Compare commits

..

74 Commits

Author SHA1 Message Date
86cc7add2d Fix status code errors 2024-11-04 14:40:16 -05:00
f529f8fcd2 Fix JSON decoding error 2024-11-04 14:26:15 -05:00
24119db4b1 Update README badge order 2024-10-25 23:48:01 -04:00
a060b41791 Update README 2024-10-25 23:45:20 -04:00
febda81f7b Fix incorrect ifup commands 2024-10-24 12:50:01 -04:00
34e1335fce Break system packages 2024-10-24 12:41:06 -04:00
640bdc0552 Add more missing dependencies 2024-10-17 15:26:53 -04:00
2097bf954b Add support for Debian 12 2024-10-17 15:25:06 -04:00
1038d5c576 Ensure interface is up before bootstrap init 2024-10-17 13:14:52 -04:00
d2b792c414 Update install dependencies for Debian 12 2024-10-17 12:50:03 -04:00
0907e1d7d2 Add OSD count configuration handling 2024-09-23 12:58:49 -04:00
c0acaafc61 Update description of detect: strings 2024-08-30 11:07:11 -04:00
40f30ce467 Perform second submodule update after init 2023-10-24 10:47:20 -04:00
32457f2427 Append to backends in apt-cacher-ng 2023-09-05 10:52:51 -04:00
96c9643753 Add python3-ansible-runner dependency 2023-09-05 10:51:09 -04:00
700d09d54f Add missing lock_file var 2023-09-05 10:50:31 -04:00
1dc4f98432 Fix name of key_file variable 2023-09-05 10:49:03 -04:00
cfe40da677 Add python3-git to installed packages 2023-09-05 10:47:16 -04:00
9e0e2f0c76 Lower initial wait time to 30s 2023-09-01 19:23:22 +00:00
ed0ab06d2c Add support for Debian release 2023-08-31 16:50:17 +00:00
286d7aad44 Fix incorrect error handling 2023-08-30 17:24:55 +00:00
3f0b0b2d7b Add support for customized mirror and apt-cacher 2023-08-30 14:30:13 +00:00
6cfc75e321 Simplify stdout/stderr handling 2023-08-30 09:47:45 -04:00
cac2e2a2b8 Better handle errors in TFTP bootstrap 2023-08-30 09:37:26 -04:00
7230ba6121 Fix cluster name 2023-03-13 16:13:52 +00:00
9cb675a60f Fix name of Ansible bootstrap trigger var 2023-01-16 19:37:51 +00:00
090e39694c Fix various delays and bugs 2023-01-16 19:37:19 +00:00
83118331a5 Wait longer for final poweroff and adjust msgs 2022-10-27 16:39:37 +00:00
05bd7d1711 Fix bad ref to cluster 2022-10-27 16:35:49 +00:00
5db1438328 Fix completion for all nodes 2022-10-27 15:31:05 +00:00
1c3c59b6f0 Add check if node is registered
Avoids reprovisioning existing nodes until they are manually removed
from the database.
2022-10-26 18:10:30 +00:00
b4796ef4c9 Revert incorrect reference call and use right data 2022-10-26 14:53:04 +00:00
d025a47d82 Correct returns for copy and script execution 2022-10-26 14:41:06 +00:00
2681ccf577 Fix incorrect class reference 2022-10-26 13:54:39 +00:00
247fc866a2 Handle return codes from paramiko commands 2022-10-25 21:17:12 +00:00
96ac9bcb75 Add additional detail in commit 2022-10-25 21:03:30 +00:00
884867989a Reduce second wait and add hook after 2022-10-25 20:01:51 +00:00
08ba856288 Standardize names and lock config 2022-10-25 19:25:38 +00:00
00ac00ae2c Add locking to git commands
Avoids conflicting attempts when multiple hosts check in at once.
2022-10-25 19:25:38 +00:00
fdae20c3c6 Add additional detail for status diagram 2022-10-24 10:11:09 -04:00
41d259101d Add additional details on detect strings 2022-10-24 10:09:43 -04:00
5ceb589af7 Update second reference too 2022-10-24 09:55:45 -04:00
9b589e5be1 Mention webhook system in readme 2022-10-24 09:54:04 -04:00
9b3a8c9d40 Unify remaining message 2022-08-02 18:25:47 +00:00
8d8898077f Add additional error state webhooks
Ensures that webhooks are sent for failures during Redfish setup.
2022-08-02 18:23:54 +00:00
f927118f5b Refactor completion of cluster configuration
Instead of rebooting one final time before shutting down, just shut it
down directly after performing hooks. This allows hooks to prestage
things like the `--autostart` flag of a VM properly.
2022-07-08 19:19:29 +00:00
8d7e3d8dc8 Handle login/logout failures more gracefully 2022-07-08 14:32:50 +00:00
bdf72f90c5 Fix bad octal conversion 2022-07-07 23:29:51 -04:00
8201535b5b Add triggerword support for completed runs 2022-07-07 21:09:22 +00:00
00dd050151 Alter final message text 2022-07-07 21:02:49 +00:00
75de65fe71 Quote error in messages 2022-07-07 20:26:50 +00:00
0ca705ddd9 Make sure mode is set as integer 2022-07-07 20:19:04 +00:00
5c2653395d Update message icon for init 2022-07-07 19:25:10 +00:00
30f8368886 Remove git pull message
This happens much too frequently to notify on.
2022-07-07 19:07:35 +00:00
21bcf32ac7 Move git messages around 2022-07-07 18:34:33 +00:00
f10e0930e4 Fix bad hook variable 2022-07-07 18:10:27 +00:00
a30c94bcb4 Correct messages for install start/finish 2022-07-07 17:45:27 +00:00
0b026faaca Fix boot source override further 2022-07-07 17:34:46 +00:00
6ab39cd2e2 Add further delay to avoid errors 2022-07-07 17:20:35 +00:00
405b4a63f7 Fix boot source override 2022-07-07 17:19:30 +00:00
6acacc153c Move delay further 2022-07-07 17:15:38 +00:00
668b5c9939 Fix bug with mgr attributes 2022-07-07 17:11:00 +00:00
fe3d79b5f1 Fix typo in boot override configuration 2022-07-07 16:52:35 +00:00
59228ed2eb Add better messages and move normalization wait 2022-07-07 16:50:25 +00:00
a30ac176f4 Improve boot override handling for Dell 2022-07-07 16:23:29 +00:00
fe6f34a3eb Add note message type and improve messages 2022-07-07 16:23:18 +00:00
9a9caae923 Fix where path is expanded 2022-07-07 15:39:06 +00:00
0565a6b635 Fix relative paths in copy hook 2022-07-06 22:25:25 +00:00
26c86a40fd Add replcfg option to pool creation 2022-07-06 20:05:28 +00:00
d68c3a32b5 Fix bug with network hooks 2022-07-06 19:50:24 +00:00
c86a14477c Add sudo option to script hooks 2022-07-06 19:42:25 +00:00
61f7f33c92 Adjust some completion messages 2022-07-06 19:03:24 +00:00
b0fb48925f Adjust message of git initialization 2022-07-06 18:41:20 +00:00
c690cdb920 Fix some bugs 2022-07-06 18:37:16 +00:00
16 changed files with 568 additions and 328 deletions

View File

@@ -1,18 +1,46 @@
<p align="center">
<img alt="Logo banner" src="https://docs.parallelvirtualcluster.org/en/latest/images/pvc_logo_black.png"/>
<br/><br/>
<a href="https://www.parallelvirtualcluster.org"><img alt="Website" src="https://img.shields.io/badge/visit-website-blue"/></a>
<a href="https://github.com/parallelvirtualcluster/pvc/releases"><img alt="Latest Release" src="https://img.shields.io/github/release-pre/parallelvirtualcluster/pvc"/></a>
<a href="https://docs.parallelvirtualcluster.org/en/latest/?badge=latest"><img alt="Documentation Status" src="https://readthedocs.org/projects/parallelvirtualcluster/badge/?version=latest"/></a>
<a href="https://github.com/parallelvirtualcluster/pvc"><img alt="License" src="https://img.shields.io/github/license/parallelvirtualcluster/pvc"/></a>
<a href="https://github.com/psf/black"><img alt="Code style: Black" src="https://img.shields.io/badge/code%20style-black-000000.svg"/></a>
</p>
## What is PVC?
PVC is a Linux KVM-based hyperconverged infrastructure (HCI) virtualization cluster solution that is fully Free Software, scalable, redundant, self-healing, self-managing, and designed for administrator simplicity. It is an alternative to other HCI solutions such as Ganeti, Harvester, Nutanix, and VMWare, as well as to other common virtualization stacks such as ProxMox and OpenStack.
PVC is a complete HCI solution, built from well-known and well-trusted Free Software tools, to assist an administrator in creating and managing a cluster of servers to run virtual machines, as well as self-managing several important aspects including storage failover, node failure and recovery, virtual machine failure and recovery, and network plumbing. It is designed to act consistently, reliably, and unobtrusively, letting the administrator concentrate on more important things.
PVC is highly scalable. From a minimum (production) node count of 3, up to 12 or more, and supporting many dozens of VMs, PVC scales along with your workload and requirements. Deploy a cluster once and grow it as your needs expand.
As a consequence of its features, PVC makes administrating very high-uptime VMs extremely easy, featuring VM live migration, built-in always-enabled shared storage with transparent multi-node replication, and consistent network plumbing throughout the cluster. Nodes can also be seamlessly removed from or added to service, with zero VM downtime, to facilitate maintenance, upgrades, or other work.
PVC also features an optional, fully customizable VM provisioning framework, designed to automate and simplify VM deployments using custom provisioning profiles, scripts, and CloudInit userdata API support.
Installation of PVC is accomplished by two main components: a [Node installer ISO](https://github.com/parallelvirtualcluster/pvc-installer) which creates on-demand installer ISOs, and an [Ansible role framework](https://github.com/parallelvirtualcluster/pvc-ansible) to configure, bootstrap, and administrate the nodes. Installation can also be fully automated with a companion [cluster bootstrapping system](https://github.com/parallelvirtualcluster/pvc-bootstrap). Once up, the cluster is managed via an HTTP REST API, accessible via a Python Click CLI client ~~or WebUI~~ (eventually).
Just give it physical servers, and it will run your VMs without you having to think about it, all in just an hour or two of setup time.
More information about PVC, its motivations, the hardware requirements, and setting up and managing a cluster [can be found over at our docs page](https://docs.parallelvirtualcluster.org).
# PVC Bootstrap System
The PVC bootstrap system provides a convenient way to deploy PVC clusters. Rather than manual node installation, this system provides a fully-automated deployment from node powering to cluster readiness, based on pre-configured values. It is useful if an administrator will deploy several PVC clusters or for repeated re-deployment for testing purposes.
## Setup
# Setup
Setting up the PVC bootstrap system manually is very complicated, and has thus been automated with an installer script instead of providing a Debian or PIP package.
### Preparing to use the PVC Bootstrap system
## Preparing to use the PVC Bootstrap system
1. Prepare a Git repository to store cluster configurations. This can be done automatically with the `create-local-repo.sh` script in the [PVC Ansible](https://github.com/parallelvirtualcluster/pvc-ansible) repository.
1. Create `group_vars` for each cluster you plan to bootstrap. Additionally, ensure you configure the `bootstrap.yml` file for each cluster with the relevant details of the hardware you will be using. This step can be repeated for each cluster in the future as new clusters are required, and the system will automatically pull changes to the local PVC repository once configured.
### Preparing a PVC Bootstrap host
## Preparing a PVC Bootstrap host
1. The recommended OS for a PVC Bootstrap host is Debian GNU/Linux 10+. In terms of hardware, there are several supported options:
@@ -28,7 +56,7 @@ Setting up the PVC bootstrap system manually is very complicated, and has thus b
1. Run the `./install-pvcbootstrapd.sh` script from the root of the repository to install the PVC Bootstrap system on the host. It will prompt for several configuration parameters. The final steps will take some time (up to 2 hours on a Raspberry Pi 4B) so be patient.
### Networking for Bootstrap
## Networking for Bootstrap
When using the pvcbootstrapd system, a dedicated network is required to provide bootstrap DHCP and TFTP to the cluster. This network can either have a dedicated, upstream router that does not provide DHCP, or the network can be routed with network address translation (NAT) through the bootstrap host. By default, the installer will configure the latter automatically using a second NIC separate from the upstream NIC of the bootstrap host, or via a vLAN on top of the single NIC.
@@ -48,7 +76,7 @@ Consider the following diagram for reference:
![Overall Network Topology](/docs/images/pvcbootstrapd-net.png)
### Deploying a Cluster with PVC Bootstrap - Redfish
## Deploying a Cluster with PVC Bootstrap - Redfish
Redfish is an industry-standard RESTful API for interfacing with the BMC (baseband management controller, or out-of-band network management system) on modern (post ~2015) servers from most vendors, including Dell iDRAC, HP iLO, Cisco CIMC, Lenovo XCC, and Supermicro X10 and newer BMCs. Redfish allows remote management, data collection, and configuration from the BMC in a standardized way across server vendors.
@@ -60,11 +88,11 @@ The PVC Bootstrap system is designed to heavily leverage Redfish in its automati
1. Connect power to the servers, but do not manually power on the servers - Redfish will handle this aspect after characterizing each host, as well as manage boot, RAID array creation (as documented in `bootstrap.yml`), BIOS configuration, etc.
1. Wait for the cluster bootstrapping to complete; you can watch the output of the `pvcbootstrapd` and `pvcbootstrapd-worker` services on the Bootstrap host to see progress. If supported, the indicator LEDs of the nodes will be lit during setup and will be disabled upon completion to provide a physical indication of the process.
1. Wait for the cluster bootstrapping to complete; you can watch the output of the `pvcbootstrapd` and `pvcbootstrapd-worker` services on the Bootstrap host to see progress, or configure the system to send webhooks to a remote target (e.g. Slack/Mattermost messages). If supported, the indicator LEDs of the nodes will be lit during setup and will be disabled upon completion to provide a physical indication of the process.
1. Verify and power off the servers and put them into production; you may need to complete several post-install tasks (for instance setting the production BMC networking via `sudo ifup ipmi` on each node) before the cluster is completely finished.
### Deploying a Cluster with PVC Bootstrap - Non-Redfish
## Deploying a Cluster with PVC Bootstrap - Non-Redfish
The PVC Bootstrap system can still handle nodes without Redfish support, for instance older servers or those from non-compliant vendors. There is however more manual setup in the process. The steps are thus:
@@ -84,11 +112,11 @@ The PVC Bootstrap system can still handle nodes without Redfish support, for ins
1. Power on the servers and set them to boot temporarily (one time) from PXE.
1. Wait for the cluster bootstrapping to complete; you can watch the output of the `pvcbootstrapd` and `pvcbootstrapd-worker` services on the Bootstrap host to see progress. If supported, the indicator LEDs of the nodes will be lit during setup and will be disabled upon completion to provide a physical indication of the process.
1. Wait for the cluster bootstrapping to complete; you can watch the output of the `pvcbootstrapd` and `pvcbootstrapd-worker` services on the Bootstrap host to see progress, or configure the system to send webhooks to a remote target (e.g. Slack/Mattermost messages). If supported, the indicator LEDs of the nodes will be lit during setup and will be disabled upon completion to provide a physical indication of the process.
1. Verify and power off the servers and put them into production; you may need to complete several post-install tasks (for instance setting the production BMC networking via `sudo ifup ipmi` on each node) before the cluster is completely finished.
#### `host-MAC.ipxe`
### `host-MAC.ipxe`
```
#1ipxe
@@ -106,7 +134,7 @@ The PVC Bootstrap system can still handle nodes without Redfish support, for ins
set imgargs-host ARGUMENTS
```
#### `host-MAC.preseed`
### `host-MAC.preseed`
```
# The name of this file is "host-123456abcdef.preseed", where "123456abcdef" is the MAC address of the
@@ -127,9 +155,9 @@ set imgargs-host ARGUMENTS
# This file is thus not designed to be used by humans, and its values are seeded via options in
# the cluster-local Ansible group_vars, though it can be used as a manual template if required.
###
### General definitions/overrides
###
##
## General definitions/overrides
##
# The Debian release to use (overrides the default)
debrelease="bullseye"
@@ -143,20 +171,31 @@ addpkglist="ca-certificates"
filesystem="ext4"
###
### Per-host definitions (required)
###
##
## Per-host definitions (required)
##
# The hostname of the system (set per-run)
target_hostname="hv1.example.tld"
# The target system disk path
# The target system disk path; must be a single disk (mdadm/software RAID is not supported)
# This will usually use a `detect` string. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>".
# Detect strings allow for automatic determination of Linux block device paths from known basic information
# about disks by leveraging "lsscsi"/"nvme" on the target host.
# The "NAME" should be some descriptive identifier that would be part of the device's Model information, for instance
# the manufacturer (e.g. "INTEL") or a similar unique string (e.g. "BOSS" for Dell BOSS cards).
# The "HUMAN-SIZE" should be the labeled human-readable size of the device (e.g. "480GB", "1.92TB").
# The "ID" specifies the Nth 0-indexed device which matches the NAME" and "HUMAN-SIZE" values (e.g. "2" would match the
# third device with the corresponding "NAME" and "HUMAN-SIZE").
# When matching against sizes, there is +/- 3% flexibility to account for base-1000 vs. base-1024 differences and
# rounding errors.
# The "NAME" may contain whitespace but if so the entire detect string should be quoted, and is case-insensitive.
target_disk="detect:LOGICAL:146GB:0"
# SSH key method (usually tftp)
# SSH key fetch method (usually tftp)
target_keys_method="tftp"
# SSH key path (usually keys.txt)
# SSH key fetch path (usually keys.txt)
target_keys_path="keys.txt"
# Deploy username (usually deploy)
@@ -177,8 +216,8 @@ target_deploy_user="deploy"
pvcbootstrapd_checkin_uri="http://10.255.255.1:9999/checkin/host"
```
## Bootstrap Process
# Bootstrap Process
This diagram outlines the various states the nodes and clusters will be in throughout the setup process along with the individual steps for reference.
This diagram outlines the various states the nodes and clusters will be in throughout the setup process along with the individual steps for reference. Which node starts characterizing first can be random, but is shown as `node1` for clarity. For non-Redflish installs, the first several steps must be completed manually as referenced above.
![PVC Bootstrap Process](/docs/images/pvcbootstrapd-process.png)

View File

@@ -27,12 +27,8 @@ case "$( cat /etc/debian_version )" in
10.*)
CELERY_ARGS="worker --app pvcbootstrapd.flaskapi.celery --concurrency 99 --pool gevent --loglevel DEBUG"
;;
11.*)
CELERY_ARGS="--app pvcbootstrapd.flaskapi.celery worker --concurrency 99 --pool gevent --loglevel DEBUG"
;;
*)
echo "Invalid Debian version found!"
exit 1
CELERY_ARGS="--app pvcbootstrapd.flaskapi.celery worker --concurrency 99 --pool gevent --loglevel DEBUG"
;;
esac

View File

@@ -58,15 +58,24 @@ pvc:
# Per-host TFTP path (almost always "/host" under "root_path"; must be writable)
host_path: "/srv/tftp/pvc-installer/host"
# Debian repository configuration
repo:
# Mirror path; defaults to using the apt-cacher-ng instance located on the current machine
# Replace "10.199.199.254" if you change "dhcp" -> "address" above
mirror: http://10.199.199.254:3142/ftp.debian.org/debian
# Default Debian release for new clusters. Must be supported by PVC ("buster", "bullseye", "bookworm").
release: bookworm
# PVC Ansible repository configuration
# Note: If "path" does not exist, "remote" will be cloned to it via Git using SSH private key "keyfile".
# Note: If "path" does not exist, "remote" will be cloned to it via Git using SSH private key "key_file".
# Note: The VCS will be refreshed regularly via the API in response to webhooks.
ansible:
# Path to the VCS repository
path: "/var/home/joshua/pvc"
# Path to the deploy key (if applicable) used to clone and pull the repository
keyfile: "/var/home/joshua/id_ed25519.joshua.key"
key_file: "/var/home/joshua/id_ed25519.joshua.key"
# Git remote URI for the repository
remote: "ssh://git@git.bonifacelabs.ca:2222/bonifacelabs/pvc.git"
@@ -77,6 +86,9 @@ pvc:
# Clusters configuration file
clusters_file: "clusters.yml"
# Lock file to use for Git interaction
lock_file: "/run/pvcbootstrapd.lock"
# Filenames of the various group_vars components of a cluster
# Generally with pvc-ansible this will contain 2 files: "base.yml", and "pvc.yml"; refer to the
# pvc-ansible documentation and examples for details on these files.
@@ -101,10 +113,14 @@ pvc:
action: post
# Icons to use for various status types; embedded in the message with `{icon}`
icons:
info: "❕" # A note about an event
begin: "🤞" # A task is beginning
success: "✅" # A task succeeded
failure: "❌" # A task failed
completed: "👌" # A task is completed
# A trigger word (no whitespace) added to the end of the completed cluster message; this can be used
# for pings in various chat systems (e.g. Mattermost)
completed_triggerword: "#pvcbootstrapcompleted"
# The webhook body elements; this is specific to the webhook target, and is converted into raw
# JSON before sending.
# Two special variables are used: "{icon}" displays one of the above icons, and "{message}" displays

View File

@@ -21,12 +21,16 @@ pvc:
tftp:
root_path: "ROOT_DIRECTORY/tftp"
host_path: "ROOT_DIRECTORY/tftp/host"
repo:
mirror: http://BOOTSTRAP_ADDRESS:3142/UPSTREAM_MIRROR
release: DEBIAN_RELEASE
ansible:
path: "ROOT_DIRECTORY/repo"
keyfile: "ROOT_DIRECTORY/id_ed25519"
key_file: "ROOT_DIRECTORY/id_ed25519"
remote: "GIT_REMOTE"
branch: "GIT_BRANCH"
clusters_file: "clusters.yml"
lock_file: "/run/pvcbootstrapd.lock"
cspec_files:
base: "base.yml"
pvc: "pvc.yml"
@@ -35,7 +39,9 @@ pvc:
enabled: false
uri: https://mattermost.domain.tld/hooks/asecretstring
action: post
completed_triggerword: "#pvcbootstrapcompleted"
icons:
info: "❕" # A note about an event
begin: "🤞" # A task is beginning
success: "✅" # A task succeeded
failure: "❌" # A task failed

View File

@@ -121,6 +121,7 @@ def read_config():
o_queue = o_base["queue"]
o_dhcp = o_base["dhcp"]
o_tftp = o_base["tftp"]
o_repo = o_base["repo"]
o_ansible = o_base["ansible"]
o_notifications = o_base["notifications"]
except KeyError as k:
@@ -178,8 +179,17 @@ def read_config():
f"Missing second-level key '{key}' under 'tftp'"
)
# Get the Repo configuration
for key in ["mirror", "release"]:
try:
config[f"repo_{key}"] = o_repo[key]
except Exception:
raise MalformedConfigurationError(
f"Missing second-level key '{key}' under 'repo'"
)
# Get the Ansible configuration
for key in ["path", "keyfile", "remote", "branch", "clusters_file"]:
for key in ["path", "key_file", "remote", "branch", "clusters_file", "lock_file"]:
try:
config[f"ansible_{key}"] = o_ansible[key]
except Exception:
@@ -205,7 +215,7 @@ def read_config():
)
# Get the Notifications configuration
for key in ["enabled", "uri", "action", "icons", "body"]:
for key in ["enabled", "uri", "action", "icons", "body", "completed_triggerword"]:
try:
config[f"notifications_{key}"] = o_notifications[key]
except Exception:
@@ -248,10 +258,7 @@ def entrypoint():
print("|----------------------------------------------------------|")
print("")
notifications.send_webhook(config, "begin", "Starting up pvcbootstrapd")
cspec = git.load_cspec_yaml(config)
print(cspec)
notifications.send_webhook(config, "info", "Initializing pvcbootstrapd")
# Initialize the database
db.init_database(config)
@@ -264,7 +271,7 @@ def entrypoint():
if "--init-only" in argv:
print("Successfully initialized pvcbootstrapd; exiting.")
notifications.send_webhook(config, "success", "Successfully initialized pvcbootstrapd")
notifications.send_webhook(config, "completed", "Successfully initialized pvcbootstrapd")
exit(0)
# Start DNSMasq
@@ -277,14 +284,14 @@ def entrypoint():
def term(signum="", frame=""):
print("Received TERM, exiting.")
notifications.send_webhook(config, "begin", "Received TERM, exiting pvcbootstrapd")
notifications.send_webhook(config, "info", "Received TERM, exiting pvcbootstrapd")
cleanup(0)
signal.signal(signal.SIGTERM, term)
signal.signal(signal.SIGINT, term)
signal.signal(signal.SIGQUIT, term)
notifications.send_webhook(config, "success", "Started up pvcbootstrapd")
notifications.send_webhook(config, "info", "Starting up pvcbootstrapd")
# Start Flask
pvcbootstrapd.app.run(

View File

@@ -54,8 +54,8 @@ def run_bootstrap(config, cspec, cluster, nodes):
logger.info("Waiting 60s before starting Ansible bootstrap.")
sleep(60)
logger.info("Starting Ansible bootstrap of cluster {cluster.name}")
notifications.send_webhook(config, "begin", f"Starting Ansible bootstrap of cluster {cluster.name}")
logger.info(f"Starting Ansible bootstrap of cluster {cluster.name}")
notifications.send_webhook(config, "begin", f"Cluster {cluster.name}: Starting Ansible bootstrap")
# Run the Ansible playbooks
with tempfile.TemporaryDirectory(prefix="pvc-ansible-bootstrap_") as pdir:
@@ -66,8 +66,8 @@ def run_bootstrap(config, cspec, cluster, nodes):
limit=f"{cluster.name}",
playbook=f"{config['ansible_path']}/pvc.yml",
extravars={
"ansible_ssh_private_key_file": config["ansible_keyfile"],
"bootstrap": "yes",
"ansible_ssh_private_key_file": config["ansible_key_file"],
"do_bootstrap": "yes",
},
forks=len(nodes),
verbosity=2,
@@ -76,11 +76,11 @@ def run_bootstrap(config, cspec, cluster, nodes):
logger.info("{}: {}".format(r.status, r.rc))
logger.info(r.stats)
if r.rc == 0:
git.commit_repository(config)
git.commit_repository(config, f"Generated files for cluster '{cluster.name}'")
git.push_repository(config)
notifications.send_webhook(config, "success", f"Completed Ansible bootstrap of cluster {cluster.name}")
notifications.send_webhook(config, "success", f"Cluster {cluster.name}: Completed Ansible bootstrap")
else:
notifications.send_webhook(config, "failure", f"Failed Ansible bootstrap of cluster {cluster.name}; check pvcbootstrapd logs")
notifications.send_webhook(config, "failure", f"Cluster {cluster.name}: Failed Ansible bootstrap; check pvcbootstrapd logs")
except Exception as e:
logger.warning(f"Error: {e}")
notifications.send_webhook(config, "failure", f"Failed Ansible bootstrap of cluster {cluster.name} with error {e}; check pvcbootstrapd logs")
notifications.send_webhook(config, "failure", f"Cluster {cluster.name}: Failed Ansible bootstrap with error '{e}'; check pvcbootstrapd logs")

View File

@@ -67,7 +67,7 @@ def init_database(config):
(id INTEGER PRIMARY KEY AUTOINCREMENT,
cluster INTEGER NOT NULL,
state TEXT NOT NULL,
name TEXT UNIQUE NOT NULL,
name TEXT NOT NULL,
nodeid INTEGER NOT NULL,
bmc_macaddr TEXT NOT NULL,
bmc_ipaddr TEXT NOT NULL,

View File

@@ -22,6 +22,7 @@
import os.path
import git
import yaml
from filelock import FileLock
import pvcbootstrapd.lib.notifications as notifications
@@ -36,7 +37,7 @@ def init_repository(config):
Clone the Ansible git repository
"""
try:
git_ssh_cmd = f"ssh -i {config['ansible_keyfile']} -o StrictHostKeyChecking=no"
git_ssh_cmd = f"ssh -i {config['ansible_key_file']} -o StrictHostKeyChecking=no"
if not os.path.exists(config["ansible_path"]):
print(
f"First run: cloning repository {config['ansible_remote']} branch {config['ansible_branch']} to {config['ansible_path']}"
@@ -52,7 +53,6 @@ def init_repository(config):
g = git.cmd.Git(f"{config['ansible_path']}")
g.checkout(config["ansible_branch"])
g.submodule("update", "--init", env=dict(GIT_SSH_COMMAND=git_ssh_cmd))
notifications.send_webhook(config, "success", "First run: successfully initialized Git repository")
except Exception as e:
print(f"Error: {e}")
@@ -61,24 +61,30 @@ def pull_repository(config):
"""
Pull (with rebase) the Ansible git repository
"""
with FileLock(config['ansible_lock_file']):
logger.info(f"Updating local configuration repository {config['ansible_path']}")
try:
git_ssh_cmd = f"ssh -i {config['ansible_keyfile']} -o StrictHostKeyChecking=no"
git_ssh_cmd = f"ssh -i {config['ansible_key_file']} -o StrictHostKeyChecking=no"
g = git.cmd.Git(f"{config['ansible_path']}")
logger.debug("Performing git pull")
g.pull(rebase=True, env=dict(GIT_SSH_COMMAND=git_ssh_cmd))
logger.debug("Performing git submodule update")
g.submodule("update", "--init", env=dict(GIT_SSH_COMMAND=git_ssh_cmd))
g.submodule("update", env=dict(GIT_SSH_COMMAND=git_ssh_cmd))
except Exception as e:
logger.warn(e)
notifications.send_webhook(config, "failure", "Failed to update Git repository")
logger.info("Completed repository synchonization")
def commit_repository(config):
def commit_repository(config, message="Generic commit"):
"""
Commit uncommitted changes to the Ansible git repository
"""
with FileLock(config['ansible_lock_file']):
logger.info(
f"Committing changes to local configuration repository {config['ansible_path']}"
)
try:
g = git.cmd.Git(f"{config['ansible_path']}")
g.add("--all")
@@ -89,28 +95,34 @@ def commit_repository(config):
g.commit(
"-m",
"Automated commit from PVC Bootstrap Ansible subsystem",
"-m",
message,
author="PVC Bootstrap <git@pvcbootstrapd>",
env=commit_env,
)
notifications.send_webhook(config, "success", "Successfully committed to Git repository")
except Exception as e:
logger.warn(e)
notifications.send_webhook(config, "failure", "Failed to commit to Git repository")
def push_repository(config):
"""
Push changes to the default remote
"""
with FileLock(config['ansible_lock_file']):
logger.info(
f"Pushing changes from local configuration repository {config['ansible_path']}"
)
try:
git_ssh_cmd = f"ssh -i {config['ansible_keyfile']} -o StrictHostKeyChecking=no"
git_ssh_cmd = f"ssh -i {config['ansible_key_file']} -o StrictHostKeyChecking=no"
g = git.Repo(f"{config['ansible_path']}")
origin = g.remote(name="origin")
origin.push(env=dict(GIT_SSH_COMMAND=git_ssh_cmd))
notifications.send_webhook(config, "success", "Successfully pushed Git repository")
except Exception as e:
logger.warn(e)
notifications.send_webhook(config, "failure", "Failed to push Git repository")
def load_cspec_yaml(config):

View File

@@ -43,7 +43,7 @@ def run_paramiko(config, node_address):
ssh_client.connect(
hostname=node_address,
username=config["deploy_username"],
key_filename=config["ansible_keyfile"],
key_filename=config["ansible_key_file"],
)
yield ssh_client
ssh_client.close()
@@ -69,6 +69,7 @@ def run_hook_osddb(config, targets, args):
stdin, stdout, stderr = c.exec_command(pvc_cmd_string)
logger.debug(stdout.readlines())
logger.debug(stderr.readlines())
return stdout.channel.recv_exit_status()
def run_hook_osd(config, targets, args):
@@ -83,13 +84,14 @@ def run_hook_osd(config, targets, args):
weight = args.get("weight", 1)
ext_db_flag = args.get("ext_db", False)
ext_db_ratio = args.get("ext_db_ratio", 0.05)
osd_count = args.get("osd_count", 1)
logger.info(f"Creating OSD on node {node_name} device {device} weight {weight}")
# Using a direct command on the target here is somewhat messy, but avoids many
# complexities of determining a valid API listen address, etc.
pvc_cmd_string = (
f"pvc storage osd add --yes {node_name} {device} --weight {weight}"
f"pvc storage osd add --yes {node_name} {device} --weight {weight} --osd-count {osd_count}"
)
if ext_db_flag:
pvc_cmd_string = f"{pvc_cmd_string} --ext-db --ext-db-ratio {ext_db_ratio}"
@@ -98,6 +100,7 @@ def run_hook_osd(config, targets, args):
stdin, stdout, stderr = c.exec_command(pvc_cmd_string)
logger.debug(stdout.readlines())
logger.debug(stderr.readlines())
return stdout.channel.recv_exit_status()
def run_hook_pool(config, targets, args):
@@ -111,14 +114,15 @@ def run_hook_pool(config, targets, args):
name = args["name"]
pgs = args.get("pgs", "64")
tier = args.get("tier", "default") # Does nothing yet
replcfg = args.get("replcfg", "copies=3,mincopies=2")
logger.info(
f"Creating storage pool on node {node_name} name {name} pgs {pgs} tier {tier}"
f"Creating storage pool on node {node_name} name {name} pgs {pgs} tier {tier} replcfg {replcfg}"
)
# Using a direct command on the target here is somewhat messy, but avoids many
# complexities of determining a valid API listen address, etc.
pvc_cmd_string = f"pvc storage pool add {name} {pgs}"
pvc_cmd_string = f"pvc storage pool add {name} {pgs} --replcfg {replcfg}"
with run_paramiko(config, node_address) as c:
stdin, stdout, stderr = c.exec_command(pvc_cmd_string)
@@ -126,7 +130,7 @@ def run_hook_pool(config, targets, args):
logger.debug(stderr.readlines())
# This only runs once on whatever the first node is
break
return stdout.channel.recv_exit_status()
def run_hook_network(config, targets, args):
@@ -157,7 +161,7 @@ def run_hook_network(config, targets, args):
for dns_server in dns_servers:
pvc_cmd_string = f"{pvc_cmd_string} --dns-server {dns_server}"
is_ip4 = args["ip4"]
is_ip4 = args.get("ip4", False)
if is_ip4:
ip4_network = args["ip4_network"]
pvc_cmd_string = f"{pvc_cmd_string} --ipnet {ip4_network}"
@@ -165,7 +169,7 @@ def run_hook_network(config, targets, args):
ip4_gateway = args["ip4_gateway"]
pvc_cmd_string = f"{pvc_cmd_string} --gateway {ip4_gateway}"
ip4_dhcp = args["ip4_dhcp"]
ip4_dhcp = args.get("ip4_dhcp", False)
if ip4_dhcp:
pvc_cmd_string = f"{pvc_cmd_string} --dhcp"
ip4_dhcp_start = args["ip4_dhcp_start"]
@@ -174,7 +178,7 @@ def run_hook_network(config, targets, args):
else:
pvc_cmd_string = f"{pvc_cmd_string} --no-dhcp"
is_ip6 = args["ip6"]
is_ip6 = args.get("ip6", False)
if is_ip6:
ip6_network = args["ip6_network"]
pvc_cmd_string = f"{pvc_cmd_string} --ipnet6 {ip6_network}"
@@ -190,7 +194,7 @@ def run_hook_network(config, targets, args):
logger.debug(stderr.readlines())
# This only runs once on whatever the first node is
break
return stdout.channel.recv_exit_status()
def run_hook_copy(config, targets, args):
@@ -209,16 +213,21 @@ def run_hook_copy(config, targets, args):
with run_paramiko(config, node_address) as c:
for sfile, dfile, dmode in zip(source, destination, mode):
if not match(r"^/", sfile):
sfile = f"{config['ansible_path']}/{sfile}"
tc = c.open_sftp()
tc.put(sfile, dfile)
tc.chmod(dfile, dmode)
tc.chmod(dfile, int(dmode, 8))
tc.close()
return 0
def run_hook_script(config, targets, args):
"""
Run a script on the targets
"""
return_status = 0
for node in targets:
node_name = node.name
node_address = node.host_ipaddr
@@ -227,6 +236,7 @@ def run_hook_script(config, targets, args):
source = args.get("source", None)
path = args.get("path", None)
arguments = args.get("arguments", [])
use_sudo = args.get("use_sudo", False)
logger.info(f"Running script on node {node_name}")
@@ -262,9 +272,16 @@ def run_hook_script(config, targets, args):
else:
remote_command = remote_path
if use_sudo:
remote_command = f"sudo {remote_command}"
stdin, stdout, stderr = c.exec_command(remote_command)
logger.debug(stdout.readlines())
logger.debug(stderr.readlines())
if stdout.channel.recv_exit_status() != 0:
return_status = stdout.channel.recv_exit_status()
return return_status
def run_hook_webhook(config, targets, args):
@@ -312,7 +329,7 @@ def run_hooks(config, cspec, cluster, nodes):
logger.info("Waiting 300s before starting hook run.")
sleep(300)
notifications.send_webhook(config, "begin", f"Running hook tasks for cluster {cluster.name}")
notifications.send_webhook(config, "begin", f"Cluster {cluster.name}: Running post-setup hook tasks")
cluster_hooks = cspec["hooks"][cluster.name]
@@ -338,22 +355,15 @@ def run_hooks(config, cspec, cluster, nodes):
# Run the hook function
try:
notifications.send_webhook(config, "begin", f"Cluster {cluster.name}: Running hook task '{hook_name}'")
hook_functions[hook_type](config, target_nodes, hook_args)
retcode = hook_functions[hook_type](config, target_nodes, hook_args)
if retcode > 0:
raise Exception(f"Hook returned with code {retcode}")
notifications.send_webhook(config, "success", f"Cluster {cluster.name}: Completed hook task '{hook_name}'")
except Exception as e:
logger.warning(f"Error running hook: {e}")
notifications.send_webhook(config, "failure", f"Cluster {cluster.name}: Failed hook task '{hook_name}' with error {e}")
notifications.send_webhook(config, "failure", f"Cluster {cluster.name}: Failed hook task '{hook_name}' with error '{e}'")
# Wait 5s between hooks
sleep(5)
# Restart nodes to complete setup
hook_functions["script"](
config,
cluster_nodes,
{
"script": "#!/usr/bin/env bash\necho bootstrapped | sudo tee /etc/pvc-install.hooks\nsudo reboot"
},
)
notifications.send_webhook(config, "success", f"Completed hook tasks for cluster {cluster.name}")
notifications.send_webhook(config, "success", f"Cluster {cluster.name}: Completed post-setup hook tasks")

View File

@@ -84,3 +84,16 @@ def set_boot_state(config, cspec, data, state):
db.update_node_state(config, cspec_cluster, cspec_hostname, state)
node = db.get_node(config, cspec_cluster, name=cspec_hostname)
logger.debug(node)
def set_completed(config, cspec, cluster):
nodes = list()
for bmc_macaddr in cspec["bootstrap"]:
if cspec["bootstrap"][bmc_macaddr]["node"]["cluster"] == cluster:
nodes.append(cspec["bootstrap"][bmc_macaddr])
for node in nodes:
cspec_cluster = node["node"]["cluster"]
cspec_hostname = node["node"]["hostname"]
db.update_node_state(config, cspec_cluster, cspec_hostname, "completed")
node = db.get_node(config, cspec_cluster, name=cspec_hostname)
logger.debug(node)

View File

@@ -66,8 +66,8 @@ def add_preseed(config, cspec_node, host_macaddr, system_drive_target):
# We use the dhcp_address here to allow the listen_address to be 0.0.0.0
rendered = template.render(
debrelease=cspec_node.get("config", {}).get("release"),
debmirror=cspec_node.get("config", {}).get("mirror"),
debrelease=config.get("repo_release"),
debmirror=config.get("repo_mirror"),
addpkglist=add_packages,
filesystem=cspec_node.get("config", {}).get("filesystem"),
skip_blockcheck=False,

View File

@@ -50,8 +50,24 @@ def dnsmasq_checkin(config, data):
)
cspec = git.load_cspec_yaml(config)
is_in_bootstrap_map = True if data["macaddr"] in cspec["bootstrap"] else False
try:
if is_in_bootstrap_map:
notifications.send_webhook(config, "begin", f"New host checkin from MAC '{data['macaddr']}' as host {cspec['bootstrap'][data['macaddr']]['node']['fqdn']}")
cspec_cluster = cspec["bootstrap"][data["macaddr"]]["node"]["cluster"]
is_registered = True if data["macaddr"] in [x.bmc_macaddr for x in db.get_nodes_in_cluster(config, cspec_cluster)] else False
else:
is_registered = False
except Exception:
is_registered = False
if not is_in_bootstrap_map:
logger.warn(f"Device '{data['macaddr']}' not in bootstrap map; ignoring.")
return
if is_registered:
logger.info(f"Device '{data['macaddr']}' has already been bootstrapped; ignoring.")
return
notifications.send_webhook(config, "info", f"New host checkin from MAC {data['macaddr']} as host {cspec['bootstrap'][data['macaddr']]['node']['fqdn']} in cluster {cspec['bootstrap'][data['macaddr']]['node']['cluster']}")
if (
cspec["bootstrap"][data["macaddr"]]["bmc"].get("redfish", None)
is not None
@@ -66,8 +82,6 @@ def dnsmasq_checkin(config, data):
logger.info(f"Is device '{data['macaddr']}' Redfish capable? {is_redfish}")
if is_redfish:
redfish.redfish_init(config, cspec, data)
else:
logger.warn(f"Device '{data['macaddr']}' not in bootstrap map; ignoring.")
return
@@ -83,28 +97,29 @@ def host_checkin(config, data):
"""
Handle checkins from the PVC node
"""
logger.info(f"Registering checkin for host {data['hostname']}")
logger.info(f"Registering checkin for {data['bmc_macaddr']}")
logger.debug(f"data = {data}")
cspec = git.load_cspec_yaml(config)
bmc_macaddr = data["bmc_macaddr"]
cspec_cluster = cspec["bootstrap"][bmc_macaddr]["node"]["cluster"]
cspec_fqdn = cspec["bootstrap"][bmc_macaddr]["node"]["fqdn"]
if data["action"] in ["install-start"]:
# Node install has started
logger.info(f"Registering install start for host {data['hostname']}")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Registering install start for host {data['hostname']}")
logger.info(f"Registering install start for host {cspec_fqdn}")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Base install starting for host {cspec_fqdn}")
host.installer_init(config, cspec, data)
elif data["action"] in ["install-complete"]:
# Node install has finished
logger.info(f"Registering install complete for host {data['hostname']}")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Registering install complete for host {data['hostname']}")
logger.info(f"Registering install complete for host {cspec_fqdn}")
notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Base install completed for host {cspec_fqdn}")
host.installer_complete(config, cspec, data)
elif data["action"] in ["system-boot_initial"]:
# Node has booted for the first time and can begin Ansible runs once all nodes up
logger.info(f"Registering first boot for host {data['hostname']}")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Registering first boot for host {data['hostname']}")
logger.info(f"Registering first boot for host {cspec_fqdn}")
notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Registering first boot for host {cspec_fqdn}")
target_state = "booted-initial"
host.set_boot_state(config, cspec, data, target_state)
@@ -122,8 +137,8 @@ def host_checkin(config, data):
elif data["action"] in ["system-boot_configured"]:
# Node has been booted after Ansible run and can begin hook runs
logger.info(f"Registering post-Ansible boot for host {data['hostname']}")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Registering post-Ansible boot for host {data['hostname']}")
logger.info(f"Registering post-Ansible boot for host {cspec_fqdn}")
notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Registering post-Ansible boot for host {cspec_fqdn}")
target_state = "booted-configured"
host.set_boot_state(config, cspec, data, target_state)
@@ -139,21 +154,9 @@ def host_checkin(config, data):
hooks.run_hooks(config, cspec, cluster, ready_nodes)
elif data["action"] in ["system-boot_completed"]:
# Node has been fully configured and can be shut down for the final time
logger.info(f"Registering post-hooks boot for host {data['hostname']}")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Registering post-hooks boot for host {data['hostname']}")
target_state = "booted-completed"
host.set_completed(config, cspec, cspec_cluster)
host.set_boot_state(config, cspec, data, target_state)
sleep(1)
all_nodes = db.get_nodes_in_cluster(config, cspec_cluster)
ready_nodes = [node for node in all_nodes if node.state == target_state]
logger.info(f"Ready: {len(ready_nodes)} All: {len(all_nodes)}")
if len(ready_nodes) >= len(all_nodes):
cluster = db.update_cluster_state(config, cspec_cluster, "completed")
notifications.send_webhook(config, "completed", f"Cluster {cspec_cluster} deployment completed")
# Hosts will now power down ready for real activation in production
sleep(300)
cluster = db.update_cluster_state(config, cspec_cluster, "completed")
notifications.send_webhook(config, "completed", f"Cluster {cspec_cluster}: PVC bootstrap deployment completed")

View File

@@ -38,6 +38,9 @@ def send_webhook(config, status, message):
logger.debug(f"Sending notification to {config['notifications_uri']}")
if status == "completed" and config["notifications_completed_triggerword"]:
message = f"{message} {config['notifications_completed_triggerword']}"
# Get the body data
body = config['notifications_body']
formatted_body = dict()

View File

@@ -42,37 +42,6 @@ logger = get_task_logger(__name__)
#
# Helper Classes
#
class AuthenticationException(Exception):
def __init__(self, error=None, response=None):
if error is not None:
self.short_message = error
else:
self.short_message = "Generic authentication failure"
if response is not None:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
if rinfo.get("Message") is not None:
self.full_message = rinfo["Message"]
self.res_message = rinfo["Resolution"]
self.severity = rinfo["Severity"]
self.message_id = rinfo["MessageId"]
else:
self.full_message = ""
self.res_message = ""
self.severity = "Fatal"
self.message_id = rinfo["MessageId"]
self.status_code = response.status_code
else:
self.status_code = None
def __str__(self):
if self.status_code is not None:
message = f"{self.short_message}: {self.full_message} {self.res_message} (HTTP Code: {self.status_code}, Severity: {self.severity}, ID: {self.message_id})"
else:
message = f"{self.short_message}"
return str(message)
class RedfishSession:
def __init__(self, host, username, password):
# Disable urllib3 warnings
@@ -103,12 +72,27 @@ class RedfishSession:
sleep(2)
tries += 1
if login_response is None:
logger.error("Failed to log in to Redfish")
if login_response is None or login_response.status_code not in [200, 201]:
try:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
except Exception:
rinfo = {}
if rinfo.get("Message") is not None:
full_message = rinfo["Message"]
res_message = rinfo["Resolution"]
severity = rinfo["Severity"]
message_id = rinfo["MessageId"]
else:
full_message = ""
res_message = ""
severity = "Fatal"
message_id = rinfo.get("MessageId", "No message ID")
status_code = login_response.status_code
failure_message = f"Redfish failure: {full_message} {res_message} (HTTP Code: {status_code}, Severity: {severity}, ID: {message_id})"
logger.error(f"Failed to log in to Redfish at {host}")
logger.error(failure_message)
return
if login_response.status_code not in [200, 201]:
raise AuthenticationException("Login failed", response=login_response)
logger.info(f"Logged in to Redfish at {host} successfully")
self.host = host
@@ -135,7 +119,25 @@ class RedfishSession:
)
if logout_response.status_code not in [200, 201]:
raise AuthenticationException("Logout failed", response=logout_response)
try:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
except Exception:
rinfo = {}
if rinfo.get("Message") is not None:
full_message = rinfo["Message"]
res_message = rinfo["Resolution"]
severity = rinfo["Severity"]
message_id = rinfo["MessageId"]
else:
full_message = ""
res_message = ""
severity = "Fatal"
message_id = rinfo.get("MessageId", "No message ID")
status_code = logout_response.status_code
failure_message = f"Redfish failure: {full_message} {res_message} (HTTP Code: {status_code}, Severity: {severity}, ID: {message_id})"
logger.error(f"Failed to log out of Redfish at {host}")
logger.error(failure_message)
return
logger.info(f"Logged out of Redfish at {self.host} successfully")
def get(self, uri):
@@ -194,16 +196,19 @@ class RedfishSession:
logger.debug(f"POST payload: {payload}")
response = requests.post(url, data=payload, headers=self.headers, verify=False)
logger.debug(f"Response: {response.status_code}")
if response.status_code in [200, 201, 204]:
if response.status_code in [201, 204]:
return {"response": "ok"}
elif response.status_code in [200]:
try:
return response.json()
except json.decoder.JSONDecodeError as e:
except Exception:
return {"json_err": e}
else:
try:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
except json.decoder.JSONDecodeError:
except Exception:
logger.debug(response)
raise
@@ -574,6 +579,7 @@ def set_power_state(session, system_root, redfish_vendor, state):
"""
Set the system power state to the desired state
"""
logger.debug(f"Calling set_power_state with {session}, {system_root}, {redfish_vendor}, {state}")
state_values = {
"default": {
"on": "On",
@@ -620,19 +626,42 @@ def set_boot_override(session, system_root, redfish_vendor, target):
"""
Set the system boot override to the desired target
"""
try:
print(redfish_vendor)
system_detail = session.get(system_root)
boot_targets = system_detail["Boot"]["BootSourceOverrideSupported"]
def set_boot_override_dell():
try:
boot_targets = system_detail["Boot"]["BootSourceOverrideTarget@Redfish.AllowableValues"]
except KeyError:
logger.warn(f"Failed to set boot override, no BootSourceOverrideSupported key at {system_detail}")
return False
if target not in boot_targets:
logger.warn(f"Failed to set boot override, key {target} not in {boot_targets}")
return False
session.patch(system_root, {"Boot": {"BootSourceOverrideMode": "UEFI", "BootSourceOverrideTarget": target}})
return True
def set_boot_override_generic():
try:
boot_targets = system_detail["Boot"]["BootSourceOverrideSupported"]
except KeyError:
logger.warn(f"Failed to set boot override, no BootSourceOverrideSupported key at {system_detail}")
return False
if target not in boot_targets:
logger.warn(f"Failed to set boot override, key {target} not in {boot_targets}")
return False
session.patch(system_root, {"Boot": {"BootSourceOverrideTarget": target}})
return True
if redfish_vendor == "Dell":
return set_boot_override_dell()
else:
return set_boot_override_generic()
def check_redfish(config, data):
"""
@@ -688,8 +717,12 @@ def redfish_init(config, cspec, data):
cspec_cluster = cspec_node["node"]["cluster"]
cspec_hostname = cspec_node["node"]["hostname"]
cspec_fqdn = cspec_node["node"]["fqdn"]
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Beginning Redfish initialization of host {cspec_hostname}")
logger.info("Waiting 30 seconds for system normalization")
sleep(30)
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Beginning Redfish initialization of host {cspec_fqdn}")
cluster = db.get_cluster(config, name=cspec_cluster)
if cluster is None:
@@ -713,11 +746,19 @@ def redfish_init(config, cspec, data):
# Create the session and log in
session = RedfishSession(bmc_host, bmc_username, bmc_password)
if session.host is None:
logger.info("Aborting Redfish configuration; reboot BMC to try again.")
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to log in to Redfish for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Logged in to Redfish for host {cspec_fqdn} at {bmc_host}")
logger.info("Waiting 30 seconds for system normalization")
sleep(30)
logger.info("Characterizing node...")
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Beginning Redfish characterization of host {cspec_fqdn} at {bmc_host}")
try:
# Get Refish bases
logger.debug("Getting redfish bases")
redfish_base_root = "/redfish/v1"
@@ -740,9 +781,6 @@ def redfish_init(config, cspec, data):
set_power_state(session, system_root, redfish_vendor, "off")
set_indicator_state(session, system_root, redfish_vendor, "on")
logger.info("Waiting 60 seconds for system normalization")
sleep(60)
# Get the system details
logger.debug("Get the system details")
system_detail = session.get(system_root)
@@ -758,24 +796,29 @@ def redfish_init(config, cspec, data):
try:
ethernet_root = system_detail["EthernetInterfaces"]["@odata.id"].rstrip("/")
ethernet_detail = session.get(ethernet_root)
logger.debug(f"Found Ethernet detail: {ethernet_detail}")
embedded_ethernet_detail_members = [e for e in ethernet_detail["Members"] if "Embedded" in e["@odata.id"]]
embedded_ethernet_detail_members.sort(key = lambda k: k["@odata.id"])
logger.debug(f"Found Ethernet members: {embedded_ethernet_detail_members}")
first_interface_root = embedded_ethernet_detail_members[0]["@odata.id"].rstrip("/")
first_interface_detail = session.get(first_interface_root)
# Something went wrong, so fall back
except KeyError:
except Exception:
first_interface_detail = dict()
logger.debug(f"First interface detail: {first_interface_detail}")
logger.debug(f"HostCorrelation detail: {system_detail.get('HostCorrelation', {})}")
# Try to get the MAC address directly from the interface detail (Redfish standard)
logger.debug("Try to get the MAC address directly from the interface detail (Redfish standard)")
if first_interface_detail.get("MACAddress") is not None:
logger.debug("Try to get the MAC address directly from the interface detail (Redfish standard)")
bootstrap_mac_address = first_interface_detail["MACAddress"].strip().lower()
# Try to get the MAC address from the HostCorrelation->HostMACAddress (HP DL360x G8)
elif len(system_detail.get("HostCorrelation", {}).get("HostMACAddress", [])) > 0:
logger.debug("Try to get the MAC address from the HostCorrelation (HP iLO)")
bootstrap_mac_address = (
system_detail["HostCorrelation"]["HostMACAddress"][0].strip().lower()
)
# We can't find it, so use a dummy value
# We can't find it, so abort
else:
logger.error("Could not find a valid MAC address for the bootstrap interface.")
return
@@ -804,24 +847,49 @@ def redfish_init(config, cspec, data):
host_ipaddr,
)
logger.debug(node)
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to characterize Redfish for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to characterize Redfish for host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
logger.info("Waiting 60 seconds for system normalization")
sleep(60)
logger.info("Determining system disk...")
try:
storage_root = system_detail.get("Storage", {}).get("@odata.id")
system_drive_target = get_system_drive_target(session, cspec_node, storage_root)
if system_drive_target is None:
logger.error(
"No valid drives found; configure a single system drive as a 'detect:' string or Linux '/dev' path instead and try again."
"No valid drives found; configure a single system drive as a 'detect:' string or Linux '/dev' path instead and retry."
)
return
logger.info(f"Found system disk {system_drive_target}")
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to configure system disk for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to configure system disk for host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
# Create our preseed configuration
logger.info("Creating node boot configurations...")
try:
installer.add_pxe(config, cspec_node, host_macaddr)
installer.add_preseed(config, cspec_node, host_macaddr, system_drive_target)
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to generate PXE configurations for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to generate PXE configurations for host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
# Adjust any BIOS settings
if len(cspec_node["bmc"].get("bios_settings", {}).items()) > 0:
logger.info("Adjusting BIOS settings...")
try:
bios_root = system_detail.get("Bios", {}).get("@odata.id")
if bios_root is not None:
bios_detail = session.get(bios_root)
@@ -829,36 +897,64 @@ def redfish_init(config, cspec, data):
for setting, value in cspec_node["bmc"].get("bios_settings", {}).items():
if setting not in bios_attributes:
continue
payload = {"Attributes": {setting: value}}
session.patch(f"{bios_root}/Settings", payload)
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to set BIOS settings for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to set BIOS settings for host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
# Adjust any Manager settings
if len(cspec_node["bmc"].get("manager_settings", {}).items()) > 0:
logger.info("Adjusting Manager settings...")
try:
mgrattribute_root = f"{manager_root}/Attributes"
mgrattribute_detail = session.get(mgrattribute_root)
mgrattribute_attributes = list(mgrattribute_detail["Attributes"].keys())
for setting, value in cspec_node["bmc"].get("manager_settings", {}).items():
if setting not in bios_attributes:
if setting not in mgrattribute_attributes:
continue
payload = {"Attributes": {setting: value}}
session.patch(mgrattribute_root, payload)
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to set BMC settings for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to set BMC settings for host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
# Set boot override to Pxe for the installer boot
logger.info("Setting temporary PXE boot...")
try:
set_boot_override(session, system_root, redfish_vendor, "Pxe")
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to set PXE boot override for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to set PXE boot override for host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Completed Redfish initialization of host {cspec_fqdn}")
# Turn on the system
logger.info("Powering on node...")
try:
set_power_state(session, system_root, redfish_vendor, "on")
notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Powering on host {cspec_fqdn}")
except Exception as e:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to power on host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.")
logger.error(f"Cluster {cspec_cluster}: Failed to power on host {cspec_fqdn} at {bmc_host}: {e}")
logger.error("Aborting Redfish configuration; reset BMC to retry.")
del session
return
node = db.update_node_state(config, cspec_cluster, cspec_hostname, "pxe-booting")
notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Completed Redfish initialization of host {cspec_hostname}")
logger.info("Waiting for completion of node and cluster installation...")
# Wait for the system to install and be configured
while node.state != "booted-completed":
while node.state != "completed":
sleep(60)
# Keep the Redfish session alive
session.get(redfish_base_root)
@@ -866,7 +962,7 @@ def redfish_init(config, cspec, data):
node = db.get_node(config, cspec_cluster, name=cspec_hostname)
# Graceful shutdown of the machine
notifications.send_webhook(config, "begin", f"Cluster {cspec_cluster}: Powering off host {cspec_hostname}")
notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Shutting down host {cspec_fqdn}")
set_power_state(session, system_root, redfish_vendor, "GracefulShutdown")
system_power_state = "On"
while system_power_state != "Off":
@@ -877,7 +973,8 @@ def redfish_init(config, cspec, data):
# Turn off the indicator to indicate bootstrap has completed
set_indicator_state(session, system_root, redfish_vendor, "off")
notifications.send_webhook(config, "completed", f"Cluster {cspec_cluster}: Powered off host {cspec_hostname}")
notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Powered off host {cspec_fqdn}")
# We must delete the session
del session

View File

@@ -21,16 +21,18 @@
import os.path
import shutil
from subprocess import run
import pvcbootstrapd.lib.notifications as notifications
def build_tftp_repository(config):
# Generate an installer config
build_cmd = f"{config['ansible_path']}/pvc-installer/buildpxe.sh -o {config['tftp_root_path']} -u {config['deploy_username']}"
print(f"Building TFTP contents via pvc-installer command: {build_cmd}")
notifications.send_webhook(config, "begin", f"Building TFTP contents via pvc-installer command: {build_cmd}")
os.system(build_cmd)
build_cmd = [ f"{config['ansible_path']}/pvc-installer/buildpxe.sh", "-o", config['tftp_root_path'], "-u", config['deploy_username'], "-m", config["repo_mirror"] ]
print(f"Building TFTP contents via pvc-installer command: {' '.join(build_cmd)}")
notifications.send_webhook(config, "begin", f"Building TFTP contents via pvc-installer command: {' '.join(build_cmd)}")
ret = run(build_cmd)
return True if ret.returncode == 0 else False
def init_tftp(config):
@@ -43,8 +45,13 @@ def init_tftp(config):
os.makedirs(config["tftp_root_path"])
os.makedirs(config["tftp_host_path"])
shutil.copyfile(
f"{config['ansible_keyfile']}.pub", f"{config['tftp_root_path']}/keys.txt"
f"{config['ansible_key_file']}.pub", f"{config['tftp_root_path']}/keys.txt"
)
build_tftp_repository(config)
result = build_tftp_repository(config)
if result:
print("First run: successfully initialized TFTP root and contents")
notifications.send_webhook(config, "success", "First run: successfully initialized TFTP root and contents")
else:
print("First run: failed initialized TFTP root and contents; see logs above")
notifications.send_webhook(config, "failure", "First run: failed initialized TFTP root and contents; check pvcbootstrapd logs")

View File

@@ -95,12 +95,35 @@ if [[ -z ${deploy_username} ]]; then
fi
echo
echo "Please enter an upstream Debian mirror (hostname+directory without scheme) to use (e.g. ftp.debian.org/debian):"
echo -n "[ftp.debian.org/debian] > "
read upstream_mirror
if [[ -z ${upstream_mirror} ]]; then
upstream_mirror="ftp.debian.org/debian"
fi
echo
echo "Please enter the default Debian release for new clusters (e.g. 'bullseye', 'bookworm'):"
echo -n "[bookworm] > "
read debian_release
if [[ -z ${debian_release} ]]; then
debian_release="bookworm"
fi
echo
echo "Proceeding with setup!"
echo
echo "Installing APT dependencies..."
sudo apt-get update
sudo apt-get install --yes vlan iptables dnsmasq redis python3 python3-pip python3-requests sqlite3 celery pxelinux syslinux-common live-build debootstrap uuid-runtime qemu-user-static
sudo apt-get install --yes vlan iptables dnsmasq redis python3 python3-pip python3-requests python3-git python3-ansible-runner python3-filelock python3-flask python3-paramiko python3-flask-restful python3-gevent python3-redis sqlite3 celery pxelinux syslinux-common live-build debootstrap uuid-runtime qemu-user-static apt-cacher-ng
echo "Configuring apt-cacher-ng..."
sudo systemctl enable --now apt-cacher-ng
if ! grep -q ${upstream_mirror} /etc/apt-cacher-ng/backends_debian; then
echo "http://${upstream_mirror}" | sudo tee -a /etc/apt-cacher-ng/backends_debian &>/dev/null
sudo systemctl restart apt-cacher-ng
fi
echo "Configuring dnsmasq..."
sudo systemctl disable --now dnsmasq
@@ -115,7 +138,7 @@ echo "Installing pvcbootstrapd..."
cp -a bootstrap-daemon ${root_directory}/pvcbootstrapd
echo "Installing PIP dependencies..."
sudo pip3 install -r ${root_directory}/pvcbootstrapd/requirements.txt
sudo pip3 install --break-system-packages -r ${root_directory}/pvcbootstrapd/requirements.txt
echo "Determining IP addresses..."
bootstrap_address="$( awk -F'.' '{ print $1"."$2"."$3".1" }' <<<"${bootstrap_network}" )"
@@ -131,6 +154,8 @@ sed -i "s|BOOTSTRAP_DHCPSTART|${bootstrap_dhcpstart}|" ${root_directory}/pvcboot
sed -i "s|BOOTSTRAP_DHCPEND|${bootstrap_dhcpend}|" ${root_directory}/pvcbootstrapd/pvcbootstrapd.yaml
sed -i "s|GIT_REMOTE|${git_remote}|" ${root_directory}/pvcbootstrapd/pvcbootstrapd.yaml
sed -i "s|GIT_BRANCH|${git_branch}|" ${root_directory}/pvcbootstrapd/pvcbootstrapd.yaml
sed -i "s|UPSTREAM_MIRROR|${upstream_mirror}|" ${root_directory}/pvcbootstrapd/pvcbootstrapd.yaml
sed -i "s|DEBIAN_RELEASE|${debian_release}|" ${root_directory}/pvcbootstrapd/pvcbootstrapd.yaml
echo "Creating network configuration for interface ${bootstrap_interface} (is vLAN? ${is_bootstrap_interface_vlan})..."
if [[ "${is_bootstrap_interface_vlan}" == "yes" ]]; then
@@ -241,6 +266,12 @@ case ${start_flag} in
;;
*)
echo
if [[ "${is_bootstrap_interface_vlan}" == "yes" ]]; then
sudo ifup vlan${bootstrap_vlan}
else
sudo ifup ${bootstrap_interface}
fi
sudo service apt-cacher-ng restart
export PVCD_CONFIG_FILE="${root_directory}/pvcbootstrapd/pvcbootstrapd.yaml"
${root_directory}/pvcbootstrapd/pvcbootstrapd.py --init-only
;;