Compare commits

..

2 Commits

Author SHA1 Message Date
f927118f5b Refactor completion of cluster configuration
Instead of rebooting one final time before shutting down, just shut it
down directly after performing hooks. This allows hooks to prestage
things like the `--autostart` flag of a VM properly.
2022-07-08 19:19:29 +00:00
8d7e3d8dc8 Handle login/logout failures more gracefully 2022-07-08 14:32:50 +00:00
3 changed files with 46 additions and 60 deletions

View File

@ -354,13 +354,4 @@ def run_hooks(config, cspec, cluster, nodes):
# Wait 5s between hooks # Wait 5s between hooks
sleep(5) sleep(5)
# Restart nodes to complete setup
hook_functions["script"](
config,
cluster_nodes,
{
"script": "#!/usr/bin/env bash\necho bootstrapped | sudo tee /etc/pvc-install.hooks\nsudo reboot"
},
)
notifications.send_webhook(config, "success", f"Cluster {cluster.name}: Completed post-setup hook tasks") notifications.send_webhook(config, "success", f"Cluster {cluster.name}: Completed post-setup hook tasks")

View File

@ -140,21 +140,11 @@ def host_checkin(config, data):
hooks.run_hooks(config, cspec, cluster, ready_nodes) hooks.run_hooks(config, cspec, cluster, ready_nodes)
elif data["action"] in ["system-boot_completed"]: target_state = "completed"
# Node has been fully configured and can be shut down for the final time for node in all_nodes:
logger.info(f"Registering post-hooks boot for host {cspec_fqdn}")
notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Registering post-hooks boot for host {cspec_fqdn}")
target_state = "booted-completed"
host.set_boot_state(config, cspec, data, target_state) host.set_boot_state(config, cspec, data, target_state)
sleep(1)
all_nodes = db.get_nodes_in_cluster(config, cspec_cluster)
ready_nodes = [node for node in all_nodes if node.state == target_state]
logger.info(f"Ready: {len(ready_nodes)} All: {len(all_nodes)}")
if len(ready_nodes) >= len(all_nodes):
# Hosts will now power down ready for real activation in production # Hosts will now power down ready for real activation in production
sleep(30) sleep(60)
cluster = db.update_cluster_state(config, cspec_cluster, "completed") cluster = db.update_cluster_state(config, cspec_cluster, "completed")
notifications.send_webhook(config, "completed", f"Cluster {cspec_cluster}: PVC bootstrap deployment completed") notifications.send_webhook(config, "completed", f"Cluster {cspec_cluster}: PVC bootstrap deployment completed")

View File

@ -42,37 +42,6 @@ logger = get_task_logger(__name__)
# #
# Helper Classes # Helper Classes
# #
class AuthenticationException(Exception):
def __init__(self, error=None, response=None):
if error is not None:
self.short_message = error
else:
self.short_message = "Generic authentication failure"
if response is not None:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
if rinfo.get("Message") is not None:
self.full_message = rinfo["Message"]
self.res_message = rinfo["Resolution"]
self.severity = rinfo["Severity"]
self.message_id = rinfo["MessageId"]
else:
self.full_message = ""
self.res_message = ""
self.severity = "Fatal"
self.message_id = rinfo["MessageId"]
self.status_code = response.status_code
else:
self.status_code = None
def __str__(self):
if self.status_code is not None:
message = f"{self.short_message}: {self.full_message} {self.res_message} (HTTP Code: {self.status_code}, Severity: {self.severity}, ID: {self.message_id})"
else:
message = f"{self.short_message}"
return str(message)
class RedfishSession: class RedfishSession:
def __init__(self, host, username, password): def __init__(self, host, username, password):
# Disable urllib3 warnings # Disable urllib3 warnings
@ -104,7 +73,25 @@ class RedfishSession:
tries += 1 tries += 1
if login_response is None or login_response.status_code not in [200, 201]: if login_response is None or login_response.status_code not in [200, 201]:
raise AuthenticationException("Login failed", response=login_response) try:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
except Exception:
rinfo = {}
if rinfo.get("Message") is not None:
full_message = rinfo["Message"]
res_message = rinfo["Resolution"]
severity = rinfo["Severity"]
message_id = rinfo["MessageId"]
else:
full_message = ""
res_message = ""
severity = "Fatal"
message_id = rinfo.get("MessageId", "No message ID")
status_code = login_response.status_code
failure_message = f"Redfish failure: {full_message} {res_message} (HTTP Code: {status_code}, Severity: {severity}, ID: {message_id})"
logger.error(f"Failed to log in to Redfish at {host}")
logger.error(failure_message)
return
logger.info(f"Logged in to Redfish at {host} successfully") logger.info(f"Logged in to Redfish at {host} successfully")
@ -132,7 +119,25 @@ class RedfishSession:
) )
if logout_response.status_code not in [200, 201]: if logout_response.status_code not in [200, 201]:
raise AuthenticationException("Logout failed", response=logout_response) try:
rinfo = response.json()["error"]["@Message.ExtendedInfo"][0]
except Exception:
rinfo = {}
if rinfo.get("Message") is not None:
full_message = rinfo["Message"]
res_message = rinfo["Resolution"]
severity = rinfo["Severity"]
message_id = rinfo["MessageId"]
else:
full_message = ""
res_message = ""
severity = "Fatal"
message_id = rinfo.get("MessageId", "No message ID")
status_code = logout_response.status_code
failure_message = f"Redfish failure: {full_message} {res_message} (HTTP Code: {status_code}, Severity: {severity}, ID: {message_id})"
logger.error(f"Failed to log out of Redfish at {host}")
logger.error(failure_message)
return
logger.info(f"Logged out of Redfish at {self.host} successfully") logger.info(f"Logged out of Redfish at {self.host} successfully")
def get(self, uri): def get(self, uri):
@ -738,16 +743,16 @@ def redfish_init(config, cspec, data):
session = RedfishSession(bmc_host, bmc_username, bmc_password) session = RedfishSession(bmc_host, bmc_username, bmc_password)
if session.host is None: if session.host is None:
notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to log in to Redfish for host {cspec_fqdn} at {bmc_host}") notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to log in to Redfish for host {cspec_fqdn} at {bmc_host}")
logger.info("Aborting Redfish configuration; reboot BMC to try again.") logger.error("Aborting Redfish configuration; reboot BMC to try again.")
del session del session
return return
notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Logged in to Redfish for host {cspec_fqdn} at {bmc_host}") notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Logged in to Redfish for host {cspec_fqdn} at {bmc_host}")
logger.info("Characterizing node...")
logger.info("Waiting 60 seconds for system normalization") logger.info("Waiting 60 seconds for system normalization")
sleep(60) sleep(60)
logger.info("Characterizing node...")
# Get Refish bases # Get Refish bases
logger.debug("Getting redfish bases") logger.debug("Getting redfish bases")
redfish_base_root = "/redfish/v1" redfish_base_root = "/redfish/v1"
@ -890,7 +895,7 @@ def redfish_init(config, cspec, data):
logger.info("Waiting for completion of node and cluster installation...") logger.info("Waiting for completion of node and cluster installation...")
# Wait for the system to install and be configured # Wait for the system to install and be configured
while node.state != "booted-completed": while node.state != "completed":
sleep(60) sleep(60)
# Keep the Redfish session alive # Keep the Redfish session alive
session.get(redfish_base_root) session.get(redfish_base_root)