From 8d8898077fc56fa43593e9c98ee41d57766610b5 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Tue, 2 Aug 2022 18:23:54 +0000 Subject: [PATCH] Add additional error state webhooks Ensures that webhooks are sent for failures during Redfish setup. --- bootstrap-daemon/pvcbootstrapd/lib/redfish.py | 269 +++++++++++------- 1 file changed, 159 insertions(+), 110 deletions(-) diff --git a/bootstrap-daemon/pvcbootstrapd/lib/redfish.py b/bootstrap-daemon/pvcbootstrapd/lib/redfish.py index 3139272..483bd9d 100755 --- a/bootstrap-daemon/pvcbootstrapd/lib/redfish.py +++ b/bootstrap-daemon/pvcbootstrapd/lib/redfish.py @@ -743,7 +743,7 @@ def redfish_init(config, cspec, data): session = RedfishSession(bmc_host, bmc_username, bmc_password) if session.host is None: notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to log in to Redfish for host {cspec_fqdn} at {bmc_host}") - logger.error("Aborting Redfish configuration; reboot BMC to try again.") + logger.error("Aborting Redfish configuration; reset BMC to retry.") del session return notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Logged in to Redfish for host {cspec_fqdn} at {bmc_host}") @@ -752,144 +752,193 @@ def redfish_init(config, cspec, data): sleep(60) logger.info("Characterizing node...") - - # Get Refish bases - logger.debug("Getting redfish bases") - redfish_base_root = "/redfish/v1" - redfish_base_detail = session.get(redfish_base_root) - - redfish_vendor = list(redfish_base_detail["Oem"].keys())[0] - redfish_name = redfish_base_detail["Name"] - redfish_version = redfish_base_detail["RedfishVersion"] - - managers_base_root = redfish_base_detail["Managers"]["@odata.id"].rstrip("/") - managers_base_detail = session.get(managers_base_root) - manager_root = managers_base_detail["Members"][0]["@odata.id"].rstrip("/") - - systems_base_root = redfish_base_detail["Systems"]["@odata.id"].rstrip("/") - systems_base_detail = session.get(systems_base_root) - system_root = systems_base_detail["Members"][0]["@odata.id"].rstrip("/") - - # Force off the system and turn on the indicator - logger.debug("Force off the system and turn on the indicator") - set_power_state(session, system_root, redfish_vendor, "off") - set_indicator_state(session, system_root, redfish_vendor, "on") - - # Get the system details - logger.debug("Get the system details") - system_detail = session.get(system_root) - - system_sku = system_detail["SKU"].strip() - system_serial = system_detail["SerialNumber"].strip() - system_power_state = system_detail["PowerState"].strip() - system_indicator_state = system_detail["IndicatorLED"].strip() - system_health_state = system_detail["Status"]["Health"].strip() - - # Walk down the EthernetInterfaces construct to get the bootstrap interface MAC address - logger.debug("Walk down the EthernetInterfaces construct to get the bootstrap interface MAC address") try: - ethernet_root = system_detail["EthernetInterfaces"]["@odata.id"].rstrip("/") - ethernet_detail = session.get(ethernet_root) - embedded_ethernet_detail_members = [e for e in ethernet_detail["Members"] if "Embedded" in e["@odata.id"]] - embedded_ethernet_detail_members.sort(key = lambda k: k["@odata.id"]) - first_interface_root = embedded_ethernet_detail_members[0]["@odata.id"].rstrip("/") - first_interface_detail = session.get(first_interface_root) - # Something went wrong, so fall back - except KeyError: - first_interface_detail = dict() - # Try to get the MAC address directly from the interface detail (Redfish standard) - logger.debug("Try to get the MAC address directly from the interface detail (Redfish standard)") - if first_interface_detail.get("MACAddress") is not None: - bootstrap_mac_address = first_interface_detail["MACAddress"].strip().lower() - # Try to get the MAC address from the HostCorrelation->HostMACAddress (HP DL360x G8) - elif len(system_detail.get("HostCorrelation", {}).get("HostMACAddress", [])) > 0: - bootstrap_mac_address = ( - system_detail["HostCorrelation"]["HostMACAddress"][0].strip().lower() + # Get Refish bases + logger.debug("Getting redfish bases") + redfish_base_root = "/redfish/v1" + redfish_base_detail = session.get(redfish_base_root) + + redfish_vendor = list(redfish_base_detail["Oem"].keys())[0] + redfish_name = redfish_base_detail["Name"] + redfish_version = redfish_base_detail["RedfishVersion"] + + managers_base_root = redfish_base_detail["Managers"]["@odata.id"].rstrip("/") + managers_base_detail = session.get(managers_base_root) + manager_root = managers_base_detail["Members"][0]["@odata.id"].rstrip("/") + + systems_base_root = redfish_base_detail["Systems"]["@odata.id"].rstrip("/") + systems_base_detail = session.get(systems_base_root) + system_root = systems_base_detail["Members"][0]["@odata.id"].rstrip("/") + + # Force off the system and turn on the indicator + logger.debug("Force off the system and turn on the indicator") + set_power_state(session, system_root, redfish_vendor, "off") + set_indicator_state(session, system_root, redfish_vendor, "on") + + # Get the system details + logger.debug("Get the system details") + system_detail = session.get(system_root) + + system_sku = system_detail["SKU"].strip() + system_serial = system_detail["SerialNumber"].strip() + system_power_state = system_detail["PowerState"].strip() + system_indicator_state = system_detail["IndicatorLED"].strip() + system_health_state = system_detail["Status"]["Health"].strip() + + # Walk down the EthernetInterfaces construct to get the bootstrap interface MAC address + logger.debug("Walk down the EthernetInterfaces construct to get the bootstrap interface MAC address") + try: + ethernet_root = system_detail["EthernetInterfaces"]["@odata.id"].rstrip("/") + ethernet_detail = session.get(ethernet_root) + embedded_ethernet_detail_members = [e for e in ethernet_detail["Members"] if "Embedded" in e["@odata.id"]] + embedded_ethernet_detail_members.sort(key = lambda k: k["@odata.id"]) + first_interface_root = embedded_ethernet_detail_members[0]["@odata.id"].rstrip("/") + first_interface_detail = session.get(first_interface_root) + # Something went wrong, so fall back + except KeyError: + first_interface_detail = dict() + + # Try to get the MAC address directly from the interface detail (Redfish standard) + logger.debug("Try to get the MAC address directly from the interface detail (Redfish standard)") + if first_interface_detail.get("MACAddress") is not None: + bootstrap_mac_address = first_interface_detail["MACAddress"].strip().lower() + # Try to get the MAC address from the HostCorrelation->HostMACAddress (HP DL360x G8) + elif len(system_detail.get("HostCorrelation", {}).get("HostMACAddress", [])) > 0: + bootstrap_mac_address = ( + system_detail["HostCorrelation"]["HostMACAddress"][0].strip().lower() + ) + # We can't find it, so use a dummy value + else: + logger.error("Could not find a valid MAC address for the bootstrap interface.") + return + + # Display the system details + logger.info("Found details from node characterization:") + logger.info(f"> System Manufacturer: {redfish_vendor}") + logger.info(f"> System Redfish Version: {redfish_version}") + logger.info(f"> System Redfish Name: {redfish_name}") + logger.info(f"> System SKU: {system_sku}") + logger.info(f"> System Serial: {system_serial}") + logger.info(f"> Power State: {system_power_state}") + logger.info(f"> Indicator LED: {system_indicator_state}") + logger.info(f"> Health State: {system_health_state}") + logger.info(f"> Bootstrap NIC MAC: {bootstrap_mac_address}") + + # Update node host MAC address + host_macaddr = bootstrap_mac_address + node = db.update_node_addresses( + config, + cspec_cluster, + cspec_hostname, + bmc_macaddr, + bmc_ipaddr, + host_macaddr, + host_ipaddr, ) - # We can't find it, so use a dummy value - else: - logger.error("Could not find a valid MAC address for the bootstrap interface.") + logger.debug(node) + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to characterize Redfish for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to characterize Redfish for host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session return - # Display the system details - logger.info("Found details from node characterization:") - logger.info(f"> System Manufacturer: {redfish_vendor}") - logger.info(f"> System Redfish Version: {redfish_version}") - logger.info(f"> System Redfish Name: {redfish_name}") - logger.info(f"> System SKU: {system_sku}") - logger.info(f"> System Serial: {system_serial}") - logger.info(f"> Power State: {system_power_state}") - logger.info(f"> Indicator LED: {system_indicator_state}") - logger.info(f"> Health State: {system_health_state}") - logger.info(f"> Bootstrap NIC MAC: {bootstrap_mac_address}") - - # Update node host MAC address - host_macaddr = bootstrap_mac_address - node = db.update_node_addresses( - config, - cspec_cluster, - cspec_hostname, - bmc_macaddr, - bmc_ipaddr, - host_macaddr, - host_ipaddr, - ) - logger.debug(node) - logger.info("Waiting 60 seconds for system normalization") sleep(60) logger.info("Determining system disk...") - storage_root = system_detail.get("Storage", {}).get("@odata.id") - system_drive_target = get_system_drive_target(session, cspec_node, storage_root) - if system_drive_target is None: - logger.error( - "No valid drives found; configure a single system drive as a 'detect:' string or Linux '/dev' path instead and try again." - ) + try: + storage_root = system_detail.get("Storage", {}).get("@odata.id") + system_drive_target = get_system_drive_target(session, cspec_node, storage_root) + if system_drive_target is None: + logger.error( + "No valid drives found; configure a single system drive as a 'detect:' string or Linux '/dev' path instead and retry." + ) + return + logger.info(f"Found system disk {system_drive_target}") + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to configure system disk for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to configure system disk for host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session return - logger.info(f"Found system disk {system_drive_target}") # Create our preseed configuration logger.info("Creating node boot configurations...") - installer.add_pxe(config, cspec_node, host_macaddr) - installer.add_preseed(config, cspec_node, host_macaddr, system_drive_target) + try: + installer.add_pxe(config, cspec_node, host_macaddr) + installer.add_preseed(config, cspec_node, host_macaddr, system_drive_target) + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to generate PXE configurations for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to generate PXE configurations for host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session + return # Adjust any BIOS settings logger.info("Adjusting BIOS settings...") - bios_root = system_detail.get("Bios", {}).get("@odata.id") - if bios_root is not None: - bios_detail = session.get(bios_root) - bios_attributes = list(bios_detail["Attributes"].keys()) - for setting, value in cspec_node["bmc"].get("bios_settings", {}).items(): - if setting not in bios_attributes: - continue + try: + bios_root = system_detail.get("Bios", {}).get("@odata.id") + if bios_root is not None: + bios_detail = session.get(bios_root) + bios_attributes = list(bios_detail["Attributes"].keys()) + for setting, value in cspec_node["bmc"].get("bios_settings", {}).items(): + if setting not in bios_attributes: + continue - payload = {"Attributes": {setting: value}} - session.patch(f"{bios_root}/Settings", payload) + payload = {"Attributes": {setting: value}} + session.patch(f"{bios_root}/Settings", payload) + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to set BIOS settings for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to set BIOS settings for host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session + return # Adjust any Manager settings logger.info("Adjusting Manager settings...") - mgrattribute_root = f"{manager_root}/Attributes" - mgrattribute_detail = session.get(mgrattribute_root) - mgrattribute_attributes = list(mgrattribute_detail["Attributes"].keys()) - for setting, value in cspec_node["bmc"].get("manager_settings", {}).items(): - if setting not in mgrattribute_attributes: - continue + try: + mgrattribute_root = f"{manager_root}/Attributes" + mgrattribute_detail = session.get(mgrattribute_root) + mgrattribute_attributes = list(mgrattribute_detail["Attributes"].keys()) + for setting, value in cspec_node["bmc"].get("manager_settings", {}).items(): + if setting not in mgrattribute_attributes: + continue - payload = {"Attributes": {setting: value}} - session.patch(mgrattribute_root, payload) + payload = {"Attributes": {setting: value}} + session.patch(mgrattribute_root, payload) + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to set BMC settings for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to set BMC settings for host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session + return # Set boot override to Pxe for the installer boot logger.info("Setting temporary PXE boot...") - set_boot_override(session, system_root, redfish_vendor, "Pxe") + try: + set_boot_override(session, system_root, redfish_vendor, "Pxe") + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to set PXE boot override for host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to set PXE boot override for host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session + return notifications.send_webhook(config, "success", f"Cluster {cspec_cluster}: Completed Redfish initialization of host {cspec_fqdn}") # Turn on the system logger.info("Powering on node...") - set_power_state(session, system_root, redfish_vendor, "on") - notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Powering on host {cspec_fqdn}") + try: + set_power_state(session, system_root, redfish_vendor, "on") + notifications.send_webhook(config, "info", f"Cluster {cspec_cluster}: Powering on host {cspec_fqdn}") + except Exception as e: + notifications.send_webhook(config, "failure", f"Cluster {cspec_cluster}: Failed to power on host {cspec_fqdn} at {bmc_host}. Check pvcbootstrapd logs and reset this host's BMC to retry.") + logger.error(f"Cluster {cspec_cluster}: Failed to power on host {cspec_fqdn} at {bmc_host}: {e}") + logger.error("Aborting Redfish configuration; reset BMC to retry.") + del session + return node = db.update_node_state(config, cspec_cluster, cspec_hostname, "pxe-booting")