#!/usr/bin/env python3 # redfish.py - PVC Cluster Auto-bootstrap Redfish libraries # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018-2021 Joshua M. Boniface # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################### # Refs: # https://downloads.dell.com/manuals/all-products/esuprt_software/esuprt_it_ops_datcentr_mgmt/dell-management-solution-resources_white-papers11_en-us.pdf # https://downloads.dell.com/solutions/dell-management-solution-resources/RESTfulSerConfig-using-iDRAC-REST%20API%28DTC%20copy%29.pdf import requests import urllib3 import json import re import math from sys import stderr, argv from time import sleep from celery.utils.log import get_task_logger import pvcbootstrapd.lib.installer as installer import pvcbootstrapd.lib.db as db logger = get_task_logger(__name__) # # Helper Classes # class AuthenticationException(Exception): def __init__(self, error=None, response=None): if error is not None: self.short_message = error else: self.short_message = "Generic authentication failure" if response is not None: response.status_code = response.status_code rinfo = response.json()['error']['@Message.ExtendedInfo'][0] if rinfo.get('Message') is not None: self.full_message = rinfo['Message'] self.res_message = rinfo['Resolution'] self.severity = rinfo['Severity'] self.message_id = rinfo['MessageId'] else: self.full_message = '' self.res_message = '' self.severity = 'Fatal' self.message_id = rinfo['MessageId'] else: response.status_code = None def __str__(self): if response.status_code is not None: message = f"{self.short_message}: {self.full_message} {self.res_message} (HTTP Code: {response.status_code}, Severity: {self.severity}, ID: {self.message_id})" else: message = f"{self.short_message}" return str(message) class RedfishSession: def __init__(self, host, username, password): # Disable urllib3 warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Perform login login_payload = { "UserName": username, "Password": password } login_uri = f"{host}/redfish/v1/Sessions" login_headers = {'content-type': 'application/json'} self.host = None login_response = None tries = 1 max_tries = 25 while tries < max_tries: logger.info(f"Trying to log in to Redfish ({tries}/{max_tries - 1})...") try: login_response = requests.post( login_uri, data=json.dumps(login_payload), headers=login_headers, verify=False, timeout=5 ) break except Exception as e: sleep(2) tries += 1 if login_response is None: logger.error("Failed to log in to Redfish") return if login_response.status_code not in [200, 201]: raise AuthenticationException( f"Login failed", response=login_response ) logger.info(f"Logged in to Redfish at {host} successfully") self.host = host self.token = login_response.headers.get('X-Auth-Token') self.headers = { 'content-type': 'application/json', 'x-auth-token': self.token } logout_uri = login_response.headers.get('Location') if re.match(r"^/", logout_uri): self.logout_uri = f"{host}{logout_uri}" else: self.logout_uri = logout_uri def __del__(self): if self.host is None: return logout_headers = { "Content-Type": "application/json", "X-Auth-Token": self.token } logout_response = requests.delete( self.logout_uri, headers=logout_headers, verify=False, timeout=15 ) if logout_response.status_code not in [200, 201]: raise AuthenticationException( f"Logout failed", response=logout_response ) logger.info(f"Logged out of Redfish at {host} successfully") def get(self, uri): url = f"{self.host}{uri}" response = requests.get(url, headers=self.headers, verify=False) if response.status_code in [200, 201]: return response.json() else: rinfo = response.json()['error']['@Message.ExtendedInfo'][0] if rinfo.get('Message') is not None: message = f"{rinfo['Message']} {rinfo['Resolution']}" severity = rinfo['Severity'] message_id = rinfo['MessageId'] else: message = rinfo severity = 'Error' message_id = 'N/A' logger.warn(f"! Error: GET request to {url} failed") logger.warn(f"! HTTP Code: {response.status_code} Severity: {severity} ID: {message_id}") logger.warn(f"! Details: {message}") return None def delete(self, uri): url = f"{self.host}{uri}" response = requests.delete(url, headers=self.headers, verify=False) if response.status_code in [200, 201]: return response.json() else: rinfo = response.json()['error']['@Message.ExtendedInfo'][0] if rinfo.get('Message') is not None: message = f"{rinfo['Message']} {rinfo['Resolution']}" severity = rinfo['Severity'] message_id = rinfo['MessageId'] else: message = rinfo severity = 'Error' message_id = 'N/A' logger.warn(f"! Error: DELETE request to {url} failed") logger.warn(f"! HTTP Code: {response.status_code} Severity: {severity} ID: {message_id}") logger.warn(f"! Details: {message}") return None def post(self, uri, data): url = f"{self.host}{uri}" payload = json.dumps(data) response = requests.post(url, data=payload, headers=self.headers, verify=False) if response.status_code in [200, 201]: return response.json() else: rinfo = response.json()['error']['@Message.ExtendedInfo'][0] if rinfo.get('Message') is not None: message = f"{rinfo['Message']} {rinfo['Resolution']}" severity = rinfo['Severity'] message_id = rinfo['MessageId'] else: message = rinfo severity = 'Error' message_id = 'N/A' logger.warn(f"! Error: POST request to {url} failed") logger.warn(f"! HTTP Code: {response.status_code} Severity: {severity} ID: {message_id}") logger.warn(f"! Details: {message}") return None def put(self, uri, data): url = f"{self.host}{uri}" payload = json.dumps(data) response = requests.put(url, data=payload, headers=self.headers, verify=False) if response.status_code in [200, 201]: return response.json() else: rinfo = response.json()['error']['@Message.ExtendedInfo'][0] if rinfo.get('Message') is not None: message = f"{rinfo['Message']} {rinfo['Resolution']}" severity = rinfo['Severity'] message_id = rinfo['MessageId'] else: message = rinfo severity = 'Error' message_id = 'N/A' logger.warn(f"! Error: PUT request to {url} failed") logger.warn(f"! HTTP Code: {response.status_code} Severity: {severity} ID: {message_id}") logger.warn(f"! Details: {message}") return None def patch(self, uri, data): url = f"{self.host}{uri}" payload = json.dumps(data) response = requests.patch(url, data=payload, headers=self.headers, verify=False) if response.status_code in [200, 201]: return response.json() else: rinfo = response.json()['error']['@Message.ExtendedInfo'][0] if rinfo.get('Message') is not None: message = f"{rinfo['Message']} {rinfo['Resolution']}" severity = rinfo['Severity'] message_id = rinfo['MessageId'] else: message = rinfo severity = 'Error' message_id = 'N/A' logger.warn(f"! Error: PATCH request to {url} failed") logger.warn(f"! HTTP Code: {response.status_code} Severity: {severity} ID: {message_id}") logger.warn(f"! Details: {message}") return None # # Helper functions # def format_bytes_tohuman(databytes): """ Format a string of bytes into a human-readable value (using base-1000) """ # Matrix of human-to-byte values byte_unit_matrix = { "B": 1, "KB": 1000, "MB": 1000 * 1000, "GB": 1000 * 1000 * 1000, "TB": 1000 * 1000 * 1000 * 1000, "PB": 1000 * 1000 * 1000 * 1000 * 1000, "EB": 1000 * 1000 * 1000 * 1000 * 1000 * 1000, } datahuman = "" for unit in sorted(byte_unit_matrix, key=byte_unit_matrix.get, reverse=True): if unit in ['TB', 'PB', 'EB']: # Handle the situation where we might want to round to integer values # for some entries (2TB) but not others (e.g. 1.92TB). We round if the # result is within +/- 2% of the integer value, otherwise we use two # decimal places. new_bytes = databytes / byte_unit_matrix[unit] new_bytes_plustwopct = new_bytes * 1.02 new_bytes_minustwopct = new_bytes * 0.98 cieled_bytes = int(math.ceil(databytes / byte_unit_matrix[unit])) rounded_bytes = round(databytes / byte_unit_matrix[unit], 2) if cieled_bytes > new_bytes_minustwopct and cieled_bytes < new_bytes_plustwopct: new_bytes = cieled_bytes else: new_bytes = rounded_bytes # Round up if 5 or more digits if new_bytes > 999: # We can jump down another level continue else: # We're at the end, display with this size datahuman = "{}{}".format(new_bytes, unit) return datahuman def get_system_drive_target(session, cspec_node, storage_root): """ Determine the system drive target for the installer """ # Handle an invalid >2 number of system disks, use only first 2 if len(cspec_node['config']['system_disks']) > 2: cspec_drives = cspec_node['config']['system_disks'][0:2] else: cspec_drives = cspec_node['config']['system_disks'] # If we have no storage root, we just return the first entry from # the cpsec_drives as-is and hope the administrator has the right # format here. if storage_root is None: return cspec_drives[0] # We proceed with Redfish configuration to determine the disks else: storage_detail = session.get(storage_root) # Grab a full list of drives drive_list = list() for storage_member in storage_detail['Members']: storage_member_root = storage_member['@odata.id'] storage_member_detail = session.get(storage_member_root) for drive in storage_member_detail['Drives']: drive_root = drive['@odata.id'] drive_detail = session.get(drive_root) drive_list.append(drive_detail) system_drives = list() # Iterate through each drive and include those that match for cspec_drive in cspec_drives: if re.match(r"^\/dev", cspec_drive) or re.match(r"^detect:", cspect_drive): # We only match the first drive that has these conditions for use in the preseed config logger.info("Found a drive with a 'detect:' string or Linux '/dev' path, using it for bootstrap.") return cspec_drive # Match any chassis-ID spec drives for drive in drive_list: # Like "Disk.Bay.2:Enclosure.Internal.0-1:RAID.Integrated.1-1" drive_name = drive['Id'].split(':')[0] # Craft up the cspec version of this cspec_drive_name = f"Drive.Bay.{cspec_drive}" if drive_name == cspec_drive_name: system_drives.append(drive) # We found a single drive, so determine its actual detect string if len(system_drives) == 1: logger.info("Found a single drive matching the requested chassis ID, using it as the system disk.") # Get the model's first word drive_model = system_drives[0].get('Model', 'INVALID').split()[0] # Get and convert the size in bytes value to human drive_size_bytes = system_drives[0].get('CapacityBytes', 0) drive_size_human = format_bytes_tohuman(drive_size_bytes) # Get the drive ID out of all the valid entries # How this works is that, for each non-array disk, we must find what position our exact disk is # So for example, say we want disk 3 out of 4, and all 4 are the same size and model and not in # another (RAID) volume. This will give us an index of 2. Then in the installer this will match # the 3rd list entry from "lsscsi". This is probably an unneccessary hack, since people will # probably just give the first disk if they want one, or 2 disks if they want a RAID-1, but this # is here just in case idx = 0 for drive in drive_list: list_drive_model = drive.get('Model', 'INVALID').split()[0] list_drive_size_bytes = drive.get('CapacityBytes', 0) list_drive_in_array = False if drive.get('Links', {}).get('Volumes', [''])[0].get('@odata.id').split('/')[-1] == drive.get('Id') else True if drive_model == list_drive_model and drive_size_bytes == list_drive_size_bytes and not list_drive_in_array: index = idx idx += 1 drive_id = index # Create the target string system_drive_target = f"detect:{drive_model}:{drive_size_human}:{drive_id}" # We found two drives, so create a RAID-1 array then determine the volume's detect string elif len(system_drives) == 2: logger.info("Found two drives matching the requested chassis IDs, creating a RAID-1 and using it as the system disk.") drive_one = system_drives[0] drive_one_id = drive_one.get('Id', 'INVALID') drive_one_path = drive_one.get('@odata.id', 'INVALID') drive_one_controller = drive_one_id.split(':')[-1] drive_two = system_drives[1] drive_two_id = drive_two.get('Id', 'INVALID') drive_two_path = drive_two.get('@odata.id', 'INVALID') drive_two_controller = drive_two_id.split(':')[-1] # Determine that the drives are on the same controller if drive_one_controller != drive_two_controller: logger.error("Two drives are not on the same controller; this should not happen") return None # Get the controller details controller_root = f"{storage_root}/{drive_one_controller}" controller_detail = session.get(controller_root) # Get the name of the controller (for crafting the detect string) controller_name = controller_detail.get('Name', 'INVALID').split()[0] # Get the volume root for the controller controller_volume_root = controller_detail.get('Volumes', {}).get('@odata.id') # Get the pre-creation list of volumes on the controller controller_volumes_pre = [volume['@odata.id'] for volume in session.get(controller_volume_root).get('Members', [])] # Create the RAID-1 volume payload = { "VolumeType": "Mirrored", "Drives": [ { "@odata.id": drive_one_path }, { "@odata.id": drive_two_path } ] } session.post(controller_volume_root, payload) # Wait for the volume to be created new_volume_list = [] while len(new_volume_list) < 1: sleep(5) controller_volumes_post = [volume['@odata.id'] for volume in session.get(controller_volume_root).get('Members', [])] new_volume_list = list(set(controller_volumes_post).difference(controller_volumes_pre)) new_volume_root = new_volume_list[0] # Get the IDX of the volume out of any others volume_id = 0 for idx, volume in enumerate(controller_volumes_post): if volume == new_volume_root: volume_id = idx break # Get and convert the size in bytes value to human volume_detail = session.get(new_volume_root) volume_size_bytes = volume_detail.get('CapacityBytes', 0) volume_size_human = format_bytes_tohuman(volume_size_bytes) # Create the target string system_drive_target = f"detect:{controller_name}:{volume_size_human}:{volume_id}" # We found too few or too many drives, error else: system_drive_target = None return system_drive_target # # Redfish Task functions # def set_indicator_state(session, system_root, redfish_vendor, state): """ Set the system indicator LED to the desired state (on/off) """ state_values_write = { 'Dell': { 'on': 'Blinking', 'off': 'Off', }, 'default': { 'on': 'Lit', 'off': 'Off', }, } state_values_read = { 'default': { 'on': 'Lit', 'off': 'Off', }, } try: # Allow vendor-specific overrides if redfish_vendor not in state_values_write: redfish_vendor = "default" # Allow nice names ("on"/"off") if state in state_values_write[redfish_vendor]: state = state_values_write[redfish_vendor][state] # Get current state system_detail = session.get(system_root) current_state = system_detail['IndicatorLED'] except KeyError: return False try: state_read = state # Allow vendor-specific overrides if redfish_vendor not in state_values_read: redfish_vendor = "default" # Allow nice names ("on"/"off") if state_read in state_values_read[redfish_vendor]: state_read = state_values_read[redfish_vendor][state] if state_read == current_state: return False except KeyError: return False session.patch( system_root, { "IndicatorLED": state } ) return True def set_power_state(session, system_root, redfish_vendor, state): """ Set the system power state to the desired state """ state_values = { 'default': { 'on': 'On', 'off': 'ForceOff', }, } try: # Allow vendor-specific overrides if redfish_vendor not in state_values: redfish_vendor = "default" # Allow nice names ("on"/"off") if state in state_values[redfish_vendor]: state = state_values[redfish_vendor][state] # Get current state, target URI, and allowable values system_detail = session.get(system_root) current_state = system_detail['PowerState'] power_root = system_detail['Actions']['#ComputerSystem.Reset']['target'] power_choices = system_detail['Actions']['#ComputerSystem.Reset']['ResetType@Redfish.AllowableValues'] except KeyError: return False # Remap some namings so we can check the current state against the target state if state in ['ForceOff']: target_state = 'Off' else: target_state = state if target_state == current_state: return False if state not in power_choices: return False session.post( power_root, { "ResetType": state } ) return True def set_boot_override(session, system_root, redfish_vendor, target): """ Set the system boot override to the desired target """ try: system_detail = session.get(system_root) boot_targets = system_detail['Boot']['BootSourceOverrideSupported'] except KeyError: return False if target not in boot_targets: return False session.patch( system_root, { "Boot": { "BootSourceOverrideTarget": target } } ) return True def check_redfish(config, data): """ Validate that a BMC is Redfish-capable """ headers = { "Content-Type": "application/json" } logger.info("Checking for Redfish response...") count = 0 while True: try: count += 1 if count > 30: retcode = 500 logger.warn("Aborted after 300s; device too slow or not booting.") break resp = requests.get(f"https://{data['ipaddr']}/redfish/v1", headers=headers, verify=False, timeout=10) retcode = resp.retcode break except Exception: logger.info(f"Attempt {count}...") continue if retcode == 200: return True else: return False # # Entry function # def redfish_init(config, cspec, data): """ Initialize a new node with Redfish """ bmc_ipaddr = data['ipaddr'] bmc_macaddr = data['macaddr'] bmc_host = f"https://{bmc_ipaddr}" cspec_node = cspec['bootstrap'][bmc_macaddr] logger.debug(f"cspec_node = {cspec_node}") bmc_username = cspec_node['bmc']['username'] bmc_password = cspec_node['bmc']['password'] host_macaddr = '' host_ipaddr = '' cspec_cluster = cspec_node['node']['cluster'] cspec_hostname = cspec_node['node']['hostname'] cspec_nid = int(''.join(filter(str.isdigit, cspec_hostname))) cluster = db.get_cluster(config, name=cspec_cluster) if cluster is None: cluster = db.add_cluster(config, cspec_cluster, "provisioning") logger.debug(cluster) node = db.get_node(config, cspec_cluster, name=cspec_hostname) if node is None: node = db.add_node(config, cspec_cluster, "characterizing", cspec_hostname, cspec_nid, bmc_macaddr, bmc_ipaddr, host_macaddr, host_ipaddr) else: node = db.update_node_addresses(config, cspec_cluster, cspec_hostname, bmc_macaddr, bmc_ipaddr, host_macaddr, host_ipaddr) logger.debug(node) # Create the session and log in session = RedfishSession(bmc_host, bmc_username, bmc_password) if session.host is None: logger.info("Aborting Redfish configuration; reboot BMC to try again.") del session return logger.info("Characterizing node...") # Get Refish bases redfish_base_root = '/redfish/v1' redfish_base_detail = session.get(redfish_base_root) redfish_vendor = list(redfish_base_detail['Oem'].keys())[0] redfish_name = redfish_base_detail['Name'] redfish_version = redfish_base_detail['RedfishVersion'] systems_base_root = redfish_base_detail['Systems']['@odata.id'].rstrip('/') systems_base_detail = session.get(systems_base_root) system_root = systems_base_detail['Members'][0]['@odata.id'].rstrip('/') # Force off the system and turn on the indicator set_power_state(session, system_root, redfish_vendor, 'off') set_indicator_state(session, system_root, redfish_vendor, 'on') # Get the system details system_detail = session.get(system_root) system_sku = system_detail['SKU'].strip() system_serial = system_detail['SerialNumber'].strip() system_power_state = system_detail['PowerState'].strip() system_indicator_state = system_detail['IndicatorLED'].strip() system_health_state = system_detail['Status']['Health'].strip() # Walk down the EthernetInterfaces construct to get the bootstrap interface MAC address try: ethernet_root = system_detail['EthernetInterfaces']['@odata.id'].rstrip('/') ethernet_detail = session.get(ethernet_root) first_interface_root = ethernet_detail['Members'][0]['@odata.id'].rstrip('/') first_interface_detail = session.get(first_interface_root) # Something went wrong, so fall back except KeyError: first_interface_detail = dict() # Try to get the MAC address directly from the interface detail (Redfish standard) if first_interface_detail.get('MACAddress') is not None: bootstrap_mac_address = first_interface_detail['MACAddress'].strip().lower() # Try to get the MAC address from the HostCorrelation->HostMACAddress (HP DL360x G8) elif len(system_detail.get('HostCorrelation', {}).get('HostMACAddress', [])) > 0: bootstrap_mac_address = system_detail['HostCorrelation']['HostMACAddress'][0].strip().lower() # We can't find it, so use a dummy value else: logger.error("Could not find a valid MAC address for the bootstrap interface.") return # Display the system details logger.info("Found details from node characterization:") logger.info(f"> System Manufacturer: {redfish_vendor}") logger.info(f"> System Redfish Version: {redfish_version}") logger.info(f"> System Redfish Name: {redfish_name}") logger.info(f"> System SKU: {system_sku}") logger.info(f"> System Serial: {system_serial}") logger.info(f"> Power State: {system_power_state}") logger.info(f"> Indicator LED: {system_indicator_state}") logger.info(f"> Health State: {system_health_state}") logger.info(f"> Bootstrap NIC MAC: {bootstrap_mac_address}") # Update node host MAC address host_macaddr = bootstrap_mac_address node = db.update_node_addresses(config, cspec_cluster, cspec_hostname, bmc_macaddr, bmc_ipaddr, host_macaddr, host_ipaddr) logger.debug(node) logger.info("Determining system disk...") storage_root = system_detail.get('Storage', {}).get('@odata.id') system_drive_target = get_system_drive_target(session, cspec_node, storage_root) if system_drive_target is None: logger.error("No valid drives found; configure a single system drive as a 'detect:' string or Linux '/dev' path instead and try again.") return logger.info(f"Found system disk {system_drive_target}") # Create our preseed configuration logger.info("Creating node boot configurations...") installer.add_pxe(config, cspec_node, host_macaddr) installer.add_preseed(config, cspec_node, host_macaddr, system_drive_target) # Adjust any BIOS settings logger.info("Adjusting BIOS settings...") bios_root = system_detail.get('Bios', {}).get('@odata.id') if bios_root is not None: bios_detail = session.get(bios_root) bios_attributes = list(bios_detail['Attributes'].keys()) for setting, value in cspec_node['bmc'].get('bios_settings', {}).items(): if setting not in bios_attributes: continue payload = { "Attributes": { setting: value } } session.patch(f"{bios_root}/Settings", payload) # Set boot override to Pxe for the installer boot logger.info("Setting temporary PXE boot...") set_boot_override(session, system_root, redfish_vendor, 'Pxe') # Turn on the system logger.info("Powering on node...") set_power_state(session, system_root, redfish_vendor, 'on') node = db.update_node_state(config, cspec_cluster, cspec_hostname, 'pxe-booting') logger.info("Waiting for completion of node and cluster installation...") # Wait for the system to install and be configured while node.state != "booted-completed": sleep(60) # Keep the Redfish session alive session.get(redfish_base_root) # Refresh our node state node = db.get_node(config, cspec_cluster, name=cspec_hostname) # Graceful shutdown of the machine set_power_state(session, system_root, redfish_vendor, 'GracefulShutdown') system_power_state = "On" while system_power_state != "Off": sleep(5) # Refresh our power state from the system details system_detail = session.get(system_root) system_power_state = system_detail['PowerState'].strip() # Turn off the indicator to indicate bootstrap has completed set_indicator_state(session, system_root, redfish_vendor, 'off') # We must delete the session del session return