From d6b8808448a3c7fe6bc4624c77356396a433e9c9 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 10 Nov 2023 09:30:34 -0500 Subject: [PATCH] Clean up fencing handler 1. Remove all format strings in favour of f-strings 2. Ensure all logger messages have a prefix 3. Add a few more logger messages for clarity --- node-daemon/pvcnoded/util/fencing.py | 164 ++++++++++++++++++--------- 1 file changed, 110 insertions(+), 54 deletions(-) diff --git a/node-daemon/pvcnoded/util/fencing.py b/node-daemon/pvcnoded/util/fencing.py index fc5d2b93..15956428 100644 --- a/node-daemon/pvcnoded/util/fencing.py +++ b/node-daemon/pvcnoded/util/fencing.py @@ -42,20 +42,24 @@ def fence_node(node_name, zkhandler, config, logger): if node_daemon_state == "dead": failcount += 1 logger.out( - 'Node "{}" failed {}/{} saving throws'.format( - node_name, failcount, failcount_limit - ), + f"Node {node_name} failed {failcount}/{failcount_limit} saving throws", state="s", + prefix=f"fencing {node_name}", ) # It changed back to something else so it must be alive else: logger.out( - 'Node "{}" passed a saving throw; canceling fence'.format(node_name), + f"Node {node_name} passed a saving throw; cancelling fance", state="o", + prefix=f"fencing {node_name}", ) return - logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state="s") + logger.out( + f"Fencing node {node_name} via IPMI reboot signal", + state="s", + prefix=f"fencing {node_name}", + ) # Get IPMI information ipmi_hostname = zkhandler.read(("node.ipmi.hostname", node_name)) @@ -63,19 +67,24 @@ def fence_node(node_name, zkhandler, config, logger): ipmi_password = zkhandler.read(("node.ipmi.password", node_name)) # Shoot it in the head - fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger) + fence_status = reboot_via_ipmi( + node_name, ipmi_hostname, ipmi_username, ipmi_password, logger + ) # Hold to ensure the fence takes effect and system stabilizes logger.out( - 'Waiting {}s for fence of node "{}" to take effect'.format( - config["keepalive_interval"], node_name - ), + f"Waiting {config['keepalive_interval']}s for fence of node {node_name} to take effect", state="i", + prefix=f"fencing {node_name}", ) time.sleep(config["keepalive_interval"]) if fence_status: - logger.out('Marking node "{}" as fenced'.format(node_name), state="i") + logger.out( + f"Marking node {node_name} as fenced", + state="i", + prefix=f"fencing {node_name}", + ) while True: try: zkhandler.write([(("node.state.daemon", node_name), "fenced")]) @@ -86,7 +95,9 @@ def fence_node(node_name, zkhandler, config, logger): # Force into secondary network state if needed if node_name in config["coordinators"]: logger.out( - 'Forcing secondary status for node "{}"'.format(node_name), state="i" + f"Forcing secondary coordinator state for node {node_name}", + state="i", + prefix=f"fencing {node_name}", ) zkhandler.write([(("node.state.router", node_name), "secondary")]) if zkhandler.read("base.config.primary_node") == node_name: @@ -108,7 +119,9 @@ def fence_node(node_name, zkhandler, config, logger): # Migrate hosts away from a fenced node def migrateFromFencedNode(zkhandler, node_name, config, logger): logger.out( - 'Migrating VMs from dead node "{}" to new hosts'.format(node_name), state="i" + f"Migrating VMs from dead node {node_name} to new hosts", + state="i", + prefix=f"fencing {node_name}", ) # Get the list of VMs @@ -121,14 +134,20 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): # Migrate a VM after a flush def fence_migrate_vm(dom_uuid): + logger.out( + f"Flushing locks of VM {dom_uuid} due to fence", + state="i", + prefix=f"fencing {node_name}", + ) vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True) target_node = common.findTargetNode(zkhandler, dom_uuid) if target_node is not None: logger.out( - 'Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), + f"Migrating VM {dom_uuid} to node {target_node}", state="i", + prefix=f"fencing {node_name}", ) zkhandler.write( [ @@ -137,12 +156,16 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): (("domain.last_node", dom_uuid), node_name), ] ) + logger.out( + f"Successfully migrated running VM {dom_uuid} to node {target_node}", + state="o", + prefix=f"fencing {node_name}", + ) else: logger.out( - 'No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format( - dom_uuid - ), + f"No target node found for VM {dom_uuid}; marking autostart=True on current node", state="i", + prefix=f"fencing {node_name}", ) zkhandler.write( { @@ -150,6 +173,11 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): (("domain.meta.autostart", dom_uuid), "True"), } ) + logger.out( + f"Successfully marked autostart for running VM {dom_uuid} on current node", + state="o", + prefix=f"fencing {node_name}", + ) # Loop through the VMs for dom_uuid in dead_node_running_domains: @@ -159,113 +187,141 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): logger.out( f"Failed to migrate VM {dom_uuid}, continuing: {e}", state="w", + prefix=f"fencing {node_name}", ) # Set node in flushed state for easy remigrating when it comes back zkhandler.write([(("node.state.domain", node_name), "flushed")]) logger.out( - 'All VMs flushed from dead node "{}" to new hosts'.format(node_name), state="i" + f"All VMs flushed from dead node {node_name} to other nodes", + state="i", + prefix=f"fencing {node_name}", ) # # Perform an IPMI fence # -def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger): +def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger): # Power off the node the node - logger.out("Sending power off to dead node", state="i") - ipmi_command_stop = ( - "/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power off".format( - ipmi_hostname, ipmi_user, ipmi_password - ) + logger.out( + "Sending power off to dead node", + state="i", + prefix=f"fencing {node_name}", ) ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command( - ipmi_command_stop + f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power off" ) - if ipmi_stop_retcode != 0: - logger.out(f"Failed to power off dead node: {ipmi_stop_stderr}", state="e") + logger.out( + f"Failed to power off dead node: {ipmi_stop_stderr}", + state="e", + prefix=f"fencing {node_name}", + ) + logger.out( + "Waiting 5s for power off to take effect", + state="i", + prefix=f"fencing {node_name}", + ) time.sleep(5) # Check the chassis power state - logger.out("Checking power state of dead node", state="i") - ipmi_command_status = ( - "/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format( - ipmi_hostname, ipmi_user, ipmi_password - ) + logger.out( + "Checking power state of dead node", + state="i", + prefix=f"fencing {node_name}", ) ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( - ipmi_command_status + f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" ) if ipmi_status_retcode == 0: logger.out( - f"Current chassis power state is: {ipmi_status_stdout.strip()}", state="i" + f"Current chassis power state is: {ipmi_status_stdout.strip()}", + state="i", + prefix=f"fencing {node_name}", ) else: - logger.out("Current chassis power state is: Unknown", state="w") + logger.out( + "Current chassis power state is: Unknown", + state="w", + prefix=f"fencing {node_name}", + ) # Power on the node - logger.out("Sending power on to dead node", state="i") - ipmi_command_start = ( - "/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on".format( - ipmi_hostname, ipmi_user, ipmi_password - ) + logger.out( + "Sending power on to dead node", + state="i", + prefix=f"fencing {node_name}", ) ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command( - ipmi_command_start + f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power on" ) if ipmi_start_retcode != 0: - logger.out(f"Failed to power on dead node: {ipmi_start_stderr}", state="w") + logger.out( + f"Failed to power on dead node: {ipmi_start_stderr}", + state="w", + prefix=f"fencing {node_name}", + ) + logger.out( + "Waiting 2s for power on to take effect", + state="i", + prefix=f"fencing {node_name}", + ) time.sleep(2) # Check the chassis power state - logger.out("Checking power state of dead node", state="i") - ipmi_command_status = ( - "/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format( - ipmi_hostname, ipmi_user, ipmi_password - ) + logger.out( + "Checking power state of dead node", + state="i", + prefix=f"fencing {node_name}", ) ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( - ipmi_command_status + f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" ) if ipmi_stop_retcode == 0: if ipmi_status_stdout.strip() == "Chassis Power is on": # We successfully rebooted the node and it is powered on; this is a succeessful fence - logger.out("Successfully rebooted dead node", state="o") + logger.out( + "Successfully rebooted dead node; proceeding with fence recovery action", + state="o", + prefix=f"fencing {node_name}", + ) return True elif ipmi_status_stdout.strip() == "Chassis Power is off": # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence logger.out( - "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush", + "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action", state="o", + prefix=f"fencing {node_name}", ) return True else: # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence logger.out( - "Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush".format( - ipmi_status_stdout.strip() - ), + f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action", state="e", + prefix=f"fencing {node_name}", ) return False else: if ipmi_status_stdout.strip() == "Chassis Power is off": # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence logger.out( - "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush", + "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action", state="o", + prefix=f"fencing {node_name}", ) return True else: # We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence logger.out( - "Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush", + "Chassis power is not in confirmed off state after failed IPMI reboot; NOT proceeding wiht fence recovery action", state="e", + prefix=f"fencing {node_name}", ) return False