Clean up fencing handler

1. Remove all format strings in favour of f-strings
2. Ensure all logger messages have a prefix
3. Add a few more logger messages for clarity
This commit is contained in:
Joshua Boniface 2023-11-10 09:30:34 -05:00
parent 83c4c6633d
commit d6b8808448
1 changed files with 110 additions and 54 deletions

View File

@ -42,20 +42,24 @@ def fence_node(node_name, zkhandler, config, logger):
if node_daemon_state == "dead": if node_daemon_state == "dead":
failcount += 1 failcount += 1
logger.out( logger.out(
'Node "{}" failed {}/{} saving throws'.format( f"Node {node_name} failed {failcount}/{failcount_limit} saving throws",
node_name, failcount, failcount_limit
),
state="s", state="s",
prefix=f"fencing {node_name}",
) )
# It changed back to something else so it must be alive # It changed back to something else so it must be alive
else: else:
logger.out( logger.out(
'Node "{}" passed a saving throw; canceling fence'.format(node_name), f"Node {node_name} passed a saving throw; cancelling fance",
state="o", state="o",
prefix=f"fencing {node_name}",
) )
return return
logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state="s") logger.out(
f"Fencing node {node_name} via IPMI reboot signal",
state="s",
prefix=f"fencing {node_name}",
)
# Get IPMI information # Get IPMI information
ipmi_hostname = zkhandler.read(("node.ipmi.hostname", node_name)) ipmi_hostname = zkhandler.read(("node.ipmi.hostname", node_name))
@ -63,19 +67,24 @@ def fence_node(node_name, zkhandler, config, logger):
ipmi_password = zkhandler.read(("node.ipmi.password", node_name)) ipmi_password = zkhandler.read(("node.ipmi.password", node_name))
# Shoot it in the head # Shoot it in the head
fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger) fence_status = reboot_via_ipmi(
node_name, ipmi_hostname, ipmi_username, ipmi_password, logger
)
# Hold to ensure the fence takes effect and system stabilizes # Hold to ensure the fence takes effect and system stabilizes
logger.out( logger.out(
'Waiting {}s for fence of node "{}" to take effect'.format( f"Waiting {config['keepalive_interval']}s for fence of node {node_name} to take effect",
config["keepalive_interval"], node_name
),
state="i", state="i",
prefix=f"fencing {node_name}",
) )
time.sleep(config["keepalive_interval"]) time.sleep(config["keepalive_interval"])
if fence_status: if fence_status:
logger.out('Marking node "{}" as fenced'.format(node_name), state="i") logger.out(
f"Marking node {node_name} as fenced",
state="i",
prefix=f"fencing {node_name}",
)
while True: while True:
try: try:
zkhandler.write([(("node.state.daemon", node_name), "fenced")]) zkhandler.write([(("node.state.daemon", node_name), "fenced")])
@ -86,7 +95,9 @@ def fence_node(node_name, zkhandler, config, logger):
# Force into secondary network state if needed # Force into secondary network state if needed
if node_name in config["coordinators"]: if node_name in config["coordinators"]:
logger.out( logger.out(
'Forcing secondary status for node "{}"'.format(node_name), state="i" f"Forcing secondary coordinator state for node {node_name}",
state="i",
prefix=f"fencing {node_name}",
) )
zkhandler.write([(("node.state.router", node_name), "secondary")]) zkhandler.write([(("node.state.router", node_name), "secondary")])
if zkhandler.read("base.config.primary_node") == node_name: if zkhandler.read("base.config.primary_node") == node_name:
@ -108,7 +119,9 @@ def fence_node(node_name, zkhandler, config, logger):
# Migrate hosts away from a fenced node # Migrate hosts away from a fenced node
def migrateFromFencedNode(zkhandler, node_name, config, logger): def migrateFromFencedNode(zkhandler, node_name, config, logger):
logger.out( logger.out(
'Migrating VMs from dead node "{}" to new hosts'.format(node_name), state="i" f"Migrating VMs from dead node {node_name} to new hosts",
state="i",
prefix=f"fencing {node_name}",
) )
# Get the list of VMs # Get the list of VMs
@ -121,14 +134,20 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
# Migrate a VM after a flush # Migrate a VM after a flush
def fence_migrate_vm(dom_uuid): def fence_migrate_vm(dom_uuid):
logger.out(
f"Flushing locks of VM {dom_uuid} due to fence",
state="i",
prefix=f"fencing {node_name}",
)
vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True) vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True)
target_node = common.findTargetNode(zkhandler, dom_uuid) target_node = common.findTargetNode(zkhandler, dom_uuid)
if target_node is not None: if target_node is not None:
logger.out( logger.out(
'Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), f"Migrating VM {dom_uuid} to node {target_node}",
state="i", state="i",
prefix=f"fencing {node_name}",
) )
zkhandler.write( zkhandler.write(
[ [
@ -137,12 +156,16 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
(("domain.last_node", dom_uuid), node_name), (("domain.last_node", dom_uuid), node_name),
] ]
) )
logger.out(
f"Successfully migrated running VM {dom_uuid} to node {target_node}",
state="o",
prefix=f"fencing {node_name}",
)
else: else:
logger.out( logger.out(
'No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format( f"No target node found for VM {dom_uuid}; marking autostart=True on current node",
dom_uuid
),
state="i", state="i",
prefix=f"fencing {node_name}",
) )
zkhandler.write( zkhandler.write(
{ {
@ -150,6 +173,11 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
(("domain.meta.autostart", dom_uuid), "True"), (("domain.meta.autostart", dom_uuid), "True"),
} }
) )
logger.out(
f"Successfully marked autostart for running VM {dom_uuid} on current node",
state="o",
prefix=f"fencing {node_name}",
)
# Loop through the VMs # Loop through the VMs
for dom_uuid in dead_node_running_domains: for dom_uuid in dead_node_running_domains:
@ -159,113 +187,141 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
logger.out( logger.out(
f"Failed to migrate VM {dom_uuid}, continuing: {e}", f"Failed to migrate VM {dom_uuid}, continuing: {e}",
state="w", state="w",
prefix=f"fencing {node_name}",
) )
# Set node in flushed state for easy remigrating when it comes back # Set node in flushed state for easy remigrating when it comes back
zkhandler.write([(("node.state.domain", node_name), "flushed")]) zkhandler.write([(("node.state.domain", node_name), "flushed")])
logger.out( logger.out(
'All VMs flushed from dead node "{}" to new hosts'.format(node_name), state="i" f"All VMs flushed from dead node {node_name} to other nodes",
state="i",
prefix=f"fencing {node_name}",
) )
# #
# Perform an IPMI fence # Perform an IPMI fence
# #
def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger): def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
# Power off the node the node # Power off the node the node
logger.out("Sending power off to dead node", state="i") logger.out(
ipmi_command_stop = ( "Sending power off to dead node",
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power off".format( state="i",
ipmi_hostname, ipmi_user, ipmi_password prefix=f"fencing {node_name}",
)
) )
ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command( ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command(
ipmi_command_stop f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power off"
) )
if ipmi_stop_retcode != 0: if ipmi_stop_retcode != 0:
logger.out(f"Failed to power off dead node: {ipmi_stop_stderr}", state="e") logger.out(
f"Failed to power off dead node: {ipmi_stop_stderr}",
state="e",
prefix=f"fencing {node_name}",
)
logger.out(
"Waiting 5s for power off to take effect",
state="i",
prefix=f"fencing {node_name}",
)
time.sleep(5) time.sleep(5)
# Check the chassis power state # Check the chassis power state
logger.out("Checking power state of dead node", state="i") logger.out(
ipmi_command_status = ( "Checking power state of dead node",
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format( state="i",
ipmi_hostname, ipmi_user, ipmi_password prefix=f"fencing {node_name}",
)
) )
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
ipmi_command_status f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
) )
if ipmi_status_retcode == 0: if ipmi_status_retcode == 0:
logger.out( logger.out(
f"Current chassis power state is: {ipmi_status_stdout.strip()}", state="i" f"Current chassis power state is: {ipmi_status_stdout.strip()}",
state="i",
prefix=f"fencing {node_name}",
) )
else: else:
logger.out("Current chassis power state is: Unknown", state="w") logger.out(
"Current chassis power state is: Unknown",
state="w",
prefix=f"fencing {node_name}",
)
# Power on the node # Power on the node
logger.out("Sending power on to dead node", state="i") logger.out(
ipmi_command_start = ( "Sending power on to dead node",
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on".format( state="i",
ipmi_hostname, ipmi_user, ipmi_password prefix=f"fencing {node_name}",
)
) )
ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command( ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(
ipmi_command_start f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power on"
) )
if ipmi_start_retcode != 0: if ipmi_start_retcode != 0:
logger.out(f"Failed to power on dead node: {ipmi_start_stderr}", state="w") logger.out(
f"Failed to power on dead node: {ipmi_start_stderr}",
state="w",
prefix=f"fencing {node_name}",
)
logger.out(
"Waiting 2s for power on to take effect",
state="i",
prefix=f"fencing {node_name}",
)
time.sleep(2) time.sleep(2)
# Check the chassis power state # Check the chassis power state
logger.out("Checking power state of dead node", state="i") logger.out(
ipmi_command_status = ( "Checking power state of dead node",
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format( state="i",
ipmi_hostname, ipmi_user, ipmi_password prefix=f"fencing {node_name}",
)
) )
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
ipmi_command_status f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
) )
if ipmi_stop_retcode == 0: if ipmi_stop_retcode == 0:
if ipmi_status_stdout.strip() == "Chassis Power is on": if ipmi_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence # We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out("Successfully rebooted dead node", state="o") logger.out(
"Successfully rebooted dead node; proceeding with fence recovery action",
state="o",
prefix=f"fencing {node_name}",
)
return True return True
elif ipmi_status_stdout.strip() == "Chassis Power is off": elif ipmi_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out( logger.out(
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush", "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
state="o", state="o",
prefix=f"fencing {node_name}",
) )
return True return True
else: else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out( logger.out(
"Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush".format( f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
ipmi_status_stdout.strip()
),
state="e", state="e",
prefix=f"fencing {node_name}",
) )
return False return False
else: else:
if ipmi_status_stdout.strip() == "Chassis Power is off": if ipmi_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out( logger.out(
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush", "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",
state="o", state="o",
prefix=f"fencing {node_name}",
) )
return True return True
else: else:
# We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence # We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence
logger.out( logger.out(
"Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush", "Chassis power is not in confirmed off state after failed IPMI reboot; NOT proceeding wiht fence recovery action",
state="e", state="e",
prefix=f"fencing {node_name}",
) )
return False return False