From c6d552ae57d53e4b5bacc644b13bd26f2fdb189d Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Tue, 13 Jul 2021 17:17:14 -0400 Subject: [PATCH] Rework success checks for IPMI fencing Previously, if the node failed to restart, it was declared a "bad fence" and no further action would be taken. However, there are some situations, for instance critical hardware failures, where intelligent systems will not attempt (or succeed at) starting up the node in such a case, which would result in dead, known-offline nodes without recovery. Tweak this behaviour somewhat. The main path of Reboot -> Check On -> Success + fence-flush is retained, but some additional side-paths are now defined: 1. We attempt to power "on" the chassis 1 second after the reboot, just in case it is off and can be recovered. We then wait another 2 seconds and check the power status (as we did before). 2. If the reboot succeeded, follow this series of choices: a. If the chassis is on, the fence succeeded. b. If the chassis is off, the fence "succeeded" as well. c. If the chassis is in some other state, the fence failed. 3. If the reboot failed, follow this series of choices: a. If the chassis is off, the fence itself failed, but we can treat it as "succeeded"" since the chassis is in a known-offline state. This is the most likely situation when there is a critical hardware failure, and the server's IPMI does not allow itself to start back up again. b. If the chassis is in any other state ("on" or unknown), the fence itself failed and we must treat this as a fence failure. Overall, this should alleviate the aforementioned issue of a critical failure rendering the node persistently "off" not triggering a fence-flush and ensure fencing is more robust. --- node-daemon/pvcnoded/fencing.py | 47 ++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/node-daemon/pvcnoded/fencing.py b/node-daemon/pvcnoded/fencing.py index ac312d25..c04050ee 100644 --- a/node-daemon/pvcnoded/fencing.py +++ b/node-daemon/pvcnoded/fencing.py @@ -133,31 +133,46 @@ def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger): if ipmi_reset_retcode != 0: logger.out('Failed to reboot dead node', state='e') print(ipmi_reset_stderr) - return False + + time.sleep(1) + + # Power on the node (just in case it is offline) + ipmi_command_start = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on'.format( + ipmi_hostname, ipmi_user, ipmi_password + ) + ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(ipmi_command_start) time.sleep(2) - # Ensure the node is powered on + # Check the chassis power state + logger.out('Checking power state of dead node', state='i') ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format( ipmi_hostname, ipmi_user, ipmi_password ) ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status) - # Trigger a power start if needed - if ipmi_status_stdout != "Chassis Power is on": - ipmi_command_start = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on'.format( - ipmi_hostname, ipmi_user, ipmi_password - ) - ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(ipmi_command_start) - - if ipmi_start_retcode != 0: - logger.out('Failed to start powered-off dead node', state='e') - print(ipmi_reset_stderr) + if ipmi_reset_retcode == 0: + if ipmi_status_stdout == "Chassis Power is on": + # We successfully rebooted the node and it is powered on; this is a succeessful fence + logger.out('Successfully rebooted dead node', state='o') + return True + elif ipmi_status_stdout == "Chassis Power is off": + # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence + logger.out('Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush', state='o') + return True + else: + # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence + logger.out('Chassis power is in an unknown state after successful IPMI reboot; not performing fence-flush', state='e') + return False + else: + if ipmi_status_stdout == "Chassis Power is off": + # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence + logger.out('Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush', state='o') + return True + else: + # We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence + logger.out('Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush', state='e') return False - - # Declare success - logger.out('Successfully rebooted dead node', state='o') - return True #