Compare commits

...

2 Commits

Author SHA1 Message Date
dcb9c0d12c Improve fence handling conditions
Use the intermediate output text when judging the fence status, rather
than the retcode of the stop as this should be more reliable.
2024-05-08 10:55:15 -04:00
f6e856bf98 Fix debug output on timeout 2024-05-06 10:49:57 -04:00
2 changed files with 19 additions and 12 deletions

View File

@ -108,9 +108,10 @@ class UploadProgressBar(object):
class ErrorResponse(requests.Response): class ErrorResponse(requests.Response):
def __init__(self, json_data, status_code): def __init__(self, json_data, status_code, headers):
self.json_data = json_data self.json_data = json_data
self.status_code = status_code self.status_code = status_code
self.headers = headers
def json(self): def json(self):
return self.json_data return self.json_data
@ -206,7 +207,7 @@ def call_api(
except Exception as e: except Exception as e:
message = "Failed to connect to the API: {}".format(e) message = "Failed to connect to the API: {}".format(e)
code = response.status_code if response else 504 code = response.status_code if response else 504
response = ErrorResponse({"message": message}, code) response = ErrorResponse({"message": message}, code, None)
# Display debug output # Display debug output
if config["debug"]: if config["debug"]:

View File

@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i", state="i",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( (
ipmi_intermediate_status_retcode,
ipmi_intermediate_status_stdout,
ipmi_intermediate_status_stderr,
) = common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
) )
if ipmi_status_retcode == 0: if ipmi_intermediate_status_retcode == 0:
logger.out( logger.out(
f"Current chassis power state is: {ipmi_status_stdout.strip()}", f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
state="i", state="i",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i", state="i",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
)
) )
if ipmi_stop_retcode == 0: if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
if ipmi_status_stdout.strip() == "Chassis Power is on": if ipmi_final_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence # We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out( logger.out(
"Successfully rebooted dead node; proceeding with fence recovery action", "Successfully rebooted dead node; proceeding with fence recovery action",
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
return True return True
elif ipmi_status_stdout.strip() == "Chassis Power is off": elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out( logger.out(
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action", "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
else: else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out( logger.out(
f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action", f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
state="e", state="e",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
return False return False
else: else:
if ipmi_status_stdout.strip() == "Chassis Power is off": if ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out( logger.out(
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action", "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",