Fix several bugs in fence handling

1. Output from ipmitool was not being stripped, and stray newlines were
throwing off the comparisons. Fixes this.

2. Several stages were lacking meaningful messages. Adds these in so the
output is more clear about what is going on.

3. Reduce the sleep time after a fence to just 1x the
keepalive_interval, rather than 2x, because this seemed like excessively
long even for slow IPMI interfaces, especially since we're checking the
power state now anyways.

4. Set the node daemon state to an explicit 'fenced' state after a
successful fence to indicate to users that the node was indeed fenced
successfully and not still 'dead'.
This commit is contained in:
Joshua Boniface 2021-09-26 20:07:30 -04:00
parent 9f41373324
commit c8134d3a1c
1 changed files with 14 additions and 6 deletions

View File

@ -56,8 +56,15 @@ def fence_node(node_name, zkhandler, config, logger):
# Shoot it in the head # Shoot it in the head
fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger) fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger)
# Hold to ensure the fence takes effect and system stabilizes # Hold to ensure the fence takes effect and system stabilizes
time.sleep(config['keepalive_interval'] * 2) logger.out('Waiting {}s for fence of node "{}" to take effect'.format(config['keepalive_interval'], node_name), state='i')
time.sleep(config['keepalive_interval'])
if fence_status:
logger.out('Marking node "{}" as fenced'.format(node_name), state='i')
zkhandler.write([
(('node.state.daemon', node_name), 'fenced')
])
# Force into secondary network state if needed # Force into secondary network state if needed
if node_name in config['coordinators']: if node_name in config['coordinators']:
@ -119,6 +126,7 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
zkhandler.write([ zkhandler.write([
(('node.state.domain', node_name), 'flushed') (('node.state.domain', node_name), 'flushed')
]) ])
logger.out('All VMs flushed from dead node "{}" to new hosts'.format(node_name), state='i')
# #
@ -152,20 +160,20 @@ def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger):
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status) ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)
if ipmi_reset_retcode == 0: if ipmi_reset_retcode == 0:
if ipmi_status_stdout == "Chassis Power is on": if ipmi_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence # We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out('Successfully rebooted dead node', state='o') logger.out('Successfully rebooted dead node', state='o')
return True return True
elif ipmi_status_stdout == "Chassis Power is off": elif ipmi_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out('Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush', state='o') logger.out('Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush', state='o')
return True return True
else: else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out('Chassis power is in an unknown state after successful IPMI reboot; not performing fence-flush', state='e') logger.out('Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush'.format(ipmi_status_stdout.strip()), state='e')
return False return False
else: else:
if ipmi_status_stdout == "Chassis Power is off": if ipmi_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out('Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush', state='o') logger.out('Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush', state='o')
return True return True
@ -181,7 +189,7 @@ def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger):
def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password): def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password):
ipmi_command = f'/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status' ipmi_command = f'/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status'
retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2) retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2)
if retcode == 0 and stdout != "Chassis Power is on": if retcode == 0 and stdout.strip() != "Chassis Power is on":
return True return True
else: else:
return False return False