2018-10-22 20:20:27 -04:00
#!/usr/bin/env python3
2021-08-21 02:46:11 -04:00
# fencing.py - Utility functions for pvcnoded fencing
2018-10-22 20:20:27 -04:00
# Part of the Parallel Virtual Cluster (PVC) system
#
2023-12-29 11:16:59 -05:00
# Copyright (C) 2018-2024 Joshua M. Boniface <joshua@boniface.me>
2018-10-22 20:20:27 -04:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
2021-03-25 16:57:17 -04:00
# the Free Software Foundation, version 3.
2018-10-22 20:20:27 -04:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
import time
2024-10-10 16:38:19 -04:00
from kazoo . exceptions import LockTimeout
2021-06-01 12:17:25 -04:00
import daemon_lib . common as common
2021-08-21 02:46:11 -04:00
2023-11-10 01:28:41 -05:00
from daemon_lib . vm import vm_worker_flush_locks
2018-10-22 20:20:27 -04:00
2020-11-07 14:45:24 -05:00
2018-10-22 20:20:27 -04:00
#
2024-10-10 16:38:19 -04:00
# Fence monitor thread entrypoint
#
def fence_monitor ( zkhandler , config , logger ) :
# Attempt to acquire an exclusive lock on the fence_lock key
# If it is already held, we'll abort since another node is processing fences
lock = zkhandler . exclusivelock ( " base.config.fence_lock " )
try :
lock . acquire ( timeout = config [ " keepalive_interval " ] - 1 )
for node_name in zkhandler . children ( " base.node " ) :
try :
node_daemon_state = zkhandler . read ( ( " node.state.daemon " , node_name ) )
node_keepalive = int ( zkhandler . read ( ( " node.keepalive " , node_name ) ) )
except Exception :
node_daemon_state = " unknown "
node_keepalive = 0
node_deadtime = int ( time . time ( ) ) - (
int ( config [ " keepalive_interval " ] ) * int ( config [ " fence_intervals " ] )
)
if node_keepalive < node_deadtime and node_daemon_state == " run " :
logger . out (
f " Node { node_name } seems dead; starting monitor for fencing " ,
state = " w " ,
)
zk_lock = zkhandler . writelock ( ( " node.state.daemon " , node_name ) )
with zk_lock :
# Ensures that, if we lost the lock race and come out of waiting,
# we won't try to trigger our own fence thread.
if zkhandler . read ( ( " node.state.daemon " , node_name ) ) != " dead " :
# Write the updated data after we start the fence thread
zkhandler . write ( [ ( ( " node.state.daemon " , node_name ) , " dead " ) ] )
# Start the fence monitoring task for this node
# NOTE: This is not a subthread and is designed to block this for loop
# This ensures that only one node is ever being fenced at a time
fence_node ( zkhandler , config , logger , node_name )
else :
logger . out (
f " Node { node_name } is OK; last checkin is { node_deadtime - node_keepalive } s from threshold, node state is ' { node_daemon_state } ' " ,
state = " d " ,
prefix = " fence-thread " ,
)
except LockTimeout :
logger . out (
" Fence monitor thread failed to acquire exclusive lock; skipping " , state = " i "
)
except Exception as e :
logger . out ( f " Fence monitor thread failed: { e } " , state = " w " )
finally :
# We're finished, so release the global lock
lock . release ( )
#
# Fence action function
2018-10-22 20:20:27 -04:00
#
2024-10-10 16:38:19 -04:00
def fence_node ( zkhandler , config , logger , node_name ) :
2020-08-05 22:36:28 -04:00
# We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
failcount_limit = 6
2018-10-22 20:20:27 -04:00
failcount = 0
2020-08-05 22:36:28 -04:00
while failcount < failcount_limit :
2018-10-22 20:20:27 -04:00
# Wait 5 seconds
2021-11-06 03:02:43 -04:00
time . sleep ( config [ " keepalive_interval " ] )
2018-10-22 20:20:27 -04:00
# Get the state
2021-11-06 03:02:43 -04:00
node_daemon_state = zkhandler . read ( ( " node.state.daemon " , node_name ) )
2018-10-22 20:20:27 -04:00
# Is it still 'dead'
2021-11-06 03:02:43 -04:00
if node_daemon_state == " dead " :
2018-10-22 20:20:27 -04:00
failcount + = 1
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
f " Node { node_name } failed { failcount } / { failcount_limit } saving throws " ,
2021-11-06 03:02:43 -04:00
state = " s " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2018-10-22 20:20:27 -04:00
# It changed back to something else so it must be alive
else :
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
f " Node { node_name } passed a saving throw; cancelling fance " ,
2021-11-06 03:02:43 -04:00
state = " o " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2018-10-22 20:20:27 -04:00
return
2023-11-10 09:30:34 -05:00
logger . out (
f " Fencing node { node_name } via IPMI reboot signal " ,
state = " s " ,
prefix = f " fencing { node_name } " ,
)
2018-10-22 20:20:27 -04:00
# Get IPMI information
2021-11-06 03:02:43 -04:00
ipmi_hostname = zkhandler . read ( ( " node.ipmi.hostname " , node_name ) )
ipmi_username = zkhandler . read ( ( " node.ipmi.username " , node_name ) )
ipmi_password = zkhandler . read ( ( " node.ipmi.password " , node_name ) )
2018-10-22 20:20:27 -04:00
# Shoot it in the head
2023-11-10 09:30:34 -05:00
fence_status = reboot_via_ipmi (
node_name , ipmi_hostname , ipmi_username , ipmi_password , logger
)
2021-09-26 20:07:30 -04:00
2020-08-15 12:38:03 -04:00
# Hold to ensure the fence takes effect and system stabilizes
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
f " Waiting { config [ ' keepalive_interval ' ] } s for fence of node { node_name } to take effect " ,
2021-11-06 03:02:43 -04:00
state = " i " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
time . sleep ( config [ " keepalive_interval " ] )
2021-10-27 16:24:17 -04:00
2021-09-26 20:07:30 -04:00
if fence_status :
2023-11-10 09:30:34 -05:00
logger . out (
f " Marking node { node_name } as fenced " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
)
2021-10-27 16:24:17 -04:00
while True :
try :
2021-11-06 03:02:43 -04:00
zkhandler . write ( [ ( ( " node.state.daemon " , node_name ) , " fenced " ) ] )
2021-10-27 16:24:17 -04:00
break
except Exception :
continue
2018-10-22 20:20:27 -04:00
# Force into secondary network state if needed
2021-11-06 03:02:43 -04:00
if node_name in config [ " coordinators " ] :
logger . out (
2023-11-10 09:30:34 -05:00
f " Forcing secondary coordinator state for node { node_name } " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
zkhandler . write ( [ ( ( " node.state.router " , node_name ) , " secondary " ) ] )
if zkhandler . read ( " base.config.primary_node " ) == node_name :
zkhandler . write ( [ ( " base.config.primary_node " , " none " ) ] )
2019-06-25 22:31:04 -04:00
2018-10-22 20:20:27 -04:00
# If the fence succeeded and successful_fence is migrate
2021-11-06 03:02:43 -04:00
if fence_status and config [ " successful_fence " ] == " migrate " :
2021-06-01 11:53:21 -04:00
migrateFromFencedNode ( zkhandler , node_name , config , logger )
2018-10-22 20:20:27 -04:00
# If the fence failed and failed_fence is migrate
2021-11-06 03:02:43 -04:00
if (
not fence_status
and config [ " failed_fence " ] == " migrate "
and config [ " suicide_intervals " ] != " 0 "
) :
2021-06-01 11:53:21 -04:00
migrateFromFencedNode ( zkhandler , node_name , config , logger )
2018-10-22 20:20:27 -04:00
2023-12-11 12:14:56 -05:00
# Reset all node resource values
logger . out (
f " Resetting all resource values for dead node { node_name } to zero " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
)
zkhandler . write (
[
( ( " node.running_domains " , node_name ) , " 0 " ) ,
( ( " node.count.provisioned_domains " , node_name ) , " 0 " ) ,
( ( " node.cpu.load " , node_name ) , " 0 " ) ,
( ( " node.vcpu.allocated " , node_name ) , " 0 " ) ,
( ( " node.memory.total " , node_name ) , " 0 " ) ,
( ( " node.memory.used " , node_name ) , " 0 " ) ,
( ( " node.memory.free " , node_name ) , " 0 " ) ,
( ( " node.memory.allocated " , node_name ) , " 0 " ) ,
( ( " node.memory.provisioned " , node_name ) , " 0 " ) ,
( ( " node.monitoring.health " , node_name ) , None ) ,
]
)
2020-11-07 14:45:24 -05:00
2018-10-22 20:20:27 -04:00
# Migrate hosts away from a fenced node
2021-06-01 11:53:21 -04:00
def migrateFromFencedNode ( zkhandler , node_name , config , logger ) :
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
f " Migrating VMs from dead node { node_name } to new hosts " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2020-08-05 22:26:01 -04:00
# Get the list of VMs
2021-11-06 03:02:43 -04:00
dead_node_running_domains = zkhandler . read (
( " node.running_domains " , node_name )
) . split ( )
2020-08-05 22:26:01 -04:00
# Set the node to a custom domainstate so we know what's happening
2021-11-06 03:02:43 -04:00
zkhandler . write ( [ ( ( " node.state.domain " , node_name ) , " fence-flush " ) ] )
2020-08-05 22:26:01 -04:00
# Migrate a VM after a flush
def fence_migrate_vm ( dom_uuid ) :
2023-11-10 09:30:34 -05:00
logger . out (
f " Flushing locks of VM { dom_uuid } due to fence " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
)
2023-11-10 01:28:41 -05:00
vm_worker_flush_locks ( zkhandler , None , dom_uuid , force_unlock = True )
2019-07-09 19:17:53 -04:00
2021-06-08 23:34:49 -04:00
target_node = common . findTargetNode ( zkhandler , dom_uuid )
2018-11-23 20:02:31 -05:00
2019-10-12 01:17:39 -04:00
if target_node is not None :
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
f " Migrating VM { dom_uuid } to node { target_node } " ,
2021-11-06 03:02:43 -04:00
state = " i " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
zkhandler . write (
[
( ( " domain.state " , dom_uuid ) , " start " ) ,
( ( " domain.node " , dom_uuid ) , target_node ) ,
( ( " domain.last_node " , dom_uuid ) , node_name ) ,
]
)
2023-11-10 09:30:34 -05:00
logger . out (
f " Successfully migrated running VM { dom_uuid } to node { target_node } " ,
state = " o " ,
prefix = f " fencing { node_name } " ,
)
2019-10-12 01:17:39 -04:00
else :
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
f " No target node found for VM { dom_uuid } ; marking autostart=True on current node " ,
2021-11-06 03:02:43 -04:00
state = " i " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
zkhandler . write (
{
( ( " domain.state " , dom_uuid ) , " stopped " ) ,
( ( " domain.meta.autostart " , dom_uuid ) , " True " ) ,
}
)
2023-11-10 09:30:34 -05:00
logger . out (
f " Successfully marked autostart for running VM { dom_uuid } on current node " ,
state = " o " ,
prefix = f " fencing { node_name } " ,
)
2018-10-22 20:20:27 -04:00
2020-08-05 22:26:01 -04:00
# Loop through the VMs
for dom_uuid in dead_node_running_domains :
2024-10-10 16:38:19 -04:00
if dom_uuid in [ " 0 " , 0 ] :
# Skip the invalid "0" UUID we sometimes get
continue
2023-09-16 22:56:09 -04:00
try :
fence_migrate_vm ( dom_uuid )
except Exception as e :
logger . out (
f " Failed to migrate VM { dom_uuid } , continuing: { e } " ,
state = " w " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2023-09-16 22:56:09 -04:00
)
2020-08-05 22:26:01 -04:00
2018-10-22 20:20:27 -04:00
# Set node in flushed state for easy remigrating when it comes back
2021-11-06 03:02:43 -04:00
zkhandler . write ( [ ( ( " node.state.domain " , node_name ) , " flushed " ) ] )
logger . out (
2023-11-10 09:30:34 -05:00
f " All VMs flushed from dead node { node_name } to other nodes " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2018-10-22 20:20:27 -04:00
2020-11-07 14:45:24 -05:00
2018-10-22 20:20:27 -04:00
#
# Perform an IPMI fence
#
2023-11-10 09:30:34 -05:00
def reboot_via_ipmi ( node_name , ipmi_hostname , ipmi_user , ipmi_password , logger ) :
2021-10-12 10:59:09 -04:00
# Power off the node the node
2023-11-10 09:30:34 -05:00
logger . out (
" Sending power off to dead node " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
ipmi_stop_retcode , ipmi_stop_stdout , ipmi_stop_stderr = common . run_os_command (
2023-11-10 09:30:34 -05:00
f " /usr/bin/ipmitool -I lanplus -H { ipmi_hostname } -U { ipmi_user } -P { ipmi_password } chassis power off "
2018-10-22 20:20:27 -04:00
)
2021-10-12 10:59:09 -04:00
if ipmi_stop_retcode != 0 :
2023-11-10 09:30:34 -05:00
logger . out (
f " Failed to power off dead node: { ipmi_stop_stderr } " ,
state = " e " ,
prefix = f " fencing { node_name } " ,
)
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
2023-11-10 09:30:34 -05:00
logger . out (
" Waiting 5s for power off to take effect " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
)
2021-10-27 16:24:17 -04:00
time . sleep ( 5 )
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
2021-10-12 10:59:09 -04:00
# Check the chassis power state
2023-11-10 09:30:34 -05:00
logger . out (
" Checking power state of dead node " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2024-05-08 10:55:15 -04:00
(
ipmi_intermediate_status_retcode ,
ipmi_intermediate_status_stdout ,
ipmi_intermediate_status_stderr ,
) = common . run_os_command (
2023-11-10 09:30:34 -05:00
f " /usr/bin/ipmitool -I lanplus -H { ipmi_hostname } -U { ipmi_user } -P { ipmi_password } chassis power status "
2021-10-12 10:59:09 -04:00
)
2024-05-08 10:55:15 -04:00
if ipmi_intermediate_status_retcode == 0 :
2021-11-06 03:02:43 -04:00
logger . out (
2024-05-08 10:55:15 -04:00
f " Current chassis power state is: { ipmi_intermediate_status_stdout . strip ( ) } " ,
2023-11-10 09:30:34 -05:00
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2021-10-12 10:59:09 -04:00
else :
2023-11-10 09:30:34 -05:00
logger . out (
" Current chassis power state is: Unknown " ,
state = " w " ,
prefix = f " fencing { node_name } " ,
)
2021-10-12 10:59:09 -04:00
# Power on the node
2023-11-10 09:30:34 -05:00
logger . out (
" Sending power on to dead node " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
ipmi_start_retcode , ipmi_start_stdout , ipmi_start_stderr = common . run_os_command (
2023-11-10 09:30:34 -05:00
f " /usr/bin/ipmitool -I lanplus -H { ipmi_hostname } -U { ipmi_user } -P { ipmi_password } chassis power on "
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
)
2020-12-15 02:45:38 -05:00
2021-10-12 10:59:09 -04:00
if ipmi_start_retcode != 0 :
2023-11-10 09:30:34 -05:00
logger . out (
f " Failed to power on dead node: { ipmi_start_stderr } " ,
state = " w " ,
prefix = f " fencing { node_name } " ,
)
2021-10-12 10:59:09 -04:00
2023-11-10 09:30:34 -05:00
logger . out (
" Waiting 2s for power on to take effect " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
)
2019-08-07 11:35:49 -04:00
time . sleep ( 2 )
2018-11-23 20:02:31 -05:00
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
# Check the chassis power state
2023-11-10 09:30:34 -05:00
logger . out (
" Checking power state of dead node " ,
state = " i " ,
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2024-05-08 10:55:15 -04:00
ipmi_final_status_retcode , ipmi_final_status_stdout , ipmi_final_status_stderr = (
common . run_os_command (
f " /usr/bin/ipmitool -I lanplus -H { ipmi_hostname } -U { ipmi_user } -P { ipmi_password } chassis power status "
)
2018-11-23 20:02:31 -05:00
)
2024-05-08 10:55:15 -04:00
if ipmi_intermediate_status_stdout . strip ( ) == " Chassis power is off " :
if ipmi_final_status_stdout . strip ( ) == " Chassis Power is on " :
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
# We successfully rebooted the node and it is powered on; this is a succeessful fence
2023-11-10 09:30:34 -05:00
logger . out (
" Successfully rebooted dead node; proceeding with fence recovery action " ,
state = " o " ,
prefix = f " fencing { node_name } " ,
)
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
return True
2024-05-08 10:55:15 -04:00
elif ipmi_final_status_stdout . strip ( ) == " Chassis Power is off " :
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
" Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action " ,
2021-11-06 03:02:43 -04:00
state = " o " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
return True
else :
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
2021-11-06 03:02:43 -04:00
logger . out (
2024-05-08 10:55:15 -04:00
f " Chassis power is in an unknown state ( { ipmi_final_status_stdout . strip ( ) } ) after successful IPMI reboot; NOT proceeding fence recovery action " ,
2021-11-06 03:02:43 -04:00
state = " e " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
return False
else :
2024-05-08 10:55:15 -04:00
if ipmi_final_status_stdout . strip ( ) == " Chassis Power is off " :
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
" Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action " ,
2021-11-06 03:02:43 -04:00
state = " o " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
a. If the chassis is on, the fence succeeded.
b. If the chassis is off, the fence "succeeded" as well.
c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
a. If the chassis is off, the fence itself failed, but we can treat
it as "succeeded"" since the chassis is in a known-offline state.
This is the most likely situation when there is a critical hardware
failure, and the server's IPMI does not allow itself to start back
up again.
b. If the chassis is in any other state ("on" or unknown), the fence
itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
2021-07-13 17:17:14 -04:00
return True
else :
# We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence
2021-11-06 03:02:43 -04:00
logger . out (
2023-11-10 09:30:34 -05:00
" Chassis power is not in confirmed off state after failed IPMI reboot; NOT proceeding wiht fence recovery action " ,
2021-11-06 03:02:43 -04:00
state = " e " ,
2023-11-10 09:30:34 -05:00
prefix = f " fencing { node_name } " ,
2021-11-06 03:02:43 -04:00
)
2020-12-15 02:45:38 -05:00
return False
2020-08-13 14:38:05 -04:00
2020-11-07 14:45:24 -05:00
2020-08-13 14:38:05 -04:00
#
# Verify that IPMI connectivity to this host exists (used during node init)
#
2021-08-21 02:46:11 -04:00
def verify_ipmi ( ipmi_hostname , ipmi_user , ipmi_password ) :
2021-11-06 03:02:43 -04:00
ipmi_command = f " /usr/bin/ipmitool -I lanplus -H { ipmi_hostname } -U { ipmi_user } -P { ipmi_password } chassis power status "
2021-08-21 02:46:11 -04:00
retcode , stdout , stderr = common . run_os_command ( ipmi_command , timeout = 2 )
2021-10-07 15:11:19 -04:00
if retcode == 0 and stdout . strip ( ) == " Chassis Power is on " :
2020-08-13 14:38:05 -04:00
return True
else :
return False