pvc/node-daemon/pvcnoded/fencing.py

#!/usr/bin/env python3

# fencing.py - PVC daemon function library, node fencing functions
# Part of the Parallel Virtual Cluster (PVC) system
#
#    Copyright (C) 2018-2020 Joshua M. Boniface <joshua@boniface.me>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################

import time

import pvcnoded.zkhandler as zkhandler
import pvcnoded.common as common
import pvcnoded.VMInstance as VMInstance


#
# Fence thread entry function
#
def fenceNode(node_name, zk_conn, config, logger):
    # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
    failcount_limit = 6
    failcount = 0
    while failcount < failcount_limit:
        # Wait 5 seconds
        time.sleep(config['keepalive_interval'])
        # Get the state
        node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
        # Is it still 'dead'
        if node_daemon_state == 'dead':
            failcount += 1
            logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='w')
        # It changed back to something else so it must be alive
        else:
            logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')
            return

    logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='w')

    # Get IPMI information
    ipmi_hostname = zkhandler.readdata(zk_conn, '/nodes/{}/ipmihostname'.format(node_name))
    ipmi_username = zkhandler.readdata(zk_conn, '/nodes/{}/ipmiusername'.format(node_name))
    ipmi_password = zkhandler.readdata(zk_conn, '/nodes/{}/ipmipassword'.format(node_name))

    # Shoot it in the head
    fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password, logger)
    # Hold to ensure the fence takes effect and system stabilizes
    time.sleep(config['keepalive_interval'] * 2)

    # Force into secondary network state if needed
    if node_name in config['coordinators']:
        logger.out('Forcing secondary status for node "{}"'.format(node_name), state='i')
        zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(node_name): 'secondary'})
        if zkhandler.readdata(zk_conn, '/primary_node') == node_name:
            zkhandler.writedata(zk_conn, {'/primary_node': 'none'})

    # If the fence succeeded and successful_fence is migrate
    if fence_status and config['successful_fence'] == 'migrate':
        migrateFromFencedNode(zk_conn, node_name, config, logger)

    # If the fence failed and failed_fence is migrate
    if not fence_status and config['failed_fence'] == 'migrate' and config['suicide_intervals'] != '0':
        migrateFromFencedNode(zk_conn, node_name, config, logger)


# Migrate hosts away from a fenced node
def migrateFromFencedNode(zk_conn, node_name, config, logger):
    logger.out('Migrating VMs from dead node "{}" to new hosts'.format(node_name), state='i')

    # Get the list of VMs
    dead_node_running_domains = zkhandler.readdata(zk_conn, '/nodes/{}/runningdomains'.format(node_name)).split()

    # Set the node to a custom domainstate so we know what's happening
    zkhandler.writedata(zk_conn, {'/nodes/{}/domainstate'.format(node_name): 'fence-flush'})

    # Migrate a VM after a flush
    def fence_migrate_vm(dom_uuid):
        VMInstance.flush_locks(zk_conn, logger, dom_uuid)

        target_node = common.findTargetNode(zk_conn, config, logger, dom_uuid)

        if target_node is not None:
            logger.out('Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i')
            zkhandler.writedata(zk_conn, {
                '/domains/{}/state'.format(dom_uuid): 'start',
                '/domains/{}/node'.format(dom_uuid): target_node,
                '/domains/{}/lastnode'.format(dom_uuid): node_name
            })
        else:
            logger.out('No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format(dom_uuid), state='i')
            zkhandler.writedata(zk_conn, {
                '/domains/{}/state'.format(dom_uuid): 'stopped',
                '/domains/{}/node_autostart'.format(dom_uuid): 'True'
            })

    # Loop through the VMs
    for dom_uuid in dead_node_running_domains:
        fence_migrate_vm(dom_uuid)

    # Set node in flushed state for easy remigrating when it comes back
    zkhandler.writedata(zk_conn, {'/nodes/{}/domainstate'.format(node_name): 'flushed'})


#
# Perform an IPMI fence
#
def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger):
    # Forcibly reboot the node
    ipmi_command_reset = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power reset'.format(
        ipmi_hostname, ipmi_user, ipmi_password
    )
    ipmi_reset_retcode, ipmi_reset_stdout, ipmi_reset_stderr = common.run_os_command(ipmi_command_reset)

    if ipmi_reset_retcode != 0:
        logger.out('Failed to reboot dead node', state='e')
        print(ipmi_reset_stderr)
        return False

    time.sleep(2)

    # Ensure the node is powered on
    ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(
        ipmi_hostname, ipmi_user, ipmi_password
    )
    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)

    # Trigger a power start if needed
    if ipmi_status_stdout != "Chassis Power is on":
        ipmi_command_start = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on'.format(
            ipmi_hostname, ipmi_user, ipmi_password
        )
        ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(ipmi_command_start)

        if ipmi_start_retcode != 0:
            logger.out('Failed to start powered-off dead node', state='e')
            print(ipmi_reset_stderr)
            return False

    # Declare success
    logger.out('Successfully rebooted dead node', state='o')
    return True


#
# Verify that IPMI connectivity to this host exists (used during node init)
#
def verifyIPMI(ipmi_hostname, ipmi_user, ipmi_password):
    ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(
        ipmi_hostname, ipmi_user, ipmi_password
    )
    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status, timeout=2)
    if ipmi_status_retcode == 0 and ipmi_status_stdout != "Chassis Power is on":
        return True
    else:
        return False
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`#!/usr/bin/env python3`

			`# fencing.py - PVC daemon function library, node fencing functions`
			`# Part of the Parallel Virtual Cluster (PVC) system`
			`#`
Update copyright header year to 2020 2020-01-08 19:38:02 -05:00			`# Copyright (C) 2018-2020 Joshua M. Boniface <joshua@boniface.me>`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`#`
			`###############################################################################`

			`import time`

Use consistent naming of components Rename "pvcd" to "pvcnoded", and "pvc-api" to "pvcapid" so names for the daemons are fully consistent. Update the names of the configuration files as well to match this new formatting. References #79 2020-02-08 19:16:19 -05:00			`import pvcnoded.zkhandler as zkhandler`
			`import pvcnoded.common as common`
			`import pvcnoded.VMInstance as VMInstance`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00
Lint: E302 expected 2 blank lines, found X 2020-11-07 14:45:24 -05:00
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`#`
			`# Fence thread entry function`
			`#`
			`def fenceNode(node_name, zk_conn, config, logger):`
Adjust fence failcount limit to 6 (30s) The previous saving throw limit (3/15s) seems to have been too low. I was observing bizarre failures where a node would be fenced while it was still starting up. Some of this may have been related to Zookeeper connections taking too long, but this was inconsistent. Increase this to 6 saving throws (30s). This provides significantly more time for a node to properly check in on startup before another node fences it. In the real world, 15s vs 30s isn't that big of a downtime change, but prevents false-positive fences. 2020-08-05 22:36:28 -04:00			`# We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it`
			`failcount_limit = 6`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`failcount = 0`
Adjust fence failcount limit to 6 (30s) The previous saving throw limit (3/15s) seems to have been too low. I was observing bizarre failures where a node would be fenced while it was still starting up. Some of this may have been related to Zookeeper connections taking too long, but this was inconsistent. Increase this to 6 saving throws (30s). This provides significantly more time for a node to properly check in on startup before another node fences it. In the real world, 15s vs 30s isn't that big of a downtime change, but prevents false-positive fences. 2020-08-05 22:36:28 -04:00			`while failcount < failcount_limit:`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`# Wait 5 seconds`
Fix incorrect keepalive interval setting 2020-10-26 11:44:45 -04:00			`time.sleep(config['keepalive_interval'])`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`# Get the state`
			`node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name))`
			`# Is it still 'dead'`
			`if node_daemon_state == 'dead':`
			`failcount += 1`
Adjust fence failcount limit to 6 (30s) The previous saving throw limit (3/15s) seems to have been too low. I was observing bizarre failures where a node would be fenced while it was still starting up. Some of this may have been related to Zookeeper connections taking too long, but this was inconsistent. Increase this to 6 saving throws (30s). This provides significantly more time for a node to properly check in on startup before another node fences it. In the real world, 15s vs 30s isn't that big of a downtime change, but prevents false-positive fences. 2020-08-05 22:36:28 -04:00			`logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='w')`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`# It changed back to something else so it must be alive`
			`else:`
			`logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')`
			`return`

			`logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='w')`

			`# Get IPMI information`
			`ipmi_hostname = zkhandler.readdata(zk_conn, '/nodes/{}/ipmihostname'.format(node_name))`
			`ipmi_username = zkhandler.readdata(zk_conn, '/nodes/{}/ipmiusername'.format(node_name))`
			`ipmi_password = zkhandler.readdata(zk_conn, '/nodes/{}/ipmipassword'.format(node_name))`

			`# Shoot it in the head`
			`fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password, logger)`
Tie fence timers to keepalive_interval Also wait 2 full keepalive intervals after fencing before doing anything else, to give the Ceph cluster a chance to recover. 2020-08-15 12:38:03 -04:00			`# Hold to ensure the fence takes effect and system stabilizes`
Fix incorrect keepalive interval setting 2020-10-26 11:44:45 -04:00			`time.sleep(config['keepalive_interval'] * 2)`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00
			`# Force into secondary network state if needed`
Fix bad split on list 2019-03-13 19:26:08 -04:00			`if node_name in config['coordinators']:`
Fix missing char in log message 2020-08-11 12:40:35 -04:00			`logger.out('Forcing secondary status for node "{}"'.format(node_name), state='i')`
Lint: E202 whitespace before '}' 2020-11-07 12:57:42 -05:00			`zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(node_name): 'secondary'})`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`if zkhandler.readdata(zk_conn, '/primary_node') == node_name:`
Lint: E202 whitespace before '}' 2020-11-07 12:57:42 -05:00			`zkhandler.writedata(zk_conn, {'/primary_node': 'none'})`
Remove extra whitespaces on blank lines 2019-06-25 22:31:04 -04:00
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`# If the fence succeeded and successful_fence is migrate`
Add logging and use better conditional 2020-08-05 21:57:36 -04:00			`if fence_status and config['successful_fence'] == 'migrate':`
Implement VM metadata and use it Implements the storing of three VM metadata attributes: 1. Node limits - allows specifying a list of hosts on which the VM must run. This limit influences the migration behaviour of VMs. 2. Per-VM node selectors - allows each VM to have its migration autoselection method specified, to automatically allow different methods per VM based on the administrator's preferences. 3. VM autorestart - allows a VM to be automatically restarted from a stopped state, presumably due to a failure to find a target node (either due to limits or otherwise) during a flush/fence recovery, on the next node unflush/ready state of its home hypervisor. Useful mostly in conjunction with limits to ensure that VMs which were shut down due to there being no valid migration targets are started back up when their node becomes ready again. Includes the full client interaction with these metadata options, including printing, as well as defining a new function to modify this metadata. For the CLI it is set/modified either on `vm define` or via the `vm meta` command. For the API it is set/modified either on a POST to the `/vm` endpoint (during VM definition) or on POST to the `/vm/<vm>` endpoint. For the API this replaces the previous reserved word for VM creation from scratch as this will no longer be implemented in-daemon (see #22). Closes #52 2019-10-12 01:17:39 -04:00			`migrateFromFencedNode(zk_conn, node_name, config, logger)`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00
			`# If the fence failed and failed_fence is migrate`
Add logging and use better conditional 2020-08-05 21:57:36 -04:00			`if not fence_status and config['failed_fence'] == 'migrate' and config['suicide_intervals'] != '0':`
Implement VM metadata and use it Implements the storing of three VM metadata attributes: 1. Node limits - allows specifying a list of hosts on which the VM must run. This limit influences the migration behaviour of VMs. 2. Per-VM node selectors - allows each VM to have its migration autoselection method specified, to automatically allow different methods per VM based on the administrator's preferences. 3. VM autorestart - allows a VM to be automatically restarted from a stopped state, presumably due to a failure to find a target node (either due to limits or otherwise) during a flush/fence recovery, on the next node unflush/ready state of its home hypervisor. Useful mostly in conjunction with limits to ensure that VMs which were shut down due to there being no valid migration targets are started back up when their node becomes ready again. Includes the full client interaction with these metadata options, including printing, as well as defining a new function to modify this metadata. For the CLI it is set/modified either on `vm define` or via the `vm meta` command. For the API it is set/modified either on a POST to the `/vm` endpoint (during VM definition) or on POST to the `/vm/<vm>` endpoint. For the API this replaces the previous reserved word for VM creation from scratch as this will no longer be implemented in-daemon (see #22). Closes #52 2019-10-12 01:17:39 -04:00			`migrateFromFencedNode(zk_conn, node_name, config, logger)`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00
Lint: E302 expected 2 blank lines, found X 2020-11-07 14:45:24 -05:00
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`# Migrate hosts away from a fenced node`
Implement VM metadata and use it Implements the storing of three VM metadata attributes: 1. Node limits - allows specifying a list of hosts on which the VM must run. This limit influences the migration behaviour of VMs. 2. Per-VM node selectors - allows each VM to have its migration autoselection method specified, to automatically allow different methods per VM based on the administrator's preferences. 3. VM autorestart - allows a VM to be automatically restarted from a stopped state, presumably due to a failure to find a target node (either due to limits or otherwise) during a flush/fence recovery, on the next node unflush/ready state of its home hypervisor. Useful mostly in conjunction with limits to ensure that VMs which were shut down due to there being no valid migration targets are started back up when their node becomes ready again. Includes the full client interaction with these metadata options, including printing, as well as defining a new function to modify this metadata. For the CLI it is set/modified either on `vm define` or via the `vm meta` command. For the API it is set/modified either on a POST to the `/vm` endpoint (during VM definition) or on POST to the `/vm/<vm>` endpoint. For the API this replaces the previous reserved word for VM creation from scratch as this will no longer be implemented in-daemon (see #22). Closes #52 2019-10-12 01:17:39 -04:00			`def migrateFromFencedNode(zk_conn, node_name, config, logger):`
Use consistent terminology in fence message 2019-07-10 11:54:56 -04:00			`logger.out('Migrating VMs from dead node "{}" to new hosts'.format(node_name), state='i')`
Improve fencing migrate layout Open the option to do this in parallel with some threads 2020-08-05 22:26:01 -04:00
			`# Get the list of VMs`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`dead_node_running_domains = zkhandler.readdata(zk_conn, '/nodes/{}/runningdomains'.format(node_name)).split()`
Improve fencing migrate layout Open the option to do this in parallel with some threads 2020-08-05 22:26:01 -04:00
			`# Set the node to a custom domainstate so we know what's happening`
Lint: E202 whitespace before '}' 2020-11-07 12:57:42 -05:00			`zkhandler.writedata(zk_conn, {'/nodes/{}/domainstate'.format(node_name): 'fence-flush'})`
Improve fencing migrate layout Open the option to do this in parallel with some threads 2020-08-05 22:26:01 -04:00
			`# Migrate a VM after a flush`
			`def fence_migrate_vm(dom_uuid):`
Move lock flushing to VMInstance Prepares for reuse of this function via client commands. 2019-08-07 13:36:56 -04:00			`VMInstance.flush_locks(zk_conn, logger, dom_uuid)`
Fix bugs with fencing 2019-07-09 19:17:53 -04:00
Add additional logging to flush selector Adds additional debug logging to the flush selector to determine how any why any given node is selected. Useful for troubleshooting strange choices. 2020-10-20 11:08:30 -04:00			`target_node = common.findTargetNode(zk_conn, config, logger, dom_uuid)`
Make IPMI handling a bit better 2018-11-23 20:02:31 -05:00
Implement VM metadata and use it Implements the storing of three VM metadata attributes: 1. Node limits - allows specifying a list of hosts on which the VM must run. This limit influences the migration behaviour of VMs. 2. Per-VM node selectors - allows each VM to have its migration autoselection method specified, to automatically allow different methods per VM based on the administrator's preferences. 3. VM autorestart - allows a VM to be automatically restarted from a stopped state, presumably due to a failure to find a target node (either due to limits or otherwise) during a flush/fence recovery, on the next node unflush/ready state of its home hypervisor. Useful mostly in conjunction with limits to ensure that VMs which were shut down due to there being no valid migration targets are started back up when their node becomes ready again. Includes the full client interaction with these metadata options, including printing, as well as defining a new function to modify this metadata. For the CLI it is set/modified either on `vm define` or via the `vm meta` command. For the API it is set/modified either on a POST to the `/vm` endpoint (during VM definition) or on POST to the `/vm/<vm>` endpoint. For the API this replaces the previous reserved word for VM creation from scratch as this will no longer be implemented in-daemon (see #22). Closes #52 2019-10-12 01:17:39 -04:00			`if target_node is not None:`
			`logger.out('Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i')`
			`zkhandler.writedata(zk_conn, {`
			`'/domains/{}/state'.format(dom_uuid): 'start',`
			`'/domains/{}/node'.format(dom_uuid): target_node,`
			`'/domains/{}/lastnode'.format(dom_uuid): node_name`
			`})`
			`else:`
			`logger.out('No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format(dom_uuid), state='i')`
			`zkhandler.writedata(zk_conn, {`
			`'/domains/{}/state'.format(dom_uuid): 'stopped',`
			`'/domains/{}/node_autostart'.format(dom_uuid): 'True'`
Fix minor bugs 2019-10-12 01:36:50 -04:00			`})`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00
Improve fencing migrate layout Open the option to do this in parallel with some threads 2020-08-05 22:26:01 -04:00			`# Loop through the VMs`
			`for dom_uuid in dead_node_running_domains:`
			`fence_migrate_vm(dom_uuid)`

Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`# Set node in flushed state for easy remigrating when it comes back`
Lint: E202 whitespace before '}' 2020-11-07 12:57:42 -05:00			`zkhandler.writedata(zk_conn, {'/nodes/{}/domainstate'.format(node_name): 'flushed'})`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00
Lint: E302 expected 2 blank lines, found X 2020-11-07 14:45:24 -05:00
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`#`
			`# Perform an IPMI fence`
			`#`
			`def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger):`
Make IPMI handling a bit better 2018-11-23 20:02:31 -05:00			`# Forcibly reboot the node`
			`ipmi_command_reset = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power reset'.format(`
Move Zookeeper update out of NodeInstance and into the main Daemon 2018-10-22 20:20:27 -04:00			`ipmi_hostname, ipmi_user, ipmi_password`
			`)`
Make IPMI handling a bit better 2018-11-23 20:02:31 -05:00			`ipmi_reset_retcode, ipmi_reset_stdout, ipmi_reset_stderr = common.run_os_command(ipmi_command_reset)`

Revamp fencing order Prevents unnecessarily excessive timeouts if IPMI connections time out; before, would have to go through 3 timed out commands at ~20s each before failure was registered; reduced to 1 if the first times out. 2020-12-15 02:45:38 -05:00			`if ipmi_reset_retcode != 0:`
			`logger.out('Failed to reboot dead node', state='e')`
			`print(ipmi_reset_stderr)`
			`return False`

Improve fence output on failure and increase delay 2019-08-07 11:35:49 -04:00			`time.sleep(2)`
Make IPMI handling a bit better 2018-11-23 20:02:31 -05:00
			`# Ensure the node is powered on`
			`ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(`
			`ipmi_hostname, ipmi_user, ipmi_password`
			`)`
			`ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)`

			`# Trigger a power start if needed`
			`if ipmi_status_stdout != "Chassis Power is on":`
Use correct IPMItool command to start server 2018-12-07 12:36:53 -05:00			`ipmi_command_start = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on'.format(`
Make IPMI handling a bit better 2018-11-23 20:02:31 -05:00			`ipmi_hostname, ipmi_user, ipmi_password`
			`)`
			`ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(ipmi_command_start)`

Revamp fencing order Prevents unnecessarily excessive timeouts if IPMI connections time out; before, would have to go through 3 timed out commands at ~20s each before failure was registered; reduced to 1 if the first times out. 2020-12-15 02:45:38 -05:00			`if ipmi_start_retcode != 0:`
			`logger.out('Failed to start powered-off dead node', state='e')`
			`print(ipmi_reset_stderr)`
			`return False`

			`# Declare success`
			`logger.out('Successfully rebooted dead node', state='o')`
			`return True`
Warn if fencing will fail Verify our IPMI state on startup, and then warn if fencing will fail. For now, this is sufficient, but in future (requires refactoring) we might want to adjust how fencing occurs based on this information. 2020-08-13 14:38:05 -04:00
Lint: E302 expected 2 blank lines, found X 2020-11-07 14:45:24 -05:00
Warn if fencing will fail Verify our IPMI state on startup, and then warn if fencing will fail. For now, this is sufficient, but in future (requires refactoring) we might want to adjust how fencing occurs based on this information. 2020-08-13 14:38:05 -04:00			`#`
			`# Verify that IPMI connectivity to this host exists (used during node init)`
			`#`
			`def verifyIPMI(ipmi_hostname, ipmi_user, ipmi_password):`
			`ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(`
			`ipmi_hostname, ipmi_user, ipmi_password`
			`)`
			`ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status, timeout=2)`
			`if ipmi_status_retcode == 0 and ipmi_status_stdout != "Chassis Power is on":`
			`return True`
			`else:`
			`return False`