From 481c6fa4457ce3dc944bf784386e3f8eebe70c17 Mon Sep 17 00:00:00 2001 From: Joshua Boniface Date: Thu, 14 Jun 2018 12:00:48 -0400 Subject: [PATCH] Combine fencenode into NodeInstance.py --- NodeInstance.py | 64 +++++++++++++++++++++++++++++++++-- ansiiprint.py | 2 +- fencenode.py | 89 ------------------------------------------------- pvcd.py | 1 - 4 files changed, 63 insertions(+), 93 deletions(-) delete mode 100644 fencenode.py diff --git a/NodeInstance.py b/NodeInstance.py index 2e0c2627..79e320a3 100644 --- a/NodeInstance.py +++ b/NodeInstance.py @@ -20,7 +20,7 @@ # ############################################################################### -import os, sys, psutil, socket, time, libvirt, kazoo.client, threading, fencenode, ansiiprint +import os, sys, psutil, socket, time, libvirt, kazoo.client, threading, ansiiprint class NodeInstance(): # Initialization function @@ -255,7 +255,7 @@ class NodeInstance(): if node_keepalive < node_deadtime and node_daemon_state == 'run': ansiiprint.echo('Node {} seems dead - starting monitor for fencing'.format(node_name), '', 'w') self.zk.set('/nodes/{}/daemonstate'.format(node_name), 'dead'.encode('ascii')) - fence_thread = threading.Thread(target=fencenode.fence, args=(node_name, self.zk), kwargs={}) + fence_thread = threading.Thread(target=fenceNode, args=(node_name, self.zk), kwargs={}) fence_thread.start() # Update the arrays @@ -295,3 +295,63 @@ class NodeInstance(): ansiiprint.echo('{}Active nodes:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.active_node_list)), '', 'c') ansiiprint.echo('{}Inactive nodes:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.inactive_node_list)), '', 'c') ansiiprint.echo('{}Flushed nodes:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.flushed_node_list)), '', 'c') + +# +# Fence thread entry function +# +def fenceNode(node_name, zk): + failcount = 0 + while failcount < 3: + # Wait 5 seconds + time.sleep(5) + # Get the state + node_daemon_state = self.zk.get('/nodes/{}/daemonstate'.format(node_name))[0].decode('ascii') + # Is it still 'dead' + if node_daemon_state == 'dead': + failcount += 1 + ansiiprint.echo('Node "{}" failed {} saving throws'.format(node_name, failcount), '', 'w') + # It changed back to something else so it must be alive + else: + ansiiprint.echo('Node "{}" passed a saving throw; canceling fence'.format(node_name), '', 'o') + return + + ansiiprint.echo('Fencing node "{}" via IPMI reboot signal'.format(node_name), '', 'e') + + ipmi_hostname = zk.get('/nodes/{}/ipmihostname'.format(node_name))[0].decode('ascii') + ipmi_username = zk.get('/nodes/{}/ipmiusername'.format(node_name))[0].decode('ascii') + ipmi_password = zk.get('/nodes/{}/ipmipassword'.format(node_name))[0].decode('ascii') + rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password) + + ansiiprint.echo('Moving VMs from dead hypervisor "{}" to new hosts'.format(node_name), '', 'i') + dead_node_running_domains = zk.get('/nodes/{}/runningdomains'.format(node_name))[0].decode('ascii').split() + for dom_uuid in dead_node_running_domains: + most_memfree = 0 + hypervisor_list = zk.get_children('/nodes') + current_hypervisor = zk.get('/domains/{}/hypervisor'.format(dom_uuid))[0].decode('ascii') + for hypervisor in hypervisor_list: + state = zk.get('/nodes/{}/state'.format(hypervisor))[0].decode('ascii') + if state != 'start' or hypervisor == current_hypervisor: + continue + + memfree = int(zk.get('/nodes/{}/memfree'.format(hypervisor))[0].decode('ascii')) + if memfree > most_memfree: + most_memfree = memfree + target_hypervisor = hypervisor + + ansiiprint.echo('Moving VM "{}" to hypervisor "{}"'.format(dom_uuid, target_hypervisor), '', 'i') + transaction = zk.transaction() + transaction.set_data('/domains/{}/state'.format(dom_uuid), 'start'.encode('ascii')) + transaction.set_data('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) + transaction.set_data('/domains/{}/lasthypervisor'.format(dom_uuid), current_hypervisor.encode('ascii')) + transaction.commit() + +# +# Perform an IPMI fence +# +def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password): + ipmi_command = ['ipmitool', '-H', ipmi_hostname, '-U', ipmi_user, '-P', ipmi_password, 'chassis', 'power', 'reset'] + ipmi_command_output = subprocess.run(ipmi_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if ipmi_command_output == 0: + ansiiprint.echo('Successfully rebooted dead node', '', 'o') + else: + ansiiprint.echo('Failed to reboot dead node', '', 'e') diff --git a/ansiiprint.py b/ansiiprint.py index 5fa9b33b..f7f3ec77 100644 --- a/ansiiprint.py +++ b/ansiiprint.py @@ -20,7 +20,7 @@ # ############################################################################### -import os, sys, socket, time, datetime, libvirt, kazoo.client, threading, fencenode, ansiiprint +import os, sys, socket, time, datetime, libvirt, kazoo.client, threading, ansiiprint # ANSII colours for output def red(): diff --git a/fencenode.py b/fencenode.py deleted file mode 100644 index d639db86..00000000 --- a/fencenode.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 - -# fencenode.py - Supplemental functions to handle fencing of dead nodes -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import os, sys, libvirt, uuid, kazoo.client, time, subprocess, re, ansiiprint - -# -# Trigger function -# -def fence(node_name, zk): - failcount = 0 - while failcount < 3: - # Wait 5 seconds - time.sleep(5) - # Get the state - node_daemon_state = self.zk.get('/nodes/{}/daemonstate'.format(node_name))[0].decode('ascii') - # Is it still 'dead' - if node_daemon_state == 'dead': - failcount += 1 - ansiiprint.echo('Node "{}" failed {} saving throws'.format(node_name, failcount), '', 'w') - # It changed back to something else so it must be alive - else: - ansiiprint.echo('Node "{}" passed a saving throw; canceling fence'.format(node_name), '', 'o') - return - - ansiiprint.echo('Fencing node "{}" via IPMI reboot signal'.format(node_name), '', 'e') - - ipmi_hostname = zk.get('/nodes/{}/ipmihostname'.format(node_name))[0].decode('ascii') - ipmi_username = zk.get('/nodes/{}/ipmiusername'.format(node_name))[0].decode('ascii') - ipmi_password = zk.get('/nodes/{}/ipmipassword'.format(node_name))[0].decode('ascii') - rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password) - - ansiiprint.echo('Moving VMs from dead hypervisor "{}" to new hosts'.format(node_name), '', 'i') - dead_node_running_domains = zk.get('/nodes/{}/runningdomains'.format(node_name))[0].decode('ascii').split() - for dom_uuid in dead_node_running_domains: - most_memfree = 0 - hypervisor_list = zk.get_children('/nodes') - current_hypervisor = zk.get('/domains/{}/hypervisor'.format(dom_uuid))[0].decode('ascii') - for hypervisor in hypervisor_list: - state = zk.get('/nodes/{}/state'.format(hypervisor))[0].decode('ascii') - if state != 'start' or hypervisor == current_hypervisor: - continue - - memfree = int(zk.get('/nodes/{}/memfree'.format(hypervisor))[0].decode('ascii')) - if memfree > most_memfree: - most_memfree = memfree - target_hypervisor = hypervisor - - ansiiprint.echo('Moving VM "{}" to hypervisor "{}"'.format(dom_uuid, target_hypervisor), '', 'i') - transaction = zk.transaction() - transaction.set_data('/domains/{}/state'.format(dom_uuid), 'start'.encode('ascii')) - transaction.set_data('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) - transaction.set_data('/domains/{}/lasthypervisor'.format(dom_uuid), current_hypervisor.encode('ascii')) - transaction.commit() - -#def getIPMIAddress(): -# ipmi_command = ['bash', '/fakeipmi.sh'] -# -# # Get the IPMI address -# ipmi_lan_output = subprocess.run(ipmi_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) -# ipmi_lan_parsed = ipmi_lan_output.stdout.decode('ascii').split('\n') -# ipmi_lan_address = [s for s in ipmi_lan_parsed if re.search('IP Address[ ]*:', s)][0].split(':')[-1].strip() -# return ipmi_lan_address - -def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password): - ipmi_command = ['ipmitool', '-H', ipmi_hostname, '-U', ipmi_user, '-P', ipmi_password, 'chassis', 'power', 'reset'] - ipmi_command_output = subprocess.run(ipmi_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if ipmi_command_output == 0: - ansiiprint.echo('Successfully rebooted dead node', '', 'o') - else: - ansiiprint.echo('Failed to reboot dead node', '', 'e') diff --git a/pvcd.py b/pvcd.py index d60be315..e8f00696 100755 --- a/pvcd.py +++ b/pvcd.py @@ -34,7 +34,6 @@ import time import atexit import configparser import apscheduler.schedulers.background -import fencenode import ansiiprint print(ansiiprint.bold() + "pvcd - Parallel Virtual Cluster management daemon" + ansiiprint.end())