pvc/pvcd.py

270 lines
9.8 KiB
Python
Raw Normal View History

2018-05-31 20:26:44 -04:00
#!/usr/bin/env python3
# pvcd.py - PVC hypervisor node daemon
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
import kazoo.client
2018-05-31 20:26:44 -04:00
import libvirt
import sys
2018-06-08 12:19:48 -04:00
import os
2018-06-16 22:30:17 -04:00
import signal
2018-05-31 21:49:23 -04:00
import socket
2018-06-12 12:07:57 -04:00
import psutil
import subprocess
2018-05-31 20:26:44 -04:00
import uuid
import time
2018-06-08 12:19:48 -04:00
import configparser
import apscheduler.schedulers.background
2018-06-17 14:55:13 -04:00
import pvcd.ansiiprint as ansiiprint
2018-06-26 22:52:55 -04:00
import pvcd.zkhandler as zkhandler
2018-06-17 14:55:13 -04:00
import pvcd.VMInstance as VMInstance
import pvcd.NodeInstance as NodeInstance
2018-05-31 20:26:44 -04:00
2018-06-08 12:19:48 -04:00
print(ansiiprint.bold() + "pvcd - Parallel Virtual Cluster management daemon" + ansiiprint.end())
2018-05-31 20:26:44 -04:00
2018-06-08 12:19:48 -04:00
# Get the config file variable from the environment
try:
pvcd_config_file = os.environ['PVCD_CONFIG_FILE']
2018-06-08 12:19:48 -04:00
except:
print('ERROR: The "PVCD_CONFIG_FILE" environment variable must be set before starting pvcd.')
2018-06-08 12:19:48 -04:00
exit(1)
myhostname = socket.gethostname()
myshorthostname = myhostname.split('.', 1)[0]
mydomainname = ''.join(myhostname.split('.', 1)[1:])
# Config values dictionary
config_values = [
'zookeeper',
'keepalive_interval',
'fence_intervals',
'suicide_intervals',
'successful_fence',
'failed_fence',
'ipmi_hostname',
'ipmi_username',
'ipmi_password'
]
2018-06-08 12:19:48 -04:00
def readConfig(pvcd_config_file, myhostname):
print('Loading configuration from file {}'.format(pvcd_config_file))
2018-06-08 12:19:48 -04:00
o_config = configparser.ConfigParser()
o_config.read(pvcd_config_file)
config = {}
try:
entries = o_config[myhostname]
except:
try:
entries = o_config['default']
2018-06-14 12:26:47 -04:00
except Exception as e:
print('ERROR: Config file is not valid!')
exit(1)
for entry in config_values:
try:
config[entry] = entries[entry]
except:
try:
config[entry] = o_config['default'][entry]
except:
print('ERROR: Config file missing required value "{}" for this host!'.format(entry))
exit(1)
# Handle an empty ipmi_hostname
if config['ipmi_hostname'] == '':
2018-06-10 20:45:32 -04:00
config['ipmi_hostname'] = myshorthostname + '-lom.' + mydomainname
return config
2018-05-31 20:26:44 -04:00
# Get config
config = readConfig(pvcd_config_file, myhostname)
2018-06-10 20:45:32 -04:00
# Check that libvirtd is listening TCP
libvirt_check_name = "qemu+tcp://127.0.0.1:16509/system"
try:
print('Connecting to Libvirt instance at {}'.format(libvirt_check_name))
lv_conn = libvirt.open(libvirt_check_name)
if lv_conn == None:
raise
except:
print('ERROR: Failed to open local libvirt connection via TCP; required for PVC!')
exit(1)
lv_conn.close()
# Connect to local zookeeper
zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper'])
2018-05-31 20:26:44 -04:00
try:
2018-06-08 12:19:48 -04:00
print('Connecting to Zookeeper instance at {}'.format(config['zookeeper']))
zk_conn.start()
2018-05-31 20:26:44 -04:00
except:
2018-06-08 12:19:48 -04:00
print('ERROR: Failed to connect to Zookeeper')
2018-05-31 20:26:44 -04:00
exit(1)
# Handle zookeeper failures
def zk_listener(state):
global zk_conn, update_timer
if state == kazoo.client.KazooState.SUSPENDED:
ansiiprint.echo('Connection to Zookeeper lost; retrying', '', 'e')
# Stop keepalive thread
stopKeepaliveTimer(update_timer)
while True:
_zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper'])
try:
_zk_conn.start()
zk_conn = _zk_conn
break
except:
time.sleep(1)
elif state == kazoo.client.KazooState.CONNECTED:
ansiiprint.echo('Connection to Zookeeper started', '', 'o')
# Start keepalive thread
update_timer = createKeepaliveTimer()
else:
pass
zk_conn.add_listener(zk_listener)
# Cleanup function
def cleanup(signum, frame):
ansiiprint.echo('Terminating daemon', '', 'e')
# Set stop state in Zookeeper
zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'stop' })
# Close the Zookeeper connection
2018-06-27 23:37:39 -04:00
try:
zk_conn.stop()
zk_conn.close()
except:
pass
# Stop keepalive thread
stopKeepaliveTimer(update_timer)
# Exit
sys.exit(0)
2018-05-31 20:26:44 -04:00
# Handle signals gracefully
signal.signal(signal.SIGTERM, cleanup)
signal.signal(signal.SIGINT, cleanup)
signal.signal(signal.SIGQUIT, cleanup)
2018-06-16 22:30:17 -04:00
# Gather useful data about our host for staticdata
# Static data format: 'cpu_count', 'arch', 'os', 'kernel'
staticdata = []
staticdata.append(str(psutil.cpu_count()))
staticdata.append(subprocess.run(['uname', '-r'], stdout=subprocess.PIPE).stdout.decode('ascii').strip())
staticdata.append(subprocess.run(['uname', '-o'], stdout=subprocess.PIPE).stdout.decode('ascii').strip())
staticdata.append(subprocess.run(['uname', '-m'], stdout=subprocess.PIPE).stdout.decode('ascii').strip())
# Print static data on start
2018-06-11 01:58:40 -04:00
print('{0}Node hostname:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), myhostname))
print('{0}IPMI hostname:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), config['ipmi_hostname']))
print('{0}Machine details:{1}'.format(ansiiprint.bold(), ansiiprint.end()))
print(' {0}CPUs:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[0]))
print(' {0}Arch:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[1]))
print(' {0}OS:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[2]))
print(' {0}Kernel:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[3]))
2018-06-11 01:58:40 -04:00
2018-05-31 22:55:44 -04:00
# Check if our node exists in Zookeeper, and create it if not
if zk_conn.exists('/nodes/{}'.format(myhostname)):
print("Node is " + ansiiprint.green() + "present" + ansiiprint.end() + " in Zookeeper")
# Update static data just in case it's changed
zkhandler.writedata(zk_conn, { '/nodes/{}/staticdata'.format(myhostname): ' '.join(staticdata) })
2018-05-31 23:04:34 -04:00
else:
print("Node is " + ansiiprint.red() + "absent" + ansiiprint.end() + " in Zookeeper; adding new node")
keepalive_time = int(time.time())
2018-06-17 22:45:03 -04:00
transaction = zk_conn.transaction()
transaction.create('/nodes/{}'.format(myhostname), 'hypervisor'.encode('ascii'))
2018-06-08 12:19:48 -04:00
# Basic state information
2018-06-17 22:45:03 -04:00
transaction.create('/nodes/{}/daemonstate'.format(myhostname), 'stop'.encode('ascii'))
transaction.create('/nodes/{}/domainstate'.format(myhostname), 'ready'.encode('ascii'))
transaction.create('/nodes/{}/staticdata'.format(myhostname), ' '.join(staticdata).encode('ascii'))
transaction.create('/nodes/{}/memfree'.format(myhostname), '0'.encode('ascii'))
transaction.create('/nodes/{}/memused'.format(myhostname), '0'.encode('ascii'))
transaction.create('/nodes/{}/memalloc'.format(myhostname), '0'.encode('ascii'))
2018-06-17 22:45:03 -04:00
transaction.create('/nodes/{}/cpuload'.format(myhostname), '0.0'.encode('ascii'))
transaction.create('/nodes/{}/runningdomains'.format(myhostname), ''.encode('ascii'))
transaction.create('/nodes/{}/domainscount'.format(myhostname), '0'.encode('ascii'))
2018-06-08 12:19:48 -04:00
# Keepalives and fencing information
2018-06-17 22:45:03 -04:00
transaction.create('/nodes/{}/keepalive'.format(myhostname), str(keepalive_time).encode('ascii'))
transaction.create('/nodes/{}/ipmihostname'.format(myhostname), config['ipmi_hostname'].encode('ascii'))
transaction.create('/nodes/{}/ipmiusername'.format(myhostname), config['ipmi_username'].encode('ascii'))
transaction.create('/nodes/{}/ipmipassword'.format(myhostname), config['ipmi_password'].encode('ascii'))
transaction.commit()
2018-05-31 22:55:44 -04:00
zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'init' })
2018-06-12 12:07:57 -04:00
2018-05-31 23:28:26 -04:00
t_node = dict()
s_domain = dict()
node_list = []
2018-06-04 02:22:59 -04:00
domain_list = []
2018-05-31 22:55:44 -04:00
@zk_conn.ChildrenWatch('/nodes')
def updatenodes(new_node_list):
2018-06-04 02:22:59 -04:00
global node_list
2018-05-31 23:28:26 -04:00
node_list = new_node_list
print(ansiiprint.blue() + 'Node list: ' + ansiiprint.end() + '{}'.format(' '.join(node_list)))
2018-05-31 23:01:22 -04:00
for node in node_list:
2018-05-31 23:28:26 -04:00
if node in t_node:
t_node[node].updatenodelist(t_node)
2018-05-31 23:28:26 -04:00
else:
t_node[node] = NodeInstance.NodeInstance(myhostname, node, t_node, s_domain, zk_conn, config)
2018-05-31 20:26:44 -04:00
@zk_conn.ChildrenWatch('/domains')
2018-06-01 01:32:19 -04:00
def updatedomains(new_domain_list):
2018-06-04 02:22:59 -04:00
global domain_list
2018-06-01 01:32:19 -04:00
domain_list = new_domain_list
print(ansiiprint.blue() + 'Domain list: ' + ansiiprint.end() + '{}'.format(' '.join(domain_list)))
2018-06-01 01:32:19 -04:00
for domain in domain_list:
if not domain in s_domain:
s_domain[domain] = VMInstance.VMInstance(domain, zk_conn, config, t_node[myhostname]);
for node in node_list:
if node in t_node:
t_node[node].updatedomainlist(s_domain)
2018-05-31 20:26:44 -04:00
# Set up our update function
this_node = t_node[myhostname]
update_zookeeper = this_node.update_zookeeper
# Create timer to update this node in Zookeeper
def createKeepaliveTimer():
2018-06-17 02:28:39 -04:00
interval = int(config['keepalive_interval'])
2018-06-17 02:29:28 -04:00
ansiiprint.echo('Starting keepalive timer ({} second interval)'.format(interval), '', 'o')
update_timer = apscheduler.schedulers.background.BackgroundScheduler()
2018-06-17 02:28:39 -04:00
update_timer.add_job(update_zookeeper, 'interval', seconds=interval)
update_timer.start()
return update_timer
def stopKeepaliveTimer(update_timer):
ansiiprint.echo('Stopping keepalive timer', '', 'c')
update_timer.shutdown()
# Start keepalive thread
update_timer = createKeepaliveTimer()
2018-06-04 02:22:59 -04:00
# Tick loop
2018-05-31 20:26:44 -04:00
while True:
2018-05-31 21:49:23 -04:00
try:
2018-05-31 22:31:20 -04:00
time.sleep(0.1)
2018-05-31 21:49:23 -04:00
except:
break