diff --git a/lint b/lint index a7638001..808e3e6b 100755 --- a/lint +++ b/lint @@ -6,7 +6,7 @@ if ! which flake8 &>/dev/null; then fi flake8 \ - --ignore=E501 \ + --ignore=E501,E241 \ --exclude=debian,api-daemon/migrations/versions,api-daemon/provisioner/examples ret=$? if [[ $ret -eq 0 ]]; then diff --git a/node-daemon/pvcnoded.py b/node-daemon/pvcnoded.py index 20c1734d..49dba9c2 100755 --- a/node-daemon/pvcnoded.py +++ b/node-daemon/pvcnoded.py @@ -20,3 +20,5 @@ ############################################################################### import pvcnoded.Daemon # noqa: F401 + +pvcnoded.Daemon.entrypoint() diff --git a/node-daemon/pvcnoded.sample.yaml b/node-daemon/pvcnoded.sample.yaml index 37097538..360a14b8 100644 --- a/node-daemon/pvcnoded.sample.yaml +++ b/node-daemon/pvcnoded.sample.yaml @@ -182,15 +182,15 @@ pvc: device: ens4 # mtu: Upstream interface MTU; use 9000 for jumbo frames (requires switch support) mtu: 1500 - # address: Upstream interface IP address, options: None, by-id, / - address: None + # address: Upstream interface IP address, options: by-id, / + address: by-id # cluster: Cluster (VNIC) physical interface device cluster: # device: Cluster (VNIC) interface device name device: ens4 # mtu: Cluster (VNIC) interface MTU; use 9000 for jumbo frames (requires switch support) mtu: 1500 - # address: Cluster (VNIC) interface IP address, options: None, by-id, / + # address: Cluster (VNIC) interface IP address, options: by-id, / address: by-id # storage: Storage (Ceph OSD) physical interface device storage: @@ -198,7 +198,7 @@ pvc: device: ens4 # mtu: Storage (Ceph OSD) interface MTU; use 9000 for jumbo frames (requires switch support) mtu: 1500 - # address: Storage (Ceph OSD) interface IP address, options: None, by-id, / + # address: Storage (Ceph OSD) interface IP address, options: by-id, / address: by-id # storage; PVC storage configuration # OPTIONAL if enable_storage: False diff --git a/node-daemon/pvcnoded/CephInstance.py b/node-daemon/pvcnoded/CephInstance.py deleted file mode 100644 index f7214302..00000000 --- a/node-daemon/pvcnoded/CephInstance.py +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/env python3 - -# CephInstance.py - Class implementing a PVC node Ceph instance -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018-2021 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import time -import json -import psutil - -import daemon_lib.common as common - - -class CephOSDInstance(object): - def __init__(self, zkhandler, this_node, osd_id): - self.zkhandler = zkhandler - self.this_node = this_node - self.osd_id = osd_id - self.node = None - self.size = None - self.stats = dict() - - @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('osd.node', self.osd_id)) - def watch_osd_node(data, stat, event=''): - if event and event.type == 'DELETED': - # The key has been deleted after existing before; terminate this watcher - # because this class instance is about to be reaped in Daemon.py - return False - - try: - data = data.decode('ascii') - except AttributeError: - data = '' - - if data and data != self.node: - self.node = data - - @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('osd.stats', self.osd_id)) - def watch_osd_stats(data, stat, event=''): - if event and event.type == 'DELETED': - # The key has been deleted after existing before; terminate this watcher - # because this class instance is about to be reaped in Daemon.py - return False - - try: - data = data.decode('ascii') - except AttributeError: - data = '' - - if data and data != self.stats: - self.stats = json.loads(data) - - -def add_osd(zkhandler, logger, node, device, weight): - # We are ready to create a new OSD on this node - logger.out('Creating new OSD disk on block device {}'.format(device), state='i') - try: - # 1. Create an OSD; we do this so we know what ID will be gen'd - retcode, stdout, stderr = common.run_os_command('ceph osd create') - if retcode: - print('ceph osd create') - print(stdout) - print(stderr) - raise - osd_id = stdout.rstrip() - - # 2. Remove that newly-created OSD - retcode, stdout, stderr = common.run_os_command('ceph osd rm {}'.format(osd_id)) - if retcode: - print('ceph osd rm') - print(stdout) - print(stderr) - raise - - # 3a. Zap the disk to ensure it is ready to go - logger.out('Zapping disk {}'.format(device), state='i') - retcode, stdout, stderr = common.run_os_command('ceph-volume lvm zap --destroy {}'.format(device)) - if retcode: - print('ceph-volume lvm zap') - print(stdout) - print(stderr) - raise - - # 3b. Create the OSD for real - logger.out('Preparing LVM for new OSD disk with ID {} on {}'.format(osd_id, device), state='i') - retcode, stdout, stderr = common.run_os_command( - 'ceph-volume lvm prepare --bluestore --data {device}'.format( - osdid=osd_id, - device=device - ) - ) - if retcode: - print('ceph-volume lvm prepare') - print(stdout) - print(stderr) - raise - - # 4a. Get OSD FSID - logger.out('Getting OSD FSID for ID {} on {}'.format(osd_id, device), state='i') - retcode, stdout, stderr = common.run_os_command( - 'ceph-volume lvm list {device}'.format( - osdid=osd_id, - device=device - ) - ) - for line in stdout.split('\n'): - if 'osd fsid' in line: - osd_fsid = line.split()[-1] - - if not osd_fsid: - print('ceph-volume lvm list') - print('Could not find OSD fsid in data:') - print(stdout) - print(stderr) - raise - - # 4b. Activate the OSD - logger.out('Activating new OSD disk with ID {}'.format(osd_id, device), state='i') - retcode, stdout, stderr = common.run_os_command( - 'ceph-volume lvm activate --bluestore {osdid} {osdfsid}'.format( - osdid=osd_id, - osdfsid=osd_fsid - ) - ) - if retcode: - print('ceph-volume lvm activate') - print(stdout) - print(stderr) - raise - - # 5. Add it to the crush map - logger.out('Adding new OSD disk with ID {} to CRUSH map'.format(osd_id), state='i') - retcode, stdout, stderr = common.run_os_command( - 'ceph osd crush add osd.{osdid} {weight} root=default host={node}'.format( - osdid=osd_id, - weight=weight, - node=node - ) - ) - if retcode: - print('ceph osd crush add') - print(stdout) - print(stderr) - raise - time.sleep(0.5) - - # 6. Verify it started - retcode, stdout, stderr = common.run_os_command( - 'systemctl status ceph-osd@{osdid}'.format( - osdid=osd_id - ) - ) - if retcode: - print('systemctl status') - print(stdout) - print(stderr) - raise - - # 7. Add the new OSD to the list - logger.out('Adding new OSD disk with ID {} to Zookeeper'.format(osd_id), state='i') - zkhandler.write([ - (('osd', osd_id), ''), - (('osd.node', osd_id), node), - (('osd.device', osd_id), device), - (('osd.stats', osd_id), '{}'), - ]) - - # Log it - logger.out('Created new OSD disk with ID {}'.format(osd_id), state='o') - return True - except Exception as e: - # Log it - logger.out('Failed to create new OSD disk: {}'.format(e), state='e') - return False - - -def remove_osd(zkhandler, logger, osd_id, osd_obj): - logger.out('Removing OSD disk {}'.format(osd_id), state='i') - try: - # 1. Verify the OSD is present - retcode, stdout, stderr = common.run_os_command('ceph osd ls') - osd_list = stdout.split('\n') - if osd_id not in osd_list: - logger.out('Could not find OSD {} in the cluster'.format(osd_id), state='e') - return True - - # 1. Set the OSD out so it will flush - logger.out('Setting out OSD disk with ID {}'.format(osd_id), state='i') - retcode, stdout, stderr = common.run_os_command('ceph osd out {}'.format(osd_id)) - if retcode: - print('ceph osd out') - print(stdout) - print(stderr) - raise - - # 2. Wait for the OSD to flush - logger.out('Flushing OSD disk with ID {}'.format(osd_id), state='i') - osd_string = str() - while True: - try: - retcode, stdout, stderr = common.run_os_command('ceph pg dump osds --format json') - dump_string = json.loads(stdout) - for osd in dump_string: - if str(osd['osd']) == osd_id: - osd_string = osd - num_pgs = osd_string['num_pgs'] - if num_pgs > 0: - time.sleep(5) - else: - raise - except Exception: - break - - # 3. Stop the OSD process and wait for it to be terminated - logger.out('Stopping OSD disk with ID {}'.format(osd_id), state='i') - retcode, stdout, stderr = common.run_os_command('systemctl stop ceph-osd@{}'.format(osd_id)) - if retcode: - print('systemctl stop') - print(stdout) - print(stderr) - raise - - # FIXME: There has to be a better way to do this /shrug - while True: - is_osd_up = False - # Find if there is a process named ceph-osd with arg '--id {id}' - for p in psutil.process_iter(attrs=['name', 'cmdline']): - if 'ceph-osd' == p.info['name'] and '--id {}'.format(osd_id) in ' '.join(p.info['cmdline']): - is_osd_up = True - # If there isn't, continue - if not is_osd_up: - break - - # 4. Determine the block devices - retcode, stdout, stderr = common.run_os_command('readlink /var/lib/ceph/osd/ceph-{}/block'.format(osd_id)) - vg_name = stdout.split('/')[-2] # e.g. /dev/ceph-/osd-block- - retcode, stdout, stderr = common.run_os_command('vgs --separator , --noheadings -o pv_name {}'.format(vg_name)) - pv_block = stdout.strip() - - # 5. Zap the volumes - logger.out('Zapping OSD disk with ID {} on {}'.format(osd_id, pv_block), state='i') - retcode, stdout, stderr = common.run_os_command('ceph-volume lvm zap --destroy {}'.format(pv_block)) - if retcode: - print('ceph-volume lvm zap') - print(stdout) - print(stderr) - raise - - # 6. Purge the OSD from Ceph - logger.out('Purging OSD disk with ID {}'.format(osd_id), state='i') - retcode, stdout, stderr = common.run_os_command('ceph osd purge {} --yes-i-really-mean-it'.format(osd_id)) - if retcode: - print('ceph osd purge') - print(stdout) - print(stderr) - raise - - # 7. Delete OSD from ZK - logger.out('Deleting OSD disk with ID {} from Zookeeper'.format(osd_id), state='i') - zkhandler.delete(('osd', osd_id), recursive=True) - - # Log it - logger.out('Removed OSD disk with ID {}'.format(osd_id), state='o') - return True - except Exception as e: - # Log it - logger.out('Failed to purge OSD disk with ID {}: {}'.format(osd_id, e), state='e') - return False - - -class CephPoolInstance(object): - def __init__(self, zkhandler, this_node, name): - self.zkhandler = zkhandler - self.this_node = this_node - self.name = name - self.pgs = '' - self.stats = dict() - - @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('pool.pgs', self.name)) - def watch_pool_node(data, stat, event=''): - if event and event.type == 'DELETED': - # The key has been deleted after existing before; terminate this watcher - # because this class instance is about to be reaped in Daemon.py - return False - - try: - data = data.decode('ascii') - except AttributeError: - data = '' - - if data and data != self.pgs: - self.pgs = data - - @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('pool.stats', self.name)) - def watch_pool_stats(data, stat, event=''): - if event and event.type == 'DELETED': - # The key has been deleted after existing before; terminate this watcher - # because this class instance is about to be reaped in Daemon.py - return False - - try: - data = data.decode('ascii') - except AttributeError: - data = '' - - if data and data != self.stats: - self.stats = json.loads(data) - - -class CephVolumeInstance(object): - def __init__(self, zkhandler, this_node, pool, name): - self.zkhandler = zkhandler - self.this_node = this_node - self.pool = pool - self.name = name - self.stats = dict() - - @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('volume.stats', f'{self.pool}/{self.name}')) - def watch_volume_stats(data, stat, event=''): - if event and event.type == 'DELETED': - # The key has been deleted after existing before; terminate this watcher - # because this class instance is about to be reaped in Daemon.py - return False - - try: - data = data.decode('ascii') - except AttributeError: - data = '' - - if data and data != self.stats: - self.stats = json.loads(data) - - -class CephSnapshotInstance(object): - def __init__(self, zkhandler, this_node, pool, volume, name): - self.zkhandler = zkhandler - self.this_node = this_node - self.pool = pool - self.volume = volume - self.name = name - self.stats = dict() - - @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('snapshot.stats', f'{self.pool}/{self.volume}/{self.name}')) - def watch_snapshot_stats(data, stat, event=''): - if event and event.type == 'DELETED': - # The key has been deleted after existing before; terminate this watcher - # because this class instance is about to be reaped in Daemon.py - return False - - try: - data = data.decode('ascii') - except AttributeError: - data = '' - - if data and data != self.stats: - self.stats = json.loads(data) - - -# Primary command function -# This command pipe is only used for OSD adds and removes -def run_command(zkhandler, logger, this_node, data, d_osd): - # Get the command and args - command, args = data.split() - - # Adding a new OSD - if command == 'osd_add': - node, device, weight = args.split(',') - if node == this_node.name: - # Lock the command queue - zk_lock = zkhandler.writelock('base.cmd.ceph') - with zk_lock: - # Add the OSD - result = add_osd(zkhandler, logger, node, device, weight) - # Command succeeded - if result: - # Update the command queue - zkhandler.write([ - ('base.cmd.ceph', 'success-{}'.format(data)) - ]) - # Command failed - else: - # Update the command queue - zkhandler.write([ - ('base.cmd.ceph', 'failure-{}'.format(data)) - ]) - # Wait 1 seconds before we free the lock, to ensure the client hits the lock - time.sleep(1) - - # Removing an OSD - elif command == 'osd_remove': - osd_id = args - - # Verify osd_id is in the list - if d_osd[osd_id] and d_osd[osd_id].node == this_node.name: - # Lock the command queue - zk_lock = zkhandler.writelock('base.cmd.ceph') - with zk_lock: - # Remove the OSD - result = remove_osd(zkhandler, logger, osd_id, d_osd[osd_id]) - # Command succeeded - if result: - # Update the command queue - zkhandler.write([ - ('base.cmd.ceph', 'success-{}'.format(data)) - ]) - # Command failed - else: - # Update the command queue - zkhandler.write([ - ('base.cmd.ceph', 'failure-{}'.format(data)) - ]) - # Wait 1 seconds before we free the lock, to ensure the client hits the lock - time.sleep(1) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 01d4cf32..baf9def6 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Daemon.py - Node daemon +# Daemon.py - PVC Node daemon main entrypoing # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018-2021 Joshua M. Boniface @@ -19,1975 +19,680 @@ # ############################################################################### -import kazoo.client -import libvirt -import sys -import os -import signal -import psutil -import subprocess -import time -import re -import yaml -import json +import pvcnoded.util.keepalive +import pvcnoded.util.config +import pvcnoded.util.fencing +import pvcnoded.util.networking +import pvcnoded.util.services +import pvcnoded.util.libvirt +import pvcnoded.util.zookeeper -from socket import gethostname -from datetime import datetime -from threading import Thread -from ipaddress import ip_address, ip_network -from apscheduler.schedulers.background import BackgroundScheduler -from distutils.util import strtobool -from queue import Queue -from xml.etree import ElementTree -from rados import Rados +import pvcnoded.objects.DNSAggregatorInstance as DNSAggregatorInstance +import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance +import pvcnoded.objects.VMInstance as VMInstance +import pvcnoded.objects.NodeInstance as NodeInstance +import pvcnoded.objects.VXNetworkInstance as VXNetworkInstance +import pvcnoded.objects.SRIOVVFInstance as SRIOVVFInstance +import pvcnoded.objects.CephInstance as CephInstance -from daemon_lib.zkhandler import ZKHandler - -import pvcnoded.fencing as fencing import daemon_lib.log as log import daemon_lib.common as common -import pvcnoded.VMInstance as VMInstance -import pvcnoded.NodeInstance as NodeInstance -import pvcnoded.VXNetworkInstance as VXNetworkInstance -import pvcnoded.SRIOVVFInstance as SRIOVVFInstance -import pvcnoded.DNSAggregatorInstance as DNSAggregatorInstance -import pvcnoded.CephInstance as CephInstance -import pvcnoded.MetadataAPIInstance as MetadataAPIInstance +from time import sleep +from distutils.util import strtobool -# Version string for startup output +import os +import sys +import signal +import re +import json + +# Daemon version version = '0.9.32' -############################################################################### -# PVCD - node daemon startup program -############################################################################### -# -# The PVC daemon starts a node and configures all the required components for -# the node to run. It determines which of the 3 daemon modes it should be in -# during initial setup based on hostname and the config file, and then starts -# any required services. The 3 daemon modes are: -# * leader: the cluster leader, follows the Zookeeper leader -# * coordinator: a Zookeeper cluster member -# * hypervisor: a hypervisor without any cluster intelligence -# -############################################################################### -############################################################################### -# Daemon functions -############################################################################### +########################################################## +# Entrypoint +########################################################## -# Ensure update_timer, this_node, and d_domain are None until they're set for real -# Ensures cleanup() doesn't fail due to these items not being created yet -update_timer = None -this_node = None -d_domain = None +def entrypoint(): + keepalive_timer = None + # Get our configuration + config = pvcnoded.util.config.get_configuration() + config['pvcnoded_version'] = version -# Create timer to update this node in Zookeeper -def startKeepaliveTimer(): - # Create our timer object - update_timer = BackgroundScheduler() - interval = int(config['keepalive_interval']) - logger.out('Starting keepalive timer ({} second interval)'.format(interval), state='s') - update_timer.add_job(node_keepalive, 'interval', seconds=interval) - update_timer.start() - node_keepalive() - return update_timer + # Set some useful booleans for later (fewer characters) + debug = config['debug'] + if debug: + print('DEBUG MODE ENABLED') + # Create and validate our directories + pvcnoded.util.config.validate_directories(config) -def stopKeepaliveTimer(): - global update_timer - try: - update_timer.shutdown() - logger.out('Stopping keepalive timer', state='s') - except Exception: - pass + # Set up the logger instance + logger = log.Logger(config) + # Print our startup message + logger.out('') + logger.out('|----------------------------------------------------------|') + logger.out('| |') + logger.out('| ███████████ ▜█▙ ▟█▛ █████ █ █ █ |') + logger.out('| ██ ▜█▙ ▟█▛ ██ |') + logger.out('| ███████████ ▜█▙ ▟█▛ ██ |') + logger.out('| ██ ▜█▙▟█▛ ███████████ |') + logger.out('| |') + logger.out('|----------------------------------------------------------|') + logger.out('| Parallel Virtual Cluster node daemon v{0: <18} |'.format(version)) + logger.out('| Debug: {0: <49} |'.format(str(config['debug']))) + logger.out('| FQDN: {0: <50} |'.format(config['node_fqdn'])) + logger.out('| Host: {0: <50} |'.format(config['node_hostname'])) + logger.out('| ID: {0: <52} |'.format(config['node_id'])) + logger.out('| IPMI hostname: {0: <41} |'.format(config['ipmi_hostname'])) + logger.out('| Machine details: |') + logger.out('| CPUs: {0: <48} |'.format(config['static_data'][0])) + logger.out('| Arch: {0: <48} |'.format(config['static_data'][3])) + logger.out('| OS: {0: <50} |'.format(config['static_data'][2])) + logger.out('| Kernel: {0: <46} |'.format(config['static_data'][1])) + logger.out('|----------------------------------------------------------|') + logger.out('') + logger.out(f'Starting pvcnoded on host {config["node_fqdn"]}', state='s') -############################################################################### -# PHASE 1a - Configuration parsing -############################################################################### - -# Get the config file variable from the environment -try: - pvcnoded_config_file = os.environ['PVCD_CONFIG_FILE'] -except Exception: - print('ERROR: The "PVCD_CONFIG_FILE" environment variable must be set before starting pvcnoded.') - os._exit(1) - -# Set local hostname and domain variables -myfqdn = gethostname() -myhostname = myfqdn.split('.', 1)[0] -mydomainname = ''.join(myfqdn.split('.', 1)[1:]) -try: - mynodeid = re.findall(r'\d+', myhostname)[-1] -except IndexError: - mynodeid = 1 - -# Maintenance mode off by default -maintenance = False - -# Gather useful data about our host -# Static data format: 'cpu_count', 'arch', 'os', 'kernel' -staticdata = [] -staticdata.append(str(psutil.cpu_count())) -staticdata.append(subprocess.run(['uname', '-r'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -staticdata.append(subprocess.run(['uname', '-o'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -staticdata.append(subprocess.run(['uname', '-m'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) - - -# Read and parse the config file -def readConfig(pvcnoded_config_file, myhostname): - print('Loading configuration from file "{}"'.format(pvcnoded_config_file)) - - with open(pvcnoded_config_file, 'r') as cfgfile: - try: - o_config = yaml.load(cfgfile, Loader=yaml.SafeLoader) - except Exception as e: - print('ERROR: Failed to parse configuration file: {}'.format(e)) - os._exit(1) - - # Handle the basic config (hypervisor-only) - try: - config_general = { - 'node': o_config['pvc']['node'], - 'coordinators': o_config['pvc']['cluster']['coordinators'], - 'enable_hypervisor': o_config['pvc']['functions']['enable_hypervisor'], - 'enable_networking': o_config['pvc']['functions']['enable_networking'], - 'enable_storage': o_config['pvc']['functions']['enable_storage'], - 'enable_api': o_config['pvc']['functions']['enable_api'], - 'dynamic_directory': o_config['pvc']['system']['configuration']['directories']['dynamic_directory'], - 'log_directory': o_config['pvc']['system']['configuration']['directories']['log_directory'], - 'console_log_directory': o_config['pvc']['system']['configuration']['directories']['console_log_directory'], - 'file_logging': o_config['pvc']['system']['configuration']['logging']['file_logging'], - 'stdout_logging': o_config['pvc']['system']['configuration']['logging']['stdout_logging'], - 'zookeeper_logging': o_config['pvc']['system']['configuration']['logging'].get('zookeeper_logging', False), - 'log_colours': o_config['pvc']['system']['configuration']['logging']['log_colours'], - 'log_dates': o_config['pvc']['system']['configuration']['logging']['log_dates'], - 'log_keepalives': o_config['pvc']['system']['configuration']['logging']['log_keepalives'], - 'log_keepalive_cluster_details': o_config['pvc']['system']['configuration']['logging']['log_keepalive_cluster_details'], - 'log_keepalive_storage_details': o_config['pvc']['system']['configuration']['logging']['log_keepalive_storage_details'], - 'console_log_lines': o_config['pvc']['system']['configuration']['logging']['console_log_lines'], - 'node_log_lines': o_config['pvc']['system']['configuration']['logging'].get('node_log_lines', 0), - 'vm_shutdown_timeout': int(o_config['pvc']['system']['intervals']['vm_shutdown_timeout']), - 'keepalive_interval': int(o_config['pvc']['system']['intervals']['keepalive_interval']), - 'fence_intervals': int(o_config['pvc']['system']['intervals']['fence_intervals']), - 'suicide_intervals': int(o_config['pvc']['system']['intervals']['suicide_intervals']), - 'successful_fence': o_config['pvc']['system']['fencing']['actions']['successful_fence'], - 'failed_fence': o_config['pvc']['system']['fencing']['actions']['failed_fence'], - 'migration_target_selector': o_config['pvc']['system']['migration']['target_selector'], - 'ipmi_hostname': o_config['pvc']['system']['fencing']['ipmi']['host'], - 'ipmi_username': o_config['pvc']['system']['fencing']['ipmi']['user'], - 'ipmi_password': o_config['pvc']['system']['fencing']['ipmi']['pass'] - } - except Exception as e: - print('ERROR: Failed to load configuration: {}'.format(e)) - cleanup(failure=True) - config = config_general - - # Handle debugging config - try: - config_debug = { - 'debug': o_config['pvc']['debug'] - } - except Exception: - config_debug = { - 'debug': False - } - config = {**config, **config_debug} - - # Handle the networking config if config['enable_networking']: - try: - config_networking = { - 'cluster_domain': o_config['pvc']['cluster']['networks']['cluster']['domain'], - 'vni_floating_ip': o_config['pvc']['cluster']['networks']['cluster']['floating_ip'], - 'vni_network': o_config['pvc']['cluster']['networks']['cluster']['network'], - 'storage_domain': o_config['pvc']['cluster']['networks']['storage']['domain'], - 'storage_floating_ip': o_config['pvc']['cluster']['networks']['storage']['floating_ip'], - 'storage_network': o_config['pvc']['cluster']['networks']['storage']['network'], - 'upstream_domain': o_config['pvc']['cluster']['networks']['upstream']['domain'], - 'upstream_floating_ip': o_config['pvc']['cluster']['networks']['upstream']['floating_ip'], - 'upstream_network': o_config['pvc']['cluster']['networks']['upstream']['network'], - 'upstream_gateway': o_config['pvc']['cluster']['networks']['upstream']['gateway'], - 'pdns_postgresql_host': o_config['pvc']['coordinator']['dns']['database']['host'], - 'pdns_postgresql_port': o_config['pvc']['coordinator']['dns']['database']['port'], - 'pdns_postgresql_dbname': o_config['pvc']['coordinator']['dns']['database']['name'], - 'pdns_postgresql_user': o_config['pvc']['coordinator']['dns']['database']['user'], - 'pdns_postgresql_password': o_config['pvc']['coordinator']['dns']['database']['pass'], - 'metadata_postgresql_host': o_config['pvc']['coordinator']['metadata']['database']['host'], - 'metadata_postgresql_port': o_config['pvc']['coordinator']['metadata']['database']['port'], - 'metadata_postgresql_dbname': o_config['pvc']['coordinator']['metadata']['database']['name'], - 'metadata_postgresql_user': o_config['pvc']['coordinator']['metadata']['database']['user'], - 'metadata_postgresql_password': o_config['pvc']['coordinator']['metadata']['database']['pass'], - 'bridge_dev': o_config['pvc']['system']['configuration']['networking']['bridge_device'], - 'vni_dev': o_config['pvc']['system']['configuration']['networking']['cluster']['device'], - 'vni_mtu': o_config['pvc']['system']['configuration']['networking']['cluster']['mtu'], - 'vni_dev_ip': o_config['pvc']['system']['configuration']['networking']['cluster']['address'], - 'storage_dev': o_config['pvc']['system']['configuration']['networking']['storage']['device'], - 'storage_mtu': o_config['pvc']['system']['configuration']['networking']['storage']['mtu'], - 'storage_dev_ip': o_config['pvc']['system']['configuration']['networking']['storage']['address'], - 'upstream_dev': o_config['pvc']['system']['configuration']['networking']['upstream']['device'], - 'upstream_mtu': o_config['pvc']['system']['configuration']['networking']['upstream']['mtu'], - 'upstream_dev_ip': o_config['pvc']['system']['configuration']['networking']['upstream']['address'], - } + if config['enable_sriov']: + # Set up SR-IOV devices + pvcnoded.util.networking.setup_sriov(logger, config) - # Check if SR-IOV is enabled and activate - config_networking['enable_sriov'] = o_config['pvc']['system']['configuration']['networking'].get('sriov_enable', False) - if config_networking['enable_sriov']: - config_networking['sriov_device'] = list(o_config['pvc']['system']['configuration']['networking']['sriov_device']) + # Set up our interfaces + pvcnoded.util.networking.setup_interfaces(logger, config) - except Exception as e: - print('ERROR: Failed to load configuration: {}'.format(e)) - cleanup(failure=True) - config = {**config, **config_networking} + # Get list of coordinator nodes + coordinator_nodes = config['coordinators'] - # Create the by-id address entries - for net in ['vni', 'storage', 'upstream']: - address_key = '{}_dev_ip'.format(net) - floating_key = '{}_floating_ip'.format(net) - network_key = '{}_network'.format(net) - - # Verify the network provided is valid - try: - network = ip_network(config[network_key]) - except Exception: - print('ERROR: Network address {} for {} is not valid!'.format(config[network_key], network_key)) - cleanup(failure=True) - - # If we should be autoselected - if config[address_key] == 'by-id': - # Construct an IP from the relevant network - # The NodeID starts at 1, but indexes start at 0 - address_id = int(mynodeid) - 1 - # Grab the nth address from the network - config[address_key] = '{}/{}'.format(list(network.hosts())[address_id], network.prefixlen) - - # Verify that the floating IP is valid - - try: - # Set the ipaddr - floating_addr = ip_address(config[floating_key].split('/')[0]) - # Verify we're in the network - if floating_addr not in list(network.hosts()): - raise - except Exception: - print('ERROR: Floating address {} for {} is not valid!'.format(config[floating_key], floating_key)) - cleanup(failure=True) - - # Handle the storage config - if config['enable_storage']: - try: - config_storage = { - 'ceph_config_file': o_config['pvc']['system']['configuration']['storage']['ceph_config_file'], - 'ceph_admin_keyring': o_config['pvc']['system']['configuration']['storage']['ceph_admin_keyring'] - } - except Exception as e: - print('ERROR: Failed to load configuration: {}'.format(e)) - cleanup(failure=True) - config = {**config, **config_storage} - - # Handle an empty ipmi_hostname - if config['ipmi_hostname'] == '': - config['ipmi_hostname'] = myhostname + '-lom.' + mydomainname - - return config - - -# Get the config object from readConfig() -config = readConfig(pvcnoded_config_file, myhostname) -debug = config['debug'] -if debug: - print('DEBUG MODE ENABLED') - -# Handle the enable values -enable_hypervisor = config['enable_hypervisor'] -enable_networking = config['enable_networking'] -enable_sriov = config['enable_sriov'] -enable_storage = config['enable_storage'] - -############################################################################### -# PHASE 1b - Prepare filesystem directories -############################################################################### - -# Define our dynamic directory schema -# / -# dnsmasq/ -# pdns/ -# nft/ -config['dnsmasq_dynamic_directory'] = config['dynamic_directory'] + '/dnsmasq' -config['pdns_dynamic_directory'] = config['dynamic_directory'] + '/pdns' -config['nft_dynamic_directory'] = config['dynamic_directory'] + '/nft' - -# Create our dynamic directories if they don't exist -if not os.path.exists(config['dynamic_directory']): - os.makedirs(config['dynamic_directory']) - os.makedirs(config['dnsmasq_dynamic_directory']) - os.makedirs(config['pdns_dynamic_directory']) - os.makedirs(config['nft_dynamic_directory']) - -# Define our log directory schema -# / -# dnsmasq/ -# pdns/ -# nft/ -config['dnsmasq_log_directory'] = config['log_directory'] + '/dnsmasq' -config['pdns_log_directory'] = config['log_directory'] + '/pdns' -config['nft_log_directory'] = config['log_directory'] + '/nft' - -# Create our log directories if they don't exist -if not os.path.exists(config['log_directory']): - os.makedirs(config['log_directory']) - os.makedirs(config['dnsmasq_log_directory']) - os.makedirs(config['pdns_log_directory']) - os.makedirs(config['nft_log_directory']) - -############################################################################### -# PHASE 1c - Set up logging -############################################################################### - -logger = log.Logger(config) - -# Print our startup messages -logger.out('') -logger.out('|----------------------------------------------------------|') -logger.out('| |') -logger.out('| ███████████ ▜█▙ ▟█▛ █████ █ █ █ |') -logger.out('| ██ ▜█▙ ▟█▛ ██ |') -logger.out('| ███████████ ▜█▙ ▟█▛ ██ |') -logger.out('| ██ ▜█▙▟█▛ ███████████ |') -logger.out('| |') -logger.out('|----------------------------------------------------------|') -logger.out('| Parallel Virtual Cluster node daemon v{0: <18} |'.format(version)) -logger.out('| Debug: {0: <49} |'.format(str(config['debug']))) -logger.out('| FQDN: {0: <50} |'.format(myfqdn)) -logger.out('| Host: {0: <50} |'.format(myhostname)) -logger.out('| ID: {0: <52} |'.format(mynodeid)) -logger.out('| IPMI hostname: {0: <41} |'.format(config['ipmi_hostname'])) -logger.out('| Machine details: |') -logger.out('| CPUs: {0: <48} |'.format(staticdata[0])) -logger.out('| Arch: {0: <48} |'.format(staticdata[3])) -logger.out('| OS: {0: <50} |'.format(staticdata[2])) -logger.out('| Kernel: {0: <46} |'.format(staticdata[1])) -logger.out('|----------------------------------------------------------|') -logger.out('') - -logger.out('Starting pvcnoded on host {}'.format(myfqdn), state='s') - -# Define some colours for future messages if applicable -if config['log_colours']: - fmt_end = logger.fmt_end - fmt_bold = logger.fmt_bold - fmt_blue = logger.fmt_blue - fmt_cyan = logger.fmt_cyan - fmt_green = logger.fmt_green - fmt_yellow = logger.fmt_yellow - fmt_red = logger.fmt_red - fmt_purple = logger.fmt_purple -else: - fmt_end = '' - fmt_bold = '' - fmt_blue = '' - fmt_cyan = '' - fmt_green = '' - fmt_yellow = '' - fmt_red = '' - fmt_purple = '' - -############################################################################### -# PHASE 2a - Activate SR-IOV support -############################################################################### - -# This happens before other networking steps to enable using VFs for cluster functions. -if enable_networking and enable_sriov: - logger.out('Setting up SR-IOV device support', state='i') - # Enable unsafe interruptts for the vfio_iommu_type1 kernel module - try: - common.run_os_command('modprobe vfio_iommu_type1 allow_unsafe_interrupts=1') - with open('/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts', 'w') as mfh: - mfh.write('Y') - except Exception: - logger.out('Failed to enable kernel modules; SR-IOV may fail.', state='w') - - # Loop through our SR-IOV NICs and enable the numvfs for each - for device in config['sriov_device']: - logger.out('Preparing SR-IOV PF {} with {} VFs'.format(device['phy'], device['vfcount']), state='i') - try: - with open('/sys/class/net/{}/device/sriov_numvfs'.format(device['phy']), 'r') as vfh: - current_sriov_count = vfh.read().strip() - with open('/sys/class/net/{}/device/sriov_numvfs'.format(device['phy']), 'w') as vfh: - vfh.write(str(device['vfcount'])) - except FileNotFoundError: - logger.out('Failed to open SR-IOV configuration for PF {}; device may not support SR-IOV.'.format(device), state='w') - except OSError: - logger.out('Failed to set SR-IOV VF count for PF {} to {}; already set to {}.'.format(device['phy'], device['vfcount'], current_sriov_count), state='w') - - if device.get('mtu', None) is not None: - logger.out('Setting SR-IOV PF {} to MTU {}'.format(device['phy'], device['mtu']), state='i') - common.run_os_command('ip link set {} mtu {} up'.format(device['phy'], device['mtu'])) - - -############################################################################### -# PHASE 2b - Create local IP addresses for static networks -############################################################################### - -if enable_networking: - # VNI configuration - vni_dev = config['vni_dev'] - vni_mtu = config['vni_mtu'] - vni_dev_ip = config['vni_dev_ip'] - logger.out('Setting up VNI network interface {} with MTU {}'.format(vni_dev, vni_mtu), state='i') - common.run_os_command('ip link set {} mtu {} up'.format(vni_dev, vni_mtu)) - - # Cluster bridge configuration - logger.out('Setting up Cluster network bridge on interface {} with IP {}'.format(vni_dev, vni_dev_ip), state='i') - common.run_os_command('brctl addbr brcluster') - common.run_os_command('brctl addif brcluster {}'.format(vni_dev)) - common.run_os_command('ip link set brcluster mtu {} up'.format(vni_mtu)) - common.run_os_command('ip address add {} dev {}'.format(vni_dev_ip, 'brcluster')) - - # Storage configuration - storage_dev = config['storage_dev'] - storage_mtu = config['storage_mtu'] - storage_dev_ip = config['storage_dev_ip'] - logger.out('Setting up Storage network interface {} with MTU {}'.format(storage_dev, vni_mtu), state='i') - common.run_os_command('ip link set {} mtu {} up'.format(storage_dev, storage_mtu)) - - # Storage bridge configuration - if storage_dev == vni_dev: - logger.out('Adding Storage network IP {} to VNI Cluster bridge brcluster'.format(storage_dev_ip), state='i') - common.run_os_command('ip address add {} dev {}'.format(storage_dev_ip, 'brcluster')) + if config['node_hostname'] in coordinator_nodes: + # We are indeed a coordinator node + config['daemon_mode'] = 'coordinator' + logger.out(f'This node is a {logger.fmt_blue}coordinator{logger.fmt_end}', state='i') else: - logger.out('Setting up Storage network bridge on interface {} with IP {}'.format(vni_dev, vni_dev_ip), state='i') - common.run_os_command('brctl addbr brstorage') - common.run_os_command('brctl addif brstorage {}'.format(storage_dev)) - common.run_os_command('ip link set brstorage mtu {} up'.format(storage_mtu)) - common.run_os_command('ip address add {} dev {}'.format(storage_dev_ip, 'brstorage')) + # We are a hypervisor node + config['daemon_mode'] = 'hypervisor' + logger.out(f'This node is a {logger.fmt_cyan}hypervisor{logger.fmt_end}', state='i') - # Upstream configuration - upstream_dev = config['upstream_dev'] - upstream_mtu = config['upstream_mtu'] - upstream_dev_ip = config['upstream_dev_ip'] - logger.out('Setting up Upstream network interface {} with MTU {}'.format(upstream_dev, upstream_mtu), state='i') - common.run_os_command('ip link set {} mtu {} up'.format(upstream_dev, upstream_mtu)) + pvcnoded.util.services.start_system_services(logger, config) - # Upstream bridge configuration - if upstream_dev == vni_dev: - logger.out('Adding Upstream network IP {} to VNI Cluster bridge brcluster'.format(upstream_dev_ip), state='i') - common.run_os_command('ip address add {} dev {}'.format(upstream_dev_ip, 'brcluster')) - else: - logger.out('Setting up Upstream network bridge on interface {} with IP {}'.format(vni_dev, vni_dev_ip), state='i') - common.run_os_command('brctl addbr brupstream') - common.run_os_command('brctl addif brupstream {}'.format(upstream_dev)) - common.run_os_command('ip link set brupstream mtu {} up'.format(upstream_mtu)) - common.run_os_command('ip address add {} dev {}'.format(upstream_dev_ip, 'brupstream')) + # Connect to Zookeeper and return our handler and current schema version + zkhandler, node_schema_version = pvcnoded.util.zookeeper.connect(logger, config) - # Add upstream default gateway - upstream_gateway = config.get('upstream_gateway', None) - if upstream_gateway: - logger.out('Setting up Upstream default gateway IP {}'.format(upstream_gateway), state='i') - if upstream_dev == vni_dev: - common.run_os_command('ip route add default via {} dev {}'.format(upstream_gateway, 'brcluster')) + # Watch for a global schema update and fire + # This will only change by the API when triggered after seeing all nodes can update + @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.schema.version')) + def update_schema(new_schema_version, stat, event=''): + nonlocal zkhandler, keepalive_timer, node_schema_version + + try: + new_schema_version = int(new_schema_version.decode('ascii')) + except Exception: + new_schema_version = 0 + + if new_schema_version == node_schema_version: + return True + + logger.out('Hot update of schema version started', state='s') + logger.out(f'Current version: {node_schema_version,} New version: {new_schema_version}', state='s') + + # Prevent any keepalive updates while this happens + if keepalive_timer is not None: + pvcnoded.util.keepalive.stop_keepalive_timer() + sleep(1) + + # Perform the migration (primary only) + if zkhandler.read('base.config.primary_node') == config['node_hostname']: + logger.out('Primary node acquiring exclusive lock', state='s') + # Wait for things to settle + sleep(0.5) + # Acquire a write lock on the root key + with zkhandler.exclusivelock('base.schema.version'): + # Perform the schema migration tasks + logger.out('Performing schema update', state='s') + if new_schema_version > node_schema_version: + zkhandler.schema.migrate(zkhandler, new_schema_version) + if new_schema_version < node_schema_version: + zkhandler.schema.rollback(zkhandler, new_schema_version) + # Wait for the exclusive lock to be lifted else: - common.run_os_command('ip route add default via {} dev {}'.format(upstream_gateway, 'brupstream')) + logger.out('Non-primary node acquiring read lock', state='s') + # Wait for things to settle + sleep(1) + # Wait for a read lock + lock = zkhandler.readlock('base.schema.version') + lock.acquire() + # Wait a bit more for the primary to return to normal + sleep(1) - logger.out('Waiting 3s for networking to come up', state='s') - time.sleep(3) - -############################################################################### -# PHASE 2c - Prepare sysctl for pvcnoded -############################################################################### - -if enable_networking: - # Enable routing functions - common.run_os_command('sysctl net.ipv4.ip_forward=1') - common.run_os_command('sysctl net.ipv6.ip_forward=1') - - # Send redirects - common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1') - common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1') - common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1') - common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1') - - # Accept source routes - common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') - common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1') - common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1') - common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1') - - # Disable RP filtering on the VNI Cluster and Upstream interfaces (to allow traffic pivoting) - common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['vni_dev'])) - common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['upstream_dev'])) - common.run_os_command('sysctl net.ipv4.conf.brcluster.rp_filter=0') - common.run_os_command('sysctl net.ipv4.conf.brupstream.rp_filter=0') - common.run_os_command('sysctl net.ipv6.conf.{}.rp_filter=0'.format(config['vni_dev'])) - common.run_os_command('sysctl net.ipv6.conf.{}.rp_filter=0'.format(config['upstream_dev'])) - common.run_os_command('sysctl net.ipv6.conf.brcluster.rp_filter=0') - common.run_os_command('sysctl net.ipv6.conf.brupstream.rp_filter=0') - -############################################################################### -# PHASE 3a - Determine coordinator mode -############################################################################### - -# What is the list of coordinator hosts -coordinator_nodes = config['coordinators'] - -if myhostname in coordinator_nodes: - # We are indeed a coordinator host - config['daemon_mode'] = 'coordinator' - # Start the zookeeper service using systemctl - logger.out('Node is a ' + fmt_blue + 'coordinator' + fmt_end, state='i') -else: - config['daemon_mode'] = 'hypervisor' - -############################################################################### -# PHASE 3b - Start system daemons -############################################################################### -if config['daemon_mode'] == 'coordinator': - logger.out('Starting Zookeeper daemon', state='i') - common.run_os_command('systemctl start zookeeper.service') - -if enable_hypervisor: - logger.out('Starting Libvirt daemon', state='i') - common.run_os_command('systemctl start libvirtd.service') - -if enable_networking: - if config['daemon_mode'] == 'coordinator': - logger.out('Starting Patroni daemon', state='i') - common.run_os_command('systemctl start patroni.service') - logger.out('Starting FRRouting daemon', state='i') - common.run_os_command('systemctl start frr.service') - -if enable_storage: - if config['daemon_mode'] == 'coordinator': - logger.out('Starting Ceph monitor daemon', state='i') - common.run_os_command('systemctl start ceph-mon@{}'.format(myhostname)) - logger.out('Starting Ceph manager daemon', state='i') - common.run_os_command('systemctl start ceph-mgr@{}'.format(myhostname)) - -logger.out('Waiting 3s for daemons to start', state='s') -time.sleep(3) - -############################################################################### -# PHASE 4 - Attempt to connect to the coordinators and start zookeeper client -############################################################################### - -# Create an instance of the handler -zkhandler = ZKHandler(config, logger=logger) - -try: - logger.out('Connecting to Zookeeper cluster nodes {}'.format(config['coordinators']), state='i') - # Start connection - zkhandler.connect(persistent=True) -except Exception as e: - logger.out('ERROR: Failed to connect to Zookeeper cluster: {}'.format(e), state='e') - os._exit(1) - -logger.out('Validating Zookeeper schema', state='i') - -try: - node_schema_version = int(zkhandler.read(('node.data.active_schema', myhostname))) -except Exception: - node_schema_version = int(zkhandler.read('base.schema.version')) - if node_schema_version is None: - node_schema_version = 0 - zkhandler.write([ - (('node.data.active_schema', myhostname), node_schema_version) - ]) - -# Load in the current node schema version -zkhandler.schema.load(node_schema_version) - -# Record the latest intalled schema version -latest_schema_version = zkhandler.schema.find_latest() -logger.out('Latest installed schema is {}'.format(latest_schema_version), state='i') -zkhandler.write([ - (('node.data.latest_schema', myhostname), latest_schema_version) -]) - - -# Watch for a global schema update and fire -# This will only change by the API when triggered after seeing all nodes can update -@zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.schema.version')) -def update_schema(new_schema_version, stat, event=''): - global zkhandler, update_timer, node_schema_version - - try: - new_schema_version = int(new_schema_version.decode('ascii')) - except Exception: - new_schema_version = 0 - - if new_schema_version == node_schema_version: - return True - - logger.out('Hot update of schema version started', state='s') - logger.out('Current version: {} New version: {}'.format(node_schema_version, new_schema_version), state='s') - - # Prevent any keepalive updates while this happens - if update_timer is not None: - stopKeepaliveTimer() - time.sleep(1) - - # Perform the migration (primary only) - if zkhandler.read('base.config.primary_node') == myhostname: - logger.out('Primary node acquiring exclusive lock', state='s') - # Wait for things to settle - time.sleep(0.5) - # Acquire a write lock on the root key - with zkhandler.exclusivelock('base.schema.version'): - # Perform the schema migration tasks - logger.out('Performing schema update', state='s') - if new_schema_version > node_schema_version: - zkhandler.schema.migrate(zkhandler, new_schema_version) - if new_schema_version < node_schema_version: - zkhandler.schema.rollback(zkhandler, new_schema_version) - # Wait for the exclusive lock to be lifted - else: - logger.out('Non-primary node acquiring read lock', state='s') - # Wait for things to settle - time.sleep(1) - # Wait for a read lock - lock = zkhandler.readlock('base.schema.version') - lock.acquire() - # Wait a bit more for the primary to return to normal - time.sleep(1) - - # Update the local schema version - logger.out('Updating node target schema version', state='s') - zkhandler.write([ - (('node.data.active_schema', myhostname), new_schema_version) - ]) - node_schema_version = new_schema_version - - # Restart the API daemons if applicable - logger.out('Restarting services', state='s') - common.run_os_command('systemctl restart pvcapid-worker.service') - if zkhandler.read('base.config.primary_node') == myhostname: - common.run_os_command('systemctl restart pvcapid.service') - - # Restart ourselves with the new schema - logger.out('Reloading node daemon', state='s') - try: - zkhandler.disconnect(persistent=True) - del zkhandler - except Exception: - pass - os.execv(sys.argv[0], sys.argv) - - -# If we are the last node to get a schema update, fire the master update -if latest_schema_version > node_schema_version: - node_latest_schema_version = list() - for node in zkhandler.children('base.node'): - node_latest_schema_version.append(int(zkhandler.read(('node.data.latest_schema', node)))) - - # This is true if all elements of the latest schema version are identical to the latest version, - # i.e. they have all had the latest schema installed and ready to load. - if node_latest_schema_version.count(latest_schema_version) == len(node_latest_schema_version): + # Update the local schema version + logger.out('Updating node target schema version', state='s') zkhandler.write([ - ('base.schema.version', latest_schema_version) + (('node.data.active_schema', config['node_hostname']), new_schema_version) + ]) + node_schema_version = new_schema_version + + # Restart the API daemons if applicable + logger.out('Restarting services', state='s') + common.run_os_command('systemctl restart pvcapid-worker.service') + if zkhandler.read('base.config.primary_node') == config['node_hostname']: + common.run_os_command('systemctl restart pvcapid.service') + + # Restart ourselves with the new schema + logger.out('Reloading node daemon', state='s') + try: + zkhandler.disconnect(persistent=True) + del zkhandler + except Exception: + pass + os.execv(sys.argv[0], sys.argv) + + # Validate the schema + pvcnoded.util.zookeeper.validate_schema(logger, zkhandler) + + # Define a cleanup function + def cleanup(failure=False): + nonlocal logger, zkhandler, keepalive_timer, d_domain + + logger.out('Terminating pvcnoded and cleaning up', state='s') + + # Set shutdown state in Zookeeper + zkhandler.write([ + (('node.state.daemon', config['node_hostname']), 'shutdown') ]) -# Validate our schema against the active version -if not zkhandler.schema.validate(zkhandler, logger): - logger.out('Found schema violations, applying', state='i') - zkhandler.schema.apply(zkhandler) -else: - logger.out('Schema successfully validated', state='o') + # Waiting for any flushes to complete + logger.out('Waiting for any active flushes', state='s') + if this_node is not None: + while this_node.flush_thread is not None: + sleep(0.5) + # Stop console logging on all VMs + logger.out('Stopping domain console watchers', state='s') + if d_domain is not None: + for domain in d_domain: + if d_domain[domain].getnode() == config['node_hostname']: + try: + d_domain[domain].console_log_instance.stop() + except Exception: + pass -############################################################################### -# PHASE 5 - Gracefully handle termination -############################################################################### + # Force into secondary coordinator state if needed + try: + if this_node.router_state == 'primary': + zkhandler.write([ + ('base.config.primary_node', 'none') + ]) + logger.out('Waiting for primary migration', state='s') + while this_node.router_state != 'secondary': + sleep(0.5) + except Exception: + pass + # Stop keepalive thread + try: + pvcnoded.util.keepalive.stop_keepalive_timer(logger, keepalive_timer) -# Cleanup function -def cleanup(failure=False): - global logger, zkhandler, update_timer, d_domain + logger.out('Performing final keepalive update', state='s') + pvcnoded.util.keepalive.node_keepalive(logger, config, zkhandler, this_node) + except Exception: + pass - logger.out('Terminating pvcnoded and cleaning up', state='s') + # Set stop state in Zookeeper + zkhandler.write([ + (('node.state.daemon', config['node_hostname']), 'stop') + ]) - # Set shutdown state in Zookeeper - zkhandler.write([ - (('node.state.daemon', myhostname), 'shutdown') - ]) + # Forcibly terminate dnsmasq because it gets stuck sometimes + common.run_os_command('killall dnsmasq') - # Waiting for any flushes to complete - logger.out('Waiting for any active flushes', state='s') - if this_node is not None: - while this_node.flush_thread is not None: - time.sleep(0.5) + # Close the Zookeeper connection + try: + zkhandler.disconnect(persistent=True) + del zkhandler + except Exception: + pass - # Stop console logging on all VMs - logger.out('Stopping domain console watchers', state='s') - if d_domain is not None: - for domain in d_domain: - if d_domain[domain].getnode() == myhostname: - try: - d_domain[domain].console_log_instance.stop() - except Exception: - pass + logger.out('Terminated pvc daemon', state='s') + logger.terminate() - # Force into secondary coordinator state if needed + if failure: + retcode = 1 + else: + retcode = 0 + + os._exit(retcode) + + # Termination function + def term(signum='', frame=''): + cleanup(failure=False) + + # Hangup (logrotate) function + def hup(signum='', frame=''): + if config['file_logging']: + logger.hup() + + # Handle signals gracefully + signal.signal(signal.SIGTERM, term) + signal.signal(signal.SIGINT, term) + signal.signal(signal.SIGQUIT, term) + signal.signal(signal.SIGHUP, hup) + + # Set up this node in Zookeeper + pvcnoded.util.zookeeper.setup_node(logger, config, zkhandler) + + # Check that the primary node key exists and create it with us as primary if not try: - if this_node.router_state == 'primary': + current_primary = zkhandler.read('base.config.primary_node') + except Exception: + current_primary = 'none' + + if current_primary and current_primary != 'none': + logger.out(f'Current primary node is {logger.fmt_blue}{current_primary}{logger.fmt_end}', state='i') + else: + if config['daemon_mode'] == 'coordinator': + logger.out('No primary node found; setting us as primary', state='i') zkhandler.write([ - ('base.config.primary_node', 'none') + ('base.config.primary_node', config['node_hostname']) ]) - logger.out('Waiting for primary migration', state='s') - while this_node.router_state != 'secondary': - time.sleep(0.5) - except Exception: - pass - # Stop keepalive thread - try: - stopKeepaliveTimer() + # Ensure that IPMI is reachable and working + if not pvcnoded.util.fencing.verify_ipmi(config['ipmi_hostname'], config['ipmi_username'], config['ipmi_password']): + logger.out('Our IPMI is not reachable; fencing of this node will likely fail', state='w') - logger.out('Performing final keepalive update', state='s') - node_keepalive() - except Exception: - pass - - # Set stop state in Zookeeper - zkhandler.write([ - (('node.state.daemon', myhostname), 'stop') - ]) - - # Forcibly terminate dnsmasq because it gets stuck sometimes - common.run_os_command('killall dnsmasq') - - # Close the Zookeeper connection - try: - zkhandler.disconnect(persistent=True) - del zkhandler - except Exception: - pass - - logger.out('Terminated pvc daemon', state='s') - logger.terminate() - - if failure: - retcode = 1 - else: - retcode = 0 - - os._exit(retcode) - - -# Termination function -def term(signum='', frame=''): - cleanup(failure=False) - - -# Hangup (logrotate) function -def hup(signum='', frame=''): - if config['file_logging']: - logger.hup() - - -# Handle signals gracefully -signal.signal(signal.SIGTERM, term) -signal.signal(signal.SIGINT, term) -signal.signal(signal.SIGQUIT, term) -signal.signal(signal.SIGHUP, hup) - -############################################################################### -# PHASE 6 - Prepare host in Zookeeper -############################################################################### - -# Check if our node exists in Zookeeper, and create it if not -if config['daemon_mode'] == 'coordinator': - init_routerstate = 'secondary' -else: - init_routerstate = 'client' - -if zkhandler.exists(('node', myhostname)): - logger.out("Node is " + fmt_green + "present" + fmt_end + " in Zookeeper", state='i') - # Update static data just in case it's changed - zkhandler.write([ - (('node', myhostname), config['daemon_mode']), - (('node.mode', myhostname), config['daemon_mode']), - (('node.state.daemon', myhostname), 'init'), - (('node.state.router', myhostname), init_routerstate), - (('node.data.static', myhostname), ' '.join(staticdata)), - (('node.data.pvc_version', myhostname), version), - (('node.ipmi.hostname', myhostname), config['ipmi_hostname']), - (('node.ipmi.username', myhostname), config['ipmi_username']), - (('node.ipmi.password', myhostname), config['ipmi_password']), - ]) -else: - logger.out("Node is " + fmt_red + "absent" + fmt_end + " in Zookeeper; adding new node", state='i') - keepalive_time = int(time.time()) - zkhandler.write([ - (('node', myhostname), config['daemon_mode']), - (('node.keepalive', myhostname), str(keepalive_time)), - (('node.mode', myhostname), config['daemon_mode']), - (('node.state.daemon', myhostname), 'init'), - (('node.state.domain', myhostname), 'flushed'), - (('node.state.router', myhostname), init_routerstate), - (('node.data.static', myhostname), ' '.join(staticdata)), - (('node.data.pvc_version', myhostname), version), - (('node.ipmi.hostname', myhostname), config['ipmi_hostname']), - (('node.ipmi.username', myhostname), config['ipmi_username']), - (('node.ipmi.password', myhostname), config['ipmi_password']), - (('node.memory.total', myhostname), '0'), - (('node.memory.used', myhostname), '0'), - (('node.memory.free', myhostname), '0'), - (('node.memory.allocated', myhostname), '0'), - (('node.memory.provisioned', myhostname), '0'), - (('node.vcpu.allocated', myhostname), '0'), - (('node.cpu.load', myhostname), '0.0'), - (('node.running_domains', myhostname), '0'), - (('node.count.provisioned_domains', myhostname), '0'), - (('node.count.networks', myhostname), '0'), - ]) - -# Check that the primary key exists, and create it with us as master if not -try: - current_primary = zkhandler.read('base.config.primary_node') -except kazoo.exceptions.NoNodeError: - current_primary = 'none' - -if current_primary and current_primary != 'none': - logger.out('Current primary node is {}{}{}.'.format(fmt_blue, current_primary, fmt_end), state='i') -else: - if config['daemon_mode'] == 'coordinator': - logger.out('No primary node found; setting us as primary.', state='i') - zkhandler.write([ - ('base.config.primary_node', myhostname) - ]) - -############################################################################### -# PHASE 7a - Ensure IPMI is reachable and working -############################################################################### -if not fencing.verifyIPMI(config['ipmi_hostname'], config['ipmi_username'], config['ipmi_password']): - logger.out('Our IPMI is not reachable; fencing of this node will likely fail', state='w') - -############################################################################### -# PHASE 7b - Ensure Libvirt is working -############################################################################### - -if enable_hypervisor: - # Check that libvirtd is listening TCP - libvirt_check_name = "qemu+tcp://{}:16509/system".format(myhostname) - logger.out('Connecting to Libvirt daemon at {}'.format(libvirt_check_name), state='i') - try: - lv_conn = libvirt.open(libvirt_check_name) - lv_conn.close() - except Exception as e: - logger.out('ERROR: Failed to connect to Libvirt daemon: {}'.format(e), state='e') + # Validate libvirt + if not pvcnoded.util.libvirt.validate_libvirtd(logger, config): cleanup(failure=True) -############################################################################### -# PHASE 7c - Ensure NFT is running on the local host -############################################################################### + # Set up NFT + pvcnoded.util.networking.create_nft_configuration(logger, config) -if enable_networking: - logger.out("Creating NFT firewall configuration", state='i') + # Create our object dictionaries + logger.out('Setting up objects', state='i') - # Create our config dirs - common.run_os_command( - '/bin/mkdir --parents {}/networks'.format( - config['nft_dynamic_directory'] - ) - ) - common.run_os_command( - '/bin/mkdir --parents {}/static'.format( - config['nft_dynamic_directory'] - ) - ) - common.run_os_command( - '/bin/mkdir --parents {}'.format( - config['nft_dynamic_directory'] - ) - ) + d_node = dict() + node_list = list() + d_network = dict() + network_list = list() + sriov_pf_list = list() + d_sriov_vf = dict() + sriov_vf_list = list() + d_domain = dict() + domain_list = list() + d_osd = dict() + osd_list = list() + d_pool = dict() + pool_list = list() + d_volume = dict() + volume_list = dict() - # Set up the basic features of the nftables firewall - nftables_base_rules = """# Base rules - flush ruleset - # Add the filter table and chains - add table inet filter - add chain inet filter forward {{type filter hook forward priority 0; }} - add chain inet filter input {{type filter hook input priority 0; }} - # Include static rules and network rules - include "{rulesdir}/static/*" - include "{rulesdir}/networks/*" - """.format( - rulesdir=config['nft_dynamic_directory'] - ) - - # Write the basic firewall config - nftables_base_filename = '{}/base.nft'.format(config['nft_dynamic_directory']) - with open(nftables_base_filename, 'w') as nfbasefile: - nfbasefile.write(nftables_base_rules) - common.reload_firewall_rules(nftables_base_filename, logger=logger) - -############################################################################### -# PHASE 7d - Ensure DNSMASQ is not running -############################################################################### - -common.run_os_command('systemctl stop dnsmasq.service') - -############################################################################### -# PHASE 8 - Set up our objects -############################################################################### - -logger.out('Setting up objects', state='i') - -d_node = dict() -d_network = dict() -d_sriov_vf = dict() -d_domain = dict() -d_osd = dict() -d_pool = dict() -d_volume = dict() # Dict of Dicts -node_list = [] -network_list = [] -sriov_pf_list = [] -sriov_vf_list = [] -domain_list = [] -osd_list = [] -pool_list = [] -volume_list = dict() # Dict of Lists - -if enable_networking: - # Create an instance of the DNS Aggregator and Metadata API if we're a coordinator - if config['daemon_mode'] == 'coordinator': + if config['enable_networking'] and config['daemon_mode'] == 'coordinator': + # Create an instance of the DNS Aggregator and Metadata API if we're a coordinator dns_aggregator = DNSAggregatorInstance.DNSAggregatorInstance(config, logger) metadata_api = MetadataAPIInstance.MetadataAPIInstance(zkhandler, config, logger) else: dns_aggregator = None metadata_api = None -else: - dns_aggregator = None - metadata_api = None + # + # Zookeeper watchers for objects + # -# Node objects -@zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.node')) -def update_nodes(new_node_list): - global node_list, d_node + # Node objects + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.node')) + def set_nodes(new_node_list): + nonlocal d_node, node_list - # Add any missing nodes to the list - for node in new_node_list: - if node not in node_list: - d_node[node] = NodeInstance.NodeInstance(node, myhostname, zkhandler, config, logger, d_node, d_network, d_domain, dns_aggregator, metadata_api) + # Add missing nodes to list + for node in [node for node in new_node_list if node not in node_list]: + d_node[node] = NodeInstance.NodeInstance(node, config['node_hostname'], zkhandler, config, logger, d_node, d_network, d_domain, dns_aggregator, metadata_api) - # Remove any deleted nodes from the list - for node in node_list: - if node not in new_node_list: - # Delete the object + # Remove deleted nodes from list + for node in [node for node in node_list if node not in new_node_list]: del(d_node[node]) - # Update and print new list - node_list = new_node_list - logger.out('{}Node list:{} {}'.format(fmt_blue, fmt_end, ' '.join(node_list)), state='i') - - # Update node objects' list - for node in d_node: - d_node[node].update_node_list(d_node) - - -# Alias for our local node (passed to network and domain objects) -this_node = d_node[myhostname] - - -# Maintenance mode -@zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.config.maintenance')) -def set_maintenance(_maintenance, stat, event=''): - global maintenance - try: - maintenance = bool(strtobool(_maintenance.decode('ascii'))) - except Exception: - maintenance = False - - -# Primary node -@zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.config.primary_node')) -def update_primary(new_primary, stat, event=''): - try: - new_primary = new_primary.decode('ascii') - except AttributeError: - new_primary = 'none' - key_version = stat.version - - if new_primary != this_node.primary_node: - if config['daemon_mode'] == 'coordinator': - # We're a coordinator and there is no primary - if new_primary == 'none': - if this_node.daemon_state == 'run' and this_node.router_state not in ['primary', 'takeover', 'relinquish']: - logger.out('Contending for primary coordinator state', state='i') - # Acquire an exclusive lock on the primary_node key - primary_lock = zkhandler.exclusivelock('base.config.primary_node') - try: - # This lock times out after 0.4s, which is 0.1s less than the pre-takeover - # timeout below, thus ensuring that a primary takeover will not deadlock - # against a node that failed the contention - primary_lock.acquire(timeout=0.4) - # Ensure when we get the lock that the versions are still consistent and that - # another node hasn't already acquired primary state - if key_version == zkhandler.zk_conn.get(zkhandler.schema.path('base.config.primary_node'))[1].version: - zkhandler.write([ - ('base.config.primary_node', myhostname) - ]) - # Cleanly release the lock - primary_lock.release() - # We timed out acquiring a lock, which means we failed contention, so just pass - except Exception: - pass - elif new_primary == myhostname: - if this_node.router_state == 'secondary': - time.sleep(0.5) - zkhandler.write([ - (('node.state.router', myhostname), 'takeover') - ]) - else: - if this_node.router_state == 'primary': - time.sleep(0.5) - zkhandler.write([ - (('node.state.router', myhostname), 'relinquish') - ]) - else: - zkhandler.write([ - (('node.state.router', myhostname), 'client') - ]) + node_list = new_node_list + logger.out(f'{logger.fmt_blue}Node list:{logger.fmt_end} {" ".join(node_list)}', state='i') + # Update node objects lists for node in d_node: - d_node[node].primary_node = new_primary + d_node[node].update_node_list(d_node) + # Create helpful alias for this node + this_node = d_node[config['node_hostname']] -if enable_networking: - # Network objects - @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.network')) - def update_networks(new_network_list): - global network_list, d_network + # Maintenance status + @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.config.maintenance')) + def update_maintenance(_maintenance, stat): + try: + maintenance = bool(strtobool(_maintenance.decode('ascii'))) + except Exception: + maintenance = False - # Add any missing networks to the list - for network in new_network_list: - if network not in network_list: + this_node.maintenance = maintenance + + # Primary node + @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.config.primary_node')) + def update_primary_node(new_primary, stat, event=''): + try: + new_primary = new_primary.decode('ascii') + except AttributeError: + new_primary = 'none' + key_version = stat.version + + # TODO: Move this to the Node structure + if new_primary != this_node.primary_node: + if config['daemon_mode'] == 'coordinator': + # We're a coordinator and there's no primary + if new_primary == 'none': + if this_node.daemon_state == 'run' and this_node.router_state not in ['primary', 'takeover', 'relinquish']: + logger.out('Contending for primary coordinator state', state='i') + # Acquire an exclusive lock on the primary_node key + primary_lock = zkhandler.exclusivelock('base.config.primary_node') + try: + # This lock times out after 0.4s, which is 0.1s less than the pre-takeover + # timeout beow. This ensures a primary takeover will not deadlock against + # a node which has failed the contention + primary_lock.acquire(timeout=0.4) + # Ensure that when we get the lock the versions are still consistent and + # that another node hasn't already acquired the primary state (maybe we're + # extremely slow to respond) + if key_version == zkhandler.zk_conn.get(zkhandler.schema.path('base.config.primary_node'))[1].version: + # Set the primary to us + logger.out('Acquiring primary coordinator state', state='o') + zkhandler.write([ + ('base.config.primary_node', config['node_hostname']) + ]) + # Cleanly release the lock + primary_lock.release() + # We timed out acquiring a lock, or failed to write, which means we failed the + # contention and should just log that + except Exception: + logger.out('Timed out contending for primary coordinator state', state='i') + elif new_primary == config['node_hostname']: + if this_node.router_state == 'secondary': + # Wait for 0.5s to ensure other contentions time out, then take over + sleep(0.5) + zkhandler.write([ + (('node.state.router', config['node_hostname']), 'takeover') + ]) + else: + if this_node.router_state == 'primary': + # Wait for 0.5s to ensure other contentions time out, then relinquish + sleep(0.5) + zkhandler.write([ + (('node.state.router', config['node_hostname']), 'relinquish') + ]) + else: + zkhandler.write([ + (('node.state.router', config['node_hostname']), 'client') + ]) + + # TODO: Turn this into a function like the others for clarity + for node in d_node: + d_node[node].primary_node = new_primary + + if config['enable_networking']: + # Network objects + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.network')) + def update_networks(new_network_list): + nonlocal network_list, d_network + + # Add any missing networks to the list + for network in [network for network in new_network_list if network not in network_list]: d_network[network] = VXNetworkInstance.VXNetworkInstance(network, zkhandler, config, logger, this_node, dns_aggregator) + # TODO: Move this to the Network structure if config['daemon_mode'] == 'coordinator' and d_network[network].nettype == 'managed': try: dns_aggregator.add_network(d_network[network]) except Exception as e: - logger.out('Failed to create DNS Aggregator for network {}: {}'.format(network, e), 'w') + logger.out(f'Failed to create DNS Aggregator for network {network}: {e}', state='w') # Start primary functionality if this_node.router_state == 'primary' and d_network[network].nettype == 'managed': d_network[network].createGateways() d_network[network].startDHCPServer() - # Remove any deleted networks from the list - for network in network_list: - if network not in new_network_list: + # Remove any missing networks from the list + for network in [network for network in network_list if network not in new_network_list]: + # TODO: Move this to the Network structure if d_network[network].nettype == 'managed': # Stop primary functionality if this_node.router_state == 'primary': d_network[network].stopDHCPServer() d_network[network].removeGateways() dns_aggregator.remove_network(d_network[network]) - # Stop general functionality + # Stop firewalling d_network[network].removeFirewall() + # Delete the network d_network[network].removeNetwork() - # Delete the object del(d_network[network]) - # Update and print new list - network_list = new_network_list - logger.out('{}Network list:{} {}'.format(fmt_blue, fmt_end, ' '.join(network_list)), state='i') + # Update the new list + network_list = new_network_list + logger.out(f'{logger.fmt_blue}Network list:{logger.fmt_end} {" ".join(network_list)}', state='i') - # Update node objects' list - for node in d_node: - d_node[node].update_network_list(d_network) + # Update node objects list + for node in d_node: + d_node[node].update_network_list(d_network) - # Add the SR-IOV PFs and VFs to Zookeeper - # These do not behave like the objects; they are not dynamic (the API cannot change them), and they - # exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node - # basis, under the Node configuration tree. - # MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys - if enable_sriov and zkhandler.schema.schema.get('sriov_pf', None) is not None: - vf_list = list() - for device in config['sriov_device']: - pf = device['phy'] - vfcount = device['vfcount'] - if device.get('mtu', None) is None: - mtu = 1500 - else: - mtu = device['mtu'] + # Add the SR-IOV PFs and VFs to Zookeeper + # These do not behave like the objects; they are not dynamic (the API cannot change them), and they + # exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node + # basis, under the Node configuration tree. + # MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys + if config['enable_sriov'] and zkhandler.schema.schema.get('sriov_pf', None) is not None: + vf_list = list() + for device in config['sriov_device']: + pf = device['phy'] + vfcount = device['vfcount'] + if device.get('mtu', None) is None: + mtu = 1500 + else: + mtu = device['mtu'] - # Create the PF device in Zookeeper - zkhandler.write([ - (('node.sriov.pf', myhostname, 'sriov_pf', pf), ''), - (('node.sriov.pf', myhostname, 'sriov_pf.mtu', pf), mtu), - (('node.sriov.pf', myhostname, 'sriov_pf.vfcount', pf), vfcount), - ]) - # Append the device to the list of PFs - sriov_pf_list.append(pf) + # Create the PF device in Zookeeper + zkhandler.write([ + (('node.sriov.pf', config['node_hostname'], 'sriov_pf', pf), ''), + (('node.sriov.pf', config['node_hostname'], 'sriov_pf.mtu', pf), mtu), + (('node.sriov.pf', config['node_hostname'], 'sriov_pf.vfcount', pf), vfcount), + ]) + # Append the device to the list of PFs + sriov_pf_list.append(pf) - # Get the list of VFs from `ip link show` - vf_list = json.loads(common.run_os_command('ip --json link show {}'.format(pf))[1])[0].get('vfinfo_list', []) - for vf in vf_list: - # { - # 'vf': 3, - # 'link_type': 'ether', - # 'address': '00:00:00:00:00:00', - # 'broadcast': 'ff:ff:ff:ff:ff:ff', - # 'vlan_list': [{'vlan': 101, 'qos': 2}], - # 'rate': {'max_tx': 0, 'min_tx': 0}, - # 'spoofchk': True, - # 'link_state': 'auto', - # 'trust': False, - # 'query_rss_en': False - # } - vfphy = '{}v{}'.format(pf, vf['vf']) + # Get the list of VFs from `ip link show` + vf_list = json.loads(common.run_os_command(f'ip --json link show {pf}')[1])[0].get('vfinfo_list', []) + for vf in vf_list: + # { + # 'vf': 3, + # 'link_type': 'ether', + # 'address': '00:00:00:00:00:00', + # 'broadcast': 'ff:ff:ff:ff:ff:ff', + # 'vlan_list': [{'vlan': 101, 'qos': 2}], + # 'rate': {'max_tx': 0, 'min_tx': 0}, + # 'spoofchk': True, + # 'link_state': 'auto', + # 'trust': False, + # 'query_rss_en': False + # } + vfphy = f'{pf}v{vf["vf"]}' - # Get the PCIe bus information - dev_pcie_path = None - try: - with open('/sys/class/net/{}/device/uevent'.format(vfphy)) as vfh: - dev_uevent = vfh.readlines() - for line in dev_uevent: - if re.match(r'^PCI_SLOT_NAME=.*', line): - dev_pcie_path = line.rstrip().split('=')[-1] - except FileNotFoundError: - # Something must already be using the PCIe device - pass + # Get the PCIe bus information + dev_pcie_path = None + try: + with open(f'/sys/class/net/{vfphy}/device/uevent') as vfh: + dev_uevent = vfh.readlines() + for line in dev_uevent: + if re.match(r'^PCI_SLOT_NAME=.*', line): + dev_pcie_path = line.rstrip().split('=')[-1] + except FileNotFoundError: + # Something must already be using the PCIe device + pass - # Add the VF to Zookeeper if it does not yet exist - if not zkhandler.exists(('node.sriov.vf', myhostname, 'sriov_vf', vfphy)): - if dev_pcie_path is not None: - pcie_domain, pcie_bus, pcie_slot, pcie_function = re.split(r':|\.', dev_pcie_path) - else: - # We can't add the device - for some reason we can't get any information on its PCIe bus path, - # so just ignore this one, and continue. - # This shouldn't happen under any real circumstances, unless the admin tries to attach a non-existent - # VF to a VM manually, then goes ahead and adds that VF to the system with the VM running. - continue + # Add the VF to Zookeeper if it does not yet exist + if not zkhandler.exists(('node.sriov.vf', config['node_hostname'], 'sriov_vf', vfphy)): + if dev_pcie_path is not None: + pcie_domain, pcie_bus, pcie_slot, pcie_function = re.split(r':|\.', dev_pcie_path) + else: + # We can't add the device - for some reason we can't get any information on its PCIe bus path, + # so just ignore this one, and continue. + # This shouldn't happen under any real circumstances, unless the admin tries to attach a non-existent + # VF to a VM manually, then goes ahead and adds that VF to the system with the VM running. + continue - zkhandler.write([ - (('node.sriov.vf', myhostname, 'sriov_vf', vfphy), ''), - (('node.sriov.vf', myhostname, 'sriov_vf.pf', vfphy), pf), - (('node.sriov.vf', myhostname, 'sriov_vf.mtu', vfphy), mtu), - (('node.sriov.vf', myhostname, 'sriov_vf.mac', vfphy), vf['address']), - (('node.sriov.vf', myhostname, 'sriov_vf.phy_mac', vfphy), vf['address']), - (('node.sriov.vf', myhostname, 'sriov_vf.config', vfphy), ''), - (('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_id', vfphy), vf['vlan_list'][0].get('vlan', '0')), - (('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_qos', vfphy), vf['vlan_list'][0].get('qos', '0')), - (('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_min', vfphy), vf['rate']['min_tx']), - (('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_max', vfphy), vf['rate']['max_tx']), - (('node.sriov.vf', myhostname, 'sriov_vf.config.spoof_check', vfphy), vf['spoofchk']), - (('node.sriov.vf', myhostname, 'sriov_vf.config.link_state', vfphy), vf['link_state']), - (('node.sriov.vf', myhostname, 'sriov_vf.config.trust', vfphy), vf['trust']), - (('node.sriov.vf', myhostname, 'sriov_vf.config.query_rss', vfphy), vf['query_rss_en']), - (('node.sriov.vf', myhostname, 'sriov_vf.pci', vfphy), ''), - (('node.sriov.vf', myhostname, 'sriov_vf.pci.domain', vfphy), pcie_domain), - (('node.sriov.vf', myhostname, 'sriov_vf.pci.bus', vfphy), pcie_bus), - (('node.sriov.vf', myhostname, 'sriov_vf.pci.slot', vfphy), pcie_slot), - (('node.sriov.vf', myhostname, 'sriov_vf.pci.function', vfphy), pcie_function), - (('node.sriov.vf', myhostname, 'sriov_vf.used', vfphy), False), - (('node.sriov.vf', myhostname, 'sriov_vf.used_by', vfphy), ''), + zkhandler.write([ + (('node.sriov.vf', config['node_hostname'], 'sriov_vf', vfphy), ''), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.pf', vfphy), pf), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.mtu', vfphy), mtu), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.mac', vfphy), vf['address']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.phy_mac', vfphy), vf['address']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config', vfphy), ''), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.vlan_id', vfphy), vf['vlan_list'][0].get('vlan', '0')), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.vlan_qos', vfphy), vf['vlan_list'][0].get('qos', '0')), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.tx_rate_min', vfphy), vf['rate']['min_tx']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.tx_rate_max', vfphy), vf['rate']['max_tx']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.spoof_check', vfphy), vf['spoofchk']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.link_state', vfphy), vf['link_state']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.trust', vfphy), vf['trust']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.config.query_rss', vfphy), vf['query_rss_en']), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.pci', vfphy), ''), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.pci.domain', vfphy), pcie_domain), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.pci.bus', vfphy), pcie_bus), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.pci.slot', vfphy), pcie_slot), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.pci.function', vfphy), pcie_function), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.used', vfphy), False), + (('node.sriov.vf', config['node_hostname'], 'sriov_vf.used_by', vfphy), ''), + ]) + + # Append the device to the list of VFs + sriov_vf_list.append(vfphy) + + # Remove any obsolete PFs from Zookeeper if they go away + for pf in zkhandler.children(('node.sriov.pf', config['node_hostname'])): + if pf not in sriov_pf_list: + zkhandler.delete([ + ('node.sriov.pf', config['node_hostname'], 'sriov_pf', pf) + ]) + # Remove any obsolete VFs from Zookeeper if their PF goes away + for vf in zkhandler.children(('node.sriov.vf', config['node_hostname'])): + vf_pf = zkhandler.read(('node.sriov.vf', config['node_hostname'], 'sriov_vf.pf', vf)) + if vf_pf not in sriov_pf_list: + zkhandler.delete([ + ('node.sriov.vf', config['node_hostname'], 'sriov_vf', vf) ]) - # Append the device to the list of VFs - sriov_vf_list.append(vfphy) + # SR-IOV VF objects + # This is a ChildrenWatch just for consistency; the list never changes at runtime + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('node.sriov.vf', config['node_hostname'])) + def update_sriov_vfs(new_sriov_vf_list): + nonlocal sriov_vf_list, d_sriov_vf - # Remove any obsolete PFs from Zookeeper if they go away - for pf in zkhandler.children(('node.sriov.pf', myhostname)): - if pf not in sriov_pf_list: - zkhandler.delete([ - ('node.sriov.pf', myhostname, 'sriov_pf', pf) - ]) - # Remove any obsolete VFs from Zookeeper if their PF goes away - for vf in zkhandler.children(('node.sriov.vf', myhostname)): - vf_pf = zkhandler.read(('node.sriov.vf', myhostname, 'sriov_vf.pf', vf)) - if vf_pf not in sriov_pf_list: - zkhandler.delete([ - ('node.sriov.vf', myhostname, 'sriov_vf', vf) - ]) + # Add VFs to the list + for vf in common.sortInterfaceNames(new_sriov_vf_list): + d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance(vf, zkhandler, config, logger, this_node) - # SR-IOV VF objects - # This is a ChildrenWatch just for consistency; the list never changes at runtime - @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('node.sriov.vf', myhostname)) - def update_sriov_vfs(new_sriov_vf_list): - global sriov_vf_list, d_sriov_vf + sriov_vf_list = sorted(new_sriov_vf_list) + logger.out(f'{logger.fmt_blue}SR-IOV VF list:{logger.fmt_end} {" ".join(sriov_vf_list)}', state='i') - # Add VFs to the list - for vf in common.sortInterfaceNames(new_sriov_vf_list): - d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance(vf, zkhandler, config, logger, this_node) + if config['enable_hypervisor']: + # VM command pipeline key + @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.domain')) + def run_domain_command(data, stat, event=''): + if data: + VMInstance.vm_command(zkhandler, logger, this_node, data.decode('ascii')) - sriov_vf_list = sorted(new_sriov_vf_list) - logger.out('{}SR-IOV VF list:{} {}'.format(fmt_blue, fmt_end, ' '.join(sriov_vf_list)), state='i') + # VM domain objects + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.domain')) + def update_domains(new_domain_list): + nonlocal domain_list, d_domain -if enable_hypervisor: - # VM command pipeline key - @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.domain')) - def cmd_domains(data, stat, event=''): - if data: - VMInstance.run_command(zkhandler, logger, this_node, data.decode('ascii')) - - # VM domain objects - @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.domain')) - def update_domains(new_domain_list): - global domain_list, d_domain - - # Add any missing domains to the list - for domain in new_domain_list: - if domain not in domain_list: + # Add missing domains to the list + for domain in [domain for domain in new_domain_list if domain not in domain_list]: d_domain[domain] = VMInstance.VMInstance(domain, zkhandler, config, logger, this_node) - # Remove any deleted domains from the list - for domain in domain_list: - if domain not in new_domain_list: - # Delete the object + # Remove any deleted domains from the list + for domain in [domain for domain in domain_list if domain not in new_domain_list]: del(d_domain[domain]) - # Update and print new list - domain_list = new_domain_list - logger.out('{}VM list:{} {}'.format(fmt_blue, fmt_end, ' '.join(domain_list)), state='i') + # Update the new list + domain_list = new_domain_list + logger.out(f'{logger.fmt_blue}Domain list:{logger.fmt_end} {" ".join(domain_list)}', state='i') - # Update node objects' list - for node in d_node: - d_node[node].update_domain_list(d_domain) + # Update node objects' list + for node in d_node: + d_node[node].update_domain_list(d_domain) -if enable_storage: - # Ceph command pipeline key - @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.ceph')) - def cmd_ceph(data, stat, event=''): - if data: - CephInstance.run_command(zkhandler, logger, this_node, data.decode('ascii'), d_osd) + if config['enable_storage']: + # Ceph command pipeline key + @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.ceph')) + def run_ceph_command(data, stat, event=''): + if data: + CephInstance.ceph_command(zkhandler, logger, this_node, data.decode('ascii'), d_osd) - # OSD objects - @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.osd')) - def update_osds(new_osd_list): - global osd_list, d_osd + # OSD objects + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.osd')) + def update_osds(new_osd_list): + nonlocal osd_list, d_osd - # Add any missing OSDs to the list - for osd in new_osd_list: - if osd not in osd_list: + # Add any missing OSDs to the list + for osd in [osd for osd in new_osd_list if osd not in osd_list]: d_osd[osd] = CephInstance.CephOSDInstance(zkhandler, this_node, osd) - # Remove any deleted OSDs from the list - for osd in osd_list: - if osd not in new_osd_list: - # Delete the object + # Remove any deleted OSDs from the list + for osd in [osd for osd in osd_list if osd not in new_osd_list]: del(d_osd[osd]) - # Update and print new list - osd_list = new_osd_list - logger.out('{}OSD list:{} {}'.format(fmt_blue, fmt_end, ' '.join(osd_list)), state='i') + # Update the new list + osd_list = new_osd_list + logger.out(f'{logger.fmt_blue}OSD list:{logger.fmt_end} {" ".join(osd_list)}', state='i') - # Pool objects - @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.pool')) - def update_pools(new_pool_list): - global pool_list, d_pool + # Pool objects + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('base.pool')) + def update_pools(new_pool_list): + nonlocal pool_list, d_pool, volume_list, d_volume - # Add any missing Pools to the list - for pool in new_pool_list: - if pool not in pool_list: + # Add any missing pools to the list + for pool in [pool for pool in new_pool_list if pool not in pool_list]: d_pool[pool] = CephInstance.CephPoolInstance(zkhandler, this_node, pool) + # Prepare the volume components for this pool + volume_list[pool] = list() d_volume[pool] = dict() - volume_list[pool] = [] - # Remove any deleted Pools from the list - for pool in pool_list: - if pool not in new_pool_list: - # Delete the object + # Remove any deleted pools from the list + for pool in [pool for pool in pool_list if pool not in new_pool_list]: del(d_pool[pool]) - # Update and print new list - pool_list = new_pool_list - logger.out('{}Pool list:{} {}'.format(fmt_blue, fmt_end, ' '.join(pool_list)), state='i') + # Update the new list + pool_list = new_pool_list + logger.out(f'{logger.fmt_blue}Pool list:{logger.fmt_end} {" ".join(pool_list)}', state='i') - # Volume objects in each pool - for pool in pool_list: - @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('volume', pool)) - def update_volumes(new_volume_list): - global volume_list, d_volume + # Volume objects (in each pool) + for pool in pool_list: + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('volume', pool)) + def update_volumes(new_volume_list): + nonlocal volume_list, d_volume - # Add any missing Volumes to the list - for volume in new_volume_list: - if volume not in volume_list[pool]: + # Add any missing volumes to the list + for volume in [volume for volume in new_volume_list if volume not in volume_list[pool]]: d_volume[pool][volume] = CephInstance.CephVolumeInstance(zkhandler, this_node, pool, volume) - # Remove any deleted Volumes from the list - for volume in volume_list[pool]: - if volume not in new_volume_list: - # Delete the object + # Remove any deleted volumes from the list + for volume in [volume for volume in volume_list[pool] if volume not in new_volume_list]: del(d_volume[pool][volume]) - # Update and print new list - volume_list[pool] = new_volume_list - logger.out('{}Volume list [{pool}]:{} {plist}'.format(fmt_blue, fmt_end, pool=pool, plist=' '.join(volume_list[pool])), state='i') + # Update the new list + volume_list[pool] = new_volume_list + logger.out(f'{logger.fmt_blue}Volume list [{pool}:{logger.fmt_end} {" ".join(volume_list[pool])}', state='i') + # Start keepalived thread + keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(logger, config, zkhandler, this_node) -############################################################################### -# PHASE 9 - Run the daemon -############################################################################### - -# Ceph stats update function -def collect_ceph_stats(queue): - if debug: - logger.out("Thread starting", state='d', prefix='ceph-thread') - - # Connect to the Ceph cluster - try: - ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring'])) - if debug: - logger.out("Connecting to cluster", state='d', prefix='ceph-thread') - ceph_conn.connect(timeout=1) - except Exception as e: - logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e') - return - - if debug: - logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread') - - # Get Ceph cluster health for local status output - command = {"prefix": "health", "format": "json"} - try: - health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1]) - ceph_health = health_status['status'] - except Exception as e: - logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e') - ceph_health = 'HEALTH_UNKN' - - if ceph_health in ['HEALTH_OK']: - ceph_health_colour = fmt_green - elif ceph_health in ['HEALTH_UNKN']: - ceph_health_colour = fmt_cyan - elif ceph_health in ['HEALTH_WARN']: - ceph_health_colour = fmt_yellow - else: - ceph_health_colour = fmt_red - - # Primary-only functions - if this_node.router_state == 'primary': - if debug: - logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread') - - command = {"prefix": "status", "format": "pretty"} - ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') + # Tick loop; does nothing since everything is async + while True: try: - zkhandler.write([ - ('base.storage', str(ceph_status)) - ]) - except Exception as e: - logger.out('Failed to set Ceph status data: {}'.format(e), state='e') - - if debug: - logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread') - - # Get rados df info - command = {"prefix": "df", "format": "pretty"} - ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') - try: - zkhandler.write([ - ('base.storage.util', str(ceph_df)) - ]) - except Exception as e: - logger.out('Failed to set Ceph utilization data: {}'.format(e), state='e') - - if debug: - logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread') - - # Get pool info - command = {"prefix": "df", "format": "json"} - ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') - try: - ceph_pool_df_raw = json.loads(ceph_df_output)['pools'] - except Exception as e: - logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w') - ceph_pool_df_raw = [] - - retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1) - try: - rados_pool_df_raw = json.loads(stdout)['pools'] - except Exception as e: - logger.out('Failed to obtain Pool data (rados df): {}'.format(e), state='w') - rados_pool_df_raw = [] - - pool_count = len(ceph_pool_df_raw) - if debug: - logger.out("Getting info for {} pools".format(pool_count), state='d', prefix='ceph-thread') - for pool_idx in range(0, pool_count): - try: - # Combine all the data for this pool - ceph_pool_df = ceph_pool_df_raw[pool_idx] - rados_pool_df = rados_pool_df_raw[pool_idx] - pool = ceph_pool_df - pool.update(rados_pool_df) - - # Ignore any pools that aren't in our pool list - if pool['name'] not in pool_list: - if debug: - logger.out("Pool {} not in pool list {}".format(pool['name'], pool_list), state='d', prefix='ceph-thread') - continue - else: - if debug: - logger.out("Parsing data for pool {}".format(pool['name']), state='d', prefix='ceph-thread') - - # Assemble a useful data structure - pool_df = { - 'id': pool['id'], - 'stored_bytes': pool['stats']['stored'], - 'free_bytes': pool['stats']['max_avail'], - 'used_bytes': pool['stats']['bytes_used'], - 'used_percent': pool['stats']['percent_used'], - 'num_objects': pool['stats']['objects'], - 'num_object_clones': pool['num_object_clones'], - 'num_object_copies': pool['num_object_copies'], - 'num_objects_missing_on_primary': pool['num_objects_missing_on_primary'], - 'num_objects_unfound': pool['num_objects_unfound'], - 'num_objects_degraded': pool['num_objects_degraded'], - 'read_ops': pool['read_ops'], - 'read_bytes': pool['read_bytes'], - 'write_ops': pool['write_ops'], - 'write_bytes': pool['write_bytes'] - } - - # Write the pool data to Zookeeper - zkhandler.write([ - (('pool.stats', pool['name']), str(json.dumps(pool_df))) - ]) - except Exception as e: - # One or more of the status commands timed out, just continue - logger.out('Failed to format and send pool data: {}'.format(e), state='w') - pass - - # Only grab OSD stats if there are OSDs to grab (otherwise `ceph osd df` hangs) - osds_this_node = 0 - if len(osd_list) > 0: - # Get data from Ceph OSDs - if debug: - logger.out("Get data from Ceph OSDs", state='d', prefix='ceph-thread') - - # Parse the dump data - osd_dump = dict() - - command = {"prefix": "osd dump", "format": "json"} - osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') - try: - osd_dump_raw = json.loads(osd_dump_output)['osds'] - except Exception as e: - logger.out('Failed to obtain OSD data: {}'.format(e), state='w') - osd_dump_raw = [] - - if debug: - logger.out("Loop through OSD dump", state='d', prefix='ceph-thread') - for osd in osd_dump_raw: - osd_dump.update({ - str(osd['osd']): { - 'uuid': osd['uuid'], - 'up': osd['up'], - 'in': osd['in'], - 'primary_affinity': osd['primary_affinity'] - } - }) - - # Parse the df data - if debug: - logger.out("Parse the OSD df data", state='d', prefix='ceph-thread') - - osd_df = dict() - - command = {"prefix": "osd df", "format": "json"} - try: - osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes'] - except Exception as e: - logger.out('Failed to obtain OSD data: {}'.format(e), state='w') - osd_df_raw = [] - - if debug: - logger.out("Loop through OSD df", state='d', prefix='ceph-thread') - for osd in osd_df_raw: - osd_df.update({ - str(osd['id']): { - 'utilization': osd['utilization'], - 'var': osd['var'], - 'pgs': osd['pgs'], - 'kb': osd['kb'], - 'weight': osd['crush_weight'], - 'reweight': osd['reweight'], - } - }) - - # Parse the status data - if debug: - logger.out("Parse the OSD status data", state='d', prefix='ceph-thread') - - osd_status = dict() - - command = {"prefix": "osd status", "format": "pretty"} - try: - osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') - except Exception as e: - logger.out('Failed to obtain OSD status data: {}'.format(e), state='w') - osd_status_raw = [] - - if debug: - logger.out("Loop through OSD status data", state='d', prefix='ceph-thread') - - for line in osd_status_raw.split('\n'): - # Strip off colour - line = re.sub(r'\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))', '', line) - # Split it for parsing - line = line.split() - if len(line) > 1 and line[1].isdigit(): - # This is an OSD line so parse it - osd_id = line[1] - node = line[3].split('.')[0] - used = line[5] - avail = line[7] - wr_ops = line[9] - wr_data = line[11] - rd_ops = line[13] - rd_data = line[15] - state = line[17] - osd_status.update({ - str(osd_id): { - 'node': node, - 'used': used, - 'avail': avail, - 'wr_ops': wr_ops, - 'wr_data': wr_data, - 'rd_ops': rd_ops, - 'rd_data': rd_data, - 'state': state - } - }) - - # Merge them together into a single meaningful dict - if debug: - logger.out("Merge OSD data together", state='d', prefix='ceph-thread') - - osd_stats = dict() - - for osd in osd_list: - if d_osd[osd].node == myhostname: - osds_this_node += 1 - try: - this_dump = osd_dump[osd] - this_dump.update(osd_df[osd]) - this_dump.update(osd_status[osd]) - osd_stats[osd] = this_dump - except KeyError as e: - # One or more of the status commands timed out, just continue - logger.out('Failed to parse OSD stats into dictionary: {}'.format(e), state='w') - - # Upload OSD data for the cluster (primary-only) - if this_node.router_state == 'primary': - if debug: - logger.out("Trigger updates for each OSD", state='d', prefix='ceph-thread') - - for osd in osd_list: - try: - stats = json.dumps(osd_stats[osd]) - zkhandler.write([ - (('osd.stats', osd), str(stats)) - ]) - except KeyError as e: - # One or more of the status commands timed out, just continue - logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w') - - ceph_conn.shutdown() - - queue.put(ceph_health_colour) - queue.put(ceph_health) - queue.put(osds_this_node) - - if debug: - logger.out("Thread finished", state='d', prefix='ceph-thread') - - -# State table for pretty stats -libvirt_vm_states = { - 0: "NOSTATE", - 1: "RUNNING", - 2: "BLOCKED", - 3: "PAUSED", - 4: "SHUTDOWN", - 5: "SHUTOFF", - 6: "CRASHED", - 7: "PMSUSPENDED" -} - - -# VM stats update function -def collect_vm_stats(queue): - if debug: - logger.out("Thread starting", state='d', prefix='vm-thread') - - # Connect to libvirt - libvirt_name = "qemu:///system" - if debug: - logger.out("Connecting to libvirt", state='d', prefix='vm-thread') - lv_conn = libvirt.open(libvirt_name) - if lv_conn is None: - logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e') - - memalloc = 0 - memprov = 0 - vcpualloc = 0 - # Toggle state management of dead VMs to restart them - if debug: - logger.out("Toggle state management of dead VMs to restart them", state='d', prefix='vm-thread') - # Make a copy of the d_domain; if not, and it changes in flight, this can fail - fixed_d_domain = this_node.d_domain.copy() - for domain, instance in fixed_d_domain.items(): - if domain in this_node.domain_list: - # Add the allocated memory to our memalloc value - memalloc += instance.getmemory() - memprov += instance.getmemory() - vcpualloc += instance.getvcpus() - if instance.getstate() == 'start' and instance.getnode() == this_node.name: - if instance.getdom() is not None: - try: - if instance.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING: - logger.out("VM {} has failed".format(instance.domname), state='w', prefix='vm-thread') - raise - except Exception: - # Toggle a state "change" - logger.out("Resetting state to {} for VM {}".format(instance.getstate(), instance.domname), state='i', prefix='vm-thread') - zkhandler.write([ - (('domain.state', domain), instance.getstate()) - ]) - elif instance.getnode() == this_node.name: - memprov += instance.getmemory() - - # Get list of running domains from Libvirt - running_domains = lv_conn.listAllDomains(libvirt.VIR_CONNECT_LIST_DOMAINS_ACTIVE) - - # Get statistics from any running VMs - for domain in running_domains: - try: - # Get basic information about the VM - tree = ElementTree.fromstring(domain.XMLDesc()) - domain_uuid = domain.UUIDString() - domain_name = domain.name() - - # Get all the raw information about the VM - if debug: - logger.out("Getting general statistics for VM {}".format(domain_name), state='d', prefix='vm-thread') - domain_state, domain_maxmem, domain_mem, domain_vcpus, domain_cputime = domain.info() - # We can't properly gather stats from a non-running VMs so continue - if domain_state != libvirt.VIR_DOMAIN_RUNNING: - continue - domain_memory_stats = domain.memoryStats() - domain_cpu_stats = domain.getCPUStats(True)[0] - except Exception as e: - logger.out("Failed getting VM information for {}: {}".format(domain.name(), e), state='w', prefix='vm-thread') - continue - - # Ensure VM is present in the domain_list - if domain_uuid not in this_node.domain_list: - this_node.domain_list.append(domain_uuid) - - if debug: - logger.out("Getting disk statistics for VM {}".format(domain_name), state='d', prefix='vm-thread') - domain_disk_stats = [] - try: - for disk in tree.findall('devices/disk'): - disk_name = disk.find('source').get('name') - if not disk_name: - disk_name = disk.find('source').get('file') - disk_stats = domain.blockStats(disk.find('target').get('dev')) - domain_disk_stats.append({ - "name": disk_name, - "rd_req": disk_stats[0], - "rd_bytes": disk_stats[1], - "wr_req": disk_stats[2], - "wr_bytes": disk_stats[3], - "err": disk_stats[4] - }) - except Exception as e: - logger.out("Failed to get disk stats for VM {}: {}".format(domain_name, e), state='w', prefix='vm-thread') - continue - - if debug: - logger.out("Getting network statistics for VM {}".format(domain_name), state='d', prefix='vm-thread') - domain_network_stats = [] - try: - for interface in tree.findall('devices/interface'): - interface_type = interface.get('type') - if interface_type not in ['bridge']: - continue - interface_name = interface.find('target').get('dev') - interface_bridge = interface.find('source').get('bridge') - interface_stats = domain.interfaceStats(interface_name) - domain_network_stats.append({ - "name": interface_name, - "bridge": interface_bridge, - "rd_bytes": interface_stats[0], - "rd_packets": interface_stats[1], - "rd_errors": interface_stats[2], - "rd_drops": interface_stats[3], - "wr_bytes": interface_stats[4], - "wr_packets": interface_stats[5], - "wr_errors": interface_stats[6], - "wr_drops": interface_stats[7] - }) - except Exception as e: - logger.out("Failed to get network stats for VM {}: {}".format(domain_name, e), state='w', prefix='vm-thread') - continue - - # Create the final dictionary - domain_stats = { - "state": libvirt_vm_states[domain_state], - "maxmem": domain_maxmem, - "livemem": domain_mem, - "cpus": domain_vcpus, - "cputime": domain_cputime, - "mem_stats": domain_memory_stats, - "cpu_stats": domain_cpu_stats, - "disk_stats": domain_disk_stats, - "net_stats": domain_network_stats - } - - if debug: - logger.out("Writing statistics for VM {} to Zookeeper".format(domain_name), state='d', prefix='vm-thread') - - try: - zkhandler.write([ - (('domain.stats', domain_uuid), str(json.dumps(domain_stats))) - ]) - except Exception as e: - if debug: - logger.out("{}".format(e), state='d', prefix='vm-thread') - - # Close the Libvirt connection - lv_conn.close() - - queue.put(len(running_domains)) - queue.put(memalloc) - queue.put(memprov) - queue.put(vcpualloc) - - if debug: - logger.out("Thread finished", state='d', prefix='vm-thread') - - -# Keepalive update function -@common.Profiler(config) -def node_keepalive(): - if debug: - logger.out("Keepalive starting", state='d', prefix='main-thread') - - # Set the migration selector in Zookeeper for clients to read - if config['enable_hypervisor']: - if this_node.router_state == 'primary': - try: - if zkhandler.read('base.config.migration_target_selector') != config['migration_target_selector']: - raise - except Exception: - zkhandler.write([ - ('base.config.migration_target_selector', config['migration_target_selector']) - ]) - - # Set the upstream IP in Zookeeper for clients to read - if config['enable_networking']: - if this_node.router_state == 'primary': - try: - if zkhandler.read('base.config.upstream_ip') != config['upstream_floating_ip']: - raise - except Exception: - zkhandler.write([ - ('base.config.upstream_ip', config['upstream_floating_ip']) - ]) - - # Get past state and update if needed - if debug: - logger.out("Get past state and update if needed", state='d', prefix='main-thread') - - past_state = zkhandler.read(('node.state.daemon', this_node.name)) - if past_state != 'run' and past_state != 'shutdown': - this_node.daemon_state = 'run' - zkhandler.write([ - (('node.state.daemon', this_node.name), 'run') - ]) - else: - this_node.daemon_state = 'run' - - # Ensure the primary key is properly set - if debug: - logger.out("Ensure the primary key is properly set", state='d', prefix='main-thread') - if this_node.router_state == 'primary': - if zkhandler.read('base.config.primary_node') != this_node.name: - zkhandler.write([ - ('base.config.primary_node', this_node.name) - ]) - - # Run VM statistics collection in separate thread for parallelization - if enable_hypervisor: - vm_thread_queue = Queue() - vm_stats_thread = Thread(target=collect_vm_stats, args=(vm_thread_queue,), kwargs={}) - vm_stats_thread.start() - - # Run Ceph status collection in separate thread for parallelization - if enable_storage: - ceph_thread_queue = Queue() - ceph_stats_thread = Thread(target=collect_ceph_stats, args=(ceph_thread_queue,), kwargs={}) - ceph_stats_thread.start() - - # Get node performance statistics - this_node.memtotal = int(psutil.virtual_memory().total / 1024 / 1024) - this_node.memused = int(psutil.virtual_memory().used / 1024 / 1024) - this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024) - this_node.cpuload = os.getloadavg()[0] - - # Join against running threads - if enable_hypervisor: - vm_stats_thread.join(timeout=4.0) - if vm_stats_thread.is_alive(): - logger.out('VM stats gathering exceeded 4s timeout, continuing', state='w') - if enable_storage: - ceph_stats_thread.join(timeout=4.0) - if ceph_stats_thread.is_alive(): - logger.out('Ceph stats gathering exceeded 4s timeout, continuing', state='w') - - # Get information from thread queues - if enable_hypervisor: - try: - this_node.domains_count = vm_thread_queue.get() - this_node.memalloc = vm_thread_queue.get() - this_node.memprov = vm_thread_queue.get() - this_node.vcpualloc = vm_thread_queue.get() + sleep(1) except Exception: - pass - else: - this_node.domains_count = 0 - this_node.memalloc = 0 - this_node.memprov = 0 - this_node.vcpualloc = 0 - - if enable_storage: - try: - ceph_health_colour = ceph_thread_queue.get() - ceph_health = ceph_thread_queue.get() - osds_this_node = ceph_thread_queue.get() - except Exception: - ceph_health_colour = fmt_cyan - ceph_health = 'UNKNOWN' - osds_this_node = '?' - - # Set our information in zookeeper - keepalive_time = int(time.time()) - if debug: - logger.out("Set our information in zookeeper", state='d', prefix='main-thread') - try: - zkhandler.write([ - (('node.memory.total', this_node.name), str(this_node.memtotal)), - (('node.memory.used', this_node.name), str(this_node.memused)), - (('node.memory.free', this_node.name), str(this_node.memfree)), - (('node.memory.allocated', this_node.name), str(this_node.memalloc)), - (('node.memory.provisioned', this_node.name), str(this_node.memprov)), - (('node.vcpu.allocated', this_node.name), str(this_node.vcpualloc)), - (('node.cpu.load', this_node.name), str(this_node.cpuload)), - (('node.count.provisioned_domains', this_node.name), str(this_node.domains_count)), - (('node.running_domains', this_node.name), ' '.join(this_node.domain_list)), - (('node.keepalive', this_node.name), str(keepalive_time)), - ]) - except Exception: - logger.out('Failed to set keepalive data', state='e') - - # Display node information to the terminal - if config['log_keepalives']: - if this_node.router_state == 'primary': - cst_colour = fmt_green - elif this_node.router_state == 'secondary': - cst_colour = fmt_blue - else: - cst_colour = fmt_cyan - logger.out( - '{}{} keepalive @ {}{} [{}{}{}]'.format( - fmt_purple, - myhostname, - datetime.now(), - fmt_end, - fmt_bold + cst_colour, - this_node.router_state, - fmt_end - ), - state='t' - ) - if config['log_keepalive_cluster_details']: - logger.out( - '{bold}Maintenance:{nofmt} {maint} ' - '{bold}Active VMs:{nofmt} {domcount} ' - '{bold}Networks:{nofmt} {netcount} ' - '{bold}Load:{nofmt} {load} ' - '{bold}Memory [MiB]: VMs:{nofmt} {allocmem} ' - '{bold}Used:{nofmt} {usedmem} ' - '{bold}Free:{nofmt} {freemem}'.format( - bold=fmt_bold, - nofmt=fmt_end, - maint=maintenance, - domcount=this_node.domains_count, - netcount=len(network_list), - load=this_node.cpuload, - freemem=this_node.memfree, - usedmem=this_node.memused, - allocmem=this_node.memalloc, - ), - state='t' - ) - if enable_storage and config['log_keepalive_storage_details']: - logger.out( - '{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt} ' - '{bold}Total OSDs:{nofmt} {total_osds} ' - '{bold}Node OSDs:{nofmt} {node_osds} ' - '{bold}Pools:{nofmt} {total_pools} '.format( - bold=fmt_bold, - health_colour=ceph_health_colour, - nofmt=fmt_end, - health=ceph_health, - total_osds=len(osd_list), - node_osds=osds_this_node, - total_pools=len(pool_list) - ), - state='t' - ) - - # Look for dead nodes and fence them - if not maintenance: - if debug: - logger.out("Look for dead nodes and fence them", state='d', prefix='main-thread') - if config['daemon_mode'] == 'coordinator': - for node_name in d_node: - try: - node_daemon_state = zkhandler.read(('node.state.daemon', node_name)) - node_keepalive = int(zkhandler.read(('node.keepalive', node_name))) - except Exception: - node_daemon_state = 'unknown' - node_keepalive = 0 - - # Handle deadtime and fencng if needed - # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds - # out-of-date while in 'start' state) - node_deadtime = int(time.time()) - (int(config['keepalive_interval']) * int(config['fence_intervals'])) - if node_keepalive < node_deadtime and node_daemon_state == 'run': - logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') - zk_lock = zkhandler.writelock(('node.state.daemon', node_name)) - with zk_lock: - # Ensures that, if we lost the lock race and come out of waiting, - # we won't try to trigger our own fence thread. - if zkhandler.read(('node.state.daemon', node_name)) != 'dead': - fence_thread = Thread(target=fencing.fenceNode, args=(node_name, zkhandler, config, logger), kwargs={}) - fence_thread.start() - # Write the updated data after we start the fence thread - zkhandler.write([ - (('node.state.daemon', node_name), 'dead') - ]) - - if debug: - logger.out("Keepalive finished", state='d', prefix='main-thread') - - -# Start keepalive thread -update_timer = startKeepaliveTimer() - -# Tick loop; does nothing since everything else is async -while True: - try: - time.sleep(1) - except Exception: - break + break diff --git a/node-daemon/pvcnoded/dnsmasq-zookeeper-leases.py b/node-daemon/pvcnoded/dnsmasq-zookeeper-leases.py index 4ada3d89..4a643077 100755 --- a/node-daemon/pvcnoded/dnsmasq-zookeeper-leases.py +++ b/node-daemon/pvcnoded/dnsmasq-zookeeper-leases.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # dnsmasq-zookeeper-leases.py - DNSMASQ leases script for Zookeeper # Part of the Parallel Virtual Cluster (PVC) system diff --git a/node-daemon/pvcnoded/objects/CephInstance.py b/node-daemon/pvcnoded/objects/CephInstance.py new file mode 100644 index 00000000..89a4c5c9 --- /dev/null +++ b/node-daemon/pvcnoded/objects/CephInstance.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 + +# CephInstance.py - Class implementing a PVC node Ceph instance +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import time +import json +import psutil + +import daemon_lib.common as common + + +class CephOSDInstance(object): + def __init__(self, zkhandler, this_node, osd_id): + self.zkhandler = zkhandler + self.this_node = this_node + self.osd_id = osd_id + self.node = None + self.size = None + self.stats = dict() + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('osd.node', self.osd_id)) + def watch_osd_node(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '' + + if data and data != self.node: + self.node = data + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('osd.stats', self.osd_id)) + def watch_osd_stats(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '' + + if data and data != self.stats: + self.stats = json.loads(data) + + @staticmethod + def add_osd(zkhandler, logger, node, device, weight): + # We are ready to create a new OSD on this node + logger.out('Creating new OSD disk on block device {}'.format(device), state='i') + try: + # 1. Create an OSD; we do this so we know what ID will be gen'd + retcode, stdout, stderr = common.run_os_command('ceph osd create') + if retcode: + print('ceph osd create') + print(stdout) + print(stderr) + raise + osd_id = stdout.rstrip() + + # 2. Remove that newly-created OSD + retcode, stdout, stderr = common.run_os_command('ceph osd rm {}'.format(osd_id)) + if retcode: + print('ceph osd rm') + print(stdout) + print(stderr) + raise + + # 3a. Zap the disk to ensure it is ready to go + logger.out('Zapping disk {}'.format(device), state='i') + retcode, stdout, stderr = common.run_os_command('ceph-volume lvm zap --destroy {}'.format(device)) + if retcode: + print('ceph-volume lvm zap') + print(stdout) + print(stderr) + raise + + # 3b. Create the OSD for real + logger.out('Preparing LVM for new OSD disk with ID {} on {}'.format(osd_id, device), state='i') + retcode, stdout, stderr = common.run_os_command( + 'ceph-volume lvm prepare --bluestore --data {device}'.format( + osdid=osd_id, + device=device + ) + ) + if retcode: + print('ceph-volume lvm prepare') + print(stdout) + print(stderr) + raise + + # 4a. Get OSD FSID + logger.out('Getting OSD FSID for ID {} on {}'.format(osd_id, device), state='i') + retcode, stdout, stderr = common.run_os_command( + 'ceph-volume lvm list {device}'.format( + osdid=osd_id, + device=device + ) + ) + for line in stdout.split('\n'): + if 'osd fsid' in line: + osd_fsid = line.split()[-1] + + if not osd_fsid: + print('ceph-volume lvm list') + print('Could not find OSD fsid in data:') + print(stdout) + print(stderr) + raise + + # 4b. Activate the OSD + logger.out('Activating new OSD disk with ID {}'.format(osd_id, device), state='i') + retcode, stdout, stderr = common.run_os_command( + 'ceph-volume lvm activate --bluestore {osdid} {osdfsid}'.format( + osdid=osd_id, + osdfsid=osd_fsid + ) + ) + if retcode: + print('ceph-volume lvm activate') + print(stdout) + print(stderr) + raise + + # 5. Add it to the crush map + logger.out('Adding new OSD disk with ID {} to CRUSH map'.format(osd_id), state='i') + retcode, stdout, stderr = common.run_os_command( + 'ceph osd crush add osd.{osdid} {weight} root=default host={node}'.format( + osdid=osd_id, + weight=weight, + node=node + ) + ) + if retcode: + print('ceph osd crush add') + print(stdout) + print(stderr) + raise + time.sleep(0.5) + + # 6. Verify it started + retcode, stdout, stderr = common.run_os_command( + 'systemctl status ceph-osd@{osdid}'.format( + osdid=osd_id + ) + ) + if retcode: + print('systemctl status') + print(stdout) + print(stderr) + raise + + # 7. Add the new OSD to the list + logger.out('Adding new OSD disk with ID {} to Zookeeper'.format(osd_id), state='i') + zkhandler.write([ + (('osd', osd_id), ''), + (('osd.node', osd_id), node), + (('osd.device', osd_id), device), + (('osd.stats', osd_id), '{}'), + ]) + + # Log it + logger.out('Created new OSD disk with ID {}'.format(osd_id), state='o') + return True + except Exception as e: + # Log it + logger.out('Failed to create new OSD disk: {}'.format(e), state='e') + return False + + @staticmethod + def remove_osd(zkhandler, logger, osd_id, osd_obj): + logger.out('Removing OSD disk {}'.format(osd_id), state='i') + try: + # 1. Verify the OSD is present + retcode, stdout, stderr = common.run_os_command('ceph osd ls') + osd_list = stdout.split('\n') + if osd_id not in osd_list: + logger.out('Could not find OSD {} in the cluster'.format(osd_id), state='e') + return True + + # 1. Set the OSD out so it will flush + logger.out('Setting out OSD disk with ID {}'.format(osd_id), state='i') + retcode, stdout, stderr = common.run_os_command('ceph osd out {}'.format(osd_id)) + if retcode: + print('ceph osd out') + print(stdout) + print(stderr) + raise + + # 2. Wait for the OSD to flush + logger.out('Flushing OSD disk with ID {}'.format(osd_id), state='i') + osd_string = str() + while True: + try: + retcode, stdout, stderr = common.run_os_command('ceph pg dump osds --format json') + dump_string = json.loads(stdout) + for osd in dump_string: + if str(osd['osd']) == osd_id: + osd_string = osd + num_pgs = osd_string['num_pgs'] + if num_pgs > 0: + time.sleep(5) + else: + raise + except Exception: + break + + # 3. Stop the OSD process and wait for it to be terminated + logger.out('Stopping OSD disk with ID {}'.format(osd_id), state='i') + retcode, stdout, stderr = common.run_os_command('systemctl stop ceph-osd@{}'.format(osd_id)) + if retcode: + print('systemctl stop') + print(stdout) + print(stderr) + raise + + # FIXME: There has to be a better way to do this /shrug + while True: + is_osd_up = False + # Find if there is a process named ceph-osd with arg '--id {id}' + for p in psutil.process_iter(attrs=['name', 'cmdline']): + if 'ceph-osd' == p.info['name'] and '--id {}'.format(osd_id) in ' '.join(p.info['cmdline']): + is_osd_up = True + # If there isn't, continue + if not is_osd_up: + break + + # 4. Determine the block devices + retcode, stdout, stderr = common.run_os_command('readlink /var/lib/ceph/osd/ceph-{}/block'.format(osd_id)) + vg_name = stdout.split('/')[-2] # e.g. /dev/ceph-/osd-block- + retcode, stdout, stderr = common.run_os_command('vgs --separator , --noheadings -o pv_name {}'.format(vg_name)) + pv_block = stdout.strip() + + # 5. Zap the volumes + logger.out('Zapping OSD disk with ID {} on {}'.format(osd_id, pv_block), state='i') + retcode, stdout, stderr = common.run_os_command('ceph-volume lvm zap --destroy {}'.format(pv_block)) + if retcode: + print('ceph-volume lvm zap') + print(stdout) + print(stderr) + raise + + # 6. Purge the OSD from Ceph + logger.out('Purging OSD disk with ID {}'.format(osd_id), state='i') + retcode, stdout, stderr = common.run_os_command('ceph osd purge {} --yes-i-really-mean-it'.format(osd_id)) + if retcode: + print('ceph osd purge') + print(stdout) + print(stderr) + raise + + # 7. Delete OSD from ZK + logger.out('Deleting OSD disk with ID {} from Zookeeper'.format(osd_id), state='i') + zkhandler.delete(('osd', osd_id), recursive=True) + + # Log it + logger.out('Removed OSD disk with ID {}'.format(osd_id), state='o') + return True + except Exception as e: + # Log it + logger.out('Failed to purge OSD disk with ID {}: {}'.format(osd_id, e), state='e') + return False + + +class CephPoolInstance(object): + def __init__(self, zkhandler, this_node, name): + self.zkhandler = zkhandler + self.this_node = this_node + self.name = name + self.pgs = '' + self.stats = dict() + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('pool.pgs', self.name)) + def watch_pool_node(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '' + + if data and data != self.pgs: + self.pgs = data + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('pool.stats', self.name)) + def watch_pool_stats(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '' + + if data and data != self.stats: + self.stats = json.loads(data) + + +class CephVolumeInstance(object): + def __init__(self, zkhandler, this_node, pool, name): + self.zkhandler = zkhandler + self.this_node = this_node + self.pool = pool + self.name = name + self.stats = dict() + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('volume.stats', f'{self.pool}/{self.name}')) + def watch_volume_stats(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '' + + if data and data != self.stats: + self.stats = json.loads(data) + + +class CephSnapshotInstance(object): + def __init__(self, zkhandler, this_node, pool, volume, name): + self.zkhandler = zkhandler + self.this_node = this_node + self.pool = pool + self.volume = volume + self.name = name + self.stats = dict() + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('snapshot.stats', f'{self.pool}/{self.volume}/{self.name}')) + def watch_snapshot_stats(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '' + + if data and data != self.stats: + self.stats = json.loads(data) + + +# Primary command function +# This command pipe is only used for OSD adds and removes +def ceph_command(zkhandler, logger, this_node, data, d_osd): + # Get the command and args + command, args = data.split() + + # Adding a new OSD + if command == 'osd_add': + node, device, weight = args.split(',') + if node == this_node.name: + # Lock the command queue + zk_lock = zkhandler.writelock('base.cmd.ceph') + with zk_lock: + # Add the OSD + result = CephOSDInstance.add_osd(zkhandler, logger, node, device, weight) + # Command succeeded + if result: + # Update the command queue + zkhandler.write([ + ('base.cmd.ceph', 'success-{}'.format(data)) + ]) + # Command failed + else: + # Update the command queue + zkhandler.write([ + ('base.cmd.ceph', 'failure-{}'.format(data)) + ]) + # Wait 1 seconds before we free the lock, to ensure the client hits the lock + time.sleep(1) + + # Removing an OSD + elif command == 'osd_remove': + osd_id = args + + # Verify osd_id is in the list + if d_osd[osd_id] and d_osd[osd_id].node == this_node.name: + # Lock the command queue + zk_lock = zkhandler.writelock('base.cmd.ceph') + with zk_lock: + # Remove the OSD + result = CephOSDInstance.remove_osd(zkhandler, logger, osd_id, d_osd[osd_id]) + # Command succeeded + if result: + # Update the command queue + zkhandler.write([ + ('base.cmd.ceph', 'success-{}'.format(data)) + ]) + # Command failed + else: + # Update the command queue + zkhandler.write([ + ('base.cmd.ceph', 'failure-{}'.format(data)) + ]) + # Wait 1 seconds before we free the lock, to ensure the client hits the lock + time.sleep(1) diff --git a/node-daemon/pvcnoded/DNSAggregatorInstance.py b/node-daemon/pvcnoded/objects/DNSAggregatorInstance.py similarity index 98% rename from node-daemon/pvcnoded/DNSAggregatorInstance.py rename to node-daemon/pvcnoded/objects/DNSAggregatorInstance.py index 9a2f94d0..e3e3bc8e 100644 --- a/node-daemon/pvcnoded/DNSAggregatorInstance.py +++ b/node-daemon/pvcnoded/objects/DNSAggregatorInstance.py @@ -74,7 +74,7 @@ class PowerDNSInstance(object): self.dns_server_daemon = None # Floating upstreams - self.vni_floatingipaddr, self.vni_cidrnetmask = self.config['vni_floating_ip'].split('/') + self.cluster_floatingipaddr, self.cluster_cidrnetmask = self.config['cluster_floating_ip'].split('/') self.upstream_floatingipaddr, self.upstream_cidrnetmask = self.config['upstream_floating_ip'].split('/') def start(self): @@ -91,7 +91,7 @@ class PowerDNSInstance(object): '--disable-syslog=yes', # Log only to stdout (which is then captured) '--disable-axfr=no', # Allow AXFRs '--allow-axfr-ips=0.0.0.0/0', # Allow AXFRs to anywhere - '--local-address={},{}'.format(self.vni_floatingipaddr, self.upstream_floatingipaddr), # Listen on floating IPs + '--local-address={},{}'.format(self.cluster_floatingipaddr, self.upstream_floatingipaddr), # Listen on floating IPs '--local-port=53', # On port 53 '--log-dns-details=on', # Log details '--loglevel=3', # Log info diff --git a/node-daemon/pvcnoded/MetadataAPIInstance.py b/node-daemon/pvcnoded/objects/MetadataAPIInstance.py similarity index 100% rename from node-daemon/pvcnoded/MetadataAPIInstance.py rename to node-daemon/pvcnoded/objects/MetadataAPIInstance.py diff --git a/node-daemon/pvcnoded/NodeInstance.py b/node-daemon/pvcnoded/objects/NodeInstance.py similarity index 97% rename from node-daemon/pvcnoded/NodeInstance.py rename to node-daemon/pvcnoded/objects/NodeInstance.py index 43c4df41..72f4f50f 100644 --- a/node-daemon/pvcnoded/NodeInstance.py +++ b/node-daemon/pvcnoded/objects/NodeInstance.py @@ -65,9 +65,9 @@ class NodeInstance(object): self.upstream_dev = self.config['upstream_dev'] self.upstream_floatingipaddr = self.config['upstream_floating_ip'].split('/')[0] self.upstream_ipaddr, self.upstream_cidrnetmask = self.config['upstream_dev_ip'].split('/') - self.vni_dev = self.config['vni_dev'] - self.vni_floatingipaddr = self.config['vni_floating_ip'].split('/')[0] - self.vni_ipaddr, self.vni_cidrnetmask = self.config['vni_dev_ip'].split('/') + self.cluster_dev = self.config['cluster_dev'] + self.cluster_floatingipaddr = self.config['cluster_floating_ip'].split('/')[0] + self.cluster_ipaddr, self.cluster_cidrnetmask = self.config['cluster_dev_ip'].split('/') self.storage_dev = self.config['storage_dev'] self.storage_floatingipaddr = self.config['storage_floating_ip'].split('/')[0] self.storage_ipaddr, self.storage_cidrnetmask = self.config['storage_dev_ip'].split('/') @@ -76,10 +76,10 @@ class NodeInstance(object): self.upstream_floatingipaddr = None self.upstream_ipaddr = None self.upstream_cidrnetmask = None - self.vni_dev = None - self.vni_floatingipaddr = None - self.vni_ipaddr = None - self.vni_cidrnetmask = None + self.cluster_dev = None + self.cluster_floatingipaddr = None + self.cluster_ipaddr = None + self.cluster_cidrnetmask = None self.storage_dev = None self.storage_floatingipaddr = None self.storage_ipaddr = None @@ -387,13 +387,13 @@ class NodeInstance(object): # 2. Add Cluster & Storage floating IP self.logger.out( 'Creating floating management IP {}/{} on interface {}'.format( - self.vni_floatingipaddr, - self.vni_cidrnetmask, + self.cluster_floatingipaddr, + self.cluster_cidrnetmask, 'brcluster' ), state='o' ) - common.createIPAddress(self.vni_floatingipaddr, self.vni_cidrnetmask, 'brcluster') + common.createIPAddress(self.cluster_floatingipaddr, self.cluster_cidrnetmask, 'brcluster') self.logger.out( 'Creating floating storage IP {}/{} on interface {}'.format( self.storage_floatingipaddr, @@ -599,13 +599,13 @@ class NodeInstance(object): # 6. Remove Cluster & Storage floating IP self.logger.out( 'Removing floating management IP {}/{} from interface {}'.format( - self.vni_floatingipaddr, - self.vni_cidrnetmask, + self.cluster_floatingipaddr, + self.cluster_cidrnetmask, 'brcluster' ), state='o' ) - common.removeIPAddress(self.vni_floatingipaddr, self.vni_cidrnetmask, 'brcluster') + common.removeIPAddress(self.cluster_floatingipaddr, self.cluster_cidrnetmask, 'brcluster') self.logger.out( 'Removing floating storage IP {}/{} from interface {}'.format( self.storage_floatingipaddr, diff --git a/node-daemon/pvcnoded/SRIOVVFInstance.py b/node-daemon/pvcnoded/objects/SRIOVVFInstance.py similarity index 100% rename from node-daemon/pvcnoded/SRIOVVFInstance.py rename to node-daemon/pvcnoded/objects/SRIOVVFInstance.py diff --git a/node-daemon/pvcnoded/VMConsoleWatcherInstance.py b/node-daemon/pvcnoded/objects/VMConsoleWatcherInstance.py similarity index 100% rename from node-daemon/pvcnoded/VMConsoleWatcherInstance.py rename to node-daemon/pvcnoded/objects/VMConsoleWatcherInstance.py diff --git a/node-daemon/pvcnoded/VMInstance.py b/node-daemon/pvcnoded/objects/VMInstance.py similarity index 92% rename from node-daemon/pvcnoded/VMInstance.py rename to node-daemon/pvcnoded/objects/VMInstance.py index 7a30fe65..0a56fbfa 100644 --- a/node-daemon/pvcnoded/VMInstance.py +++ b/node-daemon/pvcnoded/objects/VMInstance.py @@ -30,86 +30,11 @@ from xml.etree import ElementTree import daemon_lib.common as common -import pvcnoded.VMConsoleWatcherInstance as VMConsoleWatcherInstance +import pvcnoded.objects.VMConsoleWatcherInstance as VMConsoleWatcherInstance import daemon_lib.common as daemon_common -def flush_locks(zkhandler, logger, dom_uuid, this_node=None): - logger.out('Flushing RBD locks for VM "{}"'.format(dom_uuid), state='i') - # Get the list of RBD images - rbd_list = zkhandler.read(('domain.storage.volumes', dom_uuid)).split(',') - - for rbd in rbd_list: - # Check if a lock exists - lock_list_retcode, lock_list_stdout, lock_list_stderr = common.run_os_command('rbd lock list --format json {}'.format(rbd)) - if lock_list_retcode != 0: - logger.out('Failed to obtain lock list for volume "{}"'.format(rbd), state='e') - continue - - try: - lock_list = json.loads(lock_list_stdout) - except Exception as e: - logger.out('Failed to parse lock list for volume "{}": {}'.format(rbd, e), state='e') - continue - - # If there's at least one lock - if lock_list: - # Loop through the locks - for lock in lock_list: - if this_node is not None and zkhandler.read(('domain.state', dom_uuid)) != 'stop' and lock['address'].split(':')[0] != this_node.storage_ipaddr: - logger.out('RBD lock does not belong to this host (lock owner: {}): freeing this lock would be unsafe, aborting'.format(lock['address'].split(':')[0], state='e')) - zkhandler.write([ - (('domain.state', dom_uuid), 'fail'), - (('domain.failed_reason', dom_uuid), 'Could not safely free RBD lock {} ({}) on volume {}; stop VM and flush locks manually'.format(lock['id'], lock['address'], rbd)), - ]) - break - # Free the lock - lock_remove_retcode, lock_remove_stdout, lock_remove_stderr = common.run_os_command('rbd lock remove {} "{}" "{}"'.format(rbd, lock['id'], lock['locker'])) - if lock_remove_retcode != 0: - logger.out('Failed to free RBD lock "{}" on volume "{}": {}'.format(lock['id'], rbd, lock_remove_stderr), state='e') - zkhandler.write([ - (('domain.state', dom_uuid), 'fail'), - (('domain.failed_reason', dom_uuid), 'Could not free RBD lock {} ({}) on volume {}: {}'.format(lock['id'], lock['address'], rbd, lock_remove_stderr)), - ]) - break - logger.out('Freed RBD lock "{}" on volume "{}"'.format(lock['id'], rbd), state='o') - - return True - - -# Primary command function -def run_command(zkhandler, logger, this_node, data): - # Get the command and args - command, args = data.split() - - # Flushing VM RBD locks - if command == 'flush_locks': - dom_uuid = args - - # Verify that the VM is set to run on this node - if this_node.d_domain[dom_uuid].getnode() == this_node.name: - # Lock the command queue - zk_lock = zkhandler.writelock('base.cmd.domain') - with zk_lock: - # Flush the lock - result = flush_locks(zkhandler, logger, dom_uuid, this_node) - # Command succeeded - if result: - # Update the command queue - zkhandler.write([ - ('base.cmd.domain', 'success-{}'.format(data)) - ]) - # Command failed - else: - # Update the command queue - zkhandler.write([ - ('base.cmd.domain', 'failure-{}'.format(data)) - ]) - # Wait 1 seconds before we free the lock, to ensure the client hits the lock - time.sleep(1) - - class VMInstance(object): # Initialization function def __init__(self, domuuid, zkhandler, config, logger, this_node): @@ -265,7 +190,7 @@ class VMInstance(object): if self.getdom() is None or self.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING: # Flush locks self.logger.out('Flushing RBD locks', state='i', prefix='Domain {}'.format(self.domuuid)) - flush_locks(self.zkhandler, self.logger, self.domuuid, self.this_node) + VMInstance.flush_locks(self.zkhandler, self.logger, self.domuuid, self.this_node) if self.zkhandler.read(('domain.state', self.domuuid)) == 'fail': lv_conn.close() self.dom = None @@ -877,3 +802,79 @@ class VMInstance(object): # Return the dom object (or None) return dom + + # Flush the locks of a VM based on UUID + @staticmethod + def flush_locks(zkhandler, logger, dom_uuid, this_node=None): + logger.out('Flushing RBD locks for VM "{}"'.format(dom_uuid), state='i') + # Get the list of RBD images + rbd_list = zkhandler.read(('domain.storage.volumes', dom_uuid)).split(',') + + for rbd in rbd_list: + # Check if a lock exists + lock_list_retcode, lock_list_stdout, lock_list_stderr = common.run_os_command('rbd lock list --format json {}'.format(rbd)) + if lock_list_retcode != 0: + logger.out('Failed to obtain lock list for volume "{}"'.format(rbd), state='e') + continue + + try: + lock_list = json.loads(lock_list_stdout) + except Exception as e: + logger.out('Failed to parse lock list for volume "{}": {}'.format(rbd, e), state='e') + continue + + # If there's at least one lock + if lock_list: + # Loop through the locks + for lock in lock_list: + if this_node is not None and zkhandler.read(('domain.state', dom_uuid)) != 'stop' and lock['address'].split(':')[0] != this_node.storage_ipaddr: + logger.out('RBD lock does not belong to this host (lock owner: {}): freeing this lock would be unsafe, aborting'.format(lock['address'].split(':')[0], state='e')) + zkhandler.write([ + (('domain.state', dom_uuid), 'fail'), + (('domain.failed_reason', dom_uuid), 'Could not safely free RBD lock {} ({}) on volume {}; stop VM and flush locks manually'.format(lock['id'], lock['address'], rbd)), + ]) + break + # Free the lock + lock_remove_retcode, lock_remove_stdout, lock_remove_stderr = common.run_os_command('rbd lock remove {} "{}" "{}"'.format(rbd, lock['id'], lock['locker'])) + if lock_remove_retcode != 0: + logger.out('Failed to free RBD lock "{}" on volume "{}": {}'.format(lock['id'], rbd, lock_remove_stderr), state='e') + zkhandler.write([ + (('domain.state', dom_uuid), 'fail'), + (('domain.failed_reason', dom_uuid), 'Could not free RBD lock {} ({}) on volume {}: {}'.format(lock['id'], lock['address'], rbd, lock_remove_stderr)), + ]) + break + logger.out('Freed RBD lock "{}" on volume "{}"'.format(lock['id'], rbd), state='o') + + return True + + +# Primary command function +def vm_command(zkhandler, logger, this_node, data): + # Get the command and args + command, args = data.split() + + # Flushing VM RBD locks + if command == 'flush_locks': + dom_uuid = args + + # Verify that the VM is set to run on this node + if this_node.d_domain[dom_uuid].getnode() == this_node.name: + # Lock the command queue + zk_lock = zkhandler.writelock('base.cmd.domain') + with zk_lock: + # Flush the lock + result = VMInstance.flush_locks(zkhandler, logger, dom_uuid, this_node) + # Command succeeded + if result: + # Update the command queue + zkhandler.write([ + ('base.cmd.domain', 'success-{}'.format(data)) + ]) + # Command failed + else: + # Update the command queue + zkhandler.write([ + ('base.cmd.domain', 'failure-{}'.format(data)) + ]) + # Wait 1 seconds before we free the lock, to ensure the client hits the lock + time.sleep(1) diff --git a/node-daemon/pvcnoded/VXNetworkInstance.py b/node-daemon/pvcnoded/objects/VXNetworkInstance.py similarity index 99% rename from node-daemon/pvcnoded/VXNetworkInstance.py rename to node-daemon/pvcnoded/objects/VXNetworkInstance.py index 5af83173..35680d99 100644 --- a/node-daemon/pvcnoded/VXNetworkInstance.py +++ b/node-daemon/pvcnoded/objects/VXNetworkInstance.py @@ -36,8 +36,8 @@ class VXNetworkInstance(object): self.logger = logger self.this_node = this_node self.dns_aggregator = dns_aggregator - self.vni_dev = config['vni_dev'] - self.vni_mtu = config['vni_mtu'] + self.cluster_dev = config['cluster_dev'] + self.cluster_mtu = config['cluster_mtu'] self.bridge_dev = config['bridge_dev'] self.nettype = self.zkhandler.read(('network.type', self.vni)) @@ -481,7 +481,7 @@ add rule inet filter forward ip6 saddr {netaddr6} counter jump {vxlannic}-out ) # Set MTU of vLAN and bridge NICs - vx_mtu = self.vni_mtu + vx_mtu = self.cluster_mtu common.run_os_command( 'ip link set {} mtu {} up'.format( self.vlan_nic, @@ -521,7 +521,7 @@ add rule inet filter forward ip6 saddr {netaddr6} counter jump {vxlannic}-out def createNetworkManaged(self): self.logger.out( 'Creating VXLAN device on interface {}'.format( - self.vni_dev + self.cluster_dev ), prefix='VNI {}'.format(self.vni), state='o' @@ -532,7 +532,7 @@ add rule inet filter forward ip6 saddr {netaddr6} counter jump {vxlannic}-out 'ip link add {} type vxlan id {} dstport 4789 dev {}'.format( self.vxlan_nic, self.vni, - self.vni_dev + self.cluster_dev ) ) # Create bridge interface @@ -543,7 +543,7 @@ add rule inet filter forward ip6 saddr {netaddr6} counter jump {vxlannic}-out ) # Set MTU of VXLAN and bridge NICs - vx_mtu = self.vni_mtu - 50 + vx_mtu = self.cluster_mtu - 50 common.run_os_command( 'ip link set {} mtu {} up'.format( self.vxlan_nic, @@ -716,7 +716,7 @@ add rule inet filter forward ip6 saddr {netaddr6} counter jump {vxlannic}-out def removeNetworkBridged(self): self.logger.out( 'Removing VNI device on interface {}'.format( - self.vni_dev + self.cluster_dev ), prefix='VNI {}'.format(self.vni), state='o' @@ -752,7 +752,7 @@ add rule inet filter forward ip6 saddr {netaddr6} counter jump {vxlannic}-out def removeNetworkManaged(self): self.logger.out( 'Removing VNI device on interface {}'.format( - self.vni_dev + self.cluster_dev ), prefix='VNI {}'.format(self.vni), state='o' diff --git a/node-daemon/pvcnoded/objects/__init__.py b/node-daemon/pvcnoded/objects/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/node-daemon/pvcnoded/util/__init__.py b/node-daemon/pvcnoded/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/node-daemon/pvcnoded/util/config.py b/node-daemon/pvcnoded/util/config.py new file mode 100644 index 00000000..8561cd5b --- /dev/null +++ b/node-daemon/pvcnoded/util/config.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 + +# config.py - Utility functions for pvcnoded configuration parsing +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import os +import subprocess +import yaml +from socket import gethostname +from re import findall +from psutil import cpu_count +from ipaddress import ip_address, ip_network + + +class MalformedConfigurationError(Exception): + """ + An except when parsing the PVC Node daemon configuration file + """ + def __init__(self, error=None): + self.msg = f'ERROR: Configuration file is malformed: {error}' + + def __str__(self): + return str(self.msg) + + +def get_static_data(): + """ + Data that is obtained once at node startup for use later + """ + staticdata = list() + staticdata.append(str(cpu_count())) # CPU count + staticdata.append( + subprocess.run( + ['uname', '-r'], stdout=subprocess.PIPE + ).stdout.decode('ascii').strip() + ) + staticdata.append( + subprocess.run( + ['uname', '-o'], stdout=subprocess.PIPE + ).stdout.decode('ascii').strip() + ) + staticdata.append( + subprocess.run( + ['uname', '-m'], stdout=subprocess.PIPE + ).stdout.decode('ascii').strip() + ) + + return staticdata + + +def get_configuration_path(): + try: + return os.environ['PVCD_CONFIG_FILE'] + except KeyError: + print('ERROR: The "PVCD_CONFIG_FILE" environment variable must be set.') + os._exit(1) + + +def get_hostname(): + node_fqdn = gethostname() + node_hostname = node_fqdn.split('.', 1)[0] + node_domain = ''.join(node_fqdn.split('.', 1)[1:]) + try: + node_id = findall(r'\d+', node_hostname)[-1] + except IndexError: + node_id = 0 + + return node_fqdn, node_hostname, node_domain, node_id + + +def validate_floating_ip(config, network): + if network not in ['cluster', 'storage', 'upstream']: + return False, f'Specified network type "{network}" is not valid' + + floating_key = f'{network}_floating_ip' + network_key = f'{network}_network' + + # Verify the network provided is valid + try: + network = ip_network(config[network_key]) + except Exception: + return False, f'Network address {config[network_key]} for {network_key} is not valid' + + # Verify that the floating IP is valid (and in the network) + try: + floating_address = ip_address(config[floating_key].split('/')[0]) + if floating_address not in list(network.hosts()): + raise + except Exception: + return False, f'Floating address {config[floating_key]} for {floating_key} is not valid' + + return True, '' + + +def get_configuration(): + """ + Parse the configuration of the node daemon. + """ + pvcnoded_config_file = get_configuration_path() + + print('Loading configuration from file "{}"'.format(pvcnoded_config_file)) + + with open(pvcnoded_config_file, 'r') as cfgfile: + try: + o_config = yaml.load(cfgfile, Loader=yaml.SafeLoader) + except Exception as e: + print('ERROR: Failed to parse configuration file: {}'.format(e)) + os._exit(1) + + node_fqdn, node_hostname, node_domain, node_id = get_hostname() + + # Create the configuration dictionary + config = dict() + + # Get the initial base configuration + try: + o_base = o_config['pvc'] + o_cluster = o_config['pvc']['cluster'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_general = { + 'node': o_base.get('node', node_hostname), + 'node_hostname': node_hostname, + 'node_fqdn': node_fqdn, + 'node_domain': node_domain, + 'node_id': node_id, + 'coordinators': o_cluster.get('coordinators', list()), + 'debug': o_base.get('debug', False), + } + + config = {**config, **config_general} + + # Get the functions configuration + try: + o_functions = o_config['pvc']['functions'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_functions = { + 'enable_hypervisor': o_functions.get('enable_hypervisor', False), + 'enable_networking': o_functions.get('enable_networking', False), + 'enable_storage': o_functions.get('enable_storage', False), + 'enable_api': o_functions.get('enable_api', False), + } + + config = {**config, **config_functions} + + # Get the directory configuration + try: + o_directories = o_config['pvc']['system']['configuration']['directories'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_directories = { + 'dynamic_directory': o_directories.get('dynamic_directory', None), + 'log_directory': o_directories.get('log_directory', None), + 'console_log_directory': o_directories.get('console_log_directory', None), + } + + # Define our dynamic directory schema + config_directories['dnsmasq_dynamic_directory'] = config_directories['dynamic_directory'] + '/dnsmasq' + config_directories['pdns_dynamic_directory'] = config_directories['dynamic_directory'] + '/pdns' + config_directories['nft_dynamic_directory'] = config_directories['dynamic_directory'] + '/nft' + + # Define our log directory schema + config_directories['dnsmasq_log_directory'] = config_directories['log_directory'] + '/dnsmasq' + config_directories['pdns_log_directory'] = config_directories['log_directory'] + '/pdns' + config_directories['nft_log_directory'] = config_directories['log_directory'] + '/nft' + + config = {**config, **config_directories} + + # Get the logging configuration + try: + o_logging = o_config['pvc']['system']['configuration']['logging'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_logging = { + 'file_logging': o_logging.get('file_logging', False), + 'stdout_logging': o_logging.get('stdout_logging', False), + 'zookeeper_logging': o_logging.get('zookeeper_logging', False), + 'log_colours': o_logging.get('log_colours', False), + 'log_dates': o_logging.get('log_dates', False), + 'log_keepalives': o_logging.get('log_keepalives', False), + 'log_keepalive_cluster_details': o_logging.get('log_keepalive_cluster_details', False), + 'log_keepalive_storage_details': o_logging.get('log_keepalive_storage_details', False), + 'console_log_lines': o_logging.get('console_log_lines', False), + 'node_log_lines': o_logging.get('node_log_lines', False), + } + + config = {**config, **config_logging} + + # Get the interval configuration + try: + o_intervals = o_config['pvc']['system']['intervals'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_intervals = { + 'vm_shutdown_timeout': int(o_intervals.get('vm_shutdown_timeout', 60)), + 'keepalive_interval': int(o_intervals.get('keepalive_interval', 5)), + 'fence_intervals': int(o_intervals.get('fence_intervals', 6)), + 'suicide_intervals': int(o_intervals.get('suicide_interval', 0)), + } + + config = {**config, **config_intervals} + + # Get the fencing configuration + try: + o_fencing = o_config['pvc']['system']['fencing'] + o_fencing_actions = o_fencing['actions'] + o_fencing_ipmi = o_fencing['ipmi'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_fencing = { + 'successful_fence': o_fencing_actions.get('successful_fence', None), + 'failed_fence': o_fencing_actions.get('failed_fence', None), + 'ipmi_hostname': o_fencing_ipmi.get('host', f'{node_hostname}-lom.{node_domain}'), + 'ipmi_username': o_fencing_ipmi.get('user', 'null'), + 'ipmi_password': o_fencing_ipmi.get('pass', 'null'), + } + + config = {**config, **config_fencing} + + # Get the migration configuration + try: + o_migration = o_config['pvc']['system']['migration'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_migration = { + 'migration_target_selector': o_migration.get('target_selector', 'mem'), + } + + config = {**config, **config_migration} + + if config['enable_networking']: + # Get the node networks configuration + try: + o_networks = o_config['pvc']['cluster']['networks'] + o_network_cluster = o_networks['cluster'] + o_network_storage = o_networks['storage'] + o_network_upstream = o_networks['upstream'] + o_sysnetworks = o_config['pvc']['system']['configuration']['networking'] + o_sysnetwork_cluster = o_sysnetworks['cluster'] + o_sysnetwork_storage = o_sysnetworks['storage'] + o_sysnetwork_upstream = o_sysnetworks['upstream'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_networks = { + 'cluster_domain': o_network_cluster.get('domain', None), + 'cluster_network': o_network_cluster.get('network', None), + 'cluster_floating_ip': o_network_cluster.get('floating_ip', None), + 'cluster_dev': o_sysnetwork_cluster.get('device', None), + 'cluster_mtu': o_sysnetwork_cluster.get('mtu', None), + 'cluster_dev_ip': o_sysnetwork_cluster.get('address', None), + 'storage_domain': o_network_storage.get('domain', None), + 'storage_network': o_network_storage.get('network', None), + 'storage_floating_ip': o_network_storage.get('floating_ip', None), + 'storage_dev': o_sysnetwork_storage.get('device', None), + 'storage_mtu': o_sysnetwork_storage.get('mtu', None), + 'storage_dev_ip': o_sysnetwork_storage.get('address', None), + 'upstream_domain': o_network_upstream.get('domain', None), + 'upstream_network': o_network_upstream.get('network', None), + 'upstream_floating_ip': o_network_upstream.get('floating_ip', None), + 'upstream_gateway': o_network_upstream.get('gateway', None), + 'upstream_dev': o_sysnetwork_upstream.get('device', None), + 'upstream_mtu': o_sysnetwork_upstream.get('mtu', None), + 'upstream_dev_ip': o_sysnetwork_upstream.get('address', None), + 'bridge_dev': o_sysnetworks.get('bridge_device', None), + 'enable_sriov': o_sysnetworks.get('sriov_enable', False), + 'sriov_device': o_sysnetworks.get('sriov_device', list()) + } + + config = {**config, **config_networks} + + for network_type in ['cluster', 'storage', 'upstream']: + result, msg = validate_floating_ip(config, network_type) + if not result: + raise MalformedConfigurationError(msg) + + address_key = '{}_dev_ip'.format(network_type) + network_key = f'{network_type}_network' + network = ip_network(config[network_key]) + # With autoselection of addresses, construct an IP from the relevant network + if config[address_key] == 'by-id': + # The NodeID starts at 1, but indexes start at 0 + address_id = int(config['node_id']) - 1 + # Grab the nth address from the network + config[address_key] = '{}/{}'.format(list(network.hosts())[address_id], network.prefixlen) + # Validate the provided IP instead + else: + try: + address = ip_address(config[address_key].split('/')[0]) + if address not in list(network.hosts()): + raise + except Exception: + raise MalformedConfigurationError( + f'IP address {config[address_key]} for {address_key} is not valid' + ) + + # Get the PowerDNS aggregator database configuration + try: + o_pdnsdb = o_config['pvc']['coordinator']['dns']['database'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_pdnsdb = { + 'pdns_postgresql_host': o_pdnsdb.get('host', None), + 'pdns_postgresql_port': o_pdnsdb.get('port', None), + 'pdns_postgresql_dbname': o_pdnsdb.get('name', None), + 'pdns_postgresql_user': o_pdnsdb.get('user', None), + 'pdns_postgresql_password': o_pdnsdb.get('pass', None), + } + + config = {**config, **config_pdnsdb} + + # Get the Cloud-Init Metadata database configuration + try: + o_metadatadb = o_config['pvc']['coordinator']['metadata']['database'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_metadatadb = { + 'metadata_postgresql_host': o_metadatadb.get('host', None), + 'metadata_postgresql_port': o_metadatadb.get('port', None), + 'metadata_postgresql_dbname': o_metadatadb.get('name', None), + 'metadata_postgresql_user': o_metadatadb.get('user', None), + 'metadata_postgresql_password': o_metadatadb.get('pass', None), + } + + config = {**config, **config_metadatadb} + + if config['enable_storage']: + # Get the storage configuration + try: + o_storage = o_config['pvc']['system']['configuration']['storage'] + except Exception as e: + raise MalformedConfigurationError(e) + + config_storage = { + 'ceph_config_file': o_storage.get('ceph_config_file', None), + 'ceph_admin_keyring': o_storage.get('ceph_admin_keyring', None), + } + + config = {**config, **config_storage} + + # Add our node static data to the config + config['static_data'] = get_static_data() + + return config + + +def validate_directories(config): + if not os.path.exists(config['dynamic_directory']): + os.makedirs(config['dynamic_directory']) + os.makedirs(config['dnsmasq_dynamic_directory']) + os.makedirs(config['pdns_dynamic_directory']) + os.makedirs(config['nft_dynamic_directory']) + + if not os.path.exists(config['log_directory']): + os.makedirs(config['log_directory']) + os.makedirs(config['dnsmasq_log_directory']) + os.makedirs(config['pdns_log_directory']) + os.makedirs(config['nft_log_directory']) diff --git a/node-daemon/pvcnoded/fencing.py b/node-daemon/pvcnoded/util/fencing.py similarity index 89% rename from node-daemon/pvcnoded/fencing.py rename to node-daemon/pvcnoded/util/fencing.py index c04050ee..b519956f 100644 --- a/node-daemon/pvcnoded/fencing.py +++ b/node-daemon/pvcnoded/util/fencing.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# fencing.py - PVC daemon function library, node fencing functions +# fencing.py - Utility functions for pvcnoded fencing # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018-2021 Joshua M. Boniface @@ -22,13 +22,14 @@ import time import daemon_lib.common as common -import pvcnoded.VMInstance as VMInstance + +from pvcnoded.objects.VMInstance import VMInstance # # Fence thread entry function # -def fenceNode(node_name, zkhandler, config, logger): +def fence_node(node_name, zkhandler, config, logger): # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it failcount_limit = 6 failcount = 0 @@ -40,13 +41,13 @@ def fenceNode(node_name, zkhandler, config, logger): # Is it still 'dead' if node_daemon_state == 'dead': failcount += 1 - logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='w') + logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='s') # It changed back to something else so it must be alive else: logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o') return - logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='w') + logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='s') # Get IPMI information ipmi_hostname = zkhandler.read(('node.ipmi.hostname', node_name)) @@ -54,7 +55,7 @@ def fenceNode(node_name, zkhandler, config, logger): ipmi_password = zkhandler.read(('node.ipmi.password', node_name)) # Shoot it in the head - fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password, logger) + fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger) # Hold to ensure the fence takes effect and system stabilizes time.sleep(config['keepalive_interval'] * 2) @@ -123,7 +124,7 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): # # Perform an IPMI fence # -def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger): +def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger): # Forcibly reboot the node ipmi_command_reset = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power reset'.format( ipmi_hostname, ipmi_user, ipmi_password @@ -131,8 +132,7 @@ def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger): ipmi_reset_retcode, ipmi_reset_stdout, ipmi_reset_stderr = common.run_os_command(ipmi_command_reset) if ipmi_reset_retcode != 0: - logger.out('Failed to reboot dead node', state='e') - print(ipmi_reset_stderr) + logger.out(f'Failed to reboot dead node: {ipmi_reset_stderr}', state='e') time.sleep(1) @@ -178,12 +178,10 @@ def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger): # # Verify that IPMI connectivity to this host exists (used during node init) # -def verifyIPMI(ipmi_hostname, ipmi_user, ipmi_password): - ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format( - ipmi_hostname, ipmi_user, ipmi_password - ) - ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status, timeout=2) - if ipmi_status_retcode == 0 and ipmi_status_stdout != "Chassis Power is on": +def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password): + ipmi_command = f'/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status' + retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2) + if retcode == 0 and stdout != "Chassis Power is on": return True else: return False diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py new file mode 100644 index 00000000..fd2168e0 --- /dev/null +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -0,0 +1,718 @@ +#!/usr/bin/env python3 + +# keepalive.py - Utility functions for pvcnoded Keepalives +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import pvcnoded.util.fencing + +import daemon_lib.common as common + +from apscheduler.schedulers.background import BackgroundScheduler +from rados import Rados +from xml.etree import ElementTree +from queue import Queue +from threading import Thread +from datetime import datetime + +import json +import re +import libvirt +import psutil +import os +import time + + +# State table for pretty stats +libvirt_vm_states = { + 0: "NOSTATE", + 1: "RUNNING", + 2: "BLOCKED", + 3: "PAUSED", + 4: "SHUTDOWN", + 5: "SHUTOFF", + 6: "CRASHED", + 7: "PMSUSPENDED" +} + + +def start_keepalive_timer(logger, config, zkhandler, this_node): + keepalive_interval = config['keepalive_interval'] + logger.out(f'Starting keepalive timer ({keepalive_interval} second interval)', state='s') + keepalive_timer = BackgroundScheduler() + keepalive_timer.add_job( + node_keepalive, + args=(logger, config, zkhandler, this_node), + trigger='interval', + seconds=keepalive_interval) + keepalive_timer.start() + return keepalive_timer + + +def stop_keepalive_timer(logger, keepalive_timer): + try: + keepalive_timer.shutdown() + logger.out('Stopping keepalive timer', state='s') + except Exception: + logger.out('Failed to stop keepalive timer', state='w') + + +# Ceph stats update function +def collect_ceph_stats(logger, config, zkhandler, this_node, queue): + pool_list = zkhandler.children('base.pool') + osd_list = zkhandler.children('base.osd') + + debug = config['debug'] + if debug: + logger.out("Thread starting", state='d', prefix='ceph-thread') + + # Connect to the Ceph cluster + try: + ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring'])) + if debug: + logger.out("Connecting to cluster", state='d', prefix='ceph-thread') + ceph_conn.connect(timeout=1) + except Exception as e: + logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e') + return + + if debug: + logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread') + + # Get Ceph cluster health for local status output + command = {"prefix": "health", "format": "json"} + try: + health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1]) + ceph_health = health_status['status'] + except Exception as e: + logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e') + ceph_health = 'HEALTH_UNKN' + + if ceph_health in ['HEALTH_OK']: + ceph_health_colour = logger.fmt_green + elif ceph_health in ['HEALTH_UNKN']: + ceph_health_colour = logger.fmt_cyan + elif ceph_health in ['HEALTH_WARN']: + ceph_health_colour = logger.fmt_yellow + else: + ceph_health_colour = logger.fmt_red + + # Primary-only functions + if this_node.router_state == 'primary': + if debug: + logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread') + + command = {"prefix": "status", "format": "pretty"} + ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') + try: + zkhandler.write([ + ('base.storage', str(ceph_status)) + ]) + except Exception as e: + logger.out('Failed to set Ceph status data: {}'.format(e), state='e') + + if debug: + logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread') + + # Get rados df info + command = {"prefix": "df", "format": "pretty"} + ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') + try: + zkhandler.write([ + ('base.storage.util', str(ceph_df)) + ]) + except Exception as e: + logger.out('Failed to set Ceph utilization data: {}'.format(e), state='e') + + if debug: + logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread') + + # Get pool info + command = {"prefix": "df", "format": "json"} + ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') + try: + ceph_pool_df_raw = json.loads(ceph_df_output)['pools'] + except Exception as e: + logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w') + ceph_pool_df_raw = [] + + retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1) + try: + rados_pool_df_raw = json.loads(stdout)['pools'] + except Exception as e: + logger.out('Failed to obtain Pool data (rados df): {}'.format(e), state='w') + rados_pool_df_raw = [] + + pool_count = len(ceph_pool_df_raw) + if debug: + logger.out("Getting info for {} pools".format(pool_count), state='d', prefix='ceph-thread') + for pool_idx in range(0, pool_count): + try: + # Combine all the data for this pool + ceph_pool_df = ceph_pool_df_raw[pool_idx] + rados_pool_df = rados_pool_df_raw[pool_idx] + pool = ceph_pool_df + pool.update(rados_pool_df) + + # Ignore any pools that aren't in our pool list + if pool['name'] not in pool_list: + if debug: + logger.out("Pool {} not in pool list {}".format(pool['name'], pool_list), state='d', prefix='ceph-thread') + continue + else: + if debug: + logger.out("Parsing data for pool {}".format(pool['name']), state='d', prefix='ceph-thread') + + # Assemble a useful data structure + pool_df = { + 'id': pool['id'], + 'stored_bytes': pool['stats']['stored'], + 'free_bytes': pool['stats']['max_avail'], + 'used_bytes': pool['stats']['bytes_used'], + 'used_percent': pool['stats']['percent_used'], + 'num_objects': pool['stats']['objects'], + 'num_object_clones': pool['num_object_clones'], + 'num_object_copies': pool['num_object_copies'], + 'num_objects_missing_on_primary': pool['num_objects_missing_on_primary'], + 'num_objects_unfound': pool['num_objects_unfound'], + 'num_objects_degraded': pool['num_objects_degraded'], + 'read_ops': pool['read_ops'], + 'read_bytes': pool['read_bytes'], + 'write_ops': pool['write_ops'], + 'write_bytes': pool['write_bytes'] + } + + # Write the pool data to Zookeeper + zkhandler.write([ + (('pool.stats', pool['name']), str(json.dumps(pool_df))) + ]) + except Exception as e: + # One or more of the status commands timed out, just continue + logger.out('Failed to format and send pool data: {}'.format(e), state='w') + pass + + # Only grab OSD stats if there are OSDs to grab (otherwise `ceph osd df` hangs) + osds_this_node = 0 + if len(osd_list) > 0: + # Get data from Ceph OSDs + if debug: + logger.out("Get data from Ceph OSDs", state='d', prefix='ceph-thread') + + # Parse the dump data + osd_dump = dict() + + command = {"prefix": "osd dump", "format": "json"} + osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') + try: + osd_dump_raw = json.loads(osd_dump_output)['osds'] + except Exception as e: + logger.out('Failed to obtain OSD data: {}'.format(e), state='w') + osd_dump_raw = [] + + if debug: + logger.out("Loop through OSD dump", state='d', prefix='ceph-thread') + for osd in osd_dump_raw: + osd_dump.update({ + str(osd['osd']): { + 'uuid': osd['uuid'], + 'up': osd['up'], + 'in': osd['in'], + 'primary_affinity': osd['primary_affinity'] + } + }) + + # Parse the df data + if debug: + logger.out("Parse the OSD df data", state='d', prefix='ceph-thread') + + osd_df = dict() + + command = {"prefix": "osd df", "format": "json"} + try: + osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes'] + except Exception as e: + logger.out('Failed to obtain OSD data: {}'.format(e), state='w') + osd_df_raw = [] + + if debug: + logger.out("Loop through OSD df", state='d', prefix='ceph-thread') + for osd in osd_df_raw: + osd_df.update({ + str(osd['id']): { + 'utilization': osd['utilization'], + 'var': osd['var'], + 'pgs': osd['pgs'], + 'kb': osd['kb'], + 'weight': osd['crush_weight'], + 'reweight': osd['reweight'], + } + }) + + # Parse the status data + if debug: + logger.out("Parse the OSD status data", state='d', prefix='ceph-thread') + + osd_status = dict() + + command = {"prefix": "osd status", "format": "pretty"} + try: + osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii') + except Exception as e: + logger.out('Failed to obtain OSD status data: {}'.format(e), state='w') + osd_status_raw = [] + + if debug: + logger.out("Loop through OSD status data", state='d', prefix='ceph-thread') + + for line in osd_status_raw.split('\n'): + # Strip off colour + line = re.sub(r'\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))', '', line) + # Split it for parsing + line = line.split() + if len(line) > 1 and line[1].isdigit(): + # This is an OSD line so parse it + osd_id = line[1] + node = line[3].split('.')[0] + used = line[5] + avail = line[7] + wr_ops = line[9] + wr_data = line[11] + rd_ops = line[13] + rd_data = line[15] + state = line[17] + osd_status.update({ + str(osd_id): { + 'node': node, + 'used': used, + 'avail': avail, + 'wr_ops': wr_ops, + 'wr_data': wr_data, + 'rd_ops': rd_ops, + 'rd_data': rd_data, + 'state': state + } + }) + + # Merge them together into a single meaningful dict + if debug: + logger.out("Merge OSD data together", state='d', prefix='ceph-thread') + + osd_stats = dict() + + for osd in osd_list: + if zkhandler.read(('osd.node', osd)) == config['node_hostname']: + osds_this_node += 1 + try: + this_dump = osd_dump[osd] + this_dump.update(osd_df[osd]) + this_dump.update(osd_status[osd]) + osd_stats[osd] = this_dump + except KeyError as e: + # One or more of the status commands timed out, just continue + logger.out('Failed to parse OSD stats into dictionary: {}'.format(e), state='w') + + # Upload OSD data for the cluster (primary-only) + if this_node.router_state == 'primary': + if debug: + logger.out("Trigger updates for each OSD", state='d', prefix='ceph-thread') + + for osd in osd_list: + try: + stats = json.dumps(osd_stats[osd]) + zkhandler.write([ + (('osd.stats', osd), str(stats)) + ]) + except KeyError as e: + # One or more of the status commands timed out, just continue + logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w') + + ceph_conn.shutdown() + + queue.put(ceph_health_colour) + queue.put(ceph_health) + queue.put(osds_this_node) + + if debug: + logger.out("Thread finished", state='d', prefix='ceph-thread') + + +# VM stats update function +def collect_vm_stats(logger, config, zkhandler, this_node, queue): + debug = config['debug'] + if debug: + logger.out("Thread starting", state='d', prefix='vm-thread') + + # Connect to libvirt + libvirt_name = "qemu:///system" + if debug: + logger.out("Connecting to libvirt", state='d', prefix='vm-thread') + lv_conn = libvirt.open(libvirt_name) + if lv_conn is None: + logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e') + + memalloc = 0 + memprov = 0 + vcpualloc = 0 + # Toggle state management of dead VMs to restart them + if debug: + logger.out("Toggle state management of dead VMs to restart them", state='d', prefix='vm-thread') + # Make a copy of the d_domain; if not, and it changes in flight, this can fail + fixed_d_domain = this_node.d_domain.copy() + for domain, instance in fixed_d_domain.items(): + if domain in this_node.domain_list: + # Add the allocated memory to our memalloc value + memalloc += instance.getmemory() + memprov += instance.getmemory() + vcpualloc += instance.getvcpus() + if instance.getstate() == 'start' and instance.getnode() == this_node.name: + if instance.getdom() is not None: + try: + if instance.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING: + logger.out("VM {} has failed".format(instance.domname), state='w', prefix='vm-thread') + raise + except Exception: + # Toggle a state "change" + logger.out("Resetting state to {} for VM {}".format(instance.getstate(), instance.domname), state='i', prefix='vm-thread') + zkhandler.write([ + (('domain.state', domain), instance.getstate()) + ]) + elif instance.getnode() == this_node.name: + memprov += instance.getmemory() + + # Get list of running domains from Libvirt + running_domains = lv_conn.listAllDomains(libvirt.VIR_CONNECT_LIST_DOMAINS_ACTIVE) + + # Get statistics from any running VMs + for domain in running_domains: + try: + # Get basic information about the VM + tree = ElementTree.fromstring(domain.XMLDesc()) + domain_uuid = domain.UUIDString() + domain_name = domain.name() + + # Get all the raw information about the VM + if debug: + logger.out("Getting general statistics for VM {}".format(domain_name), state='d', prefix='vm-thread') + domain_state, domain_maxmem, domain_mem, domain_vcpus, domain_cputime = domain.info() + # We can't properly gather stats from a non-running VMs so continue + if domain_state != libvirt.VIR_DOMAIN_RUNNING: + continue + domain_memory_stats = domain.memoryStats() + domain_cpu_stats = domain.getCPUStats(True)[0] + except Exception as e: + if debug: + try: + logger.out("Failed getting VM information for {}: {}".format(domain.name(), e), state='d', prefix='vm-thread') + except Exception: + pass + continue + + # Ensure VM is present in the domain_list + if domain_uuid not in this_node.domain_list: + this_node.domain_list.append(domain_uuid) + + if debug: + logger.out("Getting disk statistics for VM {}".format(domain_name), state='d', prefix='vm-thread') + domain_disk_stats = [] + for disk in tree.findall('devices/disk'): + disk_name = disk.find('source').get('name') + if not disk_name: + disk_name = disk.find('source').get('file') + disk_stats = domain.blockStats(disk.find('target').get('dev')) + domain_disk_stats.append({ + "name": disk_name, + "rd_req": disk_stats[0], + "rd_bytes": disk_stats[1], + "wr_req": disk_stats[2], + "wr_bytes": disk_stats[3], + "err": disk_stats[4] + }) + + if debug: + logger.out("Getting network statistics for VM {}".format(domain_name), state='d', prefix='vm-thread') + domain_network_stats = [] + for interface in tree.findall('devices/interface'): + interface_type = interface.get('type') + if interface_type not in ['bridge']: + continue + interface_name = interface.find('target').get('dev') + interface_bridge = interface.find('source').get('bridge') + interface_stats = domain.interfaceStats(interface_name) + domain_network_stats.append({ + "name": interface_name, + "bridge": interface_bridge, + "rd_bytes": interface_stats[0], + "rd_packets": interface_stats[1], + "rd_errors": interface_stats[2], + "rd_drops": interface_stats[3], + "wr_bytes": interface_stats[4], + "wr_packets": interface_stats[5], + "wr_errors": interface_stats[6], + "wr_drops": interface_stats[7] + }) + + # Create the final dictionary + domain_stats = { + "state": libvirt_vm_states[domain_state], + "maxmem": domain_maxmem, + "livemem": domain_mem, + "cpus": domain_vcpus, + "cputime": domain_cputime, + "mem_stats": domain_memory_stats, + "cpu_stats": domain_cpu_stats, + "disk_stats": domain_disk_stats, + "net_stats": domain_network_stats + } + + if debug: + logger.out("Writing statistics for VM {} to Zookeeper".format(domain_name), state='d', prefix='vm-thread') + + try: + zkhandler.write([ + (('domain.stats', domain_uuid), str(json.dumps(domain_stats))) + ]) + except Exception as e: + if debug: + logger.out("{}".format(e), state='d', prefix='vm-thread') + + # Close the Libvirt connection + lv_conn.close() + + queue.put(len(running_domains)) + queue.put(memalloc) + queue.put(memprov) + queue.put(vcpualloc) + + if debug: + logger.out("Thread finished", state='d', prefix='vm-thread') + + +# Keepalive update function +def node_keepalive(logger, config, zkhandler, this_node): + debug = config['debug'] + if debug: + logger.out("Keepalive starting", state='d', prefix='main-thread') + + # Set the migration selector in Zookeeper for clients to read + if config['enable_hypervisor']: + if this_node.router_state == 'primary': + try: + if zkhandler.read('base.config.migration_target_selector') != config['migration_target_selector']: + raise + except Exception: + zkhandler.write([ + ('base.config.migration_target_selector', config['migration_target_selector']) + ]) + + # Set the upstream IP in Zookeeper for clients to read + if config['enable_networking']: + if this_node.router_state == 'primary': + try: + if zkhandler.read('base.config.upstream_ip') != config['upstream_floating_ip']: + raise + except Exception: + zkhandler.write([ + ('base.config.upstream_ip', config['upstream_floating_ip']) + ]) + + # Get past state and update if needed + if debug: + logger.out("Get past state and update if needed", state='d', prefix='main-thread') + + past_state = zkhandler.read(('node.state.daemon', this_node.name)) + if past_state != 'run' and past_state != 'shutdown': + this_node.daemon_state = 'run' + zkhandler.write([ + (('node.state.daemon', this_node.name), 'run') + ]) + else: + this_node.daemon_state = 'run' + + # Ensure the primary key is properly set + if debug: + logger.out("Ensure the primary key is properly set", state='d', prefix='main-thread') + if this_node.router_state == 'primary': + if zkhandler.read('base.config.primary_node') != this_node.name: + zkhandler.write([ + ('base.config.primary_node', this_node.name) + ]) + + # Run VM statistics collection in separate thread for parallelization + if config['enable_hypervisor']: + vm_thread_queue = Queue() + vm_stats_thread = Thread(target=collect_vm_stats, args=(logger, config, zkhandler, this_node, vm_thread_queue), kwargs={}) + vm_stats_thread.start() + + # Run Ceph status collection in separate thread for parallelization + if config['enable_storage']: + ceph_thread_queue = Queue() + ceph_stats_thread = Thread(target=collect_ceph_stats, args=(logger, config, zkhandler, this_node, ceph_thread_queue), kwargs={}) + ceph_stats_thread.start() + + # Get node performance statistics + this_node.memtotal = int(psutil.virtual_memory().total / 1024 / 1024) + this_node.memused = int(psutil.virtual_memory().used / 1024 / 1024) + this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024) + this_node.cpuload = os.getloadavg()[0] + + # Join against running threads + if config['enable_hypervisor']: + vm_stats_thread.join(timeout=4.0) + if vm_stats_thread.is_alive(): + logger.out('VM stats gathering exceeded 4s timeout, continuing', state='w') + if config['enable_storage']: + ceph_stats_thread.join(timeout=4.0) + if ceph_stats_thread.is_alive(): + logger.out('Ceph stats gathering exceeded 4s timeout, continuing', state='w') + + # Get information from thread queues + if config['enable_hypervisor']: + try: + this_node.domains_count = vm_thread_queue.get() + this_node.memalloc = vm_thread_queue.get() + this_node.memprov = vm_thread_queue.get() + this_node.vcpualloc = vm_thread_queue.get() + except Exception: + pass + else: + this_node.domains_count = 0 + this_node.memalloc = 0 + this_node.memprov = 0 + this_node.vcpualloc = 0 + + if config['enable_storage']: + try: + ceph_health_colour = ceph_thread_queue.get() + ceph_health = ceph_thread_queue.get() + osds_this_node = ceph_thread_queue.get() + except Exception: + ceph_health_colour = logger.fmt_cyan + ceph_health = 'UNKNOWN' + osds_this_node = '?' + + # Set our information in zookeeper + keepalive_time = int(time.time()) + if debug: + logger.out("Set our information in zookeeper", state='d', prefix='main-thread') + try: + zkhandler.write([ + (('node.memory.total', this_node.name), str(this_node.memtotal)), + (('node.memory.used', this_node.name), str(this_node.memused)), + (('node.memory.free', this_node.name), str(this_node.memfree)), + (('node.memory.allocated', this_node.name), str(this_node.memalloc)), + (('node.memory.provisioned', this_node.name), str(this_node.memprov)), + (('node.vcpu.allocated', this_node.name), str(this_node.vcpualloc)), + (('node.cpu.load', this_node.name), str(this_node.cpuload)), + (('node.count.provisioned_domains', this_node.name), str(this_node.domains_count)), + (('node.running_domains', this_node.name), ' '.join(this_node.domain_list)), + (('node.keepalive', this_node.name), str(keepalive_time)), + ]) + except Exception: + logger.out('Failed to set keepalive data', state='e') + + # Display node information to the terminal + if config['log_keepalives']: + if this_node.router_state == 'primary': + cst_colour = logger.fmt_green + elif this_node.router_state == 'secondary': + cst_colour = logger.fmt_blue + else: + cst_colour = logger.fmt_cyan + logger.out( + '{}{} keepalive @ {}{} [{}{}{}]'.format( + logger.fmt_purple, + config['node_hostname'], + datetime.now(), + logger.fmt_end, + logger.fmt_bold + cst_colour, + this_node.router_state, + logger.fmt_end + ), + state='t' + ) + if config['log_keepalive_cluster_details']: + logger.out( + '{bold}Maintenance:{nofmt} {maint} ' + '{bold}Active VMs:{nofmt} {domcount} ' + '{bold}Networks:{nofmt} {netcount} ' + '{bold}Load:{nofmt} {load} ' + '{bold}Memory [MiB]: VMs:{nofmt} {allocmem} ' + '{bold}Used:{nofmt} {usedmem} ' + '{bold}Free:{nofmt} {freemem}'.format( + bold=logger.fmt_bold, + nofmt=logger.fmt_end, + maint=this_node.maintenance, + domcount=this_node.domains_count, + netcount=len(zkhandler.children('base.network')), + load=this_node.cpuload, + freemem=this_node.memfree, + usedmem=this_node.memused, + allocmem=this_node.memalloc, + ), + state='t' + ) + if config['enable_storage'] and config['log_keepalive_storage_details']: + logger.out( + '{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt} ' + '{bold}Total OSDs:{nofmt} {total_osds} ' + '{bold}Node OSDs:{nofmt} {node_osds} ' + '{bold}Pools:{nofmt} {total_pools} '.format( + bold=logger.fmt_bold, + health_colour=ceph_health_colour, + nofmt=logger.fmt_end, + health=ceph_health, + total_osds=len(zkhandler.children('base.osd')), + node_osds=osds_this_node, + total_pools=len(zkhandler.children('base.pool')) + ), + state='t' + ) + + # Look for dead nodes and fence them + if not this_node.maintenance: + if debug: + logger.out("Look for dead nodes and fence them", state='d', prefix='main-thread') + if config['daemon_mode'] == 'coordinator': + for node_name in zkhandler.children('base.node'): + try: + node_daemon_state = zkhandler.read(('node.state.daemon', node_name)) + node_keepalive = int(zkhandler.read(('node.keepalive', node_name))) + except Exception: + node_daemon_state = 'unknown' + node_keepalive = 0 + + # Handle deadtime and fencng if needed + # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds + # out-of-date while in 'start' state) + node_deadtime = int(time.time()) - (int(config['keepalive_interval']) * int(config['fence_intervals'])) + if node_keepalive < node_deadtime and node_daemon_state == 'run': + logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') + zk_lock = zkhandler.writelock(('node.state.daemon', node_name)) + with zk_lock: + # Ensures that, if we lost the lock race and come out of waiting, + # we won't try to trigger our own fence thread. + if zkhandler.read(('node.state.daemon', node_name)) != 'dead': + fence_thread = Thread(target=pvcnoded.util.fencing.fence_node, args=(node_name, zkhandler, config, logger), kwargs={}) + fence_thread.start() + # Write the updated data after we start the fence thread + zkhandler.write([ + (('node.state.daemon', node_name), 'dead') + ]) + + if debug: + logger.out("Keepalive finished", state='d', prefix='main-thread') diff --git a/node-daemon/pvcnoded/util/libvirt.py b/node-daemon/pvcnoded/util/libvirt.py new file mode 100644 index 00000000..f6572b58 --- /dev/null +++ b/node-daemon/pvcnoded/util/libvirt.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# libvirt.py - Utility functions for pvcnoded libvirt +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import libvirt + + +def validate_libvirtd(logger, config): + if config['enable_hypervisor']: + libvirt_check_name = f'qemu+tcp://{config["node_hostname"]}/system' + logger.out(f'Connecting to Libvirt daemon at {libvirt_check_name}', state='i') + try: + lv_conn = libvirt.open(libvirt_check_name) + lv_conn.close() + except Exception as e: + logger.out(f'Failed to connect to Libvirt daemon: {e}', state='e') + return False + + return True diff --git a/node-daemon/pvcnoded/util/networking.py b/node-daemon/pvcnoded/util/networking.py new file mode 100644 index 00000000..4a70371b --- /dev/null +++ b/node-daemon/pvcnoded/util/networking.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 + +# networking.py - Utility functions for pvcnoded networking +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import daemon_lib.common as common + +from time import sleep +from os import makedirs + + +def setup_sriov(logger, config): + logger.out('Setting up SR-IOV device support', state='i') + + # Enable unsafe interrupts for the vfio_iommu_type1 kernel module + try: + common.run_os_command('modprobe vfio_iommu_type1 allow_unsafe_interrupts=1') + with open('/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts', 'w') as mfh: + mfh.write('Y') + except Exception: + logger.out('Failed to enable vfio_iommu_type1 kernel module; SR-IOV may fail', state='w') + + # Loop through our SR-IOV NICs and enable the numvfs for each + for device in config['sriov_device']: + logger.out(f'Preparing SR-IOV PF {device["phy"]} with {device["vfcount"]} VFs', state='i') + try: + with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', 'r') as vfh: + current_vf_count = vfh.read().strip() + with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', 'w') as vfh: + vfh.write(str(device['vfcount'])) + except FileNotFoundError: + logger.out(f'Failed to open SR-IOV configuration for PF {device["phy"]}; device may not support SR-IOV', state='w') + except OSError: + logger.out(f'Failed to set SR-IOV VF count for PF {device["phy"]} to {device["vfcount"]}; already set to {current_vf_count}', state='w') + + if device.get('mtu', None) is not None: + logger.out(f'Setting SR-IOV PF {device["phy"]} to MTU {device["mtu"]}', state='i') + common.run_os_command(f'ip link set {device["phy"]} mtu {device["mtu"]} up') + + +def setup_interfaces(logger, config): + # Set up the Cluster interface + cluster_dev = config['cluster_dev'] + cluster_mtu = config['cluster_mtu'] + cluster_dev_ip = config['cluster_dev_ip'] + + logger.out(f'Setting up Cluster network interface {cluster_dev} with MTU {cluster_mtu}', state='i') + + common.run_os_command(f'ip link set {cluster_dev} mtu {cluster_mtu} up') + + logger.out(f'Setting up Cluster network bridge on interface {cluster_dev} with IP {cluster_dev_ip}', state='i') + + common.run_os_command(f'brctl addbr brcluster') + common.run_os_command(f'brctl addif brcluster {cluster_dev}') + common.run_os_command(f'ip link set brcluster mtu {cluster_mtu} up') + common.run_os_command(f'ip address add {cluster_dev_ip} dev brcluster') + + # Set up the Storage interface + storage_dev = config['storage_dev'] + storage_mtu = config['storage_mtu'] + storage_dev_ip = config['storage_dev_ip'] + + logger.out(f'Setting up Storage network interface {storage_dev} with MTU {storage_mtu}', state='i') + + common.run_os_command(f'ip link set {storage_dev} mtu {storage_mtu} up') + + if storage_dev == cluster_dev: + if storage_dev_ip != cluster_dev_ip: + logger.out(f'Setting up Storage network on Cluster network bridge with IP {storage_dev_ip}', state='i') + + common.run_os_command(f'ip address add {storage_dev_ip} dev brcluster') + else: + logger.out(f'Setting up Storage network bridge on interface {storage_dev} with IP {storage_dev_ip}', state='i') + + common.run_os_command(f'brctl addbr brstorage') + common.run_os_command(f'brctl addif brstorage {storage_dev}') + common.run_os_command(f'ip link set brstorage mtu {storage_mtu} up') + common.run_os_command(f'ip address add {storage_dev_ip} dev brstorage') + + # Set up the Upstream interface + upstream_dev = config['upstream_dev'] + upstream_mtu = config['upstream_mtu'] + upstream_dev_ip = config['upstream_dev_ip'] + + logger.out(f'Setting up Upstream network interface {upstream_dev} with MTU {upstream_mtu}', state='i') + + if upstream_dev == cluster_dev: + if upstream_dev_ip != cluster_dev_ip: + logger.out(f'Setting up Upstream network on Cluster network bridge with IP {upstream_dev_ip}', state='i') + + common.run_os_command(f'ip address add {upstream_dev_ip} dev brcluster') + else: + logger.out(f'Setting up Upstream network bridge on interface {upstream_dev} with IP {upstream_dev_ip}', state='i') + + common.run_os_command(f'brctl addbr brupstream') + common.run_os_command(f'brctl addif brupstream {upstream_dev}') + common.run_os_command(f'ip link set brupstream mtu {upstream_mtu} up') + common.run_os_command(f'ip address add {upstream_dev_ip} dev brupstream') + + upstream_gateway = config['upstream_gateway'] + if upstream_gateway is not None: + logger.out(f'Setting up Upstream networok default gateway IP {upstream_gateway}', state='i') + if upstream_dev == cluster_dev: + common.run_os_command(f'ip route add default via {upstream_gateway} dev brcluster') + else: + common.run_os_command(f'ip route add default via {upstream_gateway} dev brupstream') + + # Set up sysctl tweaks to optimize networking + # Enable routing functions + common.run_os_command('sysctl net.ipv4.ip_forward=1') + common.run_os_command('sysctl net.ipv6.ip_forward=1') + # Enable send redirects + common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1') + common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1') + common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1') + common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1') + # Accept source routes + common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') + common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1') + common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1') + common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1') + # Disable RP filtering on Cluster and Upstream interfaces (to allow traffic pivoting) + common.run_os_command(f'sysctl net.ipv4.conf.{cluster_dev}.rp_filter=0') + common.run_os_command(f'sysctl net.ipv4.conf.brcluster.rp_filter=0') + common.run_os_command(f'sysctl net.ipv4.conf.{upstream_dev}.rp_filter=0') + common.run_os_command(f'sysctl net.ipv4.conf.brupstream.rp_filter=0') + common.run_os_command(f'sysctl net.ipv6.conf.{cluster_dev}.rp_filter=0') + common.run_os_command(f'sysctl net.ipv6.conf.brcluster.rp_filter=0') + common.run_os_command(f'sysctl net.ipv6.conf.{upstream_dev}.rp_filter=0') + common.run_os_command(f'sysctl net.ipv6.conf.brupstream.rp_filter=0') + + # Stop DNSMasq if it is running + common.run_os_command('systemctl stop dnsmasq.service') + + logger.out('Waiting 3 seconds for networking to come up', state='s') + sleep(3) + + +def create_nft_configuration(logger, config): + if config['enable_networking']: + logger.out('Creating NFT firewall configuration', state='i') + + dynamic_directory = config['nft_dynamic_directory'] + + # Create directories + makedirs(f'{dynamic_directory}/networks', exist_ok=True) + makedirs(f'{dynamic_directory}/static', exist_ok=True) + + # Set up the base rules + nftables_base_rules = f"""# Base rules + flush ruleset + # Add the filter table and chains + add table inet filter + add chain inet filter forward {{ type filter hook forward priority 0; }} + add chain inet filter input {{ type filter hook input priority 0; }} + # Include static rules and network rules + include "{dynamic_directory}/static/*" + include "{dynamic_directory}/networks/*" + """ + + # Write the base firewall config + nftables_base_filename = f'{dynamic_directory}/base.nft' + with open(nftables_base_filename, 'w') as nftfh: + nftfh.write(nftables_base_rules) + common.reload_firewall_rules(nftables_base_filename, logger) diff --git a/node-daemon/pvcnoded/util/services.py b/node-daemon/pvcnoded/util/services.py new file mode 100644 index 00000000..c7574a27 --- /dev/null +++ b/node-daemon/pvcnoded/util/services.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# services.py - Utility functions for pvcnoded external services +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import daemon_lib.common as common +from time import sleep + + +def start_zookeeper(logger, config): + if config['daemon_mode'] == 'coordinator': + logger.out('Starting Zookeeper daemon', state='i') + # TODO: Move our handling out of Systemd and integrate it directly as a subprocess? + common.run_os_command('systemctl start zookeeper.service') + + +def start_libvirtd(logger, config): + if config['enable_hypervisor']: + logger.out('Starting Libvirt daemon', state='i') + # TODO: Move our handling out of Systemd and integrate it directly as a subprocess? + common.run_os_command('systemctl start libvirtd.service') + + +def start_patroni(logger, config): + if config['enable_networking'] and config['daemon_mode'] == 'coordinator': + logger.out('Starting Patroni daemon', state='i') + # TODO: Move our handling out of Systemd and integrate it directly as a subprocess? + common.run_os_command('systemctl start patroni.service') + + +def start_frrouting(logger, config): + if config['enable_networking'] and config['daemon_mode'] == 'coordinator': + logger.out('Starting FRRouting daemon', state='i') + # TODO: Move our handling out of Systemd and integrate it directly as a subprocess? + common.run_os_command('systemctl start frr.service') + + +def start_ceph_mon(logger, config): + if config['enable_storage'] and config['daemon_mode'] == 'coordinator': + logger.out('Starting Ceph Monitor daemon', state='i') + # TODO: Move our handling out of Systemd and integrate it directly as a subprocess? + common.run_os_command(f'systemctl start ceph-mon@{config["node_hostname"]}.service') + + +def start_ceph_mgr(logger, config): + if config['enable_storage'] and config['daemon_mode'] == 'coordinator': + logger.out('Starting Ceph Manager daemon', state='i') + # TODO: Move our handling out of Systemd and integrate it directly as a subprocess? + common.run_os_command(f'systemctl start ceph-mgr@{config["node_hostname"]}.service') + + +def start_system_services(logger, config): + start_zookeeper(logger, config) + start_libvirtd(logger, config) + start_patroni(logger, config) + start_frrouting(logger, config) + start_ceph_mon(logger, config) + start_ceph_mgr(logger, config) + + logger.out('Waiting 3 seconds for daemons to start', state='s') + sleep(3) diff --git a/node-daemon/pvcnoded/util/zookeeper.py b/node-daemon/pvcnoded/util/zookeeper.py new file mode 100644 index 00000000..933b70da --- /dev/null +++ b/node-daemon/pvcnoded/util/zookeeper.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +# - +# zookeeper.py - Utility functions for pvcnoded Zookeeper connections +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################## + +from daemon_lib.zkhandler import ZKHandler + +import os +import time + + +def connect(logger, config): + # Create an instance of the handler + zkhandler = ZKHandler(config, logger) + + try: + logger.out('Connecting to Zookeeper on coordinator nodes {}'.format(config['coordinators']), state='i') + # Start connection + zkhandler.connect(persistent=True) + except Exception as e: + logger.out('ERROR: Failed to connect to Zookeeper cluster: {}'.format(e), state='e') + os._exit(1) + + logger.out('Validating Zookeeper schema', state='i') + + try: + node_schema_version = int(zkhandler.read(('node.data.active_schema', config['node_hostname']))) + except Exception: + node_schema_version = int(zkhandler.read('base.schema.version')) + zkhandler.write([ + (('node.data.active_schema', config['node_hostname']), node_schema_version) + ]) + + # Load in the current node schema version + zkhandler.schema.load(node_schema_version) + + # Record the latest intalled schema version + latest_schema_version = zkhandler.schema.find_latest() + logger.out('Latest installed schema is {}'.format(latest_schema_version), state='i') + zkhandler.write([ + (('node.data.latest_schema', config['node_hostname']), latest_schema_version) + ]) + + # If we are the last node to get a schema update, fire the master update + if latest_schema_version > node_schema_version: + node_latest_schema_version = list() + for node in zkhandler.children('base.node'): + node_latest_schema_version.append(int(zkhandler.read(('node.data.latest_schema', node)))) + + # This is true if all elements of the latest schema version are identical to the latest version, + # i.e. they have all had the latest schema installed and ready to load. + if node_latest_schema_version.count(latest_schema_version) == len(node_latest_schema_version): + zkhandler.write([ + ('base.schema.version', latest_schema_version) + ]) + + return zkhandler, node_schema_version + + +def validate_schema(logger, zkhandler): + # Validate our schema against the active version + if not zkhandler.schema.validate(zkhandler, logger): + logger.out('Found schema violations, applying', state='i') + zkhandler.schema.apply(zkhandler) + else: + logger.out('Schema successfully validated', state='o') + + +def setup_node(logger, config, zkhandler): + # Check if our node exists in Zookeeper, and create it if not + if config['daemon_mode'] == 'coordinator': + init_routerstate = 'secondary' + else: + init_routerstate = 'client' + + if zkhandler.exists(('node', config['node_hostname'])): + logger.out(f'Node is {logger.fmt_green}present{logger.fmt_end} in Zookeeper', state='i') + # Update static data just in case it's changed + zkhandler.write([ + (('node', config['node_hostname']), config['daemon_mode']), + (('node.mode', config['node_hostname']), config['daemon_mode']), + (('node.state.daemon', config['node_hostname']), 'init'), + (('node.state.router', config['node_hostname']), init_routerstate), + (('node.data.static', config['node_hostname']), ' '.join(config['static_data'])), + (('node.data.pvc_version', config['node_hostname']), config['pvcnoded_version']), + (('node.ipmi.hostname', config['node_hostname']), config['ipmi_hostname']), + (('node.ipmi.username', config['node_hostname']), config['ipmi_username']), + (('node.ipmi.password', config['node_hostname']), config['ipmi_password']), + ]) + else: + logger.out(f'Node is {logger.fmt_red}absent{logger.fmt_end} in Zookeeper; adding new node', state='i') + keepalive_time = int(time.time()) + zkhandler.write([ + (('node', config['node_hostname']), config['daemon_mode']), + (('node.keepalive', config['node_hostname']), str(keepalive_time)), + (('node.mode', config['node_hostname']), config['daemon_mode']), + (('node.state.daemon', config['node_hostname']), 'init'), + (('node.state.domain', config['node_hostname']), 'flushed'), + (('node.state.router', config['node_hostname']), init_routerstate), + (('node.data.static', config['node_hostname']), ' '.join(config['static_data'])), + (('node.data.pvc_version', config['node_hostname']), config['pvcnoded_version']), + (('node.ipmi.hostname', config['node_hostname']), config['ipmi_hostname']), + (('node.ipmi.username', config['node_hostname']), config['ipmi_username']), + (('node.ipmi.password', config['node_hostname']), config['ipmi_password']), + (('node.memory.total', config['node_hostname']), '0'), + (('node.memory.used', config['node_hostname']), '0'), + (('node.memory.free', config['node_hostname']), '0'), + (('node.memory.allocated', config['node_hostname']), '0'), + (('node.memory.provisioned', config['node_hostname']), '0'), + (('node.vcpu.allocated', config['node_hostname']), '0'), + (('node.cpu.load', config['node_hostname']), '0.0'), + (('node.running_domains', config['node_hostname']), '0'), + (('node.count.provisioned_domains', config['node_hostname']), '0'), + (('node.count.networks', config['node_hostname']), '0'), + ]) diff --git a/test-cluster.sh b/test-cluster.sh index d9c9988c..2f847f03 100755 --- a/test-cluster.sh +++ b/test-cluster.sh @@ -26,44 +26,44 @@ rm ${backup_tmp} || true # Provisioner tests _pvc provisioner profile list test -_pvc provisioner create --wait testX test +_pvc provisioner create --wait testx test sleep 30 # VM tests vm_tmp=$(mktemp) -_pvc vm dump testX --file ${vm_tmp} -_pvc vm shutdown --yes --wait testX -_pvc vm start testX +_pvc vm dump testx --file ${vm_tmp} +_pvc vm shutdown --yes --wait testx +_pvc vm start testx sleep 30 -_pvc vm stop --yes testX -_pvc vm disable testX -_pvc vm undefine --yes testX +_pvc vm stop --yes testx +_pvc vm disable testx +_pvc vm undefine --yes testx _pvc vm define --target hv3 --tag pvc-test ${vm_tmp} -_pvc vm start testX +_pvc vm start testx sleep 30 -_pvc vm restart --yes --wait testX +_pvc vm restart --yes --wait testx sleep 30 -_pvc vm migrate --wait testX +_pvc vm migrate --wait testx sleep 5 -_pvc vm unmigrate --wait testX +_pvc vm unmigrate --wait testx sleep 5 -_pvc vm move --wait --target hv1 testX +_pvc vm move --wait --target hv1 testx sleep 5 -_pvc vm meta testX --limit hv1 --selector vms --method live --profile test --no-autostart -_pvc vm tag add testX mytag -_pvc vm tag get testX +_pvc vm meta testx --limit hv1 --selector vms --method live --profile test --no-autostart +_pvc vm tag add testx mytag +_pvc vm tag get testx _pvc vm list --tag mytag -_pvc vm tag remove testX mytag -_pvc vm network get testX -_pvc vm vcpu set testX 4 -_pvc vm vcpu get testX -_pvc vm memory set testX 4096 -_pvc vm memory get testX -_pvc vm vcpu set testX 2 -_pvc vm memory set testX 2048 --restart --yes +_pvc vm tag remove testx mytag +_pvc vm network get testx +_pvc vm vcpu set testx 4 +_pvc vm vcpu get testx +_pvc vm memory set testx 4096 +_pvc vm memory get testx +_pvc vm vcpu set testx 2 +_pvc vm memory set testx 2048 --restart --yes sleep 5 -_pvc vm list testX -_pvc vm info --long testX +_pvc vm list testx +_pvc vm info --long testx rm ${vm_tmp} || true # Node tests @@ -81,9 +81,9 @@ _pvc node info hv1 # Network tests _pvc network add 10001 --description testing --type managed --domain testing.local --ipnet 10.100.100.0/24 --gateway 10.100.100.1 --dhcp --dhcp-start 10.100.100.100 --dhcp-end 10.100.100.199 sleep 5 -_pvc vm network add --restart --yes testX 10001 +_pvc vm network add --restart --yes testx 10001 sleep 30 -_pvc vm network remove --restart --yes testX 10001 +_pvc vm network remove --restart --yes testx 10001 sleep 5 _pvc network acl add 10001 --in --description test-acl --order 0 --rule "'ip daddr 10.0.0.0/8 counter'" @@ -98,10 +98,10 @@ _pvc network list _pvc network info --long 10001 # Network-VM interaction tests -_pvc vm network add testX 10001 --model virtio --restart --yes +_pvc vm network add testx 10001 --model virtio --restart --yes sleep 30 -_pvc vm network get testX -_pvc vm network remove testX 10001 --restart --yes +_pvc vm network get testx +_pvc vm network remove testx 10001 --restart --yes sleep 5 _pvc network remove --yes 10001 @@ -117,9 +117,9 @@ _pvc storage osd list _pvc storage pool add testing 64 --replcfg "copies=3,mincopies=2" sleep 5 _pvc storage pool list -_pvc storage volume add testing testX 1G -_pvc storage volume resize testing testX 2G -_pvc storage volume rename testing testX testerX +_pvc storage volume add testing testx 1G +_pvc storage volume resize testing testx 2G +_pvc storage volume rename testing testx testerX _pvc storage volume clone testing testerX testerY _pvc storage volume list --pool testing _pvc storage volume snapshot add testing testerX asnapshotX @@ -128,10 +128,10 @@ _pvc storage volume snapshot list _pvc storage volume snapshot remove --yes testing testerX asnapshotY # Storage-VM interaction tests -_pvc vm volume add testX --type rbd --disk-id sdh --bus scsi testing/testerY --restart --yes +_pvc vm volume add testx --type rbd --disk-id sdh --bus scsi testing/testerY --restart --yes sleep 30 -_pvc vm volume get testX -_pvc vm volume remove testX testing/testerY --restart --yes +_pvc vm volume get testx +_pvc vm volume remove testx testing/testerY --restart --yes sleep 5 _pvc storage volume remove --yes testing testerY @@ -139,8 +139,8 @@ _pvc storage volume remove --yes testing testerX _pvc storage pool remove --yes testing # Remove the VM -_pvc vm stop --yes testX -_pvc vm remove --yes testX +_pvc vm stop --yes testx +_pvc vm remove --yes testx time_end=$(date +%s)