From d90fb072404b8d534b4698eabc582c6b5b7263a8 Mon Sep 17 00:00:00 2001 From: Joshua Boniface Date: Mon, 11 Mar 2019 01:44:26 -0400 Subject: [PATCH] Move to YAML config and allow split functions 1. Move to a YAML-based configuration format instead of the original INI-based configuration to facilitate better organization and readability. 2. Modify the daemon to be able to operate in several modes based on configuration flags. Either networking or storage functions can be disabled using the configuration, allowing the PVC system to be used only for hypervisor management if required. --- debian/pvc-client-cli.install | 1 + debian/pvc-daemon.install | 2 +- debian/rules | 2 + debian/source/format | 2 +- node-daemon/pvcd.conf.sample | 98 ---- node-daemon/pvcd.sample.yaml | 14 +- node-daemon/pvcd.service | 2 +- node-daemon/pvcd/Daemon.py | 927 +++++++++++++++++-------------- node-daemon/pvcd/NodeInstance.py | 52 +- node-daemon/pvcd/fixrbdlocks | 13 + node-daemon/pvcd/log.py | 2 + 11 files changed, 570 insertions(+), 545 deletions(-) delete mode 100644 node-daemon/pvcd.conf.sample create mode 100755 node-daemon/pvcd/fixrbdlocks diff --git a/debian/pvc-client-cli.install b/debian/pvc-client-cli.install index b2ee9a60..91df889f 100644 --- a/debian/pvc-client-cli.install +++ b/debian/pvc-client-cli.install @@ -1 +1,2 @@ client-cli/pvc.py usr/share/pvc +client-cli/pvc_init.py usr/share/pvc diff --git a/debian/pvc-daemon.install b/debian/pvc-daemon.install index 067abc15..e7183947 100644 --- a/debian/pvc-daemon.install +++ b/debian/pvc-daemon.install @@ -1,4 +1,4 @@ node-daemon/pvcd.py usr/share/pvc node-daemon/pvcd.service lib/systemd/system -node-daemon/pvcd.conf.sample etc/pvc +node-daemon/pvcd.sample.yaml etc/pvc node-daemon/pvcd usr/share/pvc diff --git a/debian/rules b/debian/rules index da4dcf81..c155d70f 100755 --- a/debian/rules +++ b/debian/rules @@ -6,6 +6,8 @@ %: dh $@ +override_dh_auto_clean: + find . -name "__pycache__" -exec rm -r {} \; || true # If you need to rebuild the Sphinx documentation # Add spinxdoc to the dh --with line diff --git a/debian/source/format b/debian/source/format index 163aaf8d..d3827e75 100644 --- a/debian/source/format +++ b/debian/source/format @@ -1 +1 @@ -3.0 (quilt) +1.0 diff --git a/node-daemon/pvcd.conf.sample b/node-daemon/pvcd.conf.sample deleted file mode 100644 index d6646ddb..00000000 --- a/node-daemon/pvcd.conf.sample +++ /dev/null @@ -1,98 +0,0 @@ -# pvcd cluster configuration file example -# -# This configuration file specifies details for this node in PVC. Multiple node -# blocks can be added but only the one matching the current system nodename will -# be used by the local daemon. Default values are not supported; the values in -# this sample configuration are considered defaults and, with adjustment of the -# nodename section and coordinators list, can be used as-is on a Debian system. -# -# The following values are required for each node or in a default section: -# coordinators: a CSV list of the short hostnames of the coordinator nodes; these nodes become -# members of the Zookeeper cluster, can act as routers, and perform additional -# special functions in a cluster; ideally there are 3 coordinators, though 5 -# coordinators are supported -# cluster_domain: the node cluster domain, set during bootstrap -# storage_domain: the node storage domain, set during bootstrap -# dynamic_directory: the ramdisk directory for PVC to store its dynamic configurations, -# usually under /run or /var/run -# log_directory: the logging directory, usually under /var/log -# file_logging = whether to log daemon to a file (pvc.log under log_directory) in addition to -# normal stdout printing -# keepalive_interval: the interval between keepalives and for dead node timeout (defaults to 5) -# fence_intervals: the number of keepalive_intervals without Zookeeper contact before this node -# will consider another node dead and fence it (defaults to 6, i.e. 30s) -# suicide_intervals: the number of keepalive_intervals without Zookeeper contact before this -# node will consider itself failed and terminate all running VMs (defaults -# to 0, i.e. disabled); should be less than "fence_intervals" -# successful_fence: the action to take on a successful fencing operation; can be "none" or -# "migrate" (defaults to "migrate") -# failed_fence: the action to take on a failed fencing operation; can be "none" or "migrate" -# (defaults to "none"); "migrate" requires "suicide_intervals" to be set) -# NOTE: POTENTIALLY DANGEROUS - see README for details -# migration_target_selector: the method to use to select target nodes during a virtual machine -# flush action; can be "mem", "load", "vcpus", or "vms" (defaults -# to "mem"); the best choice based on this field is selected for -# each VM to be migrated -# pdns_mysql_host: the host address (usually "localhost") of the PowerDNS zone aggregator -# backend database -# pdns_mysql_port: the port (usually "3306") of the PowerDNS zone aggregator backend database -# pdns_mysql_dbname: the database name (usually "pvcdns") of the PowerDNS zone aggregator -# backend database -# pdns_mysql_user: the client username (usually "pvcdns") of the PowerDNS zone aggregator -# backend database -# pdns_mysql_password: the client user password (randomly generated at cluster bootstrap) -# of the PowerDNS zone aggregator backend database -# vni_floating_ip: the IP address (in CIDR format) for the floating IP on the VNI network, -# used to provide a consistent view of the dynamic primary node to other -# machines in the VNI network, e.g. for slaving DNS or sending in routes. -# upstream_floating_ip: the IP address (in CIDR format) for the floating IP on the upstream -# network, used to provide a consistent view of the dynamic primary -# node to machines in the upstream network, e.g. for slaving DNS or -# sending in routes. -# The following values are required for each node specifically (usually node-unique): -# vni_dev: the lower-level network device to bind VNI traffic to -# vni_dev_ip: the IP address (in CIDR format) of the lower-level network device, used by frr -# to communicate between nodes and pass routes between them. -# storage_dev: the lower-level network device to bind storage traffic to -# storage_dev_ip: the IP address (in CIDR format) of the lower-level network device, used by -# Ceph for storage traffic (both monitor and OSD). -# upstream_dev: the lower-level network device to bind coordinator upstream traffic to -# upstream_dev_ip: the IP address (in CIDR format) of the upstream network device, used by -# the system for upstream traffic flow. -# ipmi_hostname: the IPMI hostname for fencing (defaults to -lom.) -# ipmi_username: username to connect to IPMI -# ipmi_password: password to connect to IPMI -# -# Copy this example to /etc/pvc/pvcd.conf and edit to your needs - -[default] -coordinators = pvc-hv1,pvc-hv2,pvc-hv3 -cluster_domain = i.bonilan.net -storage_domain = sx.bonilan.net -dynamic_directory = /run/pvc -log_directory = /var/log/pvc -file_logging = True -keepalive_interval = 5 -fence_intervals = 6 -suicide_intervals = 0 -successful_fence = migrate -failed_fence = none -migration_target_selector = mem -pdns_mysql_host = localhost -pdns_mysql_port = 3306 -pdns_mysql_dbname = pvcdns -pdns_mysql_user = pvcdns -pdns_mysql_password = pvcdns -vni_floating_ip = 10.255.0.254/24 -upstream_floating_ip = 10.101.0.30/24 - -[pvc-hv1] -vni_dev = ens4 -vni_dev_ip = 10.255.0.1/24 -storage_dev = ens4 -storage_dev_ip = 10.254.0.1/24 -upstream_dev = ens2 -upstream_dev_ip = 10.101.0.31/24 -ipmi_username = admin -ipmi_password = Passw0rd -ipmi_hostname = pvc-hv1-lom diff --git a/node-daemon/pvcd.sample.yaml b/node-daemon/pvcd.sample.yaml index 64654da3..9a4bf83b 100644 --- a/node-daemon/pvcd.sample.yaml +++ b/node-daemon/pvcd.sample.yaml @@ -11,6 +11,15 @@ pvc: # node: The (short) hostname of the node, set during provisioning node: pvc-hv1 + # functions: The daemon functions to enable + functions: + # enable_hypervisor: Enable or disable hypervisor functionality + # This should never be False except in very advanced usecases + enable_hypervisor: True + # enable_networking: Enable or disable virtual networking and routing functionality + enable_networking: True + # enable_storage: Enable or disable Ceph storage management functionality + enable_storage: True # cluster: Cluster-level configuration cluster: # coordinators: The list of cluster coordinator hostnames @@ -19,6 +28,7 @@ pvc: - pvc-hv2 - pvc-hv3 # networks: Cluster-level network configuration + # OPTIONAL if enable_networking: False networks: # upstream: Upstream routed network for in- and out-bound upstream networking upstream: @@ -43,8 +53,9 @@ pvc: # network: Cluster storage network block network: "10.254.0.0/24" # floating_ip: Cluster storage floating IP address for the primary coordinator - floating_ip: "10.255.0.254/24" + floating_ip: "10.254.0.254/24" # coordinator: Coordinator-specific configuration + # OPTIONAL if enable_networking: False coordinator: # dns: DNS aggregator subsystem dns: @@ -105,6 +116,7 @@ pvc: # stdout_logging: Enable or disable logging to stdout (i.e. journald) stdout_logging: True # networking: PVC networking configuration + # OPTIONAL if enable_networking: False networking: # devices: Interface devices configuration devices: diff --git a/node-daemon/pvcd.service b/node-daemon/pvcd.service index f0769caf..8c2fbc27 100644 --- a/node-daemon/pvcd.service +++ b/node-daemon/pvcd.service @@ -7,7 +7,7 @@ After = network-online.target libvirtd.service zookeeper.service mariadb.service Type = simple WorkingDirectory = /usr/share/pvc Environment = PYTHONUNBUFFERED=true -Environment = PVCD_CONFIG_FILE=/etc/pvc/pvcd.conf +Environment = PVCD_CONFIG_FILE=/etc/pvc/pvcd.yaml ExecStart = /usr/share/pvc/pvcd.py Restart = never diff --git a/node-daemon/pvcd/Daemon.py b/node-daemon/pvcd/Daemon.py index 4174b331..461582bf 100644 --- a/node-daemon/pvcd/Daemon.py +++ b/node-daemon/pvcd/Daemon.py @@ -39,7 +39,9 @@ import time import re import configparser import threading +import yaml import json +import ipaddress import apscheduler.schedulers.background import pvcd.log as log @@ -105,7 +107,10 @@ myfqdn = socket.gethostname() #myfqdn = 'pvc-hv1.domain.net' myhostname = myfqdn.split('.', 1)[0] mydomainname = ''.join(myfqdn.split('.', 1)[1:]) -mynodeid = re.findall(r'\d+', myhostname)[-1] +try: + mynodeid = re.findall(r'\d+', myhostname)[-1] +except IndexError: + mynodeid = 1 # Gather useful data about our host # Static data format: 'cpu_count', 'arch', 'os', 'kernel' @@ -115,64 +120,116 @@ staticdata.append(subprocess.run(['uname', '-r'], stdout=subprocess.PIPE).stdout staticdata.append(subprocess.run(['uname', '-o'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) staticdata.append(subprocess.run(['uname', '-m'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -# Config values dictionary -config_values = [ - 'coordinators', - 'cluster_domain', - 'storage_domain', - 'dynamic_directory', - 'log_directory', - 'file_logging', - 'keepalive_interval', - 'fence_intervals', - 'suicide_intervals', - 'successful_fence', - 'failed_fence', - 'migration_target_selector', - 'pdns_mysql_host',# = 'localhost' - 'pdns_mysql_port',# = 3306 - 'pdns_mysql_dbname',# = 'pvcdns' - 'pdns_mysql_user',# = 'pvcdns' - 'pdns_mysql_password',# = 'pvcdns' - 'vni_dev', - 'vni_dev_ip', - 'vni_floating_ip', - 'storage_dev', - 'storage_dev_ip', - 'upstream_dev', - 'upstream_dev_ip', - 'upstream_floating_ip', - 'ipmi_hostname', - 'ipmi_username', - 'ipmi_password' -] - # Read and parse the config file def readConfig(pvcd_config_file, myhostname): print('Loading configuration from file "{}"'.format(pvcd_config_file)) - o_config = configparser.ConfigParser() - o_config.read(pvcd_config_file) - config = {} - - try: - entries = o_config[myhostname] - except: + with open(pvcd_config_file, 'r') as cfgfile: try: - entries = o_config['default'] + o_config = yaml.load(cfgfile) except Exception as e: - print('ERROR: Config file is not valid!') + print('ERROR: Failed to parse configuration file: {}'.format(e)) exit(1) - for entry in config_values: + # Handle the basic config (hypervisor-only) + try: + config_general = { + 'coordinators': o_config['pvc']['cluster']['coordinators'], + 'enable_hypervisor': o_config['pvc']['functions']['enable_hypervisor'], + 'enable_networking': o_config['pvc']['functions']['enable_networking'], + 'enable_storage': o_config['pvc']['functions']['enable_storage'], + 'dynamic_directory': o_config['pvc']['system']['configuration']['directories']['dynamic_directory'], + 'log_directory': o_config['pvc']['system']['configuration']['directories']['log_directory'], + 'file_logging': o_config['pvc']['system']['configuration']['logging']['file_logging'], + 'stdout_logging': o_config['pvc']['system']['configuration']['logging']['stdout_logging'], + 'keepalive_interval': o_config['pvc']['system']['fencing']['intervals']['keepalive_interval'], + 'fence_intervals': o_config['pvc']['system']['fencing']['intervals']['fence_intervals'], + 'suicide_intervals': o_config['pvc']['system']['fencing']['intervals']['suicide_intervals'], + 'successful_fence': o_config['pvc']['system']['fencing']['actions']['successful_fence'], + 'failed_fence': o_config['pvc']['system']['fencing']['actions']['failed_fence'], + 'migration_target_selector': o_config['pvc']['system']['migration']['target_selector'], + 'ipmi_hostname': o_config['pvc']['system']['fencing']['ipmi']['host'], + 'ipmi_username': o_config['pvc']['system']['fencing']['ipmi']['user'], + 'ipmi_password': o_config['pvc']['system']['fencing']['ipmi']['pass'] + } + except Exception as e: + print('ERROR: {}!'.format(e)) + exit(1) + config = config_general + + # Handle the networking config + if config['enable_networking']: try: - config[entry] = entries[entry] - except: + config_networking = { + 'cluster_domain': o_config['pvc']['cluster']['networks']['cluster']['domain'], + 'vni_floating_ip': o_config['pvc']['cluster']['networks']['cluster']['floating_ip'], + 'vni_network': o_config['pvc']['cluster']['networks']['cluster']['network'], + 'storage_domain': o_config['pvc']['cluster']['networks']['storage']['domain'], + 'storage_floating_ip': o_config['pvc']['cluster']['networks']['storage']['floating_ip'], + 'storage_network': o_config['pvc']['cluster']['networks']['storage']['network'], + 'upstream_domain': o_config['pvc']['cluster']['networks']['upstream']['domain'], + 'upstream_floating_ip': o_config['pvc']['cluster']['networks']['upstream']['floating_ip'], + 'upstream_network': o_config['pvc']['cluster']['networks']['upstream']['network'], + 'pdns_mysql_host': o_config['pvc']['coordinator']['dns']['database']['host'], + 'pdns_mysql_port': o_config['pvc']['coordinator']['dns']['database']['port'], + 'pdns_mysql_dbname': o_config['pvc']['coordinator']['dns']['database']['name'], + 'pdns_mysql_user': o_config['pvc']['coordinator']['dns']['database']['user'], + 'pdns_mysql_password': o_config['pvc']['coordinator']['dns']['database']['pass'], + 'vni_dev': o_config['pvc']['system']['configuration']['networking']['devices']['cluster'], + 'vni_dev_ip': o_config['pvc']['system']['configuration']['networking']['addresses']['cluster'], + 'storage_dev': o_config['pvc']['system']['configuration']['networking']['devices']['storage'], + 'storage_dev_ip': o_config['pvc']['system']['configuration']['networking']['addresses']['storage'], + 'upstream_dev': o_config['pvc']['system']['configuration']['networking']['devices']['upstream'], + 'upstream_dev_ip': o_config['pvc']['system']['configuration']['networking']['addresses']['upstream'], + } + except Exception as e: + print('ERROR: {}!'.format(e)) + exit(1) + config = {**config, **config_networking} + + # Create the by-id address entries + for net in [ 'vni', + 'storage', + 'upstream' ]: + address_key = '{}_dev_ip'.format(net) + floating_key = '{}_floating_ip'.format(net) + network_key = '{}_network'.format(net) + + # Verify the network provided is valid try: - config[entry] = o_config['default'][entry] - except: - print('ERROR: Config file missing required value "{}" for this host!'.format(entry)) + network = ipaddress.ip_network(config[network_key]) + except Exception as e: + print('ERROR: Network address {} for {} is not valid!'.format(config[network_key], network_key)) exit(1) + + # If we should be autoselected + if config[address_key] == 'by-id': + # Construct an IP from the relevant network + # The NodeID starts at 1, but indexes start at 0 + address_id = int(mynodeid) - 1 + # Grab the nth address from the network + config[address_key] = list(network.hosts())[address_id] + + # Verify that the floating IP is valid + + try: + # Set the ipaddr + floating_addr = ipaddress.ip_address(config[floating_key].split('/')[0]) + # Verify we're in the network + if not floating_addr in list(network.hosts()): + raise + except Exception as e: + print('ERROR: Floating address {} for {} is not valid!'.format(config[floating_key], floating_key)) + exit(1) + + # Handle the storage config + if config['enable_storage']: + try: + config_storage = dict() + except Exception as e: + print('ERROR: {}!'.format(e)) + exit(1) + config = {**config, **config_storage} # Handle an empty ipmi_hostname if config['ipmi_hostname'] == '': @@ -182,6 +239,11 @@ def readConfig(pvcd_config_file, myhostname): # Get the config object from readConfig() config = readConfig(pvcd_config_file, myhostname) + +# Handle the enable values +enable_hypervisor = config['enable_hypervisor'] +enable_networking = config['enable_networking'] +enable_storage = config['enable_storage'] ############################################################################### # PHASE 1b - Prepare filesystem directories @@ -242,32 +304,33 @@ logger.out('Starting pvcd on host {}'.format(myfqdn), state='s') # PHASE 1d - Prepare sysctl for pvcd ############################################################################### -# Enable routing functions -common.run_os_command('sysctl net.ipv4.ip_forward=1') -common.run_os_command('sysctl net.ipv6.ip_forward=1') - -# Send redirects -common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1') -common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1') -common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1') -common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1') - -# Accept source routes -common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') -common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1') -common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1') -common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1') - -# Disable RP filtering on the VNI dev interface (to allow traffic pivoting from primary) -common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['vni_dev'])) -common.run_os_command('sysctl net.ipv6.conf.{}.rp_filter=0'.format(config['vni_dev'])) +if enable_networking: + # Enable routing functions + common.run_os_command('sysctl net.ipv4.ip_forward=1') + common.run_os_command('sysctl net.ipv6.ip_forward=1') + + # Send redirects + common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1') + common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1') + common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1') + common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1') + + # Accept source routes + common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') + common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1') + common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1') + common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1') + + # Disable RP filtering on the VNI dev interface (to allow traffic pivoting from primary) + common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['vni_dev'])) + common.run_os_command('sysctl net.ipv6.conf.{}.rp_filter=0'.format(config['vni_dev'])) ############################################################################### # PHASE 2 - Determine coordinator mode and start Zookeeper on coordinators ############################################################################### # What is the list of coordinator hosts -coordinator_nodes = config['coordinators'].split(',') +coordinator_nodes = config['coordinators'] if myhostname in coordinator_nodes: # We are indeed a coordinator host @@ -412,7 +475,11 @@ else: transaction.commit() # Check that the primary key exists, and create it with us as master if not -current_primary = zkhandler.readdata(zk_conn, '/primary_node') +try: + current_primary = zkhandler.readdata(zk_conn, '/primary_node') +except kazoo.exceptions.NoNodeError: + current_primary = 'none' + if current_primary and current_primary != 'none': logger.out('Current primary node is {}{}{}.'.format(logger.fmt_blue, current_primary, logger.fmt_end), state='i') else: @@ -424,105 +491,109 @@ else: # PHASE 6 - Create local IP addresses for static networks ############################################################################### -# VNI configuration -vni_dev = config['vni_dev'] -vni_dev_ip = config['vni_dev_ip'] -logger.out('Setting up VNI network interface {}'.format(vni_dev, vni_dev_ip), state='i') -common.run_os_command('ip link set {} mtu 9000 up'.format(vni_dev)) +if enable_networking: + # VNI configuration + vni_dev = config['vni_dev'] + vni_dev_ip = config['vni_dev_ip'] + logger.out('Setting up VNI network interface {}'.format(vni_dev, vni_dev_ip), state='i') + common.run_os_command('ip link set {} mtu 9000 up'.format(vni_dev)) -# Cluster bridge configuration -logger.out('Setting up cluster network bridge on interface {} with IP {}'.format(vni_dev, vni_dev_ip), state='i') -common.run_os_command('brctl addbr brcluster') -common.run_os_command('brctl addif brcluster {}'.format(vni_dev)) -common.run_os_command('ip link set brcluster mtu 9000 up') -common.run_os_command('ip address add {} dev {}'.format(vni_dev_ip, 'brcluster')) + # Cluster bridge configuration + logger.out('Setting up cluster network bridge on interface {} with IP {}'.format(vni_dev, vni_dev_ip), state='i') + common.run_os_command('brctl addbr brcluster') + common.run_os_command('brctl addif brcluster {}'.format(vni_dev)) + common.run_os_command('ip link set brcluster mtu 9000 up') + common.run_os_command('ip address add {} dev {}'.format(vni_dev_ip, 'brcluster')) -# Storage configuration -storage_dev = config['storage_dev'] -if storage_dev == vni_dev: - storage_dev = 'brcluster' -storage_dev_ip = config['storage_dev_ip'] -logger.out('Setting up Storage network on interface {} with IP {}'.format(storage_dev, storage_dev_ip), state='i') -common.run_os_command('ip link set {} mtu 9000 up'.format(storage_dev)) -common.run_os_command('ip address add {} dev {}'.format(storage_dev_ip, storage_dev)) + # Storage configuration + storage_dev = config['storage_dev'] + if storage_dev == vni_dev: + storage_dev = 'brcluster' + storage_dev_ip = config['storage_dev_ip'] + logger.out('Setting up Storage network on interface {} with IP {}'.format(storage_dev, storage_dev_ip), state='i') + common.run_os_command('ip link set {} mtu 9000 up'.format(storage_dev)) + common.run_os_command('ip address add {} dev {}'.format(storage_dev_ip, storage_dev)) -# Upstream configuration -if config['daemon_mode'] == 'coordinator': - upstream_dev = config['upstream_dev'] - upstream_dev_ip = config['upstream_dev_ip'] - logger.out('Setting up Upstream network on interface {} with IP {}'.format(upstream_dev, upstream_dev_ip), state='i') - common.run_os_command('ip link set {} up'.format(upstream_dev)) - common.run_os_command('ip address add {} dev {}'.format(upstream_dev_ip, upstream_dev)) + # Upstream configuration + if config['daemon_mode'] == 'coordinator': + upstream_dev = config['upstream_dev'] + upstream_dev_ip = config['upstream_dev_ip'] + logger.out('Setting up Upstream network on interface {} with IP {}'.format(upstream_dev, upstream_dev_ip), state='i') + common.run_os_command('ip link set {} up'.format(upstream_dev)) + common.run_os_command('ip address add {} dev {}'.format(upstream_dev_ip, upstream_dev)) ############################################################################### # PHASE 7a - Ensure Libvirt is running on the local host ############################################################################### -# Start the zookeeper service using systemctl -logger.out('Starting Libvirt daemon', state='i') -common.run_os_command('systemctl start libvirtd.service') -time.sleep(1) +if enable_hypervisor: + # Start the zookeeper service using systemctl + logger.out('Starting Libvirt daemon', state='i') + common.run_os_command('systemctl start libvirtd.service') + time.sleep(1) -# Check that libvirtd is listening TCP -libvirt_check_name = "qemu+tcp://127.0.0.1:16509/system" -logger.out('Connecting to Libvirt daemon at {}'.format(libvirt_check_name), state='i') -try: - lv_conn = libvirt.open(libvirt_check_name) - lv_conn.close() -except Exception as e: - logger.out('ERROR: Failed to connect to Libvirt daemon: {}'.format(e), state='e') - exit(1) + # Check that libvirtd is listening TCP + libvirt_check_name = "qemu+tcp://127.0.0.1:16509/system" + logger.out('Connecting to Libvirt daemon at {}'.format(libvirt_check_name), state='i') + try: + lv_conn = libvirt.open(libvirt_check_name) + lv_conn.close() + except Exception as e: + logger.out('ERROR: Failed to connect to Libvirt daemon: {}'.format(e), state='e') + exit(1) ############################################################################### # PHASE 7b - Ensure Ceph is running on the local host ############################################################################### -if config['daemon_mode'] == 'coordinator': - common.run_os_command('systemctl start ceph-mon@{}'.format(myhostname)) - common.run_os_command('systemctl start ceph-mgr@{}'.format(myhostname)) +if enable_storage: + if config['daemon_mode'] == 'coordinator': + common.run_os_command('systemctl start ceph-mon@{}'.format(myhostname)) + common.run_os_command('systemctl start ceph-mgr@{}'.format(myhostname)) ############################################################################### # PHASE 7c - Ensure NFT is running on the local host ############################################################################### -logger.out("Creating NFT firewall configuration", state='i') +if enable_networking: + logger.out("Creating NFT firewall configuration", state='i') -# Create our config dirs -common.run_os_command( - '/bin/mkdir --parents {}/networks'.format( - config['nft_dynamic_directory'] - ) -) -common.run_os_command( - '/bin/mkdir --parents {}/static'.format( - config['nft_dynamic_directory'] - ) -) -common.run_os_command( - '/bin/mkdir --parents {}'.format( - config['nft_dynamic_directory'] - ) -) + # Create our config dirs + common.run_os_command( + '/bin/mkdir --parents {}/networks'.format( + config['nft_dynamic_directory'] + ) + ) + common.run_os_command( + '/bin/mkdir --parents {}/static'.format( + config['nft_dynamic_directory'] + ) + ) + common.run_os_command( + '/bin/mkdir --parents {}'.format( + config['nft_dynamic_directory'] + ) + ) -# Set up the basic features of the nftables firewall -nftables_base_rules = """# Base rules -flush ruleset -# Add the filter table and chains -add table inet filter -add chain inet filter forward {{ type filter hook forward priority 0; }} -add chain inet filter input {{ type filter hook input priority 0; }} -# Include static rules and network rules -include "{rulesdir}/static/*" -include "{rulesdir}/networks/*" -""".format( - rulesdir=config['nft_dynamic_directory'] -) + # Set up the basic features of the nftables firewall + nftables_base_rules = """# Base rules + flush ruleset + # Add the filter table and chains + add table inet filter + add chain inet filter forward {{ type filter hook forward priority 0; }} + add chain inet filter input {{ type filter hook input priority 0; }} + # Include static rules and network rules + include "{rulesdir}/static/*" + include "{rulesdir}/networks/*" + """.format( + rulesdir=config['nft_dynamic_directory'] + ) -# Write the basic firewall config -nftables_base_filename = '{}/base.nft'.format(config['nft_dynamic_directory']) -with open(nftables_base_filename, 'w') as nfbasefile: - nfbasefile.write(nftables_base_rules) -common.reload_firewall_rules(logger, nftables_base_filename) + # Write the basic firewall config + nftables_base_filename = '{}/base.nft'.format(config['nft_dynamic_directory']) + with open(nftables_base_filename, 'w') as nfbasefile: + nfbasefile.write(nftables_base_rules) + common.reload_firewall_rules(logger, nftables_base_filename) ############################################################################### # PHASE 7d - Ensure DNSMASQ is not running @@ -547,9 +618,12 @@ domain_list = [] osd_list = [] pool_list = [] -# Create an instance of the DNS Aggregator if we're a coordinator -if config['daemon_mode'] == 'coordinator': - dns_aggregator = DNSAggregatorInstance.DNSAggregatorInstance(zk_conn, config, logger) +if enable_networking: + # Create an instance of the DNS Aggregator if we're a coordinator + if config['daemon_mode'] == 'coordinator': + dns_aggregator = DNSAggregatorInstance.DNSAggregatorInstance(zk_conn, config, logger) + else: + dns_aggregator = None else: dns_aggregator = None @@ -605,119 +679,122 @@ def update_primary(new_primary, stat, event=''): for node in d_node: d_node[node].primary_node = new_primary -# Network objects -@zk_conn.ChildrenWatch('/networks') -def update_networks(new_network_list): - global network_list, d_network +if enable_networking: + # Network objects + @zk_conn.ChildrenWatch('/networks') + def update_networks(new_network_list): + global network_list, d_network - # Add any missing networks to the list - for network in new_network_list: - if not network in network_list: - d_network[network] = VXNetworkInstance.VXNetworkInstance(network, zk_conn, config, logger, this_node) - print(network) - if config['daemon_mode'] == 'coordinator': - dns_aggregator.add_network(d_network[network]) - # Start primary functionality - if this_node.router_state == 'primary': - d_network[network].createGateways() - d_network[network].startDHCPServer() + # Add any missing networks to the list + for network in new_network_list: + if not network in network_list: + d_network[network] = VXNetworkInstance.VXNetworkInstance(network, zk_conn, config, logger, this_node) + print(network) + if config['daemon_mode'] == 'coordinator': + dns_aggregator.add_network(d_network[network]) + # Start primary functionality + if this_node.router_state == 'primary': + d_network[network].createGateways() + d_network[network].startDHCPServer() - # Remove any deleted networks from the list - for network in network_list: - if not network in new_network_list: - # Stop primary functionality - if this_node.router_state == 'primary': - d_network[network].stopDHCPServer() - d_network[network].removeGateways() - dns_aggregator.remove_network(d_network[network]) - # Stop general functionality - d_network[network].removeFirewall() - d_network[network].removeNetwork() - # Delete the object - del(d_network[network]) + # Remove any deleted networks from the list + for network in network_list: + if not network in new_network_list: + # Stop primary functionality + if this_node.router_state == 'primary': + d_network[network].stopDHCPServer() + d_network[network].removeGateways() + dns_aggregator.remove_network(d_network[network]) + # Stop general functionality + d_network[network].removeFirewall() + d_network[network].removeNetwork() + # Delete the object + del(d_network[network]) - # Update and print new list - network_list = new_network_list - logger.out('{}Network list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(network_list)), state='i') + # Update and print new list + network_list = new_network_list + logger.out('{}Network list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(network_list)), state='i') - # Update node objects' list - for node in d_node: - d_node[node].update_network_list(d_network) + # Update node objects' list + for node in d_node: + d_node[node].update_network_list(d_network) -# VM domain objects -@zk_conn.ChildrenWatch('/domains') -def update_domains(new_domain_list): - global domain_list, d_domain +if enable_hypervisor: + # VM domain objects + @zk_conn.ChildrenWatch('/domains') + def update_domains(new_domain_list): + global domain_list, d_domain - # Add any missing domains to the list - for domain in new_domain_list: - if not domain in domain_list: - d_domain[domain] = DomainInstance.DomainInstance(domain, zk_conn, config, logger, this_node) + # Add any missing domains to the list + for domain in new_domain_list: + if not domain in domain_list: + d_domain[domain] = DomainInstance.DomainInstance(domain, zk_conn, config, logger, this_node) - # Remove any deleted domains from the list - for domain in domain_list: - if not domain in new_domain_list: - # Delete the object - del(d_domain[domain]) + # Remove any deleted domains from the list + for domain in domain_list: + if not domain in new_domain_list: + # Delete the object + del(d_domain[domain]) - # Update and print new list - domain_list = new_domain_list - logger.out('{}Domain list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(domain_list)), state='i') + # Update and print new list + domain_list = new_domain_list + logger.out('{}Domain list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(domain_list)), state='i') - # Update node objects' list - for node in d_node: - d_node[node].update_domain_list(d_domain) + # Update node objects' list + for node in d_node: + d_node[node].update_domain_list(d_domain) -# Ceph OSD provisioning key -@zk_conn.DataWatch('/ceph/cmd') -def cmd(data, stat, event=''): - if data: - data = data.decode('ascii') - else: - data = '' +if enable_storage: + # Ceph OSD provisioning key + @zk_conn.DataWatch('/ceph/cmd') + def cmd(data, stat, event=''): + if data: + data = data.decode('ascii') + else: + data = '' - if data: - CephInstance.run_command(zk_conn, data, d_osd) + if data: + CephInstance.run_command(zk_conn, data, d_osd) -# OSD objects -@zk_conn.ChildrenWatch('/ceph/osds') -def update_osds(new_osd_list): - global osd_list, d_osd + # OSD objects + @zk_conn.ChildrenWatch('/ceph/osds') + def update_osds(new_osd_list): + global osd_list, d_osd - # Add any missing OSDs to the list - for osd in new_osd_list: - if not osd in osd_list: - d_osd[osd] = CephInstance.CephOSDInstance(zk_conn, this_node, osd) + # Add any missing OSDs to the list + for osd in new_osd_list: + if not osd in osd_list: + d_osd[osd] = CephInstance.CephOSDInstance(zk_conn, this_node, osd) - # Remove any deleted OSDs from the list - for osd in osd_list: - if not osd in new_osd_list: - # Delete the object - del(d_osd[osd]) + # Remove any deleted OSDs from the list + for osd in osd_list: + if not osd in new_osd_list: + # Delete the object + del(d_osd[osd]) - # Update and print new list - osd_list = new_osd_list - logger.out('{}OSD list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(osd_list)), state='i') + # Update and print new list + osd_list = new_osd_list + logger.out('{}OSD list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(osd_list)), state='i') -# Pool objects -@zk_conn.ChildrenWatch('/ceph/pools') -def update_pools(new_pool_list): - global pool_list, d_pool + # Pool objects + @zk_conn.ChildrenWatch('/ceph/pools') + def update_pools(new_pool_list): + global pool_list, d_pool - # Add any missing Pools to the list - for pool in new_pool_list: - if not pool in pool_list: - d_pool[pool] = CephInstance.CephPoolInstance(zk_conn, this_node, pool) + # Add any missing Pools to the list + for pool in new_pool_list: + if not pool in pool_list: + d_pool[pool] = CephInstance.CephPoolInstance(zk_conn, this_node, pool) - # Remove any deleted Pools from the list - for pool in pool_list: - if not pool in new_pool_list: - # Delete the object - del(d_pool[pool]) + # Remove any deleted Pools from the list + for pool in pool_list: + if not pool in new_pool_list: + # Delete the object + del(d_pool[pool]) - # Update and print new list - pool_list = new_pool_list - logger.out('{}Pool list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(pool_list)), state='i') + # Update and print new list + pool_list = new_pool_list + logger.out('{}Pool list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(pool_list)), state='i') ############################################################################### # PHASE 9 - Run the daemon @@ -742,183 +819,184 @@ def update_zookeeper(): if zkhandler.readdata(zk_conn, '/primary_node') != this_node.name: zkhandler.writedata(zk_conn, {'/primary_node': this_node.name}) - # Get Ceph cluster health (for local printing) - if debug: - print("Get Ceph cluster health (for local printing)") - retcode, stdout, stderr = common.run_os_command('ceph health') - ceph_health = stdout.rstrip() - if 'HEALTH_OK' in ceph_health: - ceph_health_colour = logger.fmt_green - elif 'HEALTH_WARN' in ceph_health: - ceph_health_colour = logger.fmt_yellow - else: - ceph_health_colour = logger.fmt_red - - # Set ceph health information in zookeeper (primary only) - if this_node.router_state == 'primary': + if enable_storage: + # Get Ceph cluster health (for local printing) if debug: - print("Set ceph health information in zookeeper (primary only)") - # Get status info - retcode, stdout, stderr = common.run_os_command('ceph status') - ceph_status = stdout - try: - zkhandler.writedata(zk_conn, { - '/ceph': str(ceph_status) - }) - except: - logger.out('Failed to set Ceph status data', state='e') - return + print("Get Ceph cluster health (for local printing)") + retcode, stdout, stderr = common.run_os_command('ceph health') + ceph_health = stdout.rstrip() + if 'HEALTH_OK' in ceph_health: + ceph_health_colour = logger.fmt_green + elif 'HEALTH_WARN' in ceph_health: + ceph_health_colour = logger.fmt_yellow + else: + ceph_health_colour = logger.fmt_red - # Set pool information in zookeeper (primary only) - if this_node.router_state == 'primary': - if debug: - print("Set pool information in zookeeper (primary only)") - # Get pool info - pool_df = dict() - retcode, stdout, stderr = common.run_os_command('rados df --format json') - pool_df_raw = json.loads(stdout)['pools'] - for pool in pool_df_raw: - pool_df.update({ - str(pool['name']): { - 'id': pool['id'], - 'size_bytes': pool['size_bytes'], - 'num_objects': pool['num_objects'], - 'num_object_clones': pool['num_object_clones'], - 'num_object_copies': pool['num_object_copies'], - 'num_objects_missing_on_primary': pool['num_objects_missing_on_primary'], - 'num_objects_unfound': pool['num_objects_unfound'], - 'num_objects_degraded': pool['num_objects_degraded'], - 'read_ops': pool['read_ops'], - 'read_bytes': pool['read_bytes'], - 'write_ops': pool['write_ops'], - 'write_bytes': pool['write_bytes'] - } - }) + # Set ceph health information in zookeeper (primary only) + if this_node.router_state == 'primary': + if debug: + print("Set ceph health information in zookeeper (primary only)") + # Get status info + retcode, stdout, stderr = common.run_os_command('ceph status') + ceph_status = stdout + try: + zkhandler.writedata(zk_conn, { + '/ceph': str(ceph_status) + }) + except: + logger.out('Failed to set Ceph status data', state='e') + return + + # Set pool information in zookeeper (primary only) + if this_node.router_state == 'primary': + if debug: + print("Set pool information in zookeeper (primary only)") + # Get pool info + pool_df = dict() + retcode, stdout, stderr = common.run_os_command('rados df --format json') + pool_df_raw = json.loads(stdout)['pools'] + for pool in pool_df_raw: + pool_df.update({ + str(pool['name']): { + 'id': pool['id'], + 'size_bytes': pool['size_bytes'], + 'num_objects': pool['num_objects'], + 'num_object_clones': pool['num_object_clones'], + 'num_object_copies': pool['num_object_copies'], + 'num_objects_missing_on_primary': pool['num_objects_missing_on_primary'], + 'num_objects_unfound': pool['num_objects_unfound'], + 'num_objects_degraded': pool['num_objects_degraded'], + 'read_ops': pool['read_ops'], + 'read_bytes': pool['read_bytes'], + 'write_ops': pool['write_ops'], + 'write_bytes': pool['write_bytes'] + } + }) # Trigger updates for each OSD on this node for pool in pool_list: zkhandler.writedata(zk_conn, { '/ceph/pools/{}/stats'.format(pool): str(json.dumps(pool_df[pool])) }) - - # Get data from Ceph OSDs - if debug: - print("Get data from Ceph OSDs") - # Parse the dump data - osd_dump = dict() - retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json') - osd_dump_raw = json.loads(stdout)['osds'] - for osd in osd_dump_raw: - osd_dump.update({ - str(osd['osd']): { - 'uuid': osd['uuid'], - 'up': osd['up'], - 'in': osd['in'], - 'primary_affinity': osd['primary_affinity'] - } - }) - # Parse the df data - osd_df = dict() - retcode, stdout, stderr = common.run_os_command('ceph osd df --format json') - osd_df_raw = json.loads(stdout)['nodes'] - for osd in osd_df_raw: - osd_df.update({ - str(osd['id']): { - 'utilization': osd['utilization'], - 'var': osd['var'], - 'pgs': osd['pgs'], - 'kb': osd['kb'], - 'weight': osd['crush_weight'], - 'reweight': osd['reweight'], - } - }) - # Parse the status data - osd_status = dict() - retcode, stdout, stderr = common.run_os_command('ceph osd status') - for line in stderr.split('\n'): - # Strip off colour - line = re.sub(r'\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))', '', line) - # Split it for parsing - line = line.split() - if len(line) > 1 and line[1].isdigit(): - # This is an OSD line so parse it - osd_id = line[1] - node = line[3].split('.')[0] - used = line[5] - avail = line[7] - wr_ops = line[9] - wr_data = line[11] - rd_ops = line[13] - rd_data = line[15] - state = line[17] - osd_status.update({ - str(osd_id): { - 'node': node, - 'used': used, - 'avail': avail, - 'wr_ops': wr_ops, - 'wr_data': wr_data, - 'rd_ops': rd_ops, - 'rd_data': rd_data, - 'state': state + + # Get data from Ceph OSDs + if debug: + print("Get data from Ceph OSDs") + # Parse the dump data + osd_dump = dict() + retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json') + osd_dump_raw = json.loads(stdout)['osds'] + for osd in osd_dump_raw: + osd_dump.update({ + str(osd['osd']): { + 'uuid': osd['uuid'], + 'up': osd['up'], + 'in': osd['in'], + 'primary_affinity': osd['primary_affinity'] } }) - # Merge them together into a single meaningful dict - osd_stats = dict() - for osd in osd_list: - this_dump = osd_dump[osd] - this_dump.update(osd_df[osd]) - this_dump.update(osd_status[osd]) - osd_stats[osd] = this_dump - - # Trigger updates for each OSD on this node - if debug: - print("Trigger updates for each OSD on this node") - osds_this_node = 0 - for osd in osd_list: - if d_osd[osd].node == myhostname: - zkhandler.writedata(zk_conn, { - '/ceph/osds/{}/stats'.format(osd): str(json.dumps(osd_stats[osd])) + # Parse the df data + osd_df = dict() + retcode, stdout, stderr = common.run_os_command('ceph osd df --format json') + osd_df_raw = json.loads(stdout)['nodes'] + for osd in osd_df_raw: + osd_df.update({ + str(osd['id']): { + 'utilization': osd['utilization'], + 'var': osd['var'], + 'pgs': osd['pgs'], + 'kb': osd['kb'], + 'weight': osd['crush_weight'], + 'reweight': osd['reweight'], + } }) - osds_this_node += 1 + # Parse the status data + osd_status = dict() + retcode, stdout, stderr = common.run_os_command('ceph osd status') + for line in stderr.split('\n'): + # Strip off colour + line = re.sub(r'\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))', '', line) + # Split it for parsing + line = line.split() + if len(line) > 1 and line[1].isdigit(): + # This is an OSD line so parse it + osd_id = line[1] + node = line[3].split('.')[0] + used = line[5] + avail = line[7] + wr_ops = line[9] + wr_data = line[11] + rd_ops = line[13] + rd_data = line[15] + state = line[17] + osd_status.update({ + str(osd_id): { + 'node': node, + 'used': used, + 'avail': avail, + 'wr_ops': wr_ops, + 'wr_data': wr_data, + 'rd_ops': rd_ops, + 'rd_data': rd_data, + 'state': state + } + }) + # Merge them together into a single meaningful dict + osd_stats = dict() + for osd in osd_list: + this_dump = osd_dump[osd] + this_dump.update(osd_df[osd]) + this_dump.update(osd_status[osd]) + osd_stats[osd] = this_dump + # Trigger updates for each OSD on this node + if debug: + print("Trigger updates for each OSD on this node") + osds_this_node = 0 + for osd in osd_list: + if d_osd[osd].node == myhostname: + zkhandler.writedata(zk_conn, { + '/ceph/osds/{}/stats'.format(osd): str(json.dumps(osd_stats[osd])) + }) + osds_this_node += 1 - # Toggle state management of dead VMs to restart them - if debug: - print("Toggle state management of dead VMs to restart them") memalloc = 0 vcpualloc = 0 - for domain, instance in this_node.d_domain.items(): - if domain in this_node.domain_list: - # Add the allocated memory to our memalloc value - memalloc += instance.getmemory() - vcpualloc += instance.getvcpus() - if instance.getstate() == 'start' and instance.getnode() == this_node.name: - if instance.getdom() != None: - try: - if instance.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING: - raise - except Exception as e: - # Toggle a state "change" - zkhandler.writedata(zk_conn, { '/domains/{}/state'.format(domain): instance.getstate() }) - - # Connect to libvirt - if debug: - print("Connect to libvirt") - libvirt_name = "qemu:///system" - lv_conn = libvirt.open(libvirt_name) - if lv_conn == None: - logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e') - return - - # Ensure that any running VMs are readded to the domain_list - if debug: - print("Ensure that any running VMs are readded to the domain_list") - running_domains = lv_conn.listAllDomains(libvirt.VIR_CONNECT_LIST_DOMAINS_ACTIVE) - for domain in running_domains: - domain_uuid = domain.UUIDString() - if domain_uuid not in this_node.domain_list: - this_node.domain_list.append(domain_uuid) + if enable_hypervisor: + # Toggle state management of dead VMs to restart them + if debug: + print("Toggle state management of dead VMs to restart them") + for domain, instance in this_node.d_domain.items(): + if domain in this_node.domain_list: + # Add the allocated memory to our memalloc value + memalloc += instance.getmemory() + vcpualloc += instance.getvcpus() + if instance.getstate() == 'start' and instance.getnode() == this_node.name: + if instance.getdom() != None: + try: + if instance.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING: + raise + except Exception as e: + # Toggle a state "change" + zkhandler.writedata(zk_conn, { '/domains/{}/state'.format(domain): instance.getstate() }) + + # Connect to libvirt + if debug: + print("Connect to libvirt") + libvirt_name = "qemu:///system" + lv_conn = libvirt.open(libvirt_name) + if lv_conn == None: + logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e') + return + + # Ensure that any running VMs are readded to the domain_list + if debug: + print("Ensure that any running VMs are readded to the domain_list") + running_domains = lv_conn.listAllDomains(libvirt.VIR_CONNECT_LIST_DOMAINS_ACTIVE) + for domain in running_domains: + domain_uuid = domain.UUIDString() + if domain_uuid not in this_node.domain_list: + this_node.domain_list.append(domain_uuid) # Set our information in zookeeper if debug: @@ -929,7 +1007,10 @@ def update_zookeeper(): this_node.memalloc = memalloc this_node.vcpualloc = vcpualloc this_node.cpuload = os.getloadavg()[0] - this_node.domains_count = len(lv_conn.listDomainsID()) + if enable_hypervisor: + this_node.domains_count = len(lv_conn.listDomainsID()) + else: + this_node.domains_count = 0 keepalive_time = int(time.time()) try: zkhandler.writedata(zk_conn, { @@ -946,8 +1027,9 @@ def update_zookeeper(): logger.out('Failed to set keepalive data', state='e') return - # Close the Libvirt connection - lv_conn.close() + if enable_hypervisor: + # Close the Libvirt connection + lv_conn.close() # Look for dead nodes and fence them if debug: @@ -1005,20 +1087,21 @@ def update_zookeeper(): netcount=len(network_list) ), ) - logger.out( - '{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt} ' - '{bold}Total OSDs:{nofmt} {total_osds} ' - '{bold}Node OSDs:{nofmt} {node_osds} ' - '{bold}Pools:{nofmt} {total_pools} '.format( - bold=logger.fmt_bold, - health_colour=ceph_health_colour, - nofmt=logger.fmt_end, - health=ceph_health, - total_osds=len(osd_list), - node_osds=osds_this_node, - total_pools=len(pool_list) - ), - ) + if enable_storage: + logger.out( + '{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt} ' + '{bold}Total OSDs:{nofmt} {total_osds} ' + '{bold}Node OSDs:{nofmt} {node_osds} ' + '{bold}Pools:{nofmt} {total_pools} '.format( + bold=logger.fmt_bold, + health_colour=ceph_health_colour, + nofmt=logger.fmt_end, + health=ceph_health, + total_osds=len(osd_list), + node_osds=osds_this_node, + total_pools=len(pool_list) + ), + ) # Start keepalive thread and immediately update Zookeeper diff --git a/node-daemon/pvcd/NodeInstance.py b/node-daemon/pvcd/NodeInstance.py index 60b0064d..d4b98f16 100644 --- a/node-daemon/pvcd/NodeInstance.py +++ b/node-daemon/pvcd/NodeInstance.py @@ -66,10 +66,18 @@ class NodeInstance(object): self.memalloc = 0 self.vcpualloc = 0 # Floating upstreams - self.vni_dev = self.config['vni_dev'] - self.vni_ipaddr, self.vni_cidrnetmask = self.config['vni_floating_ip'].split('/') - self.upstream_dev = self.config['upstream_dev'] - self.upstream_ipaddr, self.upstream_cidrnetmask = self.config['upstream_floating_ip'].split('/') + if self.config['enable_networking']: + self.vni_dev = self.config['vni_dev'] + self.vni_ipaddr, self.vni_cidrnetmask = self.config['vni_floating_ip'].split('/') + self.upstream_dev = self.config['upstream_dev'] + self.upstream_ipaddr, self.upstream_cidrnetmask = self.config['upstream_floating_ip'].split('/') + else: + self.vni_dev = None + self.vni_ipaddr = None + self.vni_cidrnetmask = None + self.upstream_dev = None + self.upstream_ipaddr = None + self.upstream_cidrnetmask = None # Flags self.inflush = False @@ -240,25 +248,27 @@ class NodeInstance(object): # Routing primary/secondary states def become_secondary(self): - self.logger.out('Setting router {} to secondary state'.format(self.name), state='i') - self.logger.out('Network list: {}'.format(', '.join(self.network_list))) - time.sleep(1) - for network in self.d_network: - self.d_network[network].stopDHCPServer() - self.d_network[network].removeGateways() - self.removeFloatingAddresses() - self.dns_aggregator.stop_aggregator() + if self.config['enable_networking']: + self.logger.out('Setting router {} to secondary state'.format(self.name), state='i') + self.logger.out('Network list: {}'.format(', '.join(self.network_list))) + time.sleep(1) + for network in self.d_network: + self.d_network[network].stopDHCPServer() + self.d_network[network].removeGateways() + self.removeFloatingAddresses() + self.dns_aggregator.stop_aggregator() def become_primary(self): - self.logger.out('Setting router {} to primary state'.format(self.name), state='i') - self.logger.out('Network list: {}'.format(', '.join(self.network_list))) - self.createFloatingAddresses() - # Start up the gateways and DHCP servers - for network in self.d_network: - self.d_network[network].createGateways() - self.d_network[network].startDHCPServer() - time.sleep(0.5) - self.dns_aggregator.start_aggregator() + if self.config['enable_networking']: + self.logger.out('Setting router {} to primary state'.format(self.name), state='i') + self.logger.out('Network list: {}'.format(', '.join(self.network_list))) + self.createFloatingAddresses() + # Start up the gateways and DHCP servers + for network in self.d_network: + self.d_network[network].createGateways() + self.d_network[network].startDHCPServer() + time.sleep(0.5) + self.dns_aggregator.start_aggregator() def createFloatingAddresses(self): # VNI floating IP diff --git a/node-daemon/pvcd/fixrbdlocks b/node-daemon/pvcd/fixrbdlocks new file mode 100755 index 00000000..ad9b9da5 --- /dev/null +++ b/node-daemon/pvcd/fixrbdlocks @@ -0,0 +1,13 @@ +#!/bin/bash + +for disk in $( sudo rbd list ${BLSE_STORAGE_POOL_VM} | grep "^${vm}" ); do + echo -e " Disk: $disk" + locks="$( sudo rbd lock list ${BLSE_STORAGE_POOL_VM}/${disk} | grep '^client' )" + echo "${locks}" + if [[ -n "${locks}" ]]; then + echo -e " LOCK FOUND! Clearing." + locker="$( awk '{ print $1 }' <<<"${locks}" )" + id="$( awk '{ print $2" "$3 }' <<<"${locks}" )" + sudo rbd lock remove ${BLSE_STORAGE_POOL_VM}/${disk} "${id}" "${locker}" + fi +done diff --git a/node-daemon/pvcd/log.py b/node-daemon/pvcd/log.py index 7ba930f7..fc49402f 100644 --- a/node-daemon/pvcd/log.py +++ b/node-daemon/pvcd/log.py @@ -45,6 +45,8 @@ class Logger(object): # We open the logfile for the duration of our session, but have a hup function self.writer = open(self.logfile, 'a', buffering=1) self.last_colour = self.fmt_cyan + else: + self.last_colour = "" # Provide a hup function to close and reopen the writer def hup(self):