diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 144e7c0f..d66bae78 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,7 +7,7 @@ build: before_script: - git submodule update --init script: - - /usr/local/bin/build-package 0.3 + - /usr/local/bin/build-package 0.4 - /usr/local/bin/deploy-package artifacts: paths: diff --git a/README.md b/README.md index ed67f70c..c6120c6d 100644 --- a/README.md +++ b/README.md @@ -2,65 +2,71 @@ #### NOTICE FOR GITHUB -This software is still incomplete, and should be considered pre-alpha and not suitable for production use! Not all features described below are implemented, and I will be committing directly to master until they are. If you wish to test out PVC, the following table details the currently-working features, but be warned that functionality may change regularly. Use the tag `v0.3` for a stable implementation of the working features. - -* Working features: pvcvd, cli-client (for VM and hypervisor management) -* In progress features: pvcrd, pvcnd, cli-client support for the aforementioned -* Unstarted features: pvcpd, api-client, web-client +This software is still incomplete, and should be considered pre-alpha and not suitable for production use! Not all features described below are implemented, and I will be committing directly to master until they are (version 1.0). [![pipeline status](https://git.bonifacelabs.ca/bonifacelabs/pvc/badges/master/pipeline.svg)](https://git.bonifacelabs.ca/bonifacelabs/pvc/commits/master) ![Logo](https://git.bonifacelabs.ca/uploads/-/system/project/avatar/135/pvc_logo.png) -PVC is a suite of Python 3 tools to manage virtualized clusters. It provides a fully-functional private cloud based on the priciple that "PVC is not hyperscale". It is designed to be administrator-friendly while powerful, but without the feature bloat and complexity of tools like OpenStack that are designed to support public clouds. With PVC, an administrator can provision, manage, and update a cluster of dozens or more hypervisors running thousands of VMs using a simple CLI tool, HTTP API, or web interface. PVC is based entirely on Debian GNU/Linux and Free-and-Open-Source tools, providing the glue to provision and manage the cluster. +PVC is a suite of Python 3 tools to manage virtualized clusters. It provides a fully-functional private cloud based on the priciple that "PVC is not hyperscale". It is designed to be administrator-friendly while powerful, but without the feature bloat and complexity of tools like OpenStack that are designed to support public clouds. With PVC, an administrator can provision, manage, and update a cluster of dozens or more hypervisors running thousands of VMs using a simple CLI tool, HTTP API, or web interface. PVC is based entirely on Debian GNU/Linux and Free-and-Open-Source tools, providing the glue to bootstrap, provision and manage the cluster. Just add physical servers. ## Architecture overview -A PVC deployment ("cluster") consists of a standard physical layout and suite of daemons to manage the physical elements. The cluster is backed by a Zookeeper instance running on a subset of the machines and which all daemons communicate with to coordinate state. +A PVC deployment ("cluster") consists of a cluster of hosts which share duties using a single daemon. The cluster is backed by a Zookeeper instance running on a subset of the machines and which all daemons communicate with to coordinate state. ### Physical infrastructure -A cluster consists of two main kinds of physical servers - routers and hypervisors. A cluster will normally have two routers in a failover pair, and at least three hypervisors. +The PVC system depends on a cluster of 3 or more physical servers. Each server must have the capability to run storage, client networks, and VMs, and a subset of these servers are configured at install time to also act as routers for the cluster. -Router nodes may be less powerful than full hypervisors; they act primarily as the gateway for VM networks and handles inter-network ACLs. While they are not strictly required, a proper deployment with all functionality will require them. +The underlying networking is left up to the administrator; the only requirement is that all routers and hypervisors must be reachable by each other. In the simplest deployment, all physical nodes may be connected to a single dumb switch. All inter-VM networking is handled dynamically via software-defined networking within the cluster itself and is handled transparently above the underlying network layer. More advanced configurations may be specified during cluster initalization, including upstream networks, storage networks, and advanced node-level network configuration (vLANs, bonds, etc.) -Hypervisor nodes should be scaled at the administrator's discretion; they may be low-power and scaled out, or high-power and scaled up. PVC provides a straightforward automated provisioning system to expand the cluster as required. +The coordinator hosts [see below] require an additional upstream network. These hosts advertise BGP routes to the cluster networks on their upstream interface, and accept traffic destined to the clients; they route between themselves to reach VMs out the primary gateway node, so all coordinators are valid route targets. The router components of the daemon makes no effort to perform NAT or Internet gateway functions; an upstream router should be configured for this purpose. -The underlying networking is left up to the administrator; the only requirement is that all routers and hypervisors must be reachable by each other. In the simplest deployment, all physical nodes may be connected to a single dumb switch. All inter-VM networking is handled dynamically via software-defined networking within the cluster itself and is handled transparently above the underlying network layer. More advanced configurations may be specified during cluster initalization. +PVC supports fencing of nodes when they do not update the Zookeeper database in a fixed, configurable time, to provide automated recovery from node failures. This feature requires IPMI networked BMC support, and credentials should be specified in in the configuration. Preparing IPMI for PVC's use is left to the administrator. ### Software infrastructure -The core functionality of PVC is obtained via Zookeeper. During cluster initalization, the administrator must set either 3 or 5 hypervisors to act as the Zookeeper coordination subcluster. These hypervisors are special in the cluster and should not be removed after creation. This configuration prevents Zookeeper cluster size bloat as the cluster grows while still providing adequate redundancy for Zookeeper. +The PVC server-side infrastructure consists of a single daemon, `pvcd`, which manages each node based on connectivity to the Zookeeper cluster. All nodes are capable of running virtual machines, Ceph storage OSDs, and passing traffic to virtual machines via configured networks. -All daemons communicate with Zookeeper to obtain state, and update Zookeper as required, providing a high degree of self-management. Most major failure conditions are handled transparently by the cluster. +A subset of the nodes are designated at install time to act as "coordinator" hosts for the cluster. By default, 3 or 5 nodes can be designated as coordinators; 3 is ideal for small deployments (<30 hypervisors) while 5 allow for much larger scaling. These coordinators run additional functions for the cluster beyond VMs and storage, mainly: -FRRouting is used to manage virtual networking via BGP EVPN, and Libvirt is used to manage virtual machines. +* running Zookeeper itself, acting as the central database for the cluster. +* running FRRouting in BGP server mode, performing route reflector and upstream routing functionality. +* running Ceph monitor and manager daemons for the storage cluster. +* acting as client network gateways, DHCP, and DNS servers. +* acting as provisioning servers for nodes and VMs. -PVC itself is composed of four daemons: +A single coordinator elects itself "primary" to perform this duty at startup, and passes it off on shutdown; this can be modified manually by the administrator. The primary coordinator handles provisioning and client network functionality (gateway, DHCP, DNS) for the whole cluster, which the "secondary" coordinators can take over automatically if needed. While this architecture can suffer from tromboning when there is a larger inter-network traffic flow, it preserves a consistent and simple layer-2 model inside each client network for administrative simplicity. -* Virtualization -* Network -* Router -* Provisioning +New nodes can be added dynamically; once running, the cluster supports the PXE booting of additional hypervisors which are then self-configured and added to the cluster via the provisioning framework. This framework also allows for the quick deployment of VMs based off Ceph-stored images and templates. -#### Virtualization +The core external components are: -The virtualization daemon (`pvcvd`, package `pvc-virtualization-daemon`) manages QEMU/KVM virtual machines on hypervisor nodes. Domain configurations are stored in Zookeeper and VMs are dynamically created on hypervisor nodes based on Zookeeper configuration values. The virtualization daemon handles all stages of the VM lifecycle, including triggering startup, restart, graceful ACPI shutdown, and forceful termination. +#### Zookeeper -By default, each VM lives on a particular "home" node, and can be live migrated away either temporarily (`migrate`) or permanently (`move`). During provisioning and normal `migrate`/`move` commands, the selection of the target hypervisor is dynamic, based on administator-configurable variables. +Zookeeper is the primary database of the cluster, running on the coordinator nodes. All activity in the cluster is mediated by Zookeeper: clients read and write data to it, and daemons determine and update object configuration and state from it. The bootstrap tool initializes the cluster on the initial set of coordinator hosts, and once configured requires manual administrative action to modify; future version using Zookeeper 3.5 may offer self-managing functionality. -#### Network +Coordinator hosts automatically attempt to start the Zookeeper daemon when they start up, if it has been shut down. If the Zookeeper cluster connection is lost, all clients will pause state update operations while waiting to reconnect. Note that fencing may be triggered if only one node loses Zookeeper connectivity, as the paused operations will prevent keepalives from being sent to the cluster. Take care when rebooting coordinator nodes so that the Zookeeper cluster continues to function normally. -The network daemon (`pvcnd`, package `pvc-network-daemon`) manages the hypervisor-side virtual networking for the cluster. It is responsible for provisioning VXLAN devices on hypervisor nodes for VM network access. +#### FRRouting -#### Router +FRRouting is used to provide BGP for management of client networks. It makes use of BGP EVPN to allow dynamic, software-defined VXLAN client networks presenting as simple layer-2 networks. VMs inside a particular client network can communicate directly as if they shared a switch. FRRouting also provides upstream BGP, allowing routes to the dynamic client networks to be learned by upstream routers. -The router daemon (`pvcrd`, package `pvc-router-daemon`) manages the router-side virtual networking for the cluster. It includes functionality for managing the gateways of each virtual network, as well as providing network ACLs and IP forwarding to an upsteam, and DHCP for client networks. +#### dnsmasq -#### Provisioning +dnsmasq is used by the coordinator nodes to provide DHCP and DNS support for client networks. An individual instance is started on the primary coordinator for each network, handling that network specifically. -The provisioning daemon (`pvcpd`, package `pvc-provisioning-daemon`) manages the setup and creation of new physical nodes, new virtual machines, as well as handling updates of the cluster. The provisioning daemon can be run on any nodes, but is normally run on the routers to simplify administration. +#### PowerDNS +PowerDNS is used by the coordinator nodes to aggregate client DNS records from the dnsmasq instances and present a complete picture of the cluster DNS to clients and the outside world. An instance runs on the primary coordinator aggregating dnsmasq entries, which can then be sent to other DNS servers via AXFR, including the in-cluster DNS servers usable by clients, which also make use of PowerDNS. + +#### Libvirt + +Libvirt is used to manage virtual machines in the cluster. It uses the TCP communication mode to perform live migrations between nodes and must be listening on daemon startup. + +#### Ceph + +Ceph provides the storage infrastructure to the cluster using RBD block devices. OSDs live in each node and VM disks are stored in copies of 3 across the cluster, ensuring a high degree of resiliency. The monitor and manager functions run on the coordinator nodes for scalability. ### Client interfaces @@ -91,6 +97,10 @@ While not specifically an interface, the Python functions used by the above inte ## Changelog +#### 0.4 + +* Recombination of daemons and expansion of functionality into client network management and routing. + #### 0.3 * Major revisions to expand functionality. diff --git a/build-deb.sh b/build-deb.sh index 66e9e8b1..c36ee750 100755 --- a/build-deb.sh +++ b/build-deb.sh @@ -1,5 +1,5 @@ #!/bin/sh -ver="0.3" +ver="0.4" git pull rm ../pvc_* dh_make -p pvc_${ver} --createorig --single --yes diff --git a/cli-client/client_lib b/cli-client/client_lib deleted file mode 120000 index 8f6be249..00000000 --- a/cli-client/client_lib +++ /dev/null @@ -1 +0,0 @@ -../client-common/client_lib \ No newline at end of file diff --git a/client-cli/client_lib b/client-cli/client_lib new file mode 120000 index 00000000..37daac79 --- /dev/null +++ b/client-cli/client_lib @@ -0,0 +1 @@ +../client-common \ No newline at end of file diff --git a/cli-client/pvc.py b/client-cli/pvc.py similarity index 87% rename from cli-client/pvc.py rename to client-cli/pvc.py index 922f46ab..d0b685a0 100755 --- a/cli-client/pvc.py +++ b/client-cli/pvc.py @@ -31,7 +31,6 @@ import colorama import client_lib.common as pvc_common import client_lib.node as pvc_node -import client_lib.router as pvc_router import client_lib.vm as pvc_vm import client_lib.network as pvc_network @@ -54,13 +53,44 @@ def cleanup(retcode, retmsg, zk_conn): ############################################################################### # pvc node ############################################################################### -@click.group(name='node', short_help='Manage a PVC hypervisor node.', context_settings=CONTEXT_SETTINGS) +@click.group(name='node', short_help='Manage a PVC node.', context_settings=CONTEXT_SETTINGS) def cli_node(): """ Manage the state of a node in the PVC cluster. """ pass +############################################################################### +# pvc node secondary +############################################################################### +@click.command(name='secondary', short_help='Set a node in secondary node status.') +@click.argument( + 'node' +) +def node_secondary(node): + """ + Take NODE out of primary router mode. + """ + + zk_conn = pvc_common.startZKConnection(zk_host) + retcode, retmsg = pvc_node.secondary_node(zk_conn, node) + cleanup(retcode, retmsg, zk_conn) + +############################################################################### +# pvc node primary +############################################################################### +@click.command(name='primary', short_help='Set a node in primary status.') +@click.argument( + 'node' +) +def node_primary(node): + """ + Put NODE into primary router mode. + """ + + zk_conn = pvc_common.startZKConnection(zk_host) + retcode, retmsg = pvc_node.primary_node(zk_conn, node) + cleanup(retcode, retmsg, zk_conn) ############################################################################### # pvc node flush @@ -140,92 +170,13 @@ def node_info(node, long_output): ) def node_list(limit): """ - List all hypervisor nodes in the cluster; optionally only match names matching regex LIMIT. + List all nodes in the cluster; optionally only match names matching regex LIMIT. """ zk_conn = pvc_common.startZKConnection(zk_host) retcode, retmsg = pvc_node.get_list(zk_conn, limit) cleanup(retcode, retmsg, zk_conn) -############################################################################### -# pvc router -############################################################################### -@click.group(name='router', short_help='Manage a PVC router.', context_settings=CONTEXT_SETTINGS) -def cli_router(): - """ - Manage the state of a router in the PVC cluster. - """ - pass - - -############################################################################### -# pvc router secondary -############################################################################### -@click.command(name='secondary', short_help='Set a router in secondary status.') -@click.argument( - 'router' -) -def router_secondary(router): - """ - Take ROUTER out of primary mode handling gateways and into secondary mode. - """ - - zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_router.secondary_router(zk_conn, router) - cleanup(retcode, retmsg, zk_conn) - -############################################################################### -# pvc router primary -############################################################################### -@click.command(name='primary', short_help='Set a router in primary status.') -@click.argument( - 'router' -) -def router_primary(router): - """ - Put ROUTER into primary mode handling gateways. - """ - - zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_router.primary_router(zk_conn, router) - cleanup(retcode, retmsg, zk_conn) - -############################################################################### -# pvc router info -############################################################################### -@click.command(name='info', short_help='Show details of a router object.') -@click.argument( - 'router' -) -@click.option( - '-l', '--long', 'long_output', is_flag=True, default=False, - help='Display more detailed information.' -) -def router_info(router, long_output): - """ - Show information about router ROUTER. - """ - - zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_router.get_info(zk_conn, router, long_output) - cleanup(retcode, retmsg, zk_conn) - -############################################################################### -# pvc router list -############################################################################### -@click.command(name='list', short_help='List all router objects.') -@click.argument( - 'limit', default=None, required=False -) -def router_list(limit): - """ - List all routers in the cluster; optionally only match names matching regex LIMIT. - """ - - zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_router.get_list(zk_conn, limit) - cleanup(retcode, retmsg, zk_conn) - ############################################################################### # pvc vm ############################################################################### @@ -241,18 +192,18 @@ def cli_vm(): ############################################################################### @click.command(name='define', short_help='Define a new virtual machine from a Libvirt XML file.') @click.option( - '-t', '--hypervisor', 'target_hypervisor', - help='Home hypervisor for this domain; autodetect if unspecified.' + '-n', '--node', 'target_node', + help='Home node for this domain; autodetect if unspecified.' ) @click.option( '-s', '--selector', 'selector', default='mem', show_default=True, type=click.Choice(['mem','load','vcpus','vms']), - help='Method to determine optimal target hypervisor during autodetect.' + help='Method to determine optimal target node during autodetect.' ) @click.argument( 'config', type=click.File() ) -def vm_define(config, target_hypervisor, selector): +def vm_define(config, target_node, selector): """ Define a new virtual machine from Libvirt XML configuration file CONFIG. """ @@ -262,7 +213,7 @@ def vm_define(config, target_hypervisor, selector): config.close() zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_vm.define_vm(zk_conn, config_data, target_hypervisor, selector) + retcode, retmsg = pvc_vm.define_vm(zk_conn, config_data, target_node, selector) cleanup(retcode, retmsg, zk_conn) ############################################################################### @@ -391,7 +342,7 @@ def vm_undefine(domain): ) def vm_start(domain): """ - Start virtual machine DOMAIN on its configured hypervisor. DOMAIN may be a UUID or name. + Start virtual machine DOMAIN on its configured node. DOMAIN may be a UUID or name. """ # Open a Zookeeper connection @@ -458,22 +409,22 @@ def vm_stop(domain): 'domain' ) @click.option( - '-t', '--hypervisor', 'target_hypervisor', default=None, - help='Target hypervisor to migrate to; autodetect if unspecified.' + '-n', '--node', 'target_node', default=None, + help='Target node to migrate to; autodetect if unspecified.' ) @click.option( '-s', '--selector', 'selector', default='mem', show_default=True, type=click.Choice(['mem','load','vcpus','vms']), - help='Method to determine optimal target hypervisor during autodetect.' + help='Method to determine optimal target node during autodetect.' ) -def vm_move(domain, target_hypervisor, selector): +def vm_move(domain, target_node, selector): """ - Permanently move virtual machine DOMAIN, via live migration if running and possible, to another hypervisor node. DOMAIN may be a UUID or name. + Permanently move virtual machine DOMAIN, via live migration if running and possible, to another node. DOMAIN may be a UUID or name. """ # Open a Zookeeper connection zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_vm.move_vm(zk_conn, domain, target_hypervisor, selector) + retcode, retmsg = pvc_vm.move_vm(zk_conn, domain, target_node, selector) cleanup(retcode, retmsg, zk_conn) ############################################################################### @@ -484,26 +435,26 @@ def vm_move(domain, target_hypervisor, selector): 'domain' ) @click.option( - '-t', '--hypervisor', 'target_hypervisor', default=None, - help='Target hypervisor to migrate to; autodetect if unspecified.' + '-n', '--node', 'target_node', default=None, + help='Target node to migrate to; autodetect if unspecified.' ) @click.option( '-s', '--selector', 'selector', default='mem', show_default=True, type=click.Choice(['mem','load','vcpus','vms']), - help='Method to determine optimal target hypervisor during autodetect.' + help='Method to determine optimal target node during autodetect.' ) @click.option( '-f', '--force', 'force_migrate', is_flag=True, default=False, help='Force migrate an already migrated VM.' ) -def vm_migrate(domain, target_hypervisor, selector, force_migrate): +def vm_migrate(domain, target_node, selector, force_migrate): """ - Temporarily migrate running virtual machine DOMAIN, via live migration if possible, to another hypervisor node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node. + Temporarily migrate running virtual machine DOMAIN, via live migration if possible, to another node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node. """ # Open a Zookeeper connection zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_vm.migrate_vm(zk_conn, domain, target_hypervisor, selector, force_migrate) + retcode, retmsg = pvc_vm.migrate_vm(zk_conn, domain, target_node, selector, force_migrate) cleanup(retcode, retmsg, zk_conn) ############################################################################### @@ -515,7 +466,7 @@ def vm_migrate(domain, target_hypervisor, selector, force_migrate): ) def vm_unmigrate(domain): """ - Restore previously migrated virtual machine DOMAIN, via live migration if possible, to its original hypervisor node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node. + Restore previously migrated virtual machine DOMAIN, via live migration if possible, to its original node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node. """ # Open a Zookeeper connection @@ -552,16 +503,16 @@ def vm_info(domain, long_output): 'limit', default=None, required=False ) @click.option( - '-t', '--hypervisor', 'hypervisor', default=None, - help='Limit list to this hypervisor.' + '-n', '--node', 'node', default=None, + help='Limit list to this node.' ) -def vm_list(hypervisor, limit): +def vm_list(node, limit): """ List all virtual machines in the cluster; optionally only match names matching regex LIMIT. """ zk_conn = pvc_common.startZKConnection(zk_host) - retcode, retmsg = pvc_vm.get_list(zk_conn, hypervisor, limit) + retcode, retmsg = pvc_vm.get_list(zk_conn, node, limit) cleanup(retcode, retmsg, zk_conn) ############################################################################### @@ -877,17 +828,19 @@ def init_cluster(): # Destroy the existing data try: + zk_conn.delete('/networks', recursive=True) zk_conn.delete('/domains', recursive=True) - zk_conn.delete('nodes', recursive=True) + zk_conn.delete('/nodes', recursive=True) + zk_conn.delete('/primary_node', recursive=True) except: pass # Create the root keys transaction = zk_conn.transaction() + transaction.create('/networks', ''.encode('ascii')) transaction.create('/domains', ''.encode('ascii')) transaction.create('/nodes', ''.encode('ascii')) - transaction.create('/routers', ''.encode('ascii')) - transaction.create('/networks', ''.encode('ascii')) + transaction.create('/primary_node', 'none'.encode('ascii')) transaction.commit() # Close the Zookeeper connection @@ -920,17 +873,14 @@ def cli(_zk_host): # # Click command tree # +cli_node.add_command(node_secondary) +cli_node.add_command(node_primary) cli_node.add_command(node_flush) cli_node.add_command(node_ready) cli_node.add_command(node_unflush) cli_node.add_command(node_info) cli_node.add_command(node_list) -cli_router.add_command(router_secondary) -cli_router.add_command(router_primary) -cli_router.add_command(router_info) -cli_router.add_command(router_list) - cli_vm.add_command(vm_define) cli_vm.add_command(vm_modify) cli_vm.add_command(vm_undefine) @@ -960,7 +910,6 @@ net_dhcp_static.add_command(net_dhcp_static_remove) net_dhcp_static.add_command(net_dhcp_static_list) cli.add_command(cli_node) -cli.add_command(cli_router) cli.add_command(cli_vm) cli.add_command(cli_network) cli.add_command(init_cluster) diff --git a/client-common/client_lib/ansiiprint.py b/client-common/ansiiprint.py similarity index 100% rename from client-common/client_lib/ansiiprint.py rename to client-common/ansiiprint.py diff --git a/client-common/client_lib/router.py b/client-common/client_lib/router.py deleted file mode 100644 index a50bea03..00000000 --- a/client-common/client_lib/router.py +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env python3 - -# router.py - PVC client function library, router management -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import os -import socket -import time -import uuid -import re -import tempfile -import subprocess -import difflib -import colorama -import click -import lxml.objectify -import configparser -import kazoo.client - -import client_lib.ansiiprint as ansiiprint -import client_lib.zkhandler as zkhandler -import client_lib.common as common - -def getInformationFromRouter(zk_conn, router_name, long_output): - router_daemon_state = zk_conn.get('/routers/{}/daemonstate'.format(router_name))[0].decode('ascii') - router_network_state = zk_conn.get('/routers/{}/networkstate'.format(router_name))[0].decode('ascii') - router_cpu_count = zk_conn.get('/routers/{}/staticdata'.format(router_name))[0].decode('ascii').split()[0] - router_cpu_load = zk_conn.get('/routers/{}/cpuload'.format(router_name))[0].decode('ascii').split()[0] - router_kernel = zk_conn.get('/routers/{}/staticdata'.format(router_name))[0].decode('ascii').split()[1] - router_os = zk_conn.get('/routers/{}/staticdata'.format(router_name))[0].decode('ascii').split()[2] - router_arch = zk_conn.get('/routers/{}/staticdata'.format(router_name))[0].decode('ascii').split()[3] - - if router_daemon_state == 'run': - daemon_state_colour = ansiiprint.green() - elif router_daemon_state == 'stop': - daemon_state_colour = ansiiprint.red() - elif router_daemon_state == 'init': - daemon_state_colour = ansiiprint.yellow() - elif router_daemon_state == 'dead': - daemon_state_colour = ansiiprint.red() + ansiiprint.bold() - else: - daemon_state_colour = ansiiprint.blue() - - if router_network_state == 'primary': - network_state_colour = ansiiprint.green() - else: - network_state_colour = ansiiprint.blue() - - # Format a nice output; do this line-by-line then concat the elements at the end - ainformation = [] - ainformation.append('{}Router information:{}'.format(ansiiprint.bold(), ansiiprint.end())) - ainformation.append('') - # Basic information - ainformation.append('{}Name:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), router_name)) - ainformation.append('{}Daemon State:{} {}{}{}'.format(ansiiprint.purple(), ansiiprint.end(), daemon_state_colour, router_daemon_state, ansiiprint.end())) - ainformation.append('{}Network State:{} {}{}{}'.format(ansiiprint.purple(), ansiiprint.end(), network_state_colour, router_network_state, ansiiprint.end())) - ainformation.append('{}CPUs:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), router_cpu_count)) - ainformation.append('{}Load:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), router_cpu_load)) - if long_output == True: - ainformation.append('') - ainformation.append('{}Architecture:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), router_arch)) - ainformation.append('{}Operating System:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), router_os)) - ainformation.append('{}Kernel Version:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), router_kernel)) - - # Join it all together - information = '\n'.join(ainformation) - return information - -# -# Direct Functions -# -def secondary_router(zk_conn, router): - # Verify router is valid - if not common.verifyRouter(zk_conn, router): - return False, 'ERROR: No router named "{}" is present in the cluster.'.format(router) - - # Get current state - current_state = zkhandler.readdata(zk_conn, '/routers/{}/networkstate'.format(router)) - if current_state == 'primary': - click.echo('Setting router {} in secondary mode.'.format(router)) - zkhandler.writedata(zk_conn, { - '/routers': 'none' - }) - else: - click.echo('Router {} is already in secondary mode.'.format(router)) - - return True, '' - -def primary_router(zk_conn, router): - # Verify router is valid - if not common.verifyRouter(zk_conn, router): - return False, 'ERROR: No router named "{}" is present in the cluster.'.format(router) - - # Get current state - current_state = zkhandler.readdata(zk_conn, '/routers/{}/networkstate'.format(router)) - if current_state == 'secondary': - click.echo('Setting router {} in primary mode.'.format(router)) - zkhandler.writedata(zk_conn, { - '/routers': router - }) - else: - click.echo('Router {} is already in primary mode.'.format(router)) - - return True, '' - -def get_info(zk_conn, router, long_output): - # Verify router is valid - if not common.verifyRouter(zk_conn, router): - return False, 'ERROR: No router named "{}" is present in the cluster.'.format(router) - - # Get information about router in a pretty format - information = getInformationFromRouter(zk_conn, router, long_output) - click.echo(information) - return True, '' - -def get_list(zk_conn, limit): - # Match our limit - router_list = [] - full_router_list = zk_conn.get_children('/routers') - for router in full_router_list: - if limit != None: - try: - # Implcitly assume fuzzy limits - if re.match('\^.*', limit) == None: - limit = '.*' + limit - if re.match('.*\$', limit) == None: - limit = limit + '.*' - - if re.match(limit, router) != None: - router_list.append(router) - except Exception as e: - return False, 'Regex Error: {}'.format(e) - else: - router_list.append(router) - - router_list_output = [] - router_daemon_state = {} - router_network_state = {} - router_cpu_count = {} - router_cpu_load = {} - - # Gather information for printing - for router_name in router_list: - router_daemon_state[router_name] = zk_conn.get('/routers/{}/daemonstate'.format(router_name))[0].decode('ascii') - router_network_state[router_name] = zk_conn.get('/routers/{}/networkstate'.format(router_name))[0].decode('ascii') - router_cpu_count[router_name] = zk_conn.get('/routers/{}/staticdata'.format(router_name))[0].decode('ascii').split()[0] - router_cpu_load[router_name] = zk_conn.get('/routers/{}/cpuload'.format(router_name))[0].decode('ascii').split()[0] - - # Determine optimal column widths - # Dynamic columns: router_name - router_name_length = 0 - for router_name in router_list: - # router_name column - _router_name_length = len(router_name) + 1 - if _router_name_length > router_name_length: - router_name_length = _router_name_length - - # Format the string (header) - router_list_output.append( - '{bold}{router_name: <{router_name_length}} \ -State: {daemon_state_colour}{router_daemon_state: <7}{end_colour} {network_state_colour}{router_network_state: <10}{end_colour} \ -Resources: {router_cpu_count: <5} {router_cpu_load: <6}{end_bold}'.format( - router_name_length=router_name_length, - bold=ansiiprint.bold(), - end_bold=ansiiprint.end(), - daemon_state_colour='', - network_state_colour='', - end_colour='', - router_name='Name', - router_daemon_state='Daemon', - router_network_state='Network', - router_cpu_count='CPUs', - router_cpu_load='Load' - ) - ) - - # Format the string (elements) - for router_name in router_list: - if router_daemon_state[router_name] == 'run': - daemon_state_colour = ansiiprint.green() - elif router_daemon_state[router_name] == 'stop': - daemon_state_colour = ansiiprint.red() - elif router_daemon_state[router_name] == 'init': - daemon_state_colour = ansiiprint.yellow() - elif router_daemon_state[router_name] == 'dead': - daemon_state_colour = ansiiprint.red() + ansiiprint.bold() - else: - daemon_state_colour = ansiiprint.blue() - - if router_network_state[router_name] == 'primary': - network_state_colour = ansiiprint.green() - else: - network_state_colour = ansiiprint.blue() - - router_list_output.append( - '{bold}{router_name: <{router_name_length}} \ - {daemon_state_colour}{router_daemon_state: <7}{end_colour} {network_state_colour}{router_network_state: <10}{end_colour} \ - {router_cpu_count: <5} {router_cpu_load: <6}{end_bold}'.format( - router_name_length=router_name_length, - bold='', - end_bold='', - daemon_state_colour=daemon_state_colour, - network_state_colour=network_state_colour, - end_colour=ansiiprint.end(), - router_name=router_name, - router_daemon_state=router_daemon_state[router_name], - router_network_state=router_network_state[router_name], - router_cpu_count=router_cpu_count[router_name], - router_cpu_load=router_cpu_load[router_name] - ) - ) - - click.echo('\n'.join(sorted(router_list_output))) - - return True, '' diff --git a/client-common/client_lib/common.py b/client-common/common.py similarity index 72% rename from client-common/client_lib/common.py rename to client-common/common.py index 1fd14077..8c562ecb 100644 --- a/client-common/client_lib/common.py +++ b/client-common/common.py @@ -78,9 +78,9 @@ def getDomainMainDetails(parsed_xml): dmemory = str(parsed_xml.memory) dmemory_unit = str(parsed_xml.memory.attrib['unit']) if dmemory_unit == 'KiB': - dmemory = str(int(dmemory) * 1024) + dmemory = int(int(dmemory) / 1024) elif dmemory_unit == 'GiB': - dmemory = str(int(dmemory) / 1024) + dmemory = int(int(dmemory) * 1024) dvcpu = str(parsed_xml.vcpu) try: dvcputopo = '{}/{}/{}'.format(parsed_xml.cpu.topology.attrib['sockets'], parsed_xml.cpu.topology.attrib['cores'], parsed_xml.cpu.topology.attrib['threads']) @@ -185,106 +185,106 @@ def verifyRouter(zk_conn, router): # -# Get the list of valid target hypervisors +# Get the list of valid target nodes # -def getHypervisors(zk_conn, dom_uuid): - valid_hypervisor_list = [] - full_hypervisor_list = zk_conn.get_children('/nodes') +def getNodes(zk_conn, dom_uuid): + valid_node_list = [] + full_node_list = zk_conn.get_children('/nodes') try: - current_hypervisor = zk_conn.get('/domains/{}/hypervisor'.format(dom_uuid))[0].decode('ascii') + current_node = zk_conn.get('/domains/{}/node'.format(dom_uuid))[0].decode('ascii') except: - current_hypervisor = None + current_node = None - for hypervisor in full_hypervisor_list: - daemon_state = zk_conn.get('/nodes/{}/daemonstate'.format(hypervisor))[0].decode('ascii') - domain_state = zk_conn.get('/nodes/{}/domainstate'.format(hypervisor))[0].decode('ascii') + for node in full_node_list: + daemon_state = zk_conn.get('/nodes/{}/daemonstate'.format(node))[0].decode('ascii') + domain_state = zk_conn.get('/nodes/{}/domainstate'.format(node))[0].decode('ascii') - if hypervisor == current_hypervisor: + if node == current_node: continue if daemon_state != 'run' or domain_state != 'ready': continue - valid_hypervisor_list.append(hypervisor) + valid_node_list.append(node) - return valid_hypervisor_list + return valid_node_list # # Find a migration target # -def findTargetHypervisor(zk_conn, search_field, dom_uuid): +def findTargetNode(zk_conn, search_field, dom_uuid): if search_field == 'mem': - return findTargetHypervisorMem(zk_conn, dom_uuid) + return findTargetNodeMem(zk_conn, dom_uuid) if search_field == 'load': - return findTargetHypervisorLoad(zk_conn, dom_uuid) + return findTargetNodeLoad(zk_conn, dom_uuid) if search_field == 'vcpus': - return findTargetHypervisorVCPUs(zk_conn, dom_uuid) + return findTargetNodeVCPUs(zk_conn, dom_uuid) if search_field == 'vms': - return findTargetHypervisorVMs(zk_conn, dom_uuid) + return findTargetNodeVMs(zk_conn, dom_uuid) return None # via free memory (relative to allocated memory) -def findTargetHypervisorMem(zk_conn, dom_uuid): +def findTargetNodeMem(zk_conn, dom_uuid): most_allocfree = 0 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - memalloc = int(zk_conn.get('/nodes/{}/memalloc'.format(hypervisor))[0].decode('ascii')) - memused = int(zk_conn.get('/nodes/{}/memused'.format(hypervisor))[0].decode('ascii')) - memfree = int(zk_conn.get('/nodes/{}/memfree'.format(hypervisor))[0].decode('ascii')) + node_list = getNodes(zk_conn, dom_uuid) + for node in node_list: + memalloc = int(zk_conn.get('/nodes/{}/memalloc'.format(node))[0].decode('ascii')) + memused = int(zk_conn.get('/nodes/{}/memused'.format(node))[0].decode('ascii')) + memfree = int(zk_conn.get('/nodes/{}/memfree'.format(node))[0].decode('ascii')) memtotal = memused + memfree allocfree = memtotal - memalloc if allocfree > most_allocfree: most_allocfree = allocfree - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node # via load average -def findTargetHypervisorLoad(zk_conn, dom_uuid): +def findTargetNodeLoad(zk_conn, dom_uuid): least_load = 9999 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - load = float(zk_conn.get('/nodes/{}/cpuload'.format(hypervisor))[0].decode('ascii')) + node_list = getNodes(zk_conn, dom_uuid) + for node in node_list: + load = float(zk_conn.get('/nodes/{}/cpuload'.format(node))[0].decode('ascii')) if load < least_load: least_load = load - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node # via total vCPUs -def findTargetHypervisorVCPUs(zk_conn, dom_uuid): +def findTargetNodeVCPUs(zk_conn, dom_uuid): least_vcpus = 9999 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - vcpus = int(zk_conn.get('/nodes/{}/vcpualloc'.format(hypervisor))[0].decode('ascii')) + node_list = getNodes(zk_conn, dom_uuid) + for node in node_list: + vcpus = int(zk_conn.get('/nodes/{}/vcpualloc'.format(node))[0].decode('ascii')) if vcpus < least_vcpus: least_vcpus = vcpus - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node # via total VMs -def findTargetHypervisorVMs(zk_conn, dom_uuid): +def findTargetNodeVMs(zk_conn, dom_uuid): least_vms = 9999 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - vms = int(zk_conn.get('/nodes/{}/domainscount'.format(hypervisor))[0].decode('ascii')) + node_list = getNodes(zk_conn, dom_uuid) + for node in node_list: + vms = int(zk_conn.get('/nodes/{}/domainscount'.format(node))[0].decode('ascii')) if vms < least_vms: least_vms = vms - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node diff --git a/client-common/client_lib/network.py b/client-common/network.py similarity index 100% rename from client-common/client_lib/network.py rename to client-common/network.py diff --git a/client-common/client_lib/node.py b/client-common/node.py similarity index 57% rename from client-common/client_lib/node.py rename to client-common/node.py index 3878244d..d4e9fccb 100644 --- a/client-common/client_lib/node.py +++ b/client-common/node.py @@ -35,30 +35,26 @@ import configparser import kazoo.client import client_lib.ansiiprint as ansiiprint +import client_lib.zkhandler as zkhandler import client_lib.common as common import client_lib.vm as pvc_vm def getInformationFromNode(zk_conn, node_name, long_output): - node_daemon_state = zk_conn.get('/nodes/{}/daemonstate'.format(node_name))[0].decode('ascii') - node_domain_state = zk_conn.get('/nodes/{}/domainstate'.format(node_name))[0].decode('ascii') - node_cpu_count = zk_conn.get('/nodes/{}/staticdata'.format(node_name))[0].decode('ascii').split()[0] - node_kernel = zk_conn.get('/nodes/{}/staticdata'.format(node_name))[0].decode('ascii').split()[1] - node_os = zk_conn.get('/nodes/{}/staticdata'.format(node_name))[0].decode('ascii').split()[2] - node_arch = zk_conn.get('/nodes/{}/staticdata'.format(node_name))[0].decode('ascii').split()[3] - node_mem_used = zk_conn.get('/nodes/{}/memused'.format(node_name))[0].decode('ascii') - node_mem_free = zk_conn.get('/nodes/{}/memfree'.format(node_name))[0].decode('ascii') - node_mem_total = int(node_mem_used) + int(node_mem_free) - node_load = zk_conn.get('/nodes/{}/cpuload'.format(node_name))[0].decode('ascii') - node_domains_count = zk_conn.get('/nodes/{}/domainscount'.format(node_name))[0].decode('ascii') - node_running_domains = zk_conn.get('/nodes/{}/runningdomains'.format(node_name))[0].decode('ascii').split() - node_mem_allocated = 0 - for domain in node_running_domains: - try: - parsed_xml = common.getDomainXML(zk_conn, domain) - duuid, dname, ddescription, dmemory, dvcpu, dvcputopo = common.getDomainMainDetails(parsed_xml) - node_mem_allocated += int(dmemory) - except AttributeError: - click.echo('Error: Domain {} does not exist.'.format(domain)) + node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) + node_router_state = zkhandler.readdata(zk_conn, '/nodes/{}/routerstate'.format(node_name)) + node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) + node_static_data = zkhandler.readdata(zk_conn, '/nodes/{}/staticdata'.format(node_name)).split() + node_cpu_count = node_static_data[0] + node_kernel = node_static_data[1] + node_os = node_static_data[2] + node_arch = node_static_data[3] + node_mem_allocated = int(zkhandler.readdata(zk_conn, '/nodes/{}/memalloc'.format(node_name))) + node_mem_used = int(zkhandler.readdata(zk_conn, '/nodes/{}/memused'.format(node_name))) + node_mem_free = int(zkhandler.readdata(zk_conn, '/nodes/{}/memfree'.format(node_name))) + node_mem_total = node_mem_used + node_mem_free + node_load = zkhandler.readdata(zk_conn, '/nodes/{}/cpuload'.format(node_name)) + node_domains_count = zkhandler.readdata(zk_conn, '/nodes/{}/domainscount'.format(node_name)) + node_running_domains = zkhandler.readdata(zk_conn, '/nodes/{}/runningdomains'.format(node_name)).split() if node_daemon_state == 'run': daemon_state_colour = ansiiprint.green() @@ -71,6 +67,13 @@ def getInformationFromNode(zk_conn, node_name, long_output): else: daemon_state_colour = ansiiprint.blue() + if node_router_state == 'primary': + router_state_colour = ansiiprint.green() + elif node_router_state == 'secondary': + router_state_colour = ansiiprint.blue() + else: + router_state_colour = ansiiprint.purple() + if node_domain_state == 'ready': domain_state_colour = ansiiprint.green() else: @@ -83,6 +86,7 @@ def getInformationFromNode(zk_conn, node_name, long_output): # Basic information ainformation.append('{}Name:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), node_name)) ainformation.append('{}Daemon State:{} {}{}{}'.format(ansiiprint.purple(), ansiiprint.end(), daemon_state_colour, node_daemon_state, ansiiprint.end())) + ainformation.append('{}Router State:{} {}{}{}'.format(ansiiprint.purple(), ansiiprint.end(), router_state_colour, node_router_state, ansiiprint.end())) ainformation.append('{}Domain State:{} {}{}{}'.format(ansiiprint.purple(), ansiiprint.end(), domain_state_colour, node_domain_state, ansiiprint.end())) ainformation.append('{}Active VM Count:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), node_domains_count)) if long_output == True: @@ -105,6 +109,50 @@ def getInformationFromNode(zk_conn, node_name, long_output): # # Direct Functions # +def secondary_node(zk_conn, node): + # Verify node is valid + if not common.verifyNode(zk_conn, node): + return False, 'ERROR: No node named "{}" is present in the cluster.'.format(node) + + # Ensure node is a coordinator + daemon_mode = zkhandler.readdata(zk_conn, '/nodes/{}/daemonmode'.format(node)) + if daemon_mode == 'hypervisor': + return False, 'ERROR: Cannot change router mode on non-coordinator node "{}"'.format(node) + + # Get current state + current_state = zkhandler.readdata(zk_conn, '/nodes/{}/routerstate'.format(node)) + if current_state == 'primary': + click.echo('Setting node {} in secondary router mode.'.format(node)) + zkhandler.writedata(zk_conn, { + '/primary_node': 'none' + }) + else: + click.echo('Node {} is already in secondary router mode.'.format(node)) + + return True, '' + +def primary_node(zk_conn, node): + # Verify node is valid + if not common.verifyNode(zk_conn, node): + return False, 'ERROR: No node named "{}" is present in the cluster.'.format(node) + + # Ensure node is a coordinator + daemon_mode = zkhandler.readdata(zk_conn, '/nodes/{}/daemonmode'.format(node)) + if daemon_mode == 'hypervisor': + return False, 'ERROR: Cannot change router mode on non-coordinator node "{}"'.format(node) + + # Get current state + current_state = zkhandler.readdata(zk_conn, '/nodes/{}/routerstate'.format(node)) + if current_state == 'secondary': + click.echo('Setting node {} in primary router mode.'.format(node)) + zkhandler.writedata(zk_conn, { + '/primary_node': node + }) + else: + click.echo('Node {} is already in primary router mode.'.format(node)) + + return True, '' + def flush_node(zk_conn, node, wait): # Verify node is valid if not common.verifyNode(zk_conn, node): @@ -113,14 +161,14 @@ def flush_node(zk_conn, node, wait): click.echo('Flushing hypervisor {} of running VMs.'.format(node)) # Add the new domain to Zookeeper - transaction = zk_conn.transaction() - transaction.set_data('/nodes/{}/domainstate'.format(node), 'flush'.encode('ascii')) - results = transaction.commit() + zkhandler.writedata(zk_conn, { + '/nodes/{}/domainstate'.format(node): 'flush' + }) if wait == True: while True: time.sleep(1) - node_state = zk_conn.get('/nodes/{}/domainstate'.format(node))[0].decode('ascii') + node_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node)) if node_state == "flushed": break @@ -134,9 +182,9 @@ def ready_node(zk_conn, node): click.echo('Restoring hypervisor {} to active service.'.format(node)) # Add the new domain to Zookeeper - transaction = zk_conn.transaction() - transaction.set_data('/nodes/{}/domainstate'.format(node), 'unflush'.encode('ascii')) - results = transaction.commit() + zkhandler.writedata(zk_conn, { + '/nodes/{}/domainstate'.format(node): 'unflush' + }) return True, '' @@ -186,6 +234,7 @@ def get_list(zk_conn, limit): node_list_output = [] node_daemon_state = {} + node_router_state = {} node_domain_state = {} node_cpu_count = {} node_mem_used = {} @@ -198,55 +247,69 @@ def get_list(zk_conn, limit): # Gather information for printing for node_name in node_list: - node_daemon_state[node_name] = zk_conn.get('/nodes/{}/daemonstate'.format(node_name))[0].decode('ascii') - node_domain_state[node_name] = zk_conn.get('/nodes/{}/domainstate'.format(node_name))[0].decode('ascii') - node_cpu_count[node_name] = zk_conn.get('/nodes/{}/staticdata'.format(node_name))[0].decode('ascii').split()[0] - node_mem_used[node_name] = zk_conn.get('/nodes/{}/memused'.format(node_name))[0].decode('ascii') - node_mem_free[node_name] = zk_conn.get('/nodes/{}/memfree'.format(node_name))[0].decode('ascii') - node_mem_total[node_name] = int(node_mem_used[node_name]) + int(node_mem_free[node_name]) - node_load[node_name] = zk_conn.get('/nodes/{}/cpuload'.format(node_name))[0].decode('ascii') - node_domains_count[node_name] = zk_conn.get('/nodes/{}/domainscount'.format(node_name))[0].decode('ascii') - node_running_domains[node_name] = zk_conn.get('/nodes/{}/runningdomains'.format(node_name))[0].decode('ascii').split() - node_mem_allocated[node_name] = 0 - for domain in node_running_domains[node_name]: - try: - parsed_xml = common.getDomainXML(zk_conn, domain) - duuid, dname, ddescription, dmemory, dvcpu, dvcputopo = common.getDomainMainDetails(parsed_xml) - node_mem_allocated[node_name] += int(dmemory) - except AttributeError: - click.echo('Error: Domain {} does not exist.'.format(domain)) + node_daemon_state[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) + node_router_state[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/routerstate'.format(node_name)) + node_domain_state[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) + node_cpu_count[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/staticdata'.format(node_name)).split()[0] + node_mem_allocated[node_name] = int(zkhandler.readdata(zk_conn, '/nodes/{}/memalloc'.format(node_name))) + node_mem_used[node_name] = int(zkhandler.readdata(zk_conn, '/nodes/{}/memused'.format(node_name))) + node_mem_free[node_name] = int(zkhandler.readdata(zk_conn, '/nodes/{}/memfree'.format(node_name))) + node_mem_total[node_name] = node_mem_used[node_name] + node_mem_free[node_name] + node_load[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/cpuload'.format(node_name)) + node_domains_count[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/domainscount'.format(node_name)) + node_running_domains[node_name] = zkhandler.readdata(zk_conn, '/nodes/{}/runningdomains'.format(node_name)).split() # Determine optimal column widths - # Dynamic columns: node_name, hypervisor, migrated - node_name_length = 0 + # Dynamic columns: node_name, daemon_state, network_state, domain_state, load + node_name_length = 5 + daemon_state_length = 7 + router_state_length = 7 + domain_state_length = 7 for node_name in node_list: # node_name column _node_name_length = len(node_name) + 1 if _node_name_length > node_name_length: node_name_length = _node_name_length + # daemon_state column + _daemon_state_length = len(node_daemon_state[node_name]) + 1 + if _daemon_state_length > daemon_state_length: + daemon_state_length = _daemon_state_length + # router_state column + _router_state_length = len(node_router_state[node_name]) + 1 + if _router_state_length > router_state_length: + router_state_length = _router_state_length + # domain_state column + _domain_state_length = len(node_domain_state[node_name]) + 1 + if _domain_state_length > domain_state_length: + domain_state_length = _domain_state_length # Format the string (header) node_list_output.append( '{bold}{node_name: <{node_name_length}} \ -State: {daemon_state_colour}{node_daemon_state: <7}{end_colour} {domain_state_colour}{node_domain_state: <8}{end_colour} \ +State: {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {router_state_colour}{node_router_state: <{router_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \ Resources: {node_domains_count: <4} {node_cpu_count: <5} {node_load: <6} \ RAM (MiB): {node_mem_total: <6} {node_mem_used: <6} {node_mem_free: <6} {node_mem_allocated: <6}{end_bold}'.format( node_name_length=node_name_length, + daemon_state_length=daemon_state_length, + router_state_length=router_state_length, + domain_state_length=domain_state_length, bold=ansiiprint.bold(), end_bold=ansiiprint.end(), daemon_state_colour='', + router_state_colour='', domain_state_colour='', end_colour='', node_name='Name', node_daemon_state='Daemon', - node_domain_state='Domains', + node_router_state='Router', + node_domain_state='Domain', node_domains_count='VMs', node_cpu_count='CPUs', node_load='Load', node_mem_total='Total', node_mem_used='Used', node_mem_free='Free', - node_mem_allocated='VMs', + node_mem_allocated='VMs' ) ) @@ -263,7 +326,14 @@ RAM (MiB): {node_mem_total: <6} {node_mem_used: <6} {node_mem_free: <6} {node_me else: daemon_state_colour = ansiiprint.blue() - if node_mem_allocated[node_name] >= node_mem_total[node_name]: + if node_router_state[node_name] == 'primary': + router_state_colour = ansiiprint.green() + elif node_router_state[node_name] == 'secondary': + router_state_colour = ansiiprint.blue() + else: + router_state_colour = ansiiprint.purple() + + if node_mem_allocated[node_name] != 0 and node_mem_allocated[node_name] >= node_mem_total[node_name]: node_domain_state[node_name] = 'overprov' domain_state_colour = ansiiprint.yellow() elif node_domain_state[node_name] == 'ready': @@ -273,17 +343,22 @@ RAM (MiB): {node_mem_total: <6} {node_mem_used: <6} {node_mem_free: <6} {node_me node_list_output.append( '{bold}{node_name: <{node_name_length}} \ - {daemon_state_colour}{node_daemon_state: <7}{end_colour} {domain_state_colour}{node_domain_state: <8}{end_colour} \ + {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {router_state_colour}{node_router_state: <{router_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \ {node_domains_count: <4} {node_cpu_count: <5} {node_load: <6} \ {node_mem_total: <6} {node_mem_used: <6} {node_mem_free: <6} {node_mem_allocated: <6}{end_bold}'.format( node_name_length=node_name_length, + daemon_state_length=daemon_state_length, + router_state_length=router_state_length, + domain_state_length=domain_state_length, bold='', end_bold='', daemon_state_colour=daemon_state_colour, + router_state_colour=router_state_colour, domain_state_colour=domain_state_colour, end_colour=ansiiprint.end(), node_name=node_name, node_daemon_state=node_daemon_state[node_name], + node_router_state=node_router_state[node_name], node_domain_state=node_domain_state[node_name], node_domains_count=node_domains_count[node_name], node_cpu_count=node_cpu_count[node_name], diff --git a/client-common/client_lib/vm.py b/client-common/vm.py similarity index 78% rename from client-common/client_lib/vm.py rename to client-common/vm.py index c6f14aea..157e5082 100644 --- a/client-common/client_lib/vm.py +++ b/client-common/vm.py @@ -44,25 +44,22 @@ def getInformationFromXML(zk_conn, uuid, long_output): # Obtain the contents of the XML from Zookeeper try: dstate = zk_conn.get('/domains/{}/state'.format(uuid))[0].decode('ascii') - dhypervisor = zk_conn.get('/domains/{}/hypervisor'.format(uuid))[0].decode('ascii') - dlasthypervisor = zk_conn.get('/domains/{}/lasthypervisor'.format(uuid))[0].decode('ascii') + dnode = zk_conn.get('/domains/{}/node'.format(uuid))[0].decode('ascii') + dlastnode = zk_conn.get('/domains/{}/lastnode'.format(uuid))[0].decode('ascii') except: return None - if dlasthypervisor == '': - dlasthypervisor = 'N/A' + if dlastnode == '': + dlastnode = 'N/A' - try: - parsed_xml = common.getDomainXML(zk_conn, uuid) - duuid, dname, ddescription, dmemory, dvcpu, dvcputopo = common.getDomainMainDetails(parsed_xml) - except AttributeError: - click.echo('Error: Domain {} does not exist.'.format(domain)) + parsed_xml = common.getDomainXML(zk_conn, uuid) + duuid, dname, ddescription, dmemory, dvcpu, dvcputopo = common.getDomainMainDetails(parsed_xml) + dnets = common.getDomainNetworks(parsed_xml) if long_output == True: dtype, darch, dmachine, dconsole, demulator = common.getDomainExtraDetails(parsed_xml) dfeatures = common.getDomainCPUFeatures(parsed_xml) ddisks = common.getDomainDisks(parsed_xml) - dnets = common.getDomainNetworks(parsed_xml) dcontrollers = common.getDomainControllers(parsed_xml) # Format a nice output; do this line-by-line then concat the elements at the end @@ -98,8 +95,17 @@ def getInformationFromXML(zk_conn, uuid, long_output): 'unmigrate': ansiiprint.blue() } ainformation.append('{}State:{} {}{}{}'.format(ansiiprint.purple(), ansiiprint.end(), dstate_colour[dstate], dstate, ansiiprint.end())) - ainformation.append('{}Active Hypervisor:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), dhypervisor)) - ainformation.append('{}Last Hypervisor:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), dlasthypervisor)) + ainformation.append('{}Current Node:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), dnode)) + ainformation.append('{}Previous Node:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), dlastnode)) + + # Network list + net_list = [] + for net in dnets: + # Split out just the numerical (VNI) part of the brXXXX name + net_vni = re.findall(r'\d+', net['source'])[0] + net_list.append(net_vni) + ainformation.append('') + ainformation.append('{}Networks:{} {}'.format(ansiiprint.purple(), ansiiprint.end(), ', '.join(net_list))) if long_output == True: # Disk list @@ -112,7 +118,6 @@ def getInformationFromXML(zk_conn, uuid, long_output): ainformation.append('{0}Disks:{1} {2}ID Type {3: <{width}} Dev Bus{4}'.format(ansiiprint.purple(), ansiiprint.end(), ansiiprint.bold(), 'Name', ansiiprint.end(), width=name_length)) for disk in ddisks: ainformation.append(' {0: <3} {1: <5} {2: <{width}} {3: <4} {4: <5}'.format(ddisks.index(disk), disk['type'], disk['name'], disk['dev'], disk['bus'], width=name_length)) - # Network list ainformation.append('') ainformation.append('{}Interfaces:{} {}ID Type Source Model MAC{}'.format(ansiiprint.purple(), ansiiprint.end(), ansiiprint.bold(), ansiiprint.end())) for net in dnets: @@ -193,25 +198,25 @@ def getDomainName(zk_conn, domain): # # Direct functions # -def define_vm(zk_conn, config_data, target_hypervisor, selector): +def define_vm(zk_conn, config_data, target_node, selector): # Parse the XML data parsed_xml = lxml.objectify.fromstring(config_data) dom_uuid = parsed_xml.uuid.text dom_name = parsed_xml.name.text click.echo('Adding new VM with Name "{}" and UUID "{}" to database.'.format(dom_name, dom_uuid)) - if target_hypervisor == None: - target_hypervisor = common.findTargetHypervisor(zk_conn, selector, dom_uuid) + if target_node == None: + target_node = common.findTargetNode(zk_conn, selector, dom_uuid) # Verify node is valid - common.verifyNode(zk_conn, target_hypervisor) + common.verifyNode(zk_conn, target_node) # Add the new domain to Zookeeper transaction = zk_conn.transaction() transaction.create('/domains/{}'.format(dom_uuid), dom_name.encode('ascii')) transaction.create('/domains/{}/state'.format(dom_uuid), 'stop'.encode('ascii')) - transaction.create('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) - transaction.create('/domains/{}/lasthypervisor'.format(dom_uuid), ''.encode('ascii')) + transaction.create('/domains/{}/node'.format(dom_uuid), target_node.encode('ascii')) + transaction.create('/domains/{}/lastnode'.format(dom_uuid), ''.encode('ascii')) transaction.create('/domains/{}/failedreason'.format(dom_uuid), ''.encode('ascii')) transaction.create('/domains/{}/xml'.format(dom_uuid), config_data.encode('ascii')) results = transaction.commit() @@ -251,7 +256,7 @@ def undefine_vm(zk_conn, domain): transaction.set_data('/domains/{}/state'.format(dom_uuid), 'stop'.encode('ascii')) transaction.commit() - # Wait for 3 seconds to allow state to flow to all hypervisors + # Wait for 3 seconds to allow state to flow to all nodes click.echo('Waiting for cluster to update.') time.sleep(1) except: @@ -343,43 +348,43 @@ def stop_vm(zk_conn, domain): return True, '' -def move_vm(zk_conn, domain, target_hypervisor, selector): +def move_vm(zk_conn, domain, target_node, selector): # Validate and obtain alternate passed value dom_uuid = getDomainUUID(zk_conn, domain) if dom_uuid == None: common.stopZKConnection(zk_conn) return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain) - current_hypervisor = zk_conn.get('/domains/{}/hypervisor'.format(dom_uuid))[0].decode('ascii') + current_node = zk_conn.get('/domains/{}/node'.format(dom_uuid))[0].decode('ascii') - if target_hypervisor == None: - target_hypervisor = common.findTargetHypervisor(zk_conn, selector, dom_uuid) + if target_node == None: + target_node = common.findTargetNode(zk_conn, selector, dom_uuid) else: - if target_hypervisor == current_hypervisor: + if target_node == current_node: common.stopZKConnection(zk_conn) - return False, 'ERROR: VM "{}" is already running on hypervisor "{}".'.format(dom_uuid, current_hypervisor) + return False, 'ERROR: VM "{}" is already running on node "{}".'.format(dom_uuid, current_node) # Verify node is valid - common.verifyNode(zk_conn, target_hypervisor) + common.verifyNode(zk_conn, target_node) current_vm_state = zk_conn.get('/domains/{}/state'.format(dom_uuid))[0].decode('ascii') if current_vm_state == 'start': - click.echo('Permanently migrating VM "{}" to hypervisor "{}".'.format(dom_uuid, target_hypervisor)) + click.echo('Permanently migrating VM "{}" to node "{}".'.format(dom_uuid, target_node)) transaction = zk_conn.transaction() transaction.set_data('/domains/{}/state'.format(dom_uuid), 'migrate'.encode('ascii')) - transaction.set_data('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) - transaction.set_data('/domains/{}/lasthypervisor'.format(dom_uuid), ''.encode('ascii')) + transaction.set_data('/domains/{}/node'.format(dom_uuid), target_node.encode('ascii')) + transaction.set_data('/domains/{}/lastnode'.format(dom_uuid), ''.encode('ascii')) transaction.commit() else: - click.echo('Permanently moving VM "{}" to hypervisor "{}".'.format(dom_uuid, target_hypervisor)) + click.echo('Permanently moving VM "{}" to node "{}".'.format(dom_uuid, target_node)) transaction = zk_conn.transaction() - transaction.set_data('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) - transaction.set_data('/domains/{}/lasthypervisor'.format(dom_uuid), ''.encode('ascii')) + transaction.set_data('/domains/{}/node'.format(dom_uuid), target_node.encode('ascii')) + transaction.set_data('/domains/{}/lastnode'.format(dom_uuid), ''.encode('ascii')) transaction.commit() return True, '' -def migrate_vm(zk_conn, domain, target_hypervisor, selector, force_migrate): +def migrate_vm(zk_conn, domain, target_node, selector, force_migrate): # Validate and obtain alternate passed value dom_uuid = getDomainUUID(zk_conn, domain) if dom_uuid == None: @@ -393,32 +398,32 @@ def migrate_vm(zk_conn, domain, target_hypervisor, selector, force_migrate): else: target_state = 'migrate' - current_hypervisor = zk_conn.get('/domains/{}/hypervisor'.format(dom_uuid))[0].decode('ascii') - last_hypervisor = zk_conn.get('/domains/{}/lasthypervisor'.format(dom_uuid))[0].decode('ascii') + current_node = zk_conn.get('/domains/{}/node'.format(dom_uuid))[0].decode('ascii') + last_node = zk_conn.get('/domains/{}/lastnode'.format(dom_uuid))[0].decode('ascii') - if last_hypervisor != '' and force_migrate != True: + if last_node != '' and force_migrate != True: click.echo('ERROR: VM "{}" has been previously migrated.'.format(dom_uuid)) - click.echo('> Last hypervisor: {}'.format(last_hypervisor)) - click.echo('> Current hypervisor: {}'.format(current_hypervisor)) - click.echo('Run `vm unmigrate` to restore the VM to its previous hypervisor, or use `--force` to override this check.') + click.echo('> Last node: {}'.format(last_node)) + click.echo('> Current node: {}'.format(current_node)) + click.echo('Run `vm unmigrate` to restore the VM to its previous node, or use `--force` to override this check.') common.stopZKConnection(zk_conn) return False, '' - if target_hypervisor == None: - target_hypervisor = findTargetHypervisor(zk_conn, selector, dom_uuid) + if target_node == None: + target_node = findTargetNode(zk_conn, selector, dom_uuid) else: - if target_hypervisor == current_hypervisor: + if target_node == current_node: common.stopZKConnection(zk_conn) - return False, 'ERROR: VM "{}" is already running on hypervisor "{}".'.format(dom_uuid, current_hypervisor) + return False, 'ERROR: VM "{}" is already running on node "{}".'.format(dom_uuid, current_node) # Verify node is valid - common.verifyNode(zk_conn, target_hypervisor) + common.verifyNode(zk_conn, target_node) - click.echo('Migrating VM "{}" to hypervisor "{}".'.format(dom_uuid, target_hypervisor)) + click.echo('Migrating VM "{}" to node "{}".'.format(dom_uuid, target_node)) transaction = zk_conn.transaction() transaction.set_data('/domains/{}/state'.format(dom_uuid), target_state.encode('ascii')) - transaction.set_data('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) - transaction.set_data('/domains/{}/lasthypervisor'.format(dom_uuid), current_hypervisor.encode('ascii')) + transaction.set_data('/domains/{}/node'.format(dom_uuid), target_node.encode('ascii')) + transaction.set_data('/domains/{}/lastnode'.format(dom_uuid), current_node.encode('ascii')) transaction.commit() return True, '' @@ -437,17 +442,17 @@ def unmigrate_vm(zk_conn, domain): else: target_state = 'migrate' - target_hypervisor = zk_conn.get('/domains/{}/lasthypervisor'.format(dom_uuid))[0].decode('ascii') + target_node = zk_conn.get('/domains/{}/lastnode'.format(dom_uuid))[0].decode('ascii') - if target_hypervisor == '': + if target_node == '': common.stopZKConnection(zk_conn) return False, 'ERROR: VM "{}" has not been previously migrated.'.format(dom_uuid) - click.echo('Unmigrating VM "{}" back to hypervisor "{}".'.format(dom_uuid, target_hypervisor)) + click.echo('Unmigrating VM "{}" back to node "{}".'.format(dom_uuid, target_node)) transaction = zk_conn.transaction() transaction.set_data('/domains/{}/state'.format(dom_uuid), target_state.encode('ascii')) - transaction.set_data('/domains/{}/hypervisor'.format(dom_uuid), target_hypervisor.encode('ascii')) - transaction.set_data('/domains/{}/lasthypervisor'.format(dom_uuid), ''.encode('ascii')) + transaction.set_data('/domains/{}/node'.format(dom_uuid), target_node.encode('ascii')) + transaction.set_data('/domains/{}/lastnode'.format(dom_uuid), ''.encode('ascii')) transaction.commit() return True, '' @@ -473,16 +478,16 @@ def get_info(zk_conn, domain, long_output): return True, '' -def get_list(zk_conn, hypervisor, limit): - if hypervisor != None: +def get_list(zk_conn, node, limit): + if node != None: # Verify node is valid - common.verifyNode(zk_conn, hypervisor) + common.verifyNode(zk_conn, node) full_vm_list = zk_conn.get_children('/domains') vm_list = [] vm_list_output = [] - vm_hypervisor = {} + vm_node = {} vm_state = {} vm_migrated = {} vm_uuid = {} @@ -490,13 +495,14 @@ def get_list(zk_conn, hypervisor, limit): vm_description = {} vm_memory = {} vm_vcpu = {} + vm_nets = {} # If we're limited, remove other nodes' VMs for vm in full_vm_list: # Check we don't match the limit name = zkhandler.readdata(zk_conn, '/domains/{}'.format(vm)) - vm_hypervisor[vm] = zkhandler.readdata(zk_conn, '/domains/{}/hypervisor'.format(vm)) + vm_node[vm] = zkhandler.readdata(zk_conn, '/domains/{}/node'.format(vm)) if limit != None: try: # Implcitly assume fuzzy limits @@ -506,72 +512,85 @@ def get_list(zk_conn, hypervisor, limit): limit = limit + '.*' if re.match(limit, vm) != None: - if hypervisor == None: + if node == None: vm_list.append(vm) else: - if vm_hypervisor[vm] == hypervisor: + if vm_node[vm] == node: vm_list.append(vm) if re.match(limit, name) != None: - if hypervisor == None: + if node == None: vm_list.append(vm) else: - if vm_hypervisor[vm] == hypervisor: + if vm_node[vm] == node: vm_list.append(vm) except Exception as e: return False, 'Regex Error: {}'.format(e) else: - # Check hypervisor to avoid unneeded ZK calls - if hypervisor == None: + # Check node to avoid unneeded ZK calls + if node == None: vm_list.append(vm) else: - if vm_hypervisor[vm] == hypervisor: + if vm_node[vm] == node: vm_list.append(vm) # Gather information for printing for vm in vm_list: vm_state[vm] = zk_conn.get('/domains/{}/state'.format(vm))[0].decode('ascii') - vm_lasthypervisor = zk_conn.get('/domains/{}/lasthypervisor'.format(vm))[0].decode('ascii') - if vm_lasthypervisor != '': - vm_migrated[vm] = 'from {}'.format(vm_lasthypervisor) + vm_lastnode = zk_conn.get('/domains/{}/lastnode'.format(vm))[0].decode('ascii') + if vm_lastnode != '': + vm_migrated[vm] = 'from {}'.format(vm_lastnode) else: vm_migrated[vm] = 'no' try: vm_xml = common.getDomainXML(zk_conn, vm) vm_uuid[vm], vm_name[vm], vm_description[vm], vm_memory[vm], vm_vcpu[vm], vm_vcputopo = common.getDomainMainDetails(vm_xml) + dnets = common.getDomainNetworks(vm_xml) + net_list = [] + for net in dnets: + # Split out just the numerical (VNI) part of the brXXXX name + net_vni = re.findall(r'\d+', net['source'])[0] + net_list.append(net_vni) + vm_nets[vm] = ','.join(net_list) except AttributeError: click.echo('Error: Domain {} does not exist.'.format(domain)) # Determine optimal column widths - # Dynamic columns: node_name, hypervisor, migrated - vm_name_length = 0 - vm_hypervisor_length = 0 - vm_migrated_length = 0 + # Dynamic columns: node_name, node, migrated + vm_name_length = 10 + vm_node_length = 8 + vm_nets_length = 9 + vm_migrated_length = 10 for vm in vm_list: # vm_name column _vm_name_length = len(vm_name[vm]) + 1 if _vm_name_length > vm_name_length: vm_name_length = _vm_name_length - # vm_hypervisor column - _vm_hypervisor_length = len(vm_hypervisor[vm]) + 1 - if _vm_hypervisor_length > vm_hypervisor_length: - vm_hypervisor_length = _vm_hypervisor_length + # vm_node column + _vm_node_length = len(vm_node[vm]) + 1 + if _vm_node_length > vm_node_length: + vm_node_length = _vm_node_length + # vm_nets column + _vm_nets_length = len(vm_nets[vm]) + 1 + if _vm_nets_length > vm_nets_length: + vm_nets_length = _vm_nets_length # vm_migrated column _vm_migrated_length = len(vm_migrated[vm]) + 1 if _vm_migrated_length > vm_migrated_length: vm_migrated_length = _vm_migrated_length # Format the string (header) - vm_list_header = ansiiprint.bold() + 'Name UUID State RAM [MiB] vCPUs Hypervisor Migrated?' + ansiiprint.end() vm_list_output.append( '{bold}{vm_name: <{vm_name_length}} {vm_uuid: <37} \ {vm_state_colour}{vm_state: <8}{end_colour} \ +{vm_networks: <{vm_nets_length}} \ {vm_memory: <10} {vm_vcpu: <6} \ -{vm_hypervisor: <{vm_hypervisor_length}} \ +{vm_node: <{vm_node_length}} \ {vm_migrated: <{vm_migrated_length}}{end_bold}'.format( vm_name_length=vm_name_length, - vm_hypervisor_length=vm_hypervisor_length, + vm_node_length=vm_node_length, + vm_nets_length=vm_nets_length, vm_migrated_length=vm_migrated_length, bold=ansiiprint.bold(), end_bold=ansiiprint.end(), @@ -580,9 +599,10 @@ def get_list(zk_conn, hypervisor, limit): vm_name='Name', vm_uuid='UUID', vm_state='State', + vm_networks='Networks', vm_memory='RAM (MiB)', vm_vcpu='vCPUs', - vm_hypervisor='Hypervisor', + vm_node='Node', vm_migrated='Migrated' ) ) @@ -605,11 +625,13 @@ def get_list(zk_conn, hypervisor, limit): vm_list_output.append( '{bold}{vm_name: <{vm_name_length}} {vm_uuid: <37} \ {vm_state_colour}{vm_state: <8}{end_colour} \ +{vm_networks: <{vm_nets_length}} \ {vm_memory: <10} {vm_vcpu: <6} \ -{vm_hypervisor: <{vm_hypervisor_length}} \ +{vm_node: <{vm_node_length}} \ {vm_migrated: <{vm_migrated_length}}{end_bold}'.format( vm_name_length=vm_name_length, - vm_hypervisor_length=vm_hypervisor_length, + vm_node_length=vm_node_length, + vm_nets_length=vm_nets_length, vm_migrated_length=vm_migrated_length, bold='', end_bold='', @@ -618,9 +640,10 @@ def get_list(zk_conn, hypervisor, limit): vm_name=vm_name[vm], vm_uuid=vm_uuid[vm], vm_state=vm_state[vm], + vm_networks=vm_nets[vm], vm_memory=vm_memory[vm], vm_vcpu=vm_vcpu[vm], - vm_hypervisor=vm_hypervisor[vm], + vm_node=vm_node[vm], vm_migrated=vm_migrated[vm] ) ) diff --git a/client-common/client_lib/zkhandler.py b/client-common/zkhandler.py similarity index 100% rename from client-common/client_lib/zkhandler.py rename to client-common/zkhandler.py diff --git a/debian/changelog b/debian/changelog index 49510291..1579992a 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +pvc (0.4-0) unstable; urgency=medium + + * Unification of all daemons into node daemon + * Numerous client tweaks + + -- Joshua Boniface Sat, 13 Oct 2018 10:40:14 -0400 + pvc (0.3-1) unstable; urgency=medium * Code and package reorganization pending additional daemons diff --git a/debian/control b/debian/control index 8c0216a8..0f8b3703 100644 --- a/debian/control +++ b/debian/control @@ -6,40 +6,14 @@ Standards-Version: 3.9.8 Homepage: https://www.boniface.me X-Python3-Version: >= 3.2 -Package: pvc-daemon-common +Package: pvc-daemon Architecture: all -Depends: python3-kazoo, python3-psutil, python3-apscheduler -Suggests: pvc-virtualization-daemon, pvc-cli-client -Description: Parallel Virtual Cluster common daemon libraries (Python 3) - The Parallel Virtual Cluster provides a management solution for QEMU/KVM virtual clusters. - . - This package installs the common daemon libraries - -Package: pvc-virtualization-daemon -Architecture: all -Depends: pvc-daemon-common, python3-libvirt, ipmitool, libvirt-daemon-system -Suggests: pvc-cli-client +Depends: python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, ipmitool, libvirt-daemon-system, arping, bridge-utils, dnsmasq +Suggests: pvc-client-cli Description: Parallel Virtual Cluster virtualization daemon (Python 3) The Parallel Virtual Cluster provides a management solution for QEMU/KVM virtual clusters. . - This package installs the PVC virtualization daemon - -Package: pvc-network-daemon -Architecture: all -Depends: pvc-daemon-common, python3-libvirt, ipmitool, libvirt-daemon-system -Suggests: pvc-cli-client, pvc-virtualization-daemon -Description: Parallel Virtual Cluster network daemon (Python 3) - The Parallel Virtual Cluster provides a management solution for QEMU/KVM virtual clusters. - . - This package installs the PVC network daemon - -Package: pvc-router-daemon -Architecture: all -Depends: pvc-daemon-common, arping -Description: Parallel Virtual Cluster router daemon (Python 3) - The Parallel Virtual Cluster provides a management solution for QEMU/KVM virtual clusters. - . - This package installs the PVC router daemon + This package installs the PVC node daemon Package: pvc-client-common Architecture: all @@ -49,7 +23,7 @@ Description: Parallel Virtual Cluster common client libraries (Python 3) . This package installs the common client libraries -Package: pvc-cli-client +Package: pvc-client-cli Architecture: all Depends: pvc-client-common Description: Parallel Virtual Cluster client (Python 3) diff --git a/debian/pvc-cli-client.install b/debian/pvc-cli-client.install deleted file mode 100644 index 319c8b32..00000000 --- a/debian/pvc-cli-client.install +++ /dev/null @@ -1 +0,0 @@ -cli-client/pvc.py usr/share/pvc diff --git a/debian/pvc-client-cli.install b/debian/pvc-client-cli.install new file mode 100644 index 00000000..b2ee9a60 --- /dev/null +++ b/debian/pvc-client-cli.install @@ -0,0 +1 @@ +client-cli/pvc.py usr/share/pvc diff --git a/debian/pvc-cli-client.postinst b/debian/pvc-client-cli.postinst similarity index 100% rename from debian/pvc-cli-client.postinst rename to debian/pvc-client-cli.postinst diff --git a/debian/pvc-cli-client.prerm b/debian/pvc-client-cli.prerm similarity index 100% rename from debian/pvc-cli-client.prerm rename to debian/pvc-client-cli.prerm diff --git a/debian/pvc-client-common.install b/debian/pvc-client-common.install index e47fb483..290683b2 100644 --- a/debian/pvc-client-common.install +++ b/debian/pvc-client-common.install @@ -1 +1 @@ -client-common/client_lib usr/share/pvc +client-common/* usr/share/pvc/client_lib diff --git a/debian/pvc-daemon.install b/debian/pvc-daemon.install new file mode 100644 index 00000000..067abc15 --- /dev/null +++ b/debian/pvc-daemon.install @@ -0,0 +1,4 @@ +node-daemon/pvcd.py usr/share/pvc +node-daemon/pvcd.service lib/systemd/system +node-daemon/pvcd.conf.sample etc/pvc +node-daemon/pvcd usr/share/pvc diff --git a/debian/pvc-daemon.postinst b/debian/pvc-daemon.postinst new file mode 100644 index 00000000..9ae80517 --- /dev/null +++ b/debian/pvc-daemon.postinst @@ -0,0 +1,10 @@ +#!/bin/sh + +# Enable the service +systemctl enable /lib/systemd/system/pvcd.service + +if systemctl is-active --quiet pvcd.service; then + echo "The PVC node daemon has not been restarted; this is up to the administrator." +else + echo "The PVC node daemon has not been started; create a config file at /etc/pvc/pvcd.conf then start it." +fi diff --git a/debian/pvc-virtualization-daemon.prerm b/debian/pvc-daemon.prerm similarity index 50% rename from debian/pvc-virtualization-daemon.prerm rename to debian/pvc-daemon.prerm index 5694171f..09aaa5c3 100644 --- a/debian/pvc-virtualization-daemon.prerm +++ b/debian/pvc-daemon.prerm @@ -1,5 +1,4 @@ #!/bin/sh # Disable the service -systemctl disable pvcvd.service - +systemctl disable pvcd.service diff --git a/debian/pvc-network-daemon.install b/debian/pvc-network-daemon.install deleted file mode 100644 index 24cb205b..00000000 --- a/debian/pvc-network-daemon.install +++ /dev/null @@ -1,4 +0,0 @@ -network-daemon/pvcnd.py usr/share/pvc -network-daemon/pvcnd.service lib/systemd/system -network-daemon/pvcnd.conf.sample etc/pvc -network-daemon/pvcnd usr/share/pvc diff --git a/debian/pvc-network-daemon.postinst b/debian/pvc-network-daemon.postinst deleted file mode 100644 index b40df819..00000000 --- a/debian/pvc-network-daemon.postinst +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -# Enable the servive -systemctl enable /lib/systemd/system/pvcnd.service - -echo "The PVC network daemon has not been started. Create a config file at /etc/pvc/pvcnd.conf then start it." diff --git a/debian/pvc-network-daemon.prerm b/debian/pvc-network-daemon.prerm deleted file mode 100644 index 3f4fd8cb..00000000 --- a/debian/pvc-network-daemon.prerm +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -# Disable the service -systemctl disable pvcnd.service - diff --git a/debian/pvc-router-daemon.install b/debian/pvc-router-daemon.install deleted file mode 100644 index eca573c1..00000000 --- a/debian/pvc-router-daemon.install +++ /dev/null @@ -1,4 +0,0 @@ -router-daemon/pvcrd.py usr/share/pvc -router-daemon/pvcrd.service lib/systemd/system -router-daemon/pvcrd.conf.sample etc/pvc -router-daemon/pvcrd usr/share/pvc diff --git a/debian/pvc-router-daemon.postinst b/debian/pvc-router-daemon.postinst deleted file mode 100644 index 2f5dd080..00000000 --- a/debian/pvc-router-daemon.postinst +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -# Enable the servive -systemctl enable /lib/systemd/system/pvcrd.service - -echo "The PVC router daemon has not been started. Create a config file at /etc/pvc/pvcrd.conf then start it." diff --git a/debian/pvc-router-daemon.prerm b/debian/pvc-router-daemon.prerm deleted file mode 100644 index 14c921a5..00000000 --- a/debian/pvc-router-daemon.prerm +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -# Disable the service -systemctl disable pvcrd.service - diff --git a/debian/pvc-virtualization-daemon.install b/debian/pvc-virtualization-daemon.install deleted file mode 100644 index c4b8dd02..00000000 --- a/debian/pvc-virtualization-daemon.install +++ /dev/null @@ -1,4 +0,0 @@ -virtualization-daemon/pvcvd.py usr/share/pvc -virtualization-daemon/pvcvd.service lib/systemd/system -virtualization-daemon/pvcvd.conf.sample etc/pvc -virtualization-daemon/pvcvd usr/share/pvc diff --git a/debian/pvc-virtualization-daemon.postinst b/debian/pvc-virtualization-daemon.postinst deleted file mode 100644 index 60865eb7..00000000 --- a/debian/pvc-virtualization-daemon.postinst +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -# Enable the servive -systemctl enable /lib/systemd/system/pvcvd.service - -echo "The PVC virtualization daemon has not been started. Create a config file at /etc/pvc/pvcvd.conf then start it." diff --git a/network-daemon/pvcnd.conf.sample b/network-daemon/pvcnd.conf.sample deleted file mode 100644 index 1cd17613..00000000 --- a/network-daemon/pvcnd.conf.sample +++ /dev/null @@ -1,22 +0,0 @@ -# pvcnd cluster configuration file example -# -# This configuration file specifies details for this node in PVC. Multiple host -# blocks can be added but only the one matching the current system hostname will -# be used by the local daemon. Default values apply to all hosts for any value -# not specifically overridden. -# -# The following values are required for each host or in a default section: -# zookeeper: the IP+port of the Zookeper instance (defaults to 127.0.0.1:2181) -# vni_dev: the lower-level network device to bind VNI to -# vni_dev_ip: the IP address (CIDR) of the lower-level network device, used -# by FRR to communicate with the route reflectors and pass routes -# for VNI interfaces -# -# Copy this example to /etc/pvc/pvcnd.conf and edit to your needs - -[default] -zookeeper = 127.0.0.1:2181 - -[myhost] -vni_dev = ens4 -vni_dev_ip = 10.255.0.3/24 diff --git a/network-daemon/pvcnd.py b/network-daemon/pvcnd.py deleted file mode 100755 index e35a036a..00000000 --- a/network-daemon/pvcnd.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -# pvcnd.py - Network daemon startup stub -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import pvcnd.Daemon diff --git a/network-daemon/pvcnd.service b/network-daemon/pvcnd.service deleted file mode 100644 index 435fa348..00000000 --- a/network-daemon/pvcnd.service +++ /dev/null @@ -1,16 +0,0 @@ -# Parallel Virtual Cluster network daemon unit file -[Unit] -Description = Parallel Virtual Cluster network daemon -After = network-online.target libvirtd.service zookeeper.service - -[Service] -Type = simple -WorkingDirectory = /usr/share/pvc -Environment = PYTHONUNBUFFERED=true -Environment = PVCND_CONFIG_FILE=/etc/pvc/pvcnd.conf -ExecStart = /usr/share/pvc/pvcnd.py -KillSignal = SIGINT -Restart = on-failure - -[Install] -WantedBy = multi-user.target diff --git a/network-daemon/pvcnd/Daemon.py b/network-daemon/pvcnd/Daemon.py deleted file mode 100644 index 340dbcaf..00000000 --- a/network-daemon/pvcnd/Daemon.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python3 - -# Daemon.py - PVC hypervisor network daemon -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import kazoo.client -import sys -import os -import signal -import socket -import psutil -import configparser -import time - -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler - -import pvcnd.VXNetworkInstance as VXNetworkInstance - -print(ansiiprint.bold() + "pvcnd - Parallel Virtual Cluster network daemon" + ansiiprint.end()) - -# Get the config file variable from the environment -try: - pvcnd_config_file = os.environ['PVCND_CONFIG_FILE'] -except: - print('ERROR: The "PVCND_CONFIG_FILE" environment variable must be set before starting pvcnd.') - exit(1) - -myhostname = socket.gethostname() -myshorthostname = myhostname.split('.', 1)[0] -mydomainname = ''.join(myhostname.split('.', 1)[1:]) - -# Config values dictionary -config_values = [ - 'zookeeper', - 'vni_dev', - 'vni_dev_ip', -] -def readConfig(pvcnd_config_file, myhostname): - print('Loading configuration from file {}'.format(pvcnd_config_file)) - - o_config = configparser.ConfigParser() - o_config.read(pvcnd_config_file) - config = {} - - try: - entries = o_config[myhostname] - except: - try: - entries = o_config['default'] - except: - print('ERROR: Config file is not valid!') - exit(1) - - for entry in config_values: - try: - config[entry] = entries[entry] - except: - try: - config[entry] = o_config['default'][entry] - except: - print('ERROR: Config file missing required value "{}" for this host!'.format(entry)) - exit(1) - - return config - -config = readConfig(pvcnd_config_file, myhostname) - -zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper']) -try: - print('Connecting to Zookeeper instance at {}'.format(config['zookeeper'])) - zk_conn.start() -except: - print('ERROR: Failed to connect to Zookeeper!') - exit(1) - -# Handle zookeeper failures gracefully -def zk_listener(state): - global zk_conn - if state == kazoo.client.KazooState.SUSPENDED: - ansiiprint.echo('Connection to Zookeeper list; retrying', '', 'e') - - while True: - _zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper']) - try: - _zk_conn.start() - zk_conn = _zk_conn - break - except: - time.sleep(1) - elif state == kazoo.client.KazooState.CONNECTED: - ansiiprint.echo('Connection to Zookeeper started', '', 'o') - else: - pass - -zk_conn.add_listener(zk_listener) - -# Cleanup function -def cleanup(signum, frame): - ansiiprint.echo('Terminating daemon', '', 'e') - # Close the Zookeeper connection - try: - zk_conn.stop() - zk_conn.close() - except: - pass - # Exit - exit(0) - -# Handle signals with cleanup -signal.signal(signal.SIGTERM, cleanup) -signal.signal(signal.SIGINT, cleanup) -signal.signal(signal.SIGQUIT, cleanup) - -# What this daemon does: -# 1. Configure public networks dynamically on startup (e.g. bonding, vlans, etc.) from config -# * no /etc/network/interfaces config for these - just mgmt interface via DHCP! -# 2. Watch ZK /networks -# 3. Provision required network interfaces when a network is added -# a. create vxlan interface targeting local dev from config -# b. create bridge interface -# c. add vxlan to bridge -# d. set interfaces up -# 4. Remove network interfaces when network disapears - -# Zookeeper schema: -# networks/ -# / -# ipnet e.g. 10.101.0.0/24 -# gateway e.g. 10.101.0.1 [1] -# routers e.g. 10.101.0.2,10.101.0.3 [2] -# dhcp e.g. YES [3] -# reservations/ -# / -# address e.g. 10.101.0.30 -# mac e.g. ff:ff:fe:ab:cd:ef -# fwrules/ -# / -# description e.g. Allow HTTP from any to this net -# src e.g. any -# dest e.g. this -# port e.g. 80 - -# Notes: -# [1] becomes a VIP between the pair of routers in multi-router envs -# [2] becomes real addrs on the pair of routers in multi-router envs -# [2] should match gateway in single-router envs for consistency -# [3] enables or disables a DHCP subnet definition for the network - - -# Prepare underlying interface -if config['vni_dev_ip'] == 'dhcp': - vni_dev = config['vni_dev'] - ansiiprint.echo('Configuring VNI parent device {} with DHCP IP'.format(vni_dev), '', 'o') - os.system( - 'ip link set {0} up'.format( - vni_dev - ) - ) - os.system( - 'dhclient {0}'.format( - vni_dev - ) - ) -else: - vni_dev = config['vni_dev'] - vni_dev_ip = config['vni_dev_ip'] - ansiiprint.echo('Configuring VNI parent device {} with IP {}'.format(vni_dev, vni_dev_ip), '', 'o') - os.system( - 'ip link set {0} up'.format( - vni_dev - ) - ) - os.system( - 'ip address add {0} dev {1}'.format( - vni_dev_ip, - vni_dev - ) - ) - -# Prepare VNI list -t_vni = dict() -vni_list = [] - -@zk_conn.ChildrenWatch('/networks') -def updatenetworks(new_vni_list): - global vni_list - print(ansiiprint.blue() + 'Network list: ' + ansiiprint.end() + '{}'.format(' '.join(new_vni_list))) - # Add new VNIs - for vni in new_vni_list: - if vni not in vni_list: - vni_list.append(vni) - t_vni[vni] = VXNetworkInstance.VXNetworkInstance(vni, zk_conn, config) - t_vni[vni].createNetwork() - # Remove deleted VNIs - for vni in vni_list: - if vni not in new_vni_list: - vni_list.remove(vni) - t_vni[vni].removeNetwork() - -# Tick loop -while True: - try: - time.sleep(0.1) - except: - break diff --git a/network-daemon/pvcnd/VXNetworkInstance.py b/network-daemon/pvcnd/VXNetworkInstance.py deleted file mode 100644 index 55602567..00000000 --- a/network-daemon/pvcnd/VXNetworkInstance.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 - -# VXNetworkInstance.py - Class implementing a PVC VM network and run by pvcnd -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import os -import sys - -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler - -class VXNetworkInstance(): - # Initialization function - def __init__ (self, vni, zk_conn, config): - self.vni = vni - self.zk_conn = zk_conn - self.vni_dev = config['vni_dev'] - - def createNetwork(self): - ansiiprint.echo('Creating VNI {} device on interface {}'.format(self.vni, self.vni_dev), '', 'o') - os.system( - 'sudo ip link add vxlan{0} type vxlan id {0} dstport 4789 dev {1} nolearning'.format( - self.vni, - self.vni_dev - ) - ) - os.system( - 'sudo brctl addbr br{0}'.format( - self.vni - ) - ) - os.system( - 'sudo brctl addif br{0} vxlan{0}'.format( - self.vni - ) - ) - os.system( - 'sudo ip link set vxlan{0} up'.format( - self.vni - ) - ) - os.system( - 'sudo ip link set br{0} up'.format( - self.vni - ) - ) - - def removeNetwork(self): - ansiiprint.echo('Removing VNI {} device on interface {}'.format(self.vni, self.vni_dev), '', 'o') - os.system( - 'sudo ip link set br{0} down'.format( - self.vni - ) - ) - os.system( - 'sudo ip link set vxlan{0} down'.format( - self.vni - ) - ) - os.system( - 'sudo brctl delif br{0} vxlan{0}'.format( - self.vni - ) - ) - os.system( - 'sudo brctl delbr br{0}'.format( - self.vni - ) - ) - os.system( - 'sudo ip link delete vxlan{0}'.format( - self.vni - ) - ) diff --git a/node-daemon/pvcd.conf.sample b/node-daemon/pvcd.conf.sample new file mode 100644 index 00000000..5285160a --- /dev/null +++ b/node-daemon/pvcd.conf.sample @@ -0,0 +1,66 @@ +# pvcd cluster configuration file example +# +# This configuration file specifies details for this node in PVC. Multiple node +# blocks can be added but only the one matching the current system nodename will +# be used by the local daemon. Default values are not supported; the values in +# this sample configuration are considered defaults and, with adjustment of the +# nodename section and coordinators list, can be used as-is on a Debian system. +# +# The following values are required for each node or in a default section: +# coordinators: A CSV list of the short hostnames of the coordinator nodes; these nodes become +# members of the Zookeeper cluster, can act as routers, and perform additional +# special functions in a cluster; ideally there are 3 coordinators, though 5 +# coordinators are supported +# dynamic_directory: The ramdisk directory for PVC to store its dynamic configurations, +# usually under /run or /var/run +# log_directory: The logging directory, usually under /var/log +# file_logging = Whether to log daemon to a file (pvc.log under log_directory) in addition to +# normal stdout printing +# keepalive_interval: the interval between keepalives and for dead node timeout (defaults to 5) +# fence_intervals: the number of keepalive_intervals without Zookeeper contact before this node +# will consider another node dead and fence it (defaults to 6, i.e. 30s) +# suicide_intervals: the number of keepalive_intervals without Zookeeper contact before this +# node will consider itself failed and terminate all running VMs (defaults +# to 0, i.e. disabled); should be less than "fence_intervals" +# successful_fence: the action to take on a successful fencing operation; can be "none" or +# "migrate" (defaults to "migrate") +# failed_fence: the action to take on a failed fencing operation; can be "none" or "migrate" +# (defaults to "none"); "migrate" requires "suicide_intervals" to be set) +# NOTE: POTENTIALLY DANGEROUS - see README for details +# migration_target_selector: the method to use to select target nodes during a virtual machine +# flush action; can be "mem", "load", "vcpus", or "vms" (defaults +# to "mem"); the best choice based on this field is selected for +# each VM to be migrated +# The following values are required for each node specifically (usually node-unique): +# vni_dev: the lower-level network device to bind VNI traffic to +# vni_dev_ip: the IP address (in CIDR format) of the lower-level network device, used by frr +# to communicate between nodes and pass routes between them. +# storage_dev: the lower-level network device to bind storage traffic to +# storage_dev_ip: the IP address (in CIDR format) of the lower-level network device, used by +# Ceph for storage traffic (both monitor and OSD). +# ipmi_hostname: the IPMI hostname for fencing (defaults to -lom.) +# ipmi_username: username to connect to IPMI +# ipmi_password: password to connect to IPMI +# +# Copy this example to /etc/pvc/pvcd.conf and edit to your needs + +[default] +coordinators = pvc-hv1,pvc-hv2,pvc-hv3 +dynamic_directory = /run/pvc +log_directory = /var/log/pvc +file_logging = True +keepalive_interval = 5 +fence_intervals = 6 +suicide_intervals = 0 +successful_fence = migrate +failed_fence = none +migration_target_selector = mem + +[pvc-hv1] +vni_dev = ens4 +vni_dev_ip = 10.255.0.1/24 +storage_dev = ens4 +storage_dev_ip = 10.254.0.1/24 +ipmi_username = admin +ipmi_password = Passw0rd +ipmi_hostname = pvc-hv1-lom diff --git a/router-daemon/pvcrd.py b/node-daemon/pvcd.py similarity index 93% rename from router-daemon/pvcrd.py rename to node-daemon/pvcd.py index 485c956d..9b3bcebd 100755 --- a/router-daemon/pvcrd.py +++ b/node-daemon/pvcd.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# pvcrd.py - Router daemon startup stub +# pvcd.py - Node daemon startup stub # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018 Joshua M. Boniface @@ -20,4 +20,4 @@ # ############################################################################### -import pvcrd.Daemon +import pvcd.Daemon diff --git a/virtualization-daemon/pvcvd.service b/node-daemon/pvcd.service similarity index 67% rename from virtualization-daemon/pvcvd.service rename to node-daemon/pvcd.service index eab19e2b..d8aa89f4 100644 --- a/virtualization-daemon/pvcvd.service +++ b/node-daemon/pvcd.service @@ -1,14 +1,14 @@ # Parallel Virtual Cluster virtualization daemon unit file [Unit] -Description = Parallel Virtual Cluster virtualization daemon +Description = Parallel Virtual Cluster node daemon After = network-online.target libvirtd.service zookeeper.service [Service] Type = simple WorkingDirectory = /usr/share/pvc Environment = PYTHONUNBUFFERED=true -Environment = PVCVD_CONFIG_FILE=/etc/pvc/pvcvd.conf -ExecStart = /usr/share/pvc/pvcvd.py +Environment = PVCD_CONFIG_FILE=/etc/pvc/pvcd.conf +ExecStart = /usr/share/pvc/pvcd.py KillSignal = SIGINT Restart = on-failure diff --git a/node-daemon/pvcd/Daemon.py b/node-daemon/pvcd/Daemon.py new file mode 100644 index 00000000..8037b59f --- /dev/null +++ b/node-daemon/pvcd/Daemon.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python3 + +# Daemon.py - Node daemon +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +version = '0.4' + +import kazoo.client +import libvirt +import sys +import os +import signal +import atexit +import socket +import psutil +import subprocess +import uuid +import time +import re +import configparser +import apscheduler.schedulers.background + +import pvcd.log as log +import pvcd.zkhandler as zkhandler +import pvcd.common as common + +import pvcd.DomainInstance as DomainInstance +import pvcd.NodeInstance as NodeInstance +import pvcd.VXNetworkInstance as VXNetworkInstance + +############################################################################### +# PVCD - node daemon startup program +############################################################################### +# +# The PVC daemon starts a node and configures all the required components for +# the node to run. It determines which of the 3 daemon modes it should be in +# during initial setup based on hostname and the config file, and then starts +# any required services. The 3 daemon modes are: +# * leader: the cluster leader, follows the Zookeeper leader +# * coordinator: a Zookeeper cluster member +# * hypervisor: a hypervisor without any cluster intelligence +# +############################################################################### + +############################################################################### +# Daemon functions +############################################################################### + +# Create timer to update this node in Zookeeper +def startKeepaliveTimer(): + global update_timer + interval = int(config['keepalive_interval']) + logger.out('Starting keepalive timer ({} second interval)'.format(interval), state='s') + update_timer.add_job(update_zookeeper, 'interval', seconds=interval) + update_timer.start() + +def stopKeepaliveTimer(): + global update_timer + try: + update_timer.shutdown() + logger.out('Stopping keepalive timer', state='s') + except: + pass + +############################################################################### +# PHASE 1a - Configuration parsing +############################################################################### + +# Get the config file variable from the environment +try: + pvcvd_config_file = os.environ['PVCD_CONFIG_FILE'] +except: + print('ERROR: The "PVCD_CONFIG_FILE" environment variable must be set before starting pvcd.') + exit(1) + +# Set local hostname and domain variables +myfqdn = socket.gethostname() +#myfqdn = 'pvc-hv1.domain.net' +myhostname = myfqdn.split('.', 1)[0] +mydomainname = ''.join(myfqdn.split('.', 1)[1:]) +mynodeid = re.findall(r'\d+', myhostname)[-1] + +# Gather useful data about our host +# Static data format: 'cpu_count', 'arch', 'os', 'kernel' +staticdata = [] +staticdata.append(str(psutil.cpu_count())) +staticdata.append(subprocess.run(['uname', '-r'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) +staticdata.append(subprocess.run(['uname', '-o'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) +staticdata.append(subprocess.run(['uname', '-m'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) + +# Create our timer object +update_timer = apscheduler.schedulers.background.BackgroundScheduler() + +# Config values dictionary +config_values = [ + 'coordinators', + 'dynamic_directory', + 'log_directory', + 'file_logging', + 'keepalive_interval', + 'fence_intervals', + 'suicide_intervals', + 'successful_fence', + 'failed_fence', + 'migration_target_selector', + 'vni_dev', + 'vni_dev_ip', + 'storage_dev', + 'storage_dev_ip', + 'ipmi_hostname', + 'ipmi_username', + 'ipmi_password' +] + +# Read and parse the config file +def readConfig(pvcvd_config_file, myhostname): + print('Loading configuration from file "{}"'.format(pvcvd_config_file)) + + o_config = configparser.ConfigParser() + o_config.read(pvcvd_config_file) + config = {} + + try: + entries = o_config[myhostname] + except: + try: + entries = o_config['default'] + except Exception as e: + print('ERROR: Config file is not valid!') + exit(1) + + for entry in config_values: + try: + config[entry] = entries[entry] + except: + try: + config[entry] = o_config['default'][entry] + except: + print('ERROR: Config file missing required value "{}" for this host!'.format(entry)) + exit(1) + + # Handle an empty ipmi_hostname + if config['ipmi_hostname'] == '': + config['ipmi_hostname'] = myshorthostname + '-lom.' + mydomainname + + return config + +# Get the config object from readConfig() +config = readConfig(pvcvd_config_file, myhostname) + +############################################################################### +# PHASE 1b - Prepare filesystem directories +############################################################################### + +# Define our dynamic directory schema +# / +# dnsmasq/ +# pdns/ +# nft/ +config['dnsmasq_dynamic_directory'] = config['dynamic_directory'] + '/dnsmasq' +config['pdns_dynamic_directory'] = config['dynamic_directory'] + '/pdns' +config['nft_dynamic_directory'] = config['dynamic_directory'] + '/nft' + +# Create our dynamic directories if they don't exist +if not os.path.exists(config['dynamic_directory']): + os.makedirs(config['dynamic_directory']) + os.makedirs(config['dnsmasq_dynamic_directory']) + os.makedirs(config['pdns_dynamic_directory']) + os.makedirs(config['nft_dynamic_directory']) + +# Define our log directory schema +# / +# dnsmasq/ +# pdns/ +# nft/ +config['dnsmasq_log_directory'] = config['log_directory'] + '/dnsmasq' +config['pdns_log_directory'] = config['log_directory'] + '/pdns' +config['nft_log_directory'] = config['log_directory'] + '/nft' + +# Create our dynamic directories if they don't exist +if not os.path.exists(config['log_directory']): + os.makedirs(config['log_directory']) + os.makedirs(config['dnsmasq_log_directory']) + os.makedirs(config['pdns_log_directory']) + os.makedirs(config['nft_log_directory']) + +############################################################################### +# PHASE 1c - Set up logging +############################################################################### + +logger = log.Logger(config) + +# Print our startup messages +logger.out('Parallel Virtual Cluster node daemon v{}'.format(version)) +logger.out('FQDN: {}'.format(myfqdn)) +logger.out('Host: {}'.format(myhostname)) +logger.out('ID: {}'.format(mynodeid)) +logger.out('IPMI hostname: {}'.format(config['ipmi_hostname'])) +logger.out('Machine details:') +logger.out(' CPUs: {}'.format(staticdata[0])) +logger.out(' Arch: {}'.format(staticdata[3])) +logger.out(' OS: {}'.format(staticdata[2])) +logger.out(' Kernel: {}'.format(staticdata[1])) +logger.out('Starting pvcd on host {}'.format(myfqdn), state='s') + +############################################################################### +# PHASE 2 - Determine coordinator mode and start Zookeeper on coordinators +############################################################################### + +# What is the list of coordinator hosts +coordinator_hosts = config['coordinators'].split(',') + +if myhostname in coordinator_hosts: + # We are indeed a coordinator host + config['daemon_mode'] = 'coordinator' + # Start the zookeeper service using systemctl + logger.out('Node is a ' + logger.fmt_blue + 'coordinator' + logger.fmt_end +'; starting Zookeeper daemon', state='i') + common.run_os_command('systemctl start zookeeper.service') + time.sleep(1) +else: + config['daemon_mode'] = 'hypervisor' + +############################################################################### +# PHASE 3 - Attempt to connect to the coordinators and start zookeeper client +############################################################################### + +# Start the connection to the coordinators +zk_conn = kazoo.client.KazooClient(hosts=config['coordinators']) +try: + logger.out('Connecting to Zookeeper cluster hosts {}'.format(config['coordinators']), state='i') + # Start connection + zk_conn.start() +except Exception as e: + logger.out('ERROR: Failed to connect to Zookeeper cluster: {}'.format(e), state='e') + exit(1) + +# Handle zookeeper failures +def zk_listener(state): + global zk_conn, update_timer + if state == kazoo.client.KazooState.SUSPENDED: + logger.out('Connection to Zookeeper lost; retrying', state='w') + + # Stop keepalive thread + if update_timer: + stopKeepaliveTimer() + + while True: + try: + zk_conn.start() + break + except: + time.sleep(1) + elif state == kazoo.client.KazooState.CONNECTED: + logger.out('Connection to Zookeeper restarted', state='o') + + # Start keepalive thread + if update_timer: + update_timer = createKeepaliveTimer() + else: + pass +zk_conn.add_listener(zk_listener) + +############################################################################### +# PHASE 4 - Gracefully handle termination +############################################################################### + +# Cleanup function +def cleanup(): + global zk_conn, update_timer + + # Stop keepalive thread + stopKeepaliveTimer() + + logger.out('Terminating pvcd and cleaning up', state='s') + + # Set stop state in Zookeeper + zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'stop' }) + + # Force into secondary network state if needed + if this_node.name == this_node.primary_node: + zkhandler.writedata(zk_conn, { '/primary_node': 'none' }) + + # Wait for things to flush + time.sleep(3) + + # Close the Zookeeper connection + try: + zk_conn.stop() + zk_conn.close() + except: + pass + +# Handle exit gracefully +atexit.register(cleanup) + +# Termination function +def term(signum='', frame=''): + # Exit + sys.exit(0) + +# Handle signals gracefully +signal.signal(signal.SIGTERM, term) +signal.signal(signal.SIGINT, term) +signal.signal(signal.SIGQUIT, term) + +############################################################################### +# PHASE 5 - Prepare host in Zookeeper +############################################################################### + +# Check if our node exists in Zookeeper, and create it if not +if zk_conn.exists('/nodes/{}'.format(myhostname)): + logger.out("Node is " + logger.fmt_green + "present" + logger.fmt_end + " in Zookeeper", state='i') + zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'init' }) + # Update static data just in case it's changed + zkhandler.writedata(zk_conn, { '/nodes/{}/staticdata'.format(myhostname): ' '.join(staticdata) }) +else: + logger.out("Node is " + logger.fmt_red + "absent" + logger.fmt_end + " in Zookeeper; adding new node", state='i') + keepalive_time = int(time.time()) + transaction = zk_conn.transaction() + transaction.create('/nodes/{}'.format(myhostname), config['daemon_mode'].encode('ascii')) + # Basic state information + transaction.create('/nodes/{}/daemonmode'.format(myhostname), config['daemon_mode'].encode('ascii')) + transaction.create('/nodes/{}/daemonstate'.format(myhostname), 'init'.encode('ascii')) + transaction.create('/nodes/{}/routerstate'.format(myhostname), 'client'.encode('ascii')) + transaction.create('/nodes/{}/domainstate'.format(myhostname), 'flushed'.encode('ascii')) + transaction.create('/nodes/{}/staticdata'.format(myhostname), ' '.join(staticdata).encode('ascii')) + transaction.create('/nodes/{}/memfree'.format(myhostname), '0'.encode('ascii')) + transaction.create('/nodes/{}/memused'.format(myhostname), '0'.encode('ascii')) + transaction.create('/nodes/{}/memalloc'.format(myhostname), '0'.encode('ascii')) + transaction.create('/nodes/{}/vcpualloc'.format(myhostname), '0'.encode('ascii')) + transaction.create('/nodes/{}/cpuload'.format(myhostname), '0.0'.encode('ascii')) + transaction.create('/nodes/{}/networkscount'.format(myhostname), '0'.encode('ascii')) + transaction.create('/nodes/{}/domainscount'.format(myhostname), '0'.encode('ascii')) + transaction.create('/nodes/{}/runningdomains'.format(myhostname), ''.encode('ascii')) + # Keepalives and fencing information + transaction.create('/nodes/{}/keepalive'.format(myhostname), str(keepalive_time).encode('ascii')) + transaction.create('/nodes/{}/ipmihostname'.format(myhostname), config['ipmi_hostname'].encode('ascii')) + transaction.create('/nodes/{}/ipmiusername'.format(myhostname), config['ipmi_username'].encode('ascii')) + transaction.create('/nodes/{}/ipmipassword'.format(myhostname), config['ipmi_password'].encode('ascii')) + transaction.commit() + +# Check that the primary key exists, and create it with us as master if not +current_primary = zkhandler.readdata(zk_conn, '/primary_node') +if current_primary and current_primary != 'none': + logger.out('Current primary node is "{}{}{}".'.format(logger.fmt_blue, current_primary, logger.fmt_end), state='i') +else: + logger.out('No primary node key found; creating with us as primary.', state='i') + zkhandler.writedata(zk_conn, { '/primary_node': myhostname }) + +############################################################################### +# PHASE 6 - Create local IP addresses for VNI and Storage networks +############################################################################### + +# VNI configuration +vni_dev = config['vni_dev'] +vni_dev_ip = config['vni_dev_ip'] +logger.out('Setting up VNI network on interface {} with IP {}'.format(vni_dev, vni_dev_ip), state='i') +common.run_os_command('ip link set {} up'.format(vni_dev)) +common.run_os_command('ip address add {} dev {}'.format(vni_dev_ip, vni_dev)) + +# Storage configurationm +storage_dev = config['storage_dev'] +storage_dev_ip = config['storage_dev_ip'] +logger.out('Setting up Storage network on interface {} with IP {}'.format(storage_dev, storage_dev_ip), state='i') +common.run_os_command('ip link set {} up'.format(storage_dev)) +common.run_os_command('ip address add {} dev {}'.format(storage_dev_ip, storage_dev)) + +############################################################################### +# PHASE 7a - Ensure Libvirt is running on the local host +############################################################################### + +# Start the zookeeper service using systemctl +logger.out('Starting Libvirt daemon', state='i') +common.run_os_command('systemctl start libvirtd.service') +time.sleep(1) + +# Check that libvirtd is listening TCP +libvirt_check_name = "qemu+tcp://127.0.0.1:16509/system" +logger.out('Connecting to Libvirt daemon at {}'.format(libvirt_check_name), state='i') +try: + lv_conn = libvirt.open(libvirt_check_name) + lv_conn.close() +except Exception as e: + logger.out('ERROR: Failed to connect to Libvirt daemon: {}'.format(e), state='e') + exit(1) + +############################################################################### +# PHASE 7b - Ensure Ceph is running on the local host +############################################################################### + +# if coordinator, start ceph-mon +# if hypervisor or coodinator, start ceph-osds + +############################################################################### +# PHASE 7c - Ensure NFT is running on the local host +############################################################################### + +logger.out("Creating NFT firewall configuration", state='i') + +# Create our config dirs +common.run_os_command( + '/bin/mkdir --parents {}/networks'.format( + config['nft_dynamic_directory'] + ) +) +common.run_os_command( + '/bin/mkdir --parents {}/static'.format( + config['nft_dynamic_directory'] + ) +) +common.run_os_command( + '/bin/mkdir --parents {}'.format( + config['nft_dynamic_directory'] + ) +) + +# Set up the basic features of the nftables firewall +nftables_base_rules = """# Base rules +flush ruleset +# Add the filter table and chains +add table inet filter +add chain inet filter forward {{ type filter hook forward priority 0; }} +add chain inet filter input {{ type filter hook input priority 0; }} +# Include static rules and network rules +include "{rulesdir}/static/*" +include "{rulesdir}/networks/*" +""".format( + rulesdir=config['nft_dynamic_directory'] +) + +# Write the basic firewall config +nftables_base_filename = '{}/base.nft'.format(config['nft_dynamic_directory']) +nftables_update_filename = '{}/update'.format(config['nft_dynamic_directory']) +with open(nftables_base_filename, 'w') as nfbasefile: + nfbasefile.write(nftables_base_rules) + # Notify a reload of the firewall rules on next keepalive update + open(nftables_update_filename, 'a').close() + +############################################################################### +# PHASE 8 - Set up our objects +############################################################################### + +logger.out('Setting up objects', state='i') + +d_node = dict() +d_network = dict() +d_domain = dict() +node_list = [] +network_list = [] +domain_list = [] + +# Node objects +@zk_conn.ChildrenWatch('/nodes') +def update_nodes(new_node_list): + global node_list, d_node + + # Add any missing nodes to the list + for node in new_node_list: + if not node in node_list: + d_node[node] = NodeInstance.NodeInstance(node, myhostname, zk_conn, config, logger, d_node, d_network, d_domain) + + # Remove any deleted nodes from the list + for node in node_list: + if not node in new_node_list: + # Delete the object + del(d_node[node]) + + # Update and print new list + node_list = new_node_list + logger.out('{}Node list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(node_list)), state='i') + + # Update node objects' list + for node in d_node: + d_node[node].update_node_list(d_node) + +# Alias for our local node (passed to network and domain objects) +this_node = d_node[myhostname] + +# Network objects +@zk_conn.ChildrenWatch('/networks') +def update_networks(new_network_list): + global network_list, d_network + + # Add any missing networks to the list + for network in new_network_list: + if not network in network_list: + d_network[network] = VXNetworkInstance.VXNetworkInstance(network, zk_conn, config, logger, this_node) + # Start primary functionality + if this_node.router_state == 'primary': + d_network[network].createGatewayAddress() + d_network[network].startDHCPServer() + + # Remove any deleted networks from the list + for network in network_list: + if not network in new_network_list: + # Stop primary functionality + if this_router.router_state == 'primary': + d_network[network].stopDHCPServer() + d_network[network].removeGatewayAddress() + # Stop general functionality + d_network[network].removeFirewall() + d_network[network].removeNetwork() + # Delete the object + del(d_network[network]) + + # Update and print new list + network_list = new_network_list + logger.out('{}Network list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(network_list)), state='i') + + # Update node objects' list + for node in d_node: + d_node[node].update_network_list(d_network) + +# VM domain objects +@zk_conn.ChildrenWatch('/domains') +def update_domains(new_domain_list): + global domain_list, d_domain + + # Add any missing domains to the list + for domain in new_domain_list: + if not domain in domain_list: + d_domain[domain] = DomainInstance.DomainInstance(domain, zk_conn, config, logger, this_node); + + # Remove any deleted domains from the list + for domain in domain_list: + if not domain in new_domain_list: + # Delete the object + del(d_domain[domain]) + + # Update and print new list + domain_list = new_domain_list + logger.out('{}Domain list:{} {}'.format(logger.fmt_blue, logger.fmt_end, ' '.join(domain_list)), state='i') + + # Update node objects' list + for node in d_node: + d_node[node].update_domain_list(d_domain) + +############################################################################### +# PHASE 9 - Run the daemon +############################################################################### + +# Set up our update function +update_zookeeper = this_node.update_zookeeper + +# Start keepalive thread and immediately update Zookeeper +startKeepaliveTimer() +update_zookeeper() + +# Tick loop +while True: + try: + time.sleep(1) + except: + break diff --git a/virtualization-daemon/pvcvd/VMInstance.py b/node-daemon/pvcd/DomainInstance.py similarity index 75% rename from virtualization-daemon/pvcvd/VMInstance.py rename to node-daemon/pvcd/DomainInstance.py index d54b8d56..4a9d6bc9 100644 --- a/virtualization-daemon/pvcvd/VMInstance.py +++ b/node-daemon/pvcd/DomainInstance.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# VMInstance.py - Class implementing a PVC virtual machine and run by pvcvd +# DomainInstance.py - Class implementing a PVC virtual machine in pvcd # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018 Joshua M. Boniface @@ -29,20 +29,21 @@ import threading import libvirt import kazoo.client -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler +import pvcd.log as log +import pvcd.zkhandler as zkhandler -class VMInstance: +class DomainInstance: # Initialization function - def __init__(self, domuuid, zk_conn, config, thishypervisor): + def __init__(self, domuuid, zk_conn, config, logger, this_node): # Passed-in variables on creation self.domuuid = domuuid self.zk_conn = zk_conn self.config = config - self.thishypervisor = thishypervisor + self.logger = logger + self.this_node = this_node # These will all be set later - self.hypervisor = None + self.node = None self.state = None self.instart = False self.inrestart = False @@ -72,8 +73,8 @@ class VMInstance: def getstate(self): return self.state - def gethypervisor(self): - return self.hypervisor + def getnode(self): + return self.node def getdom(self): return self.dom @@ -96,35 +97,35 @@ class VMInstance: # Manage local node domain_list def addDomainToList(self): - if not self.domuuid in self.thishypervisor.domain_list: + if not self.domuuid in self.this_node.domain_list: try: # Add the domain to the domain_list array - self.thishypervisor.domain_list.append(self.domuuid) + self.this_node.domain_list.append(self.domuuid) # Push the change up to Zookeeper - zkhandler.writedata(self.zk_conn, { '/nodes/{}/runningdomains'.format(self.thishypervisor.name): ' '.join(self.thishypervisor.domain_list) }) + zkhandler.writedata(self.zk_conn, { '/nodes/{}/runningdomains'.format(self.this_node.name): ' '.join(self.this_node.domain_list) }) except Exception as e: - ansiiprint.echo('Error adding domain to list: {}'.format(e), '', 'c') + self.logger.out('Error adding domain to list: {}'.format(e), state='c') def removeDomainFromList(self): - if self.domuuid in self.thishypervisor.domain_list: + if self.domuuid in self.this_node.domain_list: try: # Remove the domain from the domain_list array - self.thishypervisor.domain_list.remove(self.domuuid) + self.this_node.domain_list.remove(self.domuuid) # Push the change up to Zookeeper - zkhandler.writedata(self.zk_conn, { '/nodes/{}/runningdomains'.format(self.thishypervisor.name): ' '.join(self.thishypervisor.domain_list) }) + zkhandler.writedata(self.zk_conn, { '/nodes/{}/runningdomains'.format(self.this_node.name): ' '.join(self.this_node.domain_list) }) except Exception as e: - ansiiprint.echo('Error removing domain from list: {}'.format(e), '', 'c') + self.logger.out('Error removing domain from list: {}'.format(e), state='c') # Start up the VM def start_vm(self): - ansiiprint.echo('Starting VM', '{}:'.format(self.domuuid), 'i') + self.logger.out('Starting VM', state='i', prefix='Domain {}:'.format(self.domuuid)) self.instart = True # Start up a new Libvirt connection libvirt_name = "qemu:///system" lv_conn = libvirt.open(libvirt_name) if lv_conn == None: - ansiiprint.echo('Failed to open local libvirt connection', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to open local libvirt connection', state='e', prefix='Domain {}:'.format(self.domuuid)) self.instart = False return @@ -146,11 +147,11 @@ class VMInstance: xmlconfig = zkhandler.readdata(self.zk_conn, '/domains/{}/xml'.format(self.domuuid)) dom = lv_conn.createXML(xmlconfig, 0) self.addDomainToList() - ansiiprint.echo('Successfully started VM', '{}:'.format(self.domuuid), 'o') + self.logger.out('Successfully started VM', state='o', prefix='Domain {}:'.format(self.domuuid)) self.dom = dom zkhandler.writedata(self.zk_conn, { '/domains/{}/failedreason'.format(self.domuuid): '' }) except libvirt.libvirtError as e: - ansiiprint.echo('Failed to create VM', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to create VM', state='e', prefix='Domain {}:'.format(self.domuuid)) zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'failed' }) zkhandler.writedata(self.zk_conn, { '/domains/{}/failedreason'.format(self.domuuid): str(e) }) self.dom = None @@ -160,14 +161,14 @@ class VMInstance: # Restart the VM def restart_vm(self): - ansiiprint.echo('Restarting VM', '{}:'.format(self.domuuid), 'i') + self.logger.out('Restarting VM', state='i', prefix='Domain {}:'.format(self.domuuid)) self.inrestart = True # Start up a new Libvirt connection libvirt_name = "qemu:///system" lv_conn = libvirt.open(libvirt_name) if lv_conn == None: - ansiiprint.echo('Failed to open local libvirt connection', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to open local libvirt connection', state='e', prefix='Domain {}:'.format(self.domuuid)) self.inrestart = False return @@ -181,37 +182,37 @@ class VMInstance: # Stop the VM forcibly without updating state def terminate_vm(self): - ansiiprint.echo('Terminating VM', '{}:'.format(self.domuuid), 'i') + self.logger.out('Terminating VM', state='i', prefix='Domain {}:'.format(self.domuuid)) self.instop = True try: self.dom.destroy() except AttributeError: - ansiiprint.echo('Failed to terminate VM', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to terminate VM', state='e', prefix='Domain {}:'.format(self.domuuid)) self.removeDomainFromList() - ansiiprint.echo('Successfully terminated VM', '{}:'.format(self.domuuid), 'o') + self.logger.out('Successfully terminated VM', state='o', prefix='Domain {}:'.format(self.domuuid)) self.dom = None self.instop = False # Stop the VM forcibly def stop_vm(self): - ansiiprint.echo('Forcibly stopping VM', '{}:'.format(self.domuuid), 'i') + self.logger.out('Forcibly stopping VM', state='i', prefix='Domain {}:'.format(self.domuuid)) self.instop = True try: self.dom.destroy() except AttributeError: - ansiiprint.echo('Failed to stop VM', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to stop VM', state='e', prefix='Domain {}:'.format(self.domuuid)) self.removeDomainFromList() if self.inrestart == False: zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'stop' }) - ansiiprint.echo('Successfully stopped VM', '{}:'.format(self.domuuid), 'o') + self.logger.out('Successfully stopped VM', state='o', prefix='Domain {}:'.format(self.domuuid)) self.dom = None self.instop = False # Shutdown the VM gracefully def shutdown_vm(self): - ansiiprint.echo('Gracefully stopping VM', '{}:'.format(self.domuuid), 'i') + self.logger.out('Gracefully stopping VM', state='i', prefix='Domain {}:'.format(self.domuuid)) self.inshutdown = True self.dom.shutdown() try: @@ -221,7 +222,7 @@ class VMInstance: time.sleep(0.5) if tick >= 60: - ansiiprint.echo('Shutdown timeout expired', '{}:'.format(self.domuuid), 'e') + self.logger.out('Shutdown timeout expired', state='e', prefix='Domain {}:'.format(self.domuuid)) self.stop_vm() self.inshutdown = False return @@ -233,24 +234,24 @@ class VMInstance: if self.inrestart == False: zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'stop' }) - ansiiprint.echo('Successfully shutdown VM', '{}:'.format(self.domuuid), 'o') + self.logger.out('Successfully shutdown VM', state='o', prefix='Domain {}:'.format(self.domuuid)) self.dom = None self.inshutdown = False - def live_migrate_vm(self, dest_hypervisor): + def live_migrate_vm(self, dest_node): try: - dest_lv_conn = libvirt.open('qemu+tcp://{}/system'.format(self.hypervisor)) + dest_lv_conn = libvirt.open('qemu+tcp://{}/system'.format(self.node)) if dest_lv_conn == None: raise except: - ansiiprint.echo('Failed to open connection to qemu+tcp://{}/system; aborting migration.'.format(self.hypervisor), '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to open connection to qemu+tcp://{}/system; aborting migration.'.format(self.node), state='e', prefix='Domain {}:'.format(self.domuuid)) return 1 try: target_dom = self.dom.migrate(dest_lv_conn, libvirt.VIR_MIGRATE_LIVE, None, None, 0) if target_dom == None: raise - ansiiprint.echo('Successfully migrated VM', '{}:'.format(self.domuuid), 'o') + self.logger.out('Successfully migrated VM', state='o', prefix='Domain {}:'.format(self.domuuid)) except: dest_lv_conn.close() @@ -262,15 +263,15 @@ class VMInstance: # Migrate the VM to a target host def migrate_vm(self): self.inmigrate = True - ansiiprint.echo('Migrating VM to hypervisor "{}"'.format(self.hypervisor), '{}:'.format(self.domuuid), 'i') + self.logger.out('Migrating VM to node "{}"'.format(self.node), state='i', prefix='Domain {}:'.format(self.domuuid)) try: - migrate_ret = self.live_migrate_vm(self.hypervisor) + migrate_ret = self.live_migrate_vm(self.node) except: migrate_ret = 0 if migrate_ret != 0: - ansiiprint.echo('Could not live migrate VM; shutting down to migrate instead', '{}:'.format(self.domuuid), 'e') + self.logger.out('Could not live migrate VM; shutting down to migrate instead', state='e', prefix='Domain {}:'.format(self.domuuid)) self.shutdown_vm() time.sleep(1) else: @@ -283,7 +284,7 @@ class VMInstance: # Receive the migration from another host (wait until VM is running) def receive_migrate(self): self.inreceive = True - ansiiprint.echo('Receiving migration', '{}:'.format(self.domuuid), 'i') + self.logger.out('Receiving migration', state='i', prefix='Domain {}:'.format(self.domuuid)) while True: time.sleep(0.5) self.state = zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(self.domuuid)) @@ -308,9 +309,9 @@ class VMInstance: if dom_state == libvirt.VIR_DOMAIN_RUNNING: self.addDomainToList() - ansiiprint.echo('Successfully received migrated VM', '{}:'.format(self.domuuid), 'o') + self.logger.out('Successfully received migrated VM', state='o', prefix='Domain {}:'.format(self.domuuid)) else: - ansiiprint.echo('Failed to receive migrated VM', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to receive migrated VM', state='e', prefix='Domain {}:'.format(self.domuuid)) self.inreceive = False @@ -323,7 +324,7 @@ class VMInstance: # Get the current values from zookeeper (don't rely on the watch) self.state = zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(self.domuuid)) - self.hypervisor = zkhandler.readdata(self.zk_conn, '/domains/{}/hypervisor'.format(self.domuuid)) + self.node = zkhandler.readdata(self.zk_conn, '/domains/{}/node'.format(self.domuuid)) # Check the current state of the VM try: @@ -334,7 +335,7 @@ class VMInstance: except: running = libvirt.VIR_DOMAIN_NOSTATE - ansiiprint.echo('VM state change for "{}": {} {}'.format(self.domuuid, self.state, self.hypervisor), '', 'i') + self.logger.out('VM state change for "{}": {} {}'.format(self.domuuid, self.state, self.node), state='i') ####################### # Handle state changes @@ -353,9 +354,9 @@ class VMInstance: and self.inreceive == False \ and self.inshutdown == False \ and self.instop == False: - # Conditional pass two - Is this VM configured to run on this hypervisor - if self.hypervisor == self.thishypervisor.name: - # Conditional pass three - Is this VM currently running on this hypervisor + # Conditional pass two - Is this VM configured to run on this node + if self.node == self.this_node.name: + # Conditional pass three - Is this VM currently running on this node if running == libvirt.VIR_DOMAIN_RUNNING: # VM is already running and should be if self.state == "start": @@ -377,7 +378,7 @@ class VMInstance: # VM should be started if self.state == "start": self.start_vm() - # VM should be migrated to this hypervisor + # VM should be migrated to this node elif self.state == "migrate": self.receive_migrate() # VM should be restarted (i.e. started since it isn't running) @@ -391,9 +392,9 @@ class VMInstance: self.removeDomainFromList() else: - # Conditional pass three - Is this VM currently running on this hypervisor + # Conditional pass three - Is this VM currently running on this node if running == libvirt.VIR_DOMAIN_RUNNING: - # VM should be migrated away from this hypervisor + # VM should be migrated away from this node if self.state == "migrate": self.migrate_vm() # VM should be terminated @@ -417,7 +418,7 @@ class VMInstance: # Open a libvirt connection lv_conn = libvirt.open(libvirt_name) if lv_conn == None: - ansiiprint.echo('Failed to open local libvirt connection', '{}:'.format(self.domuuid), 'e') + self.logger.out('Failed to open local libvirt connection', state='e', prefix='Domain {}:'.format(self.domuuid)) return dom # Lookup the UUID diff --git a/virtualization-daemon/pvcvd/NodeInstance.py b/node-daemon/pvcd/NodeInstance.py similarity index 52% rename from virtualization-daemon/pvcvd/NodeInstance.py rename to node-daemon/pvcd/NodeInstance.py index 2fae127c..27f2922b 100644 --- a/virtualization-daemon/pvcvd/NodeInstance.py +++ b/node-daemon/pvcd/NodeInstance.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# NodeInstance.py - Class implementing a PVC node and run by pvcvd +# NodeInstance.py - Class implementing a PVC node in pvcd # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018 Joshua M. Boniface @@ -26,195 +26,301 @@ import psutil import socket import time import libvirt -import kazoo.client import threading import subprocess -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler +import pvcd.log as log +import pvcd.zkhandler as zkhandler +import pvcd.common as common class NodeInstance(): # Initialization function - def __init__(self, this_node, name, t_node, s_domain, zk_conn, config): + def __init__(self, name, this_node, zk_conn, config, logger, d_node, d_network, d_domain): # Passed-in variables on creation + self.name = name + self.this_node = this_node self.zk_conn = zk_conn self.config = config - self.this_node = this_node - self.name = name + self.logger = logger + # The IPMI hostname for fencing + self.ipmi_hostname = self.config['ipmi_hostname'] + # Which node is primary + self.primary_node = None + # States + self.daemon_mode = zkhandler.readdata(self.zk_conn, '/nodes/{}/daemonmode'.format(self.name)) self.daemon_state = 'stop' + self.router_state = 'client' self.domain_state = 'ready' - self.t_node = t_node + # Object lists + self.d_node = d_node + self.d_network = d_network + self.d_domain = d_domain + # Printable lists self.active_node_list = [] self.flushed_node_list = [] self.inactive_node_list = [] - self.s_domain = s_domain + self.network_list = [] self.domain_list = [] - self.ipmi_hostname = self.config['ipmi_hostname'] + # Node resources + self.networks_count = 0 self.domains_count = 0 self.memused = 0 self.memfree = 0 self.memalloc = 0 self.vcpualloc = 0 + # Flags self.inflush = False # Zookeeper handlers for changed states @self.zk_conn.DataWatch('/nodes/{}/daemonstate'.format(self.name)) - def watch_hypervisor_daemonstate(data, stat, event=""): + def watch_node_daemonstate(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.daemon_state = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.daemon_state = 'stop' + data = 'stop' + + if data != self.daemon_state: + self.daemon_state = data + + @self.zk_conn.DataWatch('/nodes/{}/routerstate'.format(self.name)) + def watch_node_routerstate(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'client' + + if self.name == self.this_node and self.daemon_mode == 'coordinator': + # We're a coordinator so we care about networking + if data != self.router_state: + self.router_state = data + if self.router_state == 'primary': + self.become_primary() + else: + self.become_secondary() @self.zk_conn.DataWatch('/nodes/{}/domainstate'.format(self.name)) - def watch_hypervisor_domainstate(data, stat, event=""): + def watch_node_domainstate(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.domain_state = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.domain_state = 'unknown' + data = 'unknown' - # toggle state management of this node - if self.name == self.this_node: - if self.domain_state == 'flush' and self.inflush == False: - # Do flushing in a thread so it doesn't block the migrates out - flush_thread = threading.Thread(target=self.flush, args=(), kwargs={}) - flush_thread.start() - if self.domain_state == 'unflush' and self.inflush == False: - self.unflush() + if data != self.domain_state: + self.domain_state = data + + # toggle state management of this node + if self.name == self.this_node: + if self.domain_state == 'flush' and self.inflush == False: + # Do flushing in a thread so it doesn't block the migrates out + flush_thread = threading.Thread(target=self.flush, args=(), kwargs={}) + flush_thread.start() + if self.domain_state == 'unflush' and self.inflush == False: + self.unflush() + + @self.zk_conn.DataWatch('/primary_node') + def watch_primary_node(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'none' + + if data != self.primary_node: + if self.daemon_mode == 'coordinator': + # We're a coordinator so we care about networking + if data == 'none': + # Toggle state management of routing functions + if self.name == self.this_node: + if self.daemon_state == 'run' and self.router_state != 'primary': + # Contend for primary + self.logger.out('Contending for primary routing state', state='i') + zkhandler.writedata(self.zk_conn, {'/primary_node': self.name }) + elif data == self.this_node: + if self.name == self.this_node: + zkhandler.writedata(self.zk_conn, { '/nodes/{}/routerstate'.format(self.name): 'primary' }) + self.primary_node = data + else: + if self.name == self.this_node: + zkhandler.writedata(self.zk_conn, { '/nodes/{}/routerstate'.format(self.name): 'secondary' }) + self.primary_node = data + else: + self.primary_node = data + @self.zk_conn.DataWatch('/nodes/{}/memfree'.format(self.name)) - def watch_hypervisor_memfree(data, stat, event=""): + def watch_node_memfree(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.memfree = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.memfree = 0 + data = 0 + + if data != self.memfree: + self.memfree = data @self.zk_conn.DataWatch('/nodes/{}/memused'.format(self.name)) - def watch_hypervisor_memused(data, stat, event=""): + def watch_node_memused(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.memused = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.memused = 0 + data = 0 + + if data != self.memused: + self.memused = data @self.zk_conn.DataWatch('/nodes/{}/memalloc'.format(self.name)) - def watch_hypervisor_memalloc(data, stat, event=""): + def watch_node_memalloc(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.memalloc = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.memalloc = 0 + data = 0 + + if data != self.memalloc: + self.memalloc = data @self.zk_conn.DataWatch('/nodes/{}/vcpualloc'.format(self.name)) - def watch_hypervisor_vcpualloc(data, stat, event=""): + def watch_node_vcpualloc(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.vcpualloc = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.vcpualloc = 0 + data = 0 + + if data != self.vcpualloc: + self.vcpualloc = data @self.zk_conn.DataWatch('/nodes/{}/runningdomains'.format(self.name)) - def watch_hypervisor_runningdomains(data, stat, event=""): + def watch_node_runningdomains(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.domain_list = data.decode('ascii').split() + data = data.decode('ascii').split() except AttributeError: - self.domain_list = [] + data = [] - @self.zk_conn.DataWatch('/nodes/{}/domainscount'.format(self.name)) - def watch_hypervisor_domainscount(data, stat, event=""): + if data != self.domain_list: + self.domain_list = data + + @self.zk_conn.DataWatch('/nodes/{}/networkscount'.format(self.name)) + def watch_node_networkscount(data, stat, event=''): if event and event.type == 'DELETED': # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: - self.domains_count = data.decode('ascii') + data = data.decode('ascii') except AttributeError: - self.domains_count = 0 + data = 0 + + if data != self.networks_count: + self.networks_count = data - # Get value functions - def getfreemem(self): - return self.memfree + @self.zk_conn.DataWatch('/nodes/{}/domainscount'.format(self.name)) + def watch_node_domainscount(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False - def getallocmem(self): - return self.memalloc - - def getallocvcpu(self): - return self.vcpualloc - - def getcpuload(self): - return self.cpuload - - def getname(self): - return self.name - - def getdaemonstate(self): - return self.daemon_state - - def getdomainstate(self): - return self.domain_state - - def getdomainlist(self): - return self.domain_list + try: + data = data.decode('ascii') + except AttributeError: + data = 0 + if data != self.domains_count: + self.domains_count = data + # Update value functions - def updatenodelist(self, t_node): - self.t_node = t_node + def update_node_list(self, d_node): + self.d_node = d_node - def updatedomainlist(self, s_domain): - self.s_domain = s_domain + def update_network_list(self, d_network): + self.d_network = d_network + network_list = [] + for network in self.d_network: + network_list.append(d_network[network].vni) + self.network_list = network_list + + def update_domain_list(self, d_domain): + self.d_domain = d_domain + + # Routing primary/secondary states + def become_secondary(self): + self.logger.out('Setting router {} to secondary state'.format(self.name), state='i') + self.logger.out('Network list: {}'.format(', '.join(self.network_list))) + time.sleep(0.5) + for network in self.d_network: + self.d_network[network].stopDHCPServer() + self.d_network[network].removeGatewayAddress() + + def become_primary(self): + self.logger.out('Setting router {} to primary state.'.format(self.name), state='i') + self.logger.out('Network list: {}'.format(', '.join(self.network_list))) + for network in self.d_network: + self.d_network[network].createGatewayAddress() + self.d_network[network].startDHCPServer() # Flush all VMs on the host def flush(self): self.inflush = True - ansiiprint.echo('Flushing node "{}" of running VMs'.format(self.name), '', 'i') - ansiiprint.echo('Domain list: {}'.format(', '.join(self.domain_list)), '', 'c') + self.logger.out('Flushing node "{}" of running VMs'.format(self.name), state='i') + self.logger.out('Domain list: {}'.format(', '.join(self.domain_list))) fixed_domain_list = self.domain_list.copy() for dom_uuid in fixed_domain_list: - ansiiprint.echo('Selecting target to migrate VM "{}"'.format(dom_uuid), '', 'i') + self.logger.out('Selecting target to migrate VM "{}"'.format(dom_uuid), state='i') - current_hypervisor = zkhandler.readdata(self.zk_conn, '/domains/{}/hypervisor'.format(dom_uuid)) - target_hypervisor = findTargetHypervisor(self.zk_conn, 'mem', dom_uuid) - if target_hypervisor == None: - ansiiprint.echo('Failed to find migration target for VM "{}"; shutting down'.format(dom_uuid), '', 'e') + current_node = zkhandler.readdata(self.zk_conn, '/domains/{}/node'.format(dom_uuid)) + target_node = findTargetHypervisor(self.zk_conn, 'mem', dom_uuid) + if target_node == None: + self.logger.out('Failed to find migration target for VM "{}"; shutting down'.format(dom_uuid), state='e') zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(dom_uuid): 'shutdown' }) else: - ansiiprint.echo('Migrating VM "{}" to hypervisor "{}"'.format(dom_uuid, target_hypervisor), '', 'i') + self.logger.out('Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i') zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(dom_uuid): 'migrate', - '/domains/{}/hypervisor'.format(dom_uuid): target_hypervisor, - '/domains/{}/lasthypervisor'.format(dom_uuid): current_hypervisor + '/domains/{}/node'.format(dom_uuid): target_node, + '/domains/{}/lastnode'.format(dom_uuid): current_node }) # Wait for the VM to migrate so the next VM's free RAM count is accurate (they migrate in serial anyways) @@ -230,23 +336,23 @@ class NodeInstance(): def unflush(self): self.inflush = True - ansiiprint.echo('Restoring node {} to active service.'.format(self.name), '', 'i') + self.logger.out('Restoring node {} to active service.'.format(self.name), state='i') zkhandler.writedata(self.zk_conn, { '/nodes/{}/domainstate'.format(self.name): 'ready' }) - fixed_domain_list = self.s_domain.copy() + fixed_domain_list = self.d_domain.copy() for dom_uuid in fixed_domain_list: try: - last_hypervisor = zkhandler.readdata(self.zk_conn, '/domains/{}/lasthypervisor'.format(dom_uuid)) + last_node = zkhandler.readdata(self.zk_conn, '/domains/{}/lastnode'.format(dom_uuid)) except: continue - if last_hypervisor != self.name: + if last_node != self.name: continue - ansiiprint.echo('Setting unmigration for VM "{}"'.format(dom_uuid), '', 'i') + self.logger.out('Setting unmigration for VM "{}"'.format(dom_uuid), state='i') zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(dom_uuid): 'migrate', - '/domains/{}/hypervisor'.format(dom_uuid): self.name, - '/domains/{}/lasthypervisor'.format(dom_uuid): '' + '/domains/{}/node'.format(dom_uuid): self.name, + '/domains/{}/lastnode'.format(dom_uuid): '' }) self.inflush = False @@ -256,7 +362,7 @@ class NodeInstance(): libvirt_name = "qemu:///system" lv_conn = libvirt.open(libvirt_name) if lv_conn == None: - ansiiprint.echo('Failed to open connection to "{}"'.format(libvirt_name), '', 'e') + self.logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e') return # Get past state and update if needed @@ -267,15 +373,21 @@ class NodeInstance(): else: self.daemon_state = 'run' + # Ensure the primary key is properly set + if self.name == self.this_node: + if self.router_state == 'primary': + if zkhandler.readdata(self.zk_conn, '/primary_node') != self.name: + zkhandler.writedata(self.zk_conn, {'/primary_node': self.name}) + # Toggle state management of dead VMs to restart them memalloc = 0 vcpualloc = 0 - for domain, instance in self.s_domain.items(): - if instance.inshutdown == False and domain in self.domain_list: + for domain, instance in self.d_domain.items(): + if domain in self.domain_list: # Add the allocated memory to our memalloc value memalloc += instance.getmemory() vcpualloc += instance.getvcpus() - if instance.getstate() == 'start' and instance.gethypervisor() == self.name: + if instance.getstate() == 'start' and instance.getnode() == self.name: if instance.getdom() != None: try: if instance.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING: @@ -292,7 +404,7 @@ class NodeInstance(): self.domain_list.append(domain_uuid) # Set our information in zookeeper - self.name = lv_conn.getHostname() + #self.name = lv_conn.getHostname() self.memused = int(psutil.virtual_memory().used / 1024 / 1024) self.memfree = int(psutil.virtual_memory().free / 1024 / 1024) self.memalloc = memalloc @@ -307,23 +419,40 @@ class NodeInstance(): '/nodes/{}/memalloc'.format(self.name): str(self.memalloc), '/nodes/{}/vcpualloc'.format(self.name): str(self.vcpualloc), '/nodes/{}/cpuload'.format(self.name): str(self.cpuload), - '/nodes/{}/runningdomains'.format(self.name): ' '.join(self.domain_list), + '/nodes/{}/networkscount'.format(self.name): str(self.networks_count), '/nodes/{}/domainscount'.format(self.name): str(self.domains_count), + '/nodes/{}/runningdomains'.format(self.name): ' '.join(self.domain_list), '/nodes/{}/keepalive'.format(self.name): str(keepalive_time) }) except: - ansiiprint.echo('Failed to set keepalive data', '', 'e') + self.logger.out('Failed to set keepalive data', state='e') return # Close the Libvirt connection lv_conn.close() # Display node information to the terminal - ansiiprint.echo('{}{} keepalive{}'.format(ansiiprint.purple(), self.name, ansiiprint.end()), '', 't') - ansiiprint.echo('{0}Active domains:{1} {2} {0}Allocated memory [MiB]:{1} {6} {0}Free memory [MiB]:{1} {3} {0}Used memory [MiB]:{1} {4} {0}Load:{1} {5}'.format(ansiiprint.bold(), ansiiprint.end(), self.domains_count, self.memfree, self.memused, self.cpuload, self.memalloc), '', 'c') + self.logger.out('{}{} keepalive{}'.format(self.logger.fmt_purple, self.name, self.logger.fmt_end), state='t') + self.logger.out( + '{bold}Domains:{nobold} {domcount} ' + '{bold}Networks:{nobold} {netcount} ' + '{bold}Allocated memory [MiB]:{nobold} {allocmem} ' + '{bold}Free memory [MiB]:{nobold} {freemem} ' + '{bold}Used memory [MiB]:{nobold} {usedmem} ' + '{bold}Load:{nobold} {load}'.format( + bold=self.logger.fmt_bold, + nobold=self.logger.fmt_end, + domcount=self.domains_count, + freemem=self.memfree, + usedmem=self.memused, + load=self.cpuload, + allocmem=self.memalloc, + netcount=self.networks_count + ), + ) # Update our local node lists - for node_name in self.t_node: + for node_name in self.d_node: try: node_daemon_state = zkhandler.readdata(self.zk_conn, '/nodes/{}/daemonstate'.format(node_name)) node_domain_state = zkhandler.readdata(self.zk_conn, '/nodes/{}/domainstate'.format(node_name)) @@ -338,9 +467,9 @@ class NodeInstance(): # out-of-date while in 'start' state) node_deadtime = int(time.time()) - ( int(self.config['keepalive_interval']) * int(self.config['fence_intervals']) ) if node_keepalive < node_deadtime and node_daemon_state == 'run': - ansiiprint.echo('Node {} seems dead - starting monitor for fencing'.format(node_name), '', 'w') + self.logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') zkhandler.writedata(self.zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' }) - fence_thread = threading.Thread(target=fenceNode, args=(node_name, self.zk_conn, self.config), kwargs={}) + fence_thread = threading.Thread(target=fenceNode, args=(node_name, self.zk_conn, self.config, self.logger), kwargs={}) fence_thread.start() # Update the arrays @@ -374,12 +503,22 @@ class NodeInstance(): self.inactive_node_list.remove(node_name) except ValueError: pass - + + # List of the non-primary coordinators + secondary_node_list = self.config['coordinators'].split(',') + if secondary_node_list: + secondary_node_list.remove(self.primary_node) + for node in secondary_node_list: + if node in self.inactive_node_list: + secondary_node_list.remove(node) + # Display cluster information to the terminal - ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't') - ansiiprint.echo('{}Active nodes:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.active_node_list)), '', 'c') - ansiiprint.echo('{}Inactive nodes:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.inactive_node_list)), '', 'c') - ansiiprint.echo('{}Flushed nodes:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.flushed_node_list)), '', 'c') + self.logger.out('{}Cluster status{}'.format(self.logger.fmt_purple, self.logger.fmt_end), state='t') + self.logger.out('{}Primary coordinator:{} {}'.format(self.logger.fmt_bold, self.logger.fmt_end, self.primary_node)) + self.logger.out('{}Secondary coordinators:{} {}'.format(self.logger.fmt_bold, self.logger.fmt_end, ' '.join(secondary_node_list))) + self.logger.out('{}Active hypervisors:{} {}'.format(self.logger.fmt_bold, self.logger.fmt_end, ' '.join(self.active_node_list))) + self.logger.out('{}Flushed hypervisors:{} {}'.format(self.logger.fmt_bold, self.logger.fmt_end, ' '.join(self.flushed_node_list))) + self.logger.out('{}Inactive nodes:{} {}'.format(self.logger.fmt_bold, self.logger.fmt_end, ' '.join(self.inactive_node_list))) # # Find a migration target @@ -395,95 +534,95 @@ def findTargetHypervisor(zk_conn, search_field, dom_uuid): return findTargetHypervisorVMs(zk_conn, dom_uuid) return None -# Get the list of valid target hypervisors +# Get the list of valid target nodes def getHypervisors(zk_conn, dom_uuid): - valid_hypervisor_list = [] - full_hypervisor_list = zkhandler.listchildren(zk_conn, '/nodes') - current_hypervisor = zkhandler.readdata(zk_conn, '/domains/{}/hypervisor'.format(dom_uuid)) + valid_node_list = [] + full_node_list = zkhandler.listchildren(zk_conn, '/nodes') + current_node = zkhandler.readdata(zk_conn, '/domains/{}/node'.format(dom_uuid)) - for hypervisor in full_hypervisor_list: - daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(hypervisor)) - domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(hypervisor)) + for node in full_node_list: + daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node)) + domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node)) - if hypervisor == current_hypervisor: + if node == current_node: continue if daemon_state != 'run' or domain_state != 'ready': continue - valid_hypervisor_list.append(hypervisor) + valid_node_list.append(node) - return valid_hypervisor_list + return valid_node_list # via free memory (relative to allocated memory) def findTargetHypervisorMem(zk_conn, dom_uuid): most_allocfree = 0 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - memalloc = int(zkhandler.readdata(zk_conn, '/nodes/{}/memalloc'.format(hypervisor))) - memused = int(zkhandler.readdata(zk_conn, '/nodes/{}/memused'.format(hypervisor))) - memfree = int(zkhandler.readdata(zk_conn, '/nodes/{}/memfree'.format(hypervisor))) + node_list = getHypervisors(zk_conn, dom_uuid) + for node in node_list: + memalloc = int(zkhandler.readdata(zk_conn, '/nodes/{}/memalloc'.format(node))) + memused = int(zkhandler.readdata(zk_conn, '/nodes/{}/memused'.format(node))) + memfree = int(zkhandler.readdata(zk_conn, '/nodes/{}/memfree'.format(node))) memtotal = memused + memfree allocfree = memtotal - memalloc if allocfree > most_allocfree: most_allocfree = allocfree - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node # via load average def findTargetHypervisorLoad(zk_conn, dom_uuid): least_load = 9999 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - load = int(zkhandler.readdata(zk_conn, '/nodes/{}/load'.format(hypervisor))) + node_list = getHypervisors(zk_conn, dom_uuid) + for node in node_list: + load = int(zkhandler.readdata(zk_conn, '/nodes/{}/load'.format(node))) if load < least_load: least_load = load - target_hypevisor = hypervisor + target_hypevisor = node - return target_hypervisor + return target_node # via total vCPUs def findTargetHypervisorVCPUs(zk_conn, dom_uuid): least_vcpus = 9999 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - vcpus = int(zkhandler.readdata(zk_conn, '/nodes/{}/vcpualloc'.format(hypervisor))) + node_list = getHypervisors(zk_conn, dom_uuid) + for node in node_list: + vcpus = int(zkhandler.readdata(zk_conn, '/nodes/{}/vcpualloc'.format(node))) if vcpus < least_vcpus: least_vcpus = vcpus - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node # via total VMs def findTargetHypervisorVMs(zk_conn, dom_uuid): least_vms = 9999 - target_hypervisor = None + target_node = None - hypervisor_list = getHypervisors(zk_conn, dom_uuid) - for hypervisor in hypervisor_list: - vms = int(zkhandler.readdata(zk_conn, '/nodes/{}/domainscount'.format(hypervisor))) + node_list = getHypervisors(zk_conn, dom_uuid) + for node in node_list: + vms = int(zkhandler.readdata(zk_conn, '/nodes/{}/domainscount'.format(node))) if vms < least_vms: least_vms = vms - target_hypervisor = hypervisor + target_node = node - return target_hypervisor + return target_node # # Fence thread entry function # -def fenceNode(node_name, zk_conn, config): +def fenceNode(node_name, zk_conn, config, logger): failcount = 0 # We allow exactly 3 saving throws for the host to come back online while failcount < 3: @@ -494,13 +633,13 @@ def fenceNode(node_name, zk_conn, config): # Is it still 'dead' if node_daemon_state == 'dead': failcount += 1 - ansiiprint.echo('Node "{}" failed {} saving throws'.format(node_name, failcount), '', 'w') + logger.out('Node "{}" failed {} saving throws'.format(node_name, failcount), state='w') # It changed back to something else so it must be alive else: - ansiiprint.echo('Node "{}" passed a saving throw; canceling fence'.format(node_name), '', 'o') + logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o') return - ansiiprint.echo('Fencing node "{}" via IPMI reboot signal'.format(node_name), '', 'e') + logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='e') # Get IPMI information ipmi_hostname = zkhandler.readdata(zk_conn, '/nodes/{}/ipmihostname'.format(node_name)) @@ -508,29 +647,35 @@ def fenceNode(node_name, zk_conn, config): ipmi_password = zkhandler.readdata(zk_conn, '/nodes/{}/ipmipassword'.format(node_name)) # Shoot it in the head - fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password) + fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password, logger) # Hold to ensure the fence takes effect time.sleep(3) + # Force into secondary network state if needed + if node_name in config['coordinators'].split(','): + zkhandler.writedata(zk_conn, { '/nodes/{}/routerstate'.format(node_name): 'secondary' }) + if zkhandler.readdata(zk_conn, '/primary_node') == node_name: + zkhandler.writedata(zk_conn, { '/primary_node': 'none' }) + # If the fence succeeded and successful_fence is migrate if fence_status == True and config['successful_fence'] == 'migrate': - migrateFromFencedHost(zk_conn, node_name) + migrateFromFencedNode(zk_conn, node_name, logger) # If the fence failed and failed_fence is migrate if fence_status == False and config['failed_fence'] == 'migrate' and config['suicide_intervals'] != '0': - migrateFromFencedHost(zk_conn, node_name) + migrateFromFencedNode(zk_conn, node_name, logger) # Migrate hosts away from a fenced node -def migrateFromFencedHost(zk_conn, node_name): - ansiiprint.echo('Moving VMs from dead hypervisor "{}" to new hosts'.format(node_name), '', 'i') +def migrateFromFencedNode(zk_conn, node_name, logger): + logger.out('Moving VMs from dead node "{}" to new hosts'.format(node_name), state='i') dead_node_running_domains = zkhandler.readdata(zk_conn, '/nodes/{}/runningdomains'.format(node_name)).split() for dom_uuid in dead_node_running_domains: - target_hypervisor = findTargetHypervisor(zk_conn, 'mem', dom_uuid) + target_node = findTargetHypervisor(zk_conn, 'mem', dom_uuid) - ansiiprint.echo('Moving VM "{}" to hypervisor "{}"'.format(dom_uuid, target_hypervisor), '', 'i') + logger.out('Moving VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i') zkhandler.writedata(zk_conn, { '/domains/{}/state'.format(dom_uuid): 'start', - '/domains/{}/hypervisor'.format(dom_uuid): target_hypervisor, - '/domains/{}/lasthypervisor'.format(dom_uuid): node_name + '/domains/{}/node'.format(dom_uuid): target_node, + '/domains/{}/lastnode'.format(dom_uuid): node_name }) # Set node in flushed state for easy remigrating when it comes back @@ -539,12 +684,12 @@ def migrateFromFencedHost(zk_conn, node_name): # # Perform an IPMI fence # -def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password): +def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password, logger): ipmi_command = ['/usr/bin/ipmitool', '-I', 'lanplus', '-H', ipmi_hostname, '-U', ipmi_user, '-P', ipmi_password, 'chassis', 'power', 'reset'] ipmi_command_output = subprocess.run(ipmi_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if ipmi_command_output.returncode == 0: - ansiiprint.echo('Successfully rebooted dead node', '', 'o') + logger.out('Successfully rebooted dead node', state='o') return True else: - ansiiprint.echo('Failed to reboot dead node', '', 'e') + logger.out('Failed to reboot dead node', state='e') return False diff --git a/router-daemon/pvcrd/VXNetworkInstance.py b/node-daemon/pvcd/VXNetworkInstance.py similarity index 87% rename from router-daemon/pvcrd/VXNetworkInstance.py rename to node-daemon/pvcd/VXNetworkInstance.py index ffc7d219..0b4cd4a4 100644 --- a/router-daemon/pvcrd/VXNetworkInstance.py +++ b/node-daemon/pvcd/VXNetworkInstance.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# VXNetworkInstance.py - Class implementing a PVC VM network (router-side) and run by pvcrd +# VXNetworkInstance.py - Class implementing a PVC VM network and run by pvcd # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018 Joshua M. Boniface @@ -24,16 +24,18 @@ import os import sys from textwrap import dedent -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler -import daemon_lib.common as common +import pvcd.log as log +import pvcd.zkhandler as zkhandler +import pvcd.common as common class VXNetworkInstance(): # Initialization function - def __init__ (self, vni, zk_conn, config, this_router): + def __init__ (self, vni, zk_conn, config, logger, this_node): self.vni = vni self.zk_conn = zk_conn - self.this_router = this_router + self.config = config + self.logger = logger + self.this_node = this_node self.vni_dev = config['vni_dev'] self.old_description = None @@ -49,12 +51,12 @@ class VXNetworkInstance(): self.vxlan_nic = 'vxlan{}'.format(self.vni) self.bridge_nic = 'br{}'.format(self.vni) - self.nftables_update_filename = '{}/update'.format(config['nftables_rules_dir']) - self.nftables_netconf_filename = '{}/networks/{}.nft'.format(config['nftables_rules_dir'], self.vni) + self.nftables_update_filename = '{}/update'.format(config['nft_dynamic_directory']) + self.nftables_netconf_filename = '{}/networks/{}.nft'.format(config['nft_dynamic_directory'], self.vni) self.firewall_rules = [] self.dhcp_server_daemon = None - self.dnsmasq_hostsdir = '{}/{}'.format(config['dnsmasq_hosts_dir'], self.vni) + self.dnsmasq_hostsdir = '{}/{}'.format(config['dnsmasq_dynamic_directory'], self.vni) self.dhcp_reservations = [] # Zookeper handlers for changed states @@ -102,7 +104,7 @@ class VXNetworkInstance(): if data and self.ip_gateway != data.decode('ascii'): orig_gateway = self.ip_gateway self.ip_gateway = data.decode('ascii') - if self.this_router.network_state == 'primary': + if self.this_node.router_state == 'primary': if orig_gateway: self.removeGatewayAddress() self.createGatewayAddress() @@ -116,9 +118,9 @@ class VXNetworkInstance(): if data and self.dhcp_flag != data.decode('ascii'): self.dhcp_flag = ( data.decode('ascii') == 'True' ) - if self.dhcp_flag and self.this_router.network_state == 'primary': + if self.dhcp_flag and self.this_node.router_state == 'primary': self.startDHCPServer() - elif self.this_router.network_state == 'primary': + elif self.this_node.router_state == 'primary': self.stopDHCPServer() @self.zk_conn.DataWatch('/networks/{}/dhcp_start'.format(self.vni)) @@ -209,13 +211,12 @@ class VXNetworkInstance(): pass def createNetwork(self): - ansiiprint.echo( - 'Creating VNI {} device on interface {}'.format( - self.vni, + self.logger.out( + 'Creating VXLAN device on interface {}'.format( self.vni_dev ), - '', - 'o' + prefix='VNI {}'.format(self.vni), + state='o' ) common.run_os_command( 'ip link add {} type vxlan id {} dstport 4789 dev {}'.format( @@ -275,15 +276,14 @@ add rule inet filter input meta iifname {bridgenic} counter drop pass def createGatewayAddress(self): - if self.this_router.getnetworkstate() == 'primary': - ansiiprint.echo( - 'Creating gateway {} on interface {} (VNI {})'.format( + if self.this_node.router_state == 'primary': + self.logger.out( + 'Creating gateway {} on interface {}'.format( self.ip_gateway, - self.bridge_nic, - self.vni + self.bridge_nic ), - '', - 'o' + prefix='VNI {}'.format(self.vni), + state='o' ) print('ip address add {}/{} dev {}'.format( self.ip_gateway, @@ -307,14 +307,13 @@ add rule inet filter input meta iifname {bridgenic} counter drop ) def startDHCPServer(self): - if self.this_router.getnetworkstate() == 'primary': - ansiiprint.echo( - 'Starting dnsmasq DHCP server on interface {} (VNI {})'.format( - self.bridge_nic, - self.vni + if self.this_node.router_state == 'primary': + self.logger.out( + 'Starting dnsmasq DHCP server on interface {}'.format( + self.bridge_nic ), - '', - 'o' + prefix='VNI {}'.format(self.vni), + state='o' ) # Create the network hostsdir common.run_os_command( @@ -323,10 +322,10 @@ add rule inet filter input meta iifname {bridgenic} counter drop ) ) # Recreate the environment we need for dnsmasq - pvcrd_config_file = os.environ['PVCRD_CONFIG_FILE'] + pvcd_config_file = os.environ['PVCD_CONFIG_FILE'] dhcp_environment = { 'DNSMASQ_INTERFACE': self.bridge_nic, - 'PVCRD_CONFIG_FILE': pvcrd_config_file + 'PVCD_CONFIG_FILE': pvcd_config_file } # Define the dnsmasq config dhcp_configuration = [ @@ -343,7 +342,7 @@ add rule inet filter input meta iifname {bridgenic} counter drop '--listen-address={}'.format(self.ip_gateway), '--bind-interfaces', '--leasefile-ro', - '--dhcp-script=/usr/share/pvc/pvcrd/dnsmasq-zookeeper-leases.py', + '--dhcp-script=/usr/share/pvc/pvcd/dnsmasq-zookeeper-leases.py', '--dhcp-range={},{},4h'.format(self.dhcp_start, self.dhcp_end), '--dhcp-lease-max=99', '--dhcp-hostsdir={}'.format(self.dnsmasq_hostsdir), @@ -356,18 +355,16 @@ add rule inet filter input meta iifname {bridgenic} counter drop '/usr/sbin/dnsmasq {}'.format( ' '.join(dhcp_configuration) ), - environment=dhcp_environment, - return_pid=True + environment=dhcp_environment ) def removeNetwork(self): - ansiiprint.echo( - 'Removing VNI {} device on interface {}'.format( - self.vni, + self.logger.out( + 'Removing VNI device on interface {}'.format( self.vni_dev ), - '', - 'o' + prefix='VNI {}'.format(self.vni), + state='o' ) common.run_os_command( 'ip link set {} down'.format( @@ -402,14 +399,13 @@ add rule inet filter input meta iifname {bridgenic} counter drop pass def removeGatewayAddress(self): - ansiiprint.echo( - 'Removing gateway {} from interface {} (VNI {})'.format( + self.logger.out( + 'Removing gateway {} from interface {}'.format( self.ip_gateway, - self.bridge_nic, - self.vni + self.bridge_nic ), - '', - 'o' + prefix='VNI {}'.format(self.vni), + state='o' ) common.run_os_command( 'ip address delete {}/{} dev {}'.format( @@ -421,12 +417,11 @@ add rule inet filter input meta iifname {bridgenic} counter drop def stopDHCPServer(self): if self.dhcp_server_daemon: - ansiiprint.echo( - 'Stopping dnsmasq DHCP server on interface {} (VNI {})'.format( - self.bridge_nic, - self.vni + self.logger.out( + 'Stopping dnsmasq DHCP server on interface {}'.format( + self.bridge_nic ), - '', - 'o' + prefix='VNI {}'.format(self.vni), + state='o' ) self.dhcp_server_daemon.signal('term') diff --git a/network-daemon/pvcnd/__init__.py b/node-daemon/pvcd/__init__.py similarity index 100% rename from network-daemon/pvcnd/__init__.py rename to node-daemon/pvcd/__init__.py diff --git a/node-daemon/pvcd/common.py b/node-daemon/pvcd/common.py new file mode 100644 index 00000000..397b4793 --- /dev/null +++ b/node-daemon/pvcd/common.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# common.py - PVC daemon function library, common fuctions +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import subprocess +import threading +import signal +import os +import time + +import pvcd.log as log + +class OSDaemon(object): + def __init__(self, command, environment): + self.proc = subprocess.Popen( + command, + env=environment, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + def signal(self, sent_signal): + signal_map = { + 'hup': signal.SIGHUP, + 'int': signal.SIGINT, + 'term': signal.SIGTERM + } + self.proc.send_signal(signal_map[sent_signal]) + +def run_os_daemon(command_string, environment=None): + command = command_string.split() + print(' '.join(command)) + daemon = OSDaemon(command, environment) + return daemon + +# Run a oneshot command, optionally without blocking +def run_os_command(command_string, background=False, environment=None): + command = command_string.split() + if background: + def runcmd(): + subprocess.run( + command, + env=environment, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + thread = threading.Thread(target=runcmd, args=()) + thread.start() + return 0, None, None + else: + command_output = subprocess.run( + command, + env=environment, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return command_output.returncode, command_output.stdout.decode('ascii'), command_output.stderr.decode('ascii') + +# Reload the firewall rules of the system +def reload_firewall_rules(rules_dir): + log.echo('Updating firewall rules', '', 'o') + rules_file = '{}/base.nft'.format(rules_dir) + retcode, stdout, stderr = run_os_command('/usr/sbin/nft -f {}'.format(rules_file)) + if retcode != 0: + log.echo('Failed to reload rules: {}'.format(stderr), '', 'e') diff --git a/router-daemon/pvcrd/dnsmasq-zookeeper-leases.py b/node-daemon/pvcd/dnsmasq-zookeeper-leases.py similarity index 93% rename from router-daemon/pvcrd/dnsmasq-zookeeper-leases.py rename to node-daemon/pvcd/dnsmasq-zookeeper-leases.py index e6ce5b6a..123e4a3e 100755 --- a/router-daemon/pvcrd/dnsmasq-zookeeper-leases.py +++ b/node-daemon/pvcd/dnsmasq-zookeeper-leases.py @@ -41,19 +41,19 @@ def get_client_id(): def connect_zookeeper(): # We expect the environ to contain the config file try: - pvcrd_config_file = os.environ['PVCRD_CONFIG_FILE'] + pvcd_config_file = os.environ['PVCD_CONFIG_FILE'] except: # Default place - pvcrd_config_file = '/etc/pvc/pvcrd.conf' + pvcd_config_file = '/etc/pvc/pvcd.conf' o_config = configparser.ConfigParser() - o_config.read(pvcrd_config_file) + o_config.read(pvcd_config_file) try: - zk_host = o_config['default']['zookeeper'] + zk_host = o_config['default']['coordinators'] except: try: - zk_host = o_config[socket.gethostname()]['zookeeper'] + zk_host = o_config[socket.gethostname()]['coordinators'] except: exit(1) diff --git a/node-daemon/pvcd/log.py b/node-daemon/pvcd/log.py new file mode 100644 index 00000000..7ba930f7 --- /dev/null +++ b/node-daemon/pvcd/log.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +# log.py - Output (stdout + logfile) functions +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import datetime + +class Logger(object): + # Define a logger class for a daemon instance + # Keeps record of where to log, and is passed messages which are + # formatted in various ways based off secondary characteristics. + + # ANSII colours for output + fmt_red = '\033[91m' + fmt_blue = '\033[94m' + fmt_cyan = '\033[96m' + fmt_green = '\033[92m' + fmt_yellow = '\033[93m' + fmt_purple = '\033[95m' + fmt_bold = '\033[1m' + fmt_end = '\033[0m' + + # Initialization of instance + def __init__(self, config): + self.config = config + if self.config['file_logging'] == 'True': + self.logfile = self.config['log_directory'] + '/pvc.log' + # We open the logfile for the duration of our session, but have a hup function + self.writer = open(self.logfile, 'a', buffering=1) + self.last_colour = self.fmt_cyan + + # Provide a hup function to close and reopen the writer + def hup(self): + self.writer.close() + self.writer = open(self.logfile, 'a', buffering=0) + + # Output function + def out(self, message, state='', prefix=''): + + # Get the date + date = '{} - '.format(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S.%f')) + endc = Logger.fmt_end + + # Determine the formatting + # OK + if state == 'o': + colour = Logger.fmt_green + prompt = '>>> ' + # Error + elif state == 'e': + colour = Logger.fmt_red + prompt = '>>> ' + # Warning + elif state == 'w': + colour = Logger.fmt_yellow + prompt = '>>> ' + # Tick + elif state == 't': + colour = Logger.fmt_purple + prompt = '>>> ' + # Information + elif state == 'i': + colour = Logger.fmt_blue + prompt = '>>> ' + # Startup + elif state == 's': + colour = Logger.fmt_cyan + prompt = '>>> ' + # Continuation + else: + date = '' + colour = self.last_colour + prompt = '>>> ' + + # Append space to prefix + if prefix != '': + prefix = prefix + ' - ' + + message = colour + prompt + endc + date + prefix + message + print(message) + if self.config['file_logging'] == 'True': + self.writer.write(message + '\n') + self.last_colour = colour diff --git a/node-daemon/pvcd/zkhandler.py b/node-daemon/pvcd/zkhandler.py new file mode 100644 index 00000000..91db9b76 --- /dev/null +++ b/node-daemon/pvcd/zkhandler.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +# zkhandler.py - Secure versioned ZooKeeper updates +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import kazoo.client +import pvcd.log as log + +# Child list function +def listchildren(zk_conn, key): + children = zk_conn.get_children(key) + return children + +# Key deletion function +def delete(zk_conn, key): + zk_conn.delete(key, recursive=True) + +# Data read function +def readdata(zk_conn, key): + data_raw = zk_conn.get(key) + data = data_raw[0].decode('ascii') + meta = data_raw[1] + return data + +# Data write function +def writedata(zk_conn, kv): + # Start up a transaction + zk_transaction = zk_conn.transaction() + + # Proceed one KV pair at a time + for key in sorted(kv): + data = kv[key] + if not data: + data = '' + + # Check if this key already exists or not + if not zk_conn.exists(key): + # We're creating a new key + zk_transaction.create(key, data.encode('ascii')) + else: + # We're updating a key with version validation + orig_data = zk_conn.get(key) + version = orig_data[1].version + + # Set what we expect the new version to be + new_version = version + 1 + + # Update the data + zk_transaction.set_data(key, data.encode('ascii')) + + # Set up the check + try: + zk_transaction.check(key, new_version) + except TypeError: + print('Zookeeper key "{}" does not match expected version'.format(key)) + return False + + # Commit the transaction + try: + zk_transaction.commit() + return True + except Exception: + return False + diff --git a/router-daemon/pvcrd.conf.sample b/router-daemon/pvcrd.conf.sample deleted file mode 100644 index 41475914..00000000 --- a/router-daemon/pvcrd.conf.sample +++ /dev/null @@ -1,33 +0,0 @@ -# pvcrd cluster configuration file example -# -# This configuration file specifies details for this node in PVC. Multiple host -# blocks can be added but only the one matching the current system hostname will -# be used by the local daemon. Default values apply to all hosts for any value -# not specifically overridden. -# -# The following values are required for each host or in a default section: -# zookeeper: the IP+port of the Zookeper instance (defaults to 127.0.0.1:2181) -# keepalive_interval: the interval between keepalives and for dead node timeout (defaults to 5) -# fence_intervals: the number of keepalive_intervals without Zookeeper contact before this node -# will consider another node dead and fence it (defaults to 6, i.e. 30s) -# vni_dev: the lower-level network device to bind VNI to -# vni_dev_ip: the IP address (CIDR) of the lower-level network device, used -# by FRR to communicate with the route reflectors and pass routes -# for VNI interfaces -# ipmi_hostname: the IPMI hostname for fencing (defaults to -lom.) -# ipmi_username: username to connect to IPMI -# ipmi_password: password to connect to IPMI -# -# Copy this example to /etc/pvc/pvcrd.conf and edit to your needs - -[default] -zookeeper = 127.0.0.1:2181 -keepalive_interval = 5 -fence_intervals = 6 - -[myhost] -vni_dev = ens4 -vni_dev_ip = 10.255.0.1/24 -ipmi_hostname = myhost-lom -ipmi_username = username -ipmi_password = password diff --git a/router-daemon/pvcrd.service b/router-daemon/pvcrd.service deleted file mode 100644 index 1cd3f408..00000000 --- a/router-daemon/pvcrd.service +++ /dev/null @@ -1,16 +0,0 @@ -# Parallel Virtual Cluster router daemon unit file -[Unit] -Description = Parallel Virtual Cluster router daemon -After = network-online.target frr.service - -[Service] -Type = simple -WorkingDirectory = /usr/share/pvc -Environment = PYTHONUNBUFFERED=true -Environment = PVCRD_CONFIG_FILE=/etc/pvc/pvcrd.conf -ExecStart = /usr/share/pvc/pvcrd.py -KillSignal = SIGINT -Restart = on-failure - -[Install] -WantedBy = multi-user.target diff --git a/router-daemon/pvcrd/Daemon.py b/router-daemon/pvcrd/Daemon.py deleted file mode 100644 index 7030eb2f..00000000 --- a/router-daemon/pvcrd/Daemon.py +++ /dev/null @@ -1,348 +0,0 @@ -#!/usr/bin/env python3 - -# Daemon.py - PVC hypervisor router daemon -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import kazoo.client -import sys -import os -import socket -import psutil -import subprocess -import time -import configparser -import signal -import atexit -import apscheduler.schedulers.background - -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler -import daemon_lib.common as common - -import pvcrd.RouterInstance as RouterInstance -import pvcrd.VXNetworkInstance as VXNetworkInstance - -print(ansiiprint.bold() + "pvcrd - Parallel Virtual Cluster router daemon" + ansiiprint.end()) - -# Set sysctl to enable routing before we do anything else -common.run_os_command('sysctl net.ipv4.ip_forward=1') -common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1') -common.run_os_command('sysctl net.ipv4.conf.all.rp_filter=0') -common.run_os_command('sysctl net.ipv4.conf.default.rp_filter=0') -common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') -common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') -common.run_os_command('sysctl net.ipv6.ip_forward=1') -common.run_os_command('sysctl net.ipv6.conf.all.rp_filter=0') -common.run_os_command('sysctl net.ipv6.conf.default.rp_filter=0') -common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1') -common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1') - -# Get the config file variable from the environment -try: - pvcrd_config_file = os.environ['PVCRD_CONFIG_FILE'] -except: - print('ERROR: The "PVCRD_CONFIG_FILE" environment variable must be set before starting pvcrd.') - exit(1) - -myhostname = socket.gethostname() -myshorthostname = myhostname.split('.', 1)[0] -mynetworkname = ''.join(myhostname.split('.', 1)[1:]) - -# Config values dictionary -config_values = [ - 'zookeeper', - 'keepalive_interval', - 'keepalive_interval', - 'fence_intervals', - 'vni_dev', - 'vni_dev_ip', - 'ipmi_hostname', - 'ipmi_username', - 'ipmi_password' -] -def readConfig(pvcrd_config_file, myhostname): - print('Loading configuration from file {}'.format(pvcrd_config_file)) - - o_config = configparser.ConfigParser() - o_config.read(pvcrd_config_file) - config = {} - config['pvcrd_config_file'] = pvcrd_config_file - - try: - entries = o_config[myhostname] - except: - try: - entries = o_config['default'] - except Exception as e: - print('ERROR: Config file is not valid!') - exit(1) - - for entry in config_values: - try: - config[entry] = entries[entry] - except: - try: - config[entry] = o_config['default'][entry] - except: - print('ERROR: Config file missing required value "{}" for this host!'.format(entry)) - exit(1) - - # Handle an empty ipmi_hostname - if config['ipmi_hostname'] == '': - config['ipmi_hostname'] = myshorthostname + '-lom.' + mynetworkname - - return config - -# Get config -config = readConfig(pvcrd_config_file, myhostname) - -# Add some static config elements -config['nftables_rules_dir'] = '/var/lib/pvc/nftables' -config['dnsmasq_hosts_dir'] = '/var/lib/pvc/dnsmasq' - -# Set up our VNI interface -vni_dev = config['vni_dev'] -vni_dev_ip = config['vni_dev_ip'] -print('Setting up VNI interface {} with IP {}'.format(vni_dev, vni_dev_ip)) -common.run_os_command('ip link set {} up'.format(vni_dev)) -common.run_os_command('ip address add {} dev {}'.format(vni_dev_ip, vni_dev)) - -# Connect to local zookeeper -zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper']) -try: - print('Connecting to Zookeeper instance at {}'.format(config['zookeeper'])) - zk_conn.start() -except: - print('ERROR: Failed to connect to Zookeeper') - exit(1) - -# Handle zookeeper failures -def zk_listener(state): - global zk_conn, update_timer - if state == kazoo.client.KazooState.SUSPENDED: - ansiiprint.echo('Connection to Zookeeper lost; retrying', '', 'e') - - # Stop keepalive thread - stopKeepaliveTimer(update_timer) - - while True: - _zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper']) - try: - _zk_conn.start() - zk_conn = _zk_conn - break - except: - time.sleep(1) - elif state == kazoo.client.KazooState.CONNECTED: - ansiiprint.echo('Connection to Zookeeper started', '', 'o') - - # Start keepalive thread - update_timer = createKeepaliveTimer() - else: - pass - -zk_conn.add_listener(zk_listener) - -# Cleanup function -def cleanup(): - ansiiprint.echo('Cleaning up', '', 'e') - - # Stop keepalive thread - stopKeepaliveTimer(update_timer) - - # Set stop state in Zookeeper - zkhandler.writedata(zk_conn, {'/routers/{}/daemonstate'.format(myhostname): 'stop'}) - if this_router.name == this_router.primary_router: - zkhandler.writedata(zk_conn, {'/routers': 'none'}) - - # Wait for everything to flush - time.sleep(3) - - # Close the Zookeeper connection - try: - zk_conn.stop() - zk_conn.close() - except: - pass - - ansiiprint.echo('Terminating daemon', '', 'e') - -atexit.register(cleanup) - -# Gather useful data about our host for staticdata -# Static data format: 'cpu_count', 'arch', 'os', 'kernel' -staticdata = [] -staticdata.append(str(psutil.cpu_count())) -staticdata.append(subprocess.run(['uname', '-r'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -staticdata.append(subprocess.run(['uname', '-o'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -staticdata.append(subprocess.run(['uname', '-m'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -# Print static data on start - -print('{0}Router hostname:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), myhostname)) -print('{0}IPMI hostname:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), config['ipmi_hostname'])) -print('{0}Machine details:{1}'.format(ansiiprint.bold(), ansiiprint.end())) -print(' {0}CPUs:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[0])) -print(' {0}Arch:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[3])) -print(' {0}OS:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[2])) -print(' {0}Kernel:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[1])) - -# Check if our router exists in Zookeeper, and create it if not -if zk_conn.exists('/routers/{}'.format(myhostname)): - print("Router is " + ansiiprint.green() + "present" + ansiiprint.end() + " in Zookeeper") - # Update static data just in case it's changed - zkhandler.writedata(zk_conn, { '/routers/{}/staticdata'.format(myhostname): ' '.join(staticdata) }) -else: - print("Router is " + ansiiprint.red() + "absent" + ansiiprint.end() + " in Zookeeper; adding new router") - keepalive_time = int(time.time()) - transaction = zk_conn.transaction() - transaction.create('/routers/{}'.format(myhostname), 'hypervisor'.encode('ascii')) - # Basic state information - transaction.create('/routers/{}/daemonstate'.format(myhostname), 'stop'.encode('ascii')) - transaction.create('/routers/{}/networkstate'.format(myhostname), 'secondary'.encode('ascii')) - transaction.create('/routers/{}/staticdata'.format(myhostname), ' '.join(staticdata).encode('ascii')) - transaction.create('/routers/{}/cpuload'.format(myhostname), '0'.encode('ascii')) - # Keepalives and fencing information - transaction.create('/routers/{}/keepalive'.format(myhostname), str(keepalive_time).encode('ascii')) - transaction.create('/routers/{}/ipmihostname'.format(myhostname), config['ipmi_hostname'].encode('ascii')) - transaction.create('/routers/{}/ipmiusername'.format(myhostname), config['ipmi_username'].encode('ascii')) - transaction.create('/routers/{}/ipmipassword'.format(myhostname), config['ipmi_password'].encode('ascii')) - transaction.commit() - -# Check that the primary key exists, and create it with us as master if not -current_primary = zkhandler.readdata(zk_conn, '/routers') -if current_primary: - print('Current primary router is {}"{}"{}.'.format(ansiiprint.blue(), current_primary, ansiiprint.end())) -else: - print('No primary router key found; creating with us as primary.') - zkhandler.writedata(zk_conn, { '/routers': myhostname }) - -zkhandler.writedata(zk_conn, { '/routers/{}/daemonstate'.format(myhostname): 'init' }) - -t_router = dict() -s_network = dict() -router_list = [] -network_list = [] - -# Create our config dirs -common.run_os_command( - '/bin/mkdir --parents {}/networks'.format( - config['nftables_rules_dir'] - ) -) -common.run_os_command( - '/bin/mkdir --parents {}/static'.format( - config['nftables_rules_dir'] - ) -) -common.run_os_command( - '/bin/mkdir --parents {}'.format( - config['dnsmasq_hosts_dir'] - ) -) - -# Set up the basic features of the nftables firewall -nftables_base_rules = """# Base rules -flush ruleset -# Add the filter table and chains -add table inet filter -add chain inet filter forward {{ type filter hook forward priority 0; }} -add chain inet filter input {{ type filter hook input priority 0; }} -# Include static rules and network rules -include "{rulesdir}/static/*" -include "{rulesdir}/networks/*" -""".format( - rulesdir=config['nftables_rules_dir'] -) - -# Write the basic firewall config -print(nftables_base_rules) -nftables_base_filename = '{}/base.nft'.format(config['nftables_rules_dir']) -nftables_update_filename = '{}/update'.format(config['nftables_rules_dir']) -with open(nftables_base_filename, 'w') as nfbasefile: - nfbasefile.write(nftables_base_rules) - open(nftables_update_filename, 'a').close() - -# -# Router instances -# -@zk_conn.ChildrenWatch('/routers') -def updaterouters(new_router_list): - global router_list - router_list = new_router_list - print(ansiiprint.blue() + 'Router list: ' + ansiiprint.end() + '{}'.format(' '.join(router_list))) - for router in router_list: - if router in t_router: - t_router[router].updaterouterlist(t_router) - else: - t_router[router] = RouterInstance.RouterInstance(myhostname, router, t_router, s_network, zk_conn, config) - -# Set up our update function -this_router = t_router[myhostname] -update_zookeeper = this_router.update_zookeeper - -# -# Network instances -# -@zk_conn.ChildrenWatch('/networks') -def updatenetworks(new_network_list): - global network_list - for network in new_network_list: - if not network in s_network: - s_network[network] = VXNetworkInstance.VXNetworkInstance(network, zk_conn, config, t_router[myhostname]) - if this_router.network_state == 'primary': - s_network[network].createGatewayAddress() - s_network[network].startDHCPServer() - for network in network_list: - if not network in new_network_list: - if this_router.network_state == 'primary': - s_network[network].stopDHCPServer() - s_network[network].removeGatewayAddress() - s_network[network].removeFirewall() - s_network[network].removeNetwork() - del(s_network[network]) - network_list = new_network_list - for router in router_list: - if router in t_router: - t_router[router].updatenetworklist(s_network) - print(ansiiprint.blue() + 'Network list: ' + ansiiprint.end() + '{}'.format(' '.join(network_list))) - -# Create timer to update this router in Zookeeper -def createKeepaliveTimer(): - interval = int(config['keepalive_interval']) - ansiiprint.echo('Starting keepalive timer ({} second interval)'.format(interval), '', 'o') - update_timer = apscheduler.schedulers.background.BackgroundScheduler() - update_timer.add_job(update_zookeeper, 'interval', seconds=interval) - update_timer.start() - return update_timer - -def stopKeepaliveTimer(update_timer): - ansiiprint.echo('Stopping keepalive timer', '', 'c') - update_timer.shutdown() - -# Start keepalive thread -update_timer = createKeepaliveTimer() -update_zookeeper() - -# Tick loop -while True: - try: - time.sleep(0.5) - except: - break diff --git a/router-daemon/pvcrd/RouterInstance.py b/router-daemon/pvcrd/RouterInstance.py deleted file mode 100644 index 7426eb19..00000000 --- a/router-daemon/pvcrd/RouterInstance.py +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env python3 - -# RouterInstance.py - Class implementing a PVC router and run by pvcrd -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import os -import sys -import psutil -import socket -import time -import threading -import subprocess - -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler -import daemon_lib.common as common - -class RouterInstance(): - # Initialization function - def __init__(self, this_router, name, t_router, s_network, zk_conn, config): - # Passed-in variables on creation - self.zk_conn = zk_conn - self.config = config - self.this_router = this_router - self.name = name - self.primary_router = None - self.daemon_state = 'stop' - self.network_state = 'secondary' - self.t_router = t_router - self.primary_router_list = [] - self.secondary_router_list = [] - self.inactive_router_list = [] - self.s_network = s_network - self.network_list = [] - self.ipmi_hostname = self.config['ipmi_hostname'] - - # Zookeeper handlers for changed states - @zk_conn.DataWatch('/routers/{}/daemonstate'.format(self.name)) - def watch_router_daemonstate(data, stat, event=''): - try: - data = data.decode('ascii') - except AttributeError: - data = 'stop' - - if data != self.daemon_state: - self.daemon_state = data - - @zk_conn.DataWatch('/routers/{}/networkstate'.format(self.name)) - def watch_router_networkstate(data, stat, event=''): - try: - data = data.decode('ascii') - except AttributeError: - data = 'secondary' - - if data != self.network_state: - self.network_state = data - if self.name == self.this_router: - if self.network_state == 'primary': - self.become_primary() - else: - self.become_secondary() - - @zk_conn.DataWatch('/routers') - def watch_primary_router(data, stat, event=''): - try: - data = data.decode('ascii') - except AttributeError: - data = 'none' - - # toggle state management of this router - if data != self.primary_router: - if data == 'none': - if self.name == self.this_router: - if self.daemon_state == 'run' and self.network_state != 'primary': - # Contend for primary - ansiiprint.echo('Contending for primary', '', 'i') - zkhandler.writedata(self.zk_conn, { - '/routers': self.name - }) - elif data == self.this_router: - if self.name == self.this_router: - zkhandler.writedata(self.zk_conn, { - '/routers/{}/networkstate'.format(self.name): 'primary', - }) - self.primary_router = data - else: - if self.name == self.this_router: - zkhandler.writedata(self.zk_conn, { - '/routers/{}/networkstate'.format(self.name): 'secondary', - }) - self.primary_router = data - - # Get value functions - def getname(self): - return self.name - - def getdaemonstate(self): - return self.daemon_state - - def getnetworkstate(self): - return self.network_state - - def getnetworklist(self): - return self.network_list - - # Update value functions - def updaterouterlist(self, t_router): - self.t_router = t_router - - def updatenetworklist(self, s_network): - self.s_network = s_network - network_list = [] - for network in s_network: - network_list.append(s_network[network].getvni()) - self.network_list = network_list - - def become_secondary(self): - ansiiprint.echo('Setting router {} to secondary state'.format(self.name), '', 'i') - ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c') - time.sleep(0.5) - for network in self.s_network: - self.s_network[network].stopDHCPServer() - self.s_network[network].removeGatewayAddress() - - def become_primary(self): - ansiiprint.echo('Setting router {} to primary state.'.format(self.name), '', 'i') - ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c') - for network in self.s_network: - self.s_network[network].createGatewayAddress() - self.s_network[network].startDHCPServer() - - def update_zookeeper(self): - # Get past state and update if needed - past_state = zkhandler.readdata(self.zk_conn, '/routers/{}/daemonstate'.format(self.name)) - if past_state != 'run': - self.daemon_state = 'run' - zkhandler.writedata(self.zk_conn, { '/routers/{}/daemonstate'.format(self.name): 'run' }) - else: - self.daemon_state = 'run' - - # Ensure the master key is properly set at a keepalive - if self.name == self.this_router: - if self.network_state == 'primary': - if zkhandler.readdata(self.zk_conn, '/routers') == 'none': - zkhandler.writedata(self.zk_conn, {'/routers': self.name}) - - # Set our information in zookeeper - cpuload = os.getloadavg()[0] - keepalive_time = int(time.time()) - try: - zkhandler.writedata(self.zk_conn, { - '/routers/{}/keepalive'.format(self.name): str(keepalive_time), - '/routers/{}/cpuload'.format(self.name): str(cpuload), - }) - except: - ansiiprint.echo('Failed to set keepalive data', '', 'e') - return - - # Display router information to the terminal - ansiiprint.echo('{}{} keepalive{}'.format(ansiiprint.purple(), self.name, ansiiprint.end()), '', 't') - ansiiprint.echo('{0}Networks count:{1} {2} {0}Load average:{1} {3}'.format(ansiiprint.bold(), ansiiprint.end(), len(self.network_list), cpuload), '', 'c') - - # Update our local router lists - for router_name in self.t_router: - try: - router_daemon_state = zkhandler.readdata(self.zk_conn, '/routers/{}/daemonstate'.format(router_name)) - router_network_state = zkhandler.readdata(self.zk_conn, '/routers/{}/networkstate'.format(router_name)) - router_keepalive = int(zkhandler.readdata(self.zk_conn, '/routers/{}/keepalive'.format(router_name))) - except: - router_daemon_state = 'unknown' - router_network_state = 'unknown' - router_keepalive = 0 - - # Handle deadtime and fencng if needed - # (A router is considered dead when its keepalive timer is >6*keepalive_interval seconds - # out-of-date while in 'start' state) - router_deadtime = int(time.time()) - ( int(self.config['keepalive_interval']) * int(self.config['fence_intervals']) ) - if router_keepalive < router_deadtime and router_daemon_state == 'run': - ansiiprint.echo('Router {} seems dead - starting monitor for fencing'.format(router_name), '', 'w') - zkhandler.writedata(self.zk_conn, { '/routers/{}/daemonstate'.format(router_name): 'dead' }) - fence_thread = threading.Thread(target=fenceRouter, args=(router_name, self.zk_conn, self.config), kwargs={}) - fence_thread.start() - - # Update the arrays - if router_daemon_state == 'run' and router_network_state == 'primary' and router_name not in self.primary_router_list: - self.primary_router_list.append(router_name) - try: - self.secondary_router_list.remove(router_name) - except ValueError: - pass - try: - self.inactive_router_list.remove(router_name) - except ValueError: - pass - if router_daemon_state == 'run' and router_network_state == 'secondary' and router_name not in self.secondary_router_list: - self.secondary_router_list.append(router_name) - try: - self.primary_router_list.remove(router_name) - except ValueError: - pass - try: - self.inactive_router_list.remove(router_name) - except ValueError: - pass - if router_daemon_state != 'run' and router_name not in self.inactive_router_list: - self.inactive_router_list.append(router_name) - try: - self.primary_router_list.remove(router_name) - except ValueError: - pass - try: - self.secondary_router_list.remove(router_name) - except ValueError: - pass - - # Display cluster information to the terminal - ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't') - ansiiprint.echo('{}Primary router:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.primary_router_list)), '', 'c') - ansiiprint.echo('{}Secondary router:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.secondary_router_list)), '', 'c') - ansiiprint.echo('{}Inactive routers:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.inactive_router_list)), '', 'c') - - # Reload firewall rules if needed - if os.path.isfile('{}/update'.format(self.config['nftables_rules_dir'])): - common.reload_firewall_rules(self.config['nftables_rules_dir']) - os.remove('{}/update'.format(self.config['nftables_rules_dir'])) - -# -# Fence thread entry function -# -def fenceRouter(router_name, zk_conn, config): - failcount = 0 - # We allow exactly 3 saving throws for the host to come back online - while failcount < 3: - # Wait 5 seconds - time.sleep(5) - # Get the state - router_daemon_state = zkhandler.readdata(zk_conn, '/routers/{}/daemonstate'.format(router_name)) - # Is it still 'dead' - if router_daemon_state == 'dead': - failcount += 1 - ansiiprint.echo('Router "{}" failed {} saving throws'.format(router_name, failcount), '', 'w') - # It changed back to something else so it must be alive - else: - ansiiprint.echo('Router "{}" passed a saving throw; canceling fence'.format(router_name), '', 'o') - return - - ansiiprint.echo('Fencing router "{}" via IPMI reboot signal'.format(router_name), '', 'e') - - # Get IPMI information - ipmi_hostname = zkhandler.readdata(zk_conn, '/routers/{}/ipmihostname'.format(router_name)) - ipmi_username = zkhandler.readdata(zk_conn, '/routers/{}/ipmiusername'.format(router_name)) - ipmi_password = zkhandler.readdata(zk_conn, '/routers/{}/ipmipassword'.format(router_name)) - - # Shoot it in the head - fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password) - # Hold to ensure the fence takes effect - time.sleep(3) - - # Set router in secondary state - zkhandler.writedata(zk_conn, { '/routers/{}/networkstate'.format(router_name): 'secondary' }) - -# -# Perform an IPMI fence -# -def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password): - retcode = common.run_os_command('ipmitool -I lanplus -H {} -U {} -P {} chassis power reset'.format( - ipmi_hostname, ipmi_user, ipmi_password - )) - if retcode == 0: - ansiiprint.echo('Successfully rebooted dead router', '', 'o') - return True - else: - ansiiprint.echo('Failed to reboot dead router', '', 'e') - return False diff --git a/router-daemon/pvcrd/__init__.py b/router-daemon/pvcrd/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/virtualization-daemon/pvcvd.conf.sample b/virtualization-daemon/pvcvd.conf.sample deleted file mode 100644 index c55930b6..00000000 --- a/virtualization-daemon/pvcvd.conf.sample +++ /dev/null @@ -1,43 +0,0 @@ -# pvcvd cluster configuration file example -# -# This configuration file specifies details for this node in PVC. Multiple host -# blocks can be added but only the one matching the current system hostname will -# be used by the local daemon. Default values apply to all hosts for any value -# not specifically overridden. -# -# The following values are required for each host or in a default section: -# zookeeper: the IP+port of the Zookeper instance (defaults to 127.0.0.1:2181) -# keepalive_interval: the interval between keepalives and for dead node timeout (defaults to 5) -# fence_intervals: the number of keepalive_intervals without Zookeeper contact before this node -# will consider another node dead and fence it (defaults to 6, i.e. 30s) -# suicide_intervals: the number of keepalive_intervals without Zookeeper contact before this -# node will consider itself failed and terminate all running VMs (defaults -# to 0, i.e. disabled); should be less than "fence_intervals" -# successful_fence: the action to take on a successful fencing operation; can be "none" or -# "migrate" (defaults to "migrate") -# failed_fence: the action to take on a failed fencing operation; can be "none" or "migrate" -# (defaults to "none"); "migrate" requires "suicide_intervals" to be set) -# NOTE: POTENTIALLY DANGEROUS - see README for details -# migration_target_selector: the method to use to select target hypervisor nodes during a -# flush action; can be "mem", "load", "vcpus", or "vms" (defaults -# to "mem"); the best choice based on this field is selected for -# each VM to be migrated -# ipmi_hostname: the IPMI hostname for fencing (defaults to -lom.) -# ipmi_username: username to connect to IPMI -# ipmi_password: password to connect to IPMI -# -# Copy this example to /etc/pvc/pvcd.conf and edit to your needs - -[default] -zookeeper = 127.0.0.1:2181 -keepalive_interval = 5 -fence_intervals = 6 -suicide_intervals = 0 -successful_fence = migrate -failed_fence = none -migration_target_selector = mem - -[myhost] -ipmi_username = admin -ipmi_password = admin -ipmi_hostname = myhost-lom diff --git a/virtualization-daemon/pvcvd.py b/virtualization-daemon/pvcvd.py deleted file mode 100755 index 27d6d7dc..00000000 --- a/virtualization-daemon/pvcvd.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -# pvcvd.py - Virtualization daemon startup stub -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import pvcvd.Daemon diff --git a/virtualization-daemon/pvcvd/Daemon.py b/virtualization-daemon/pvcvd/Daemon.py deleted file mode 100644 index 991593c5..00000000 --- a/virtualization-daemon/pvcvd/Daemon.py +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/env python3 - -# Daemon.py - PVC hypervisor virtualization daemon -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -import kazoo.client -import libvirt -import sys -import os -import signal -import socket -import psutil -import subprocess -import uuid -import time -import configparser -import apscheduler.schedulers.background - -import daemon_lib.ansiiprint as ansiiprint -import daemon_lib.zkhandler as zkhandler - -import pvcvd.VMInstance as VMInstance -import pvcvd.NodeInstance as NodeInstance - -print(ansiiprint.bold() + "pvcvd - Parallel Virtual Cluster virtualization daemon" + ansiiprint.end()) - -# Get the config file variable from the environment -try: - pvcvd_config_file = os.environ['PVCVD_CONFIG_FILE'] -except: - print('ERROR: The "PVCVD_CONFIG_FILE" environment variable must be set before starting pvcvd.') - exit(1) - -myhostname = socket.gethostname() -myshorthostname = myhostname.split('.', 1)[0] -mydomainname = ''.join(myhostname.split('.', 1)[1:]) - -# Config values dictionary -config_values = [ - 'zookeeper', - 'keepalive_interval', - 'fence_intervals', - 'suicide_intervals', - 'successful_fence', - 'failed_fence', - 'migration_target_selector', - 'ipmi_hostname', - 'ipmi_username', - 'ipmi_password' -] -def readConfig(pvcvd_config_file, myhostname): - print('Loading configuration from file {}'.format(pvcvd_config_file)) - - o_config = configparser.ConfigParser() - o_config.read(pvcvd_config_file) - config = {} - - try: - entries = o_config[myhostname] - except: - try: - entries = o_config['default'] - except Exception as e: - print('ERROR: Config file is not valid!') - exit(1) - - for entry in config_values: - try: - config[entry] = entries[entry] - except: - try: - config[entry] = o_config['default'][entry] - except: - print('ERROR: Config file missing required value "{}" for this host!'.format(entry)) - exit(1) - - # Handle an empty ipmi_hostname - if config['ipmi_hostname'] == '': - config['ipmi_hostname'] = myshorthostname + '-lom.' + mydomainname - - return config - -# Get config -config = readConfig(pvcvd_config_file, myhostname) - -# Check that libvirtd is listening TCP -libvirt_check_name = "qemu+tcp://127.0.0.1:16509/system" -try: - print('Connecting to Libvirt instance at {}'.format(libvirt_check_name)) - lv_conn = libvirt.open(libvirt_check_name) - if lv_conn == None: - raise -except: - print('ERROR: Failed to open local libvirt connection via TCP; required for PVC!') - exit(1) -lv_conn.close() - -# Connect to local zookeeper -zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper']) -try: - print('Connecting to Zookeeper instance at {}'.format(config['zookeeper'])) - zk_conn.start() -except: - print('ERROR: Failed to connect to Zookeeper') - exit(1) - -# Handle zookeeper failures -def zk_listener(state): - global zk_conn, update_timer - if state == kazoo.client.KazooState.SUSPENDED: - ansiiprint.echo('Connection to Zookeeper lost; retrying', '', 'e') - - # Stop keepalive thread - stopKeepaliveTimer(update_timer) - - while True: - _zk_conn = kazoo.client.KazooClient(hosts=config['zookeeper']) - try: - _zk_conn.start() - zk_conn = _zk_conn - break - except: - time.sleep(1) - elif state == kazoo.client.KazooState.CONNECTED: - ansiiprint.echo('Connection to Zookeeper started', '', 'o') - - # Start keepalive thread - update_timer = createKeepaliveTimer() - else: - pass - -zk_conn.add_listener(zk_listener) - -# Cleanup function -def cleanup(signum, frame): - ansiiprint.echo('Terminating daemon', '', 'e') - # Set stop state in Zookeeper - zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'stop' }) - # Close the Zookeeper connection - try: - zk_conn.stop() - zk_conn.close() - except: - pass - # Stop keepalive thread - stopKeepaliveTimer(update_timer) - # Exit - sys.exit(0) - -# Handle signals gracefully -signal.signal(signal.SIGTERM, cleanup) -signal.signal(signal.SIGINT, cleanup) -signal.signal(signal.SIGQUIT, cleanup) - -# Gather useful data about our host for staticdata -# Static data format: 'cpu_count', 'arch', 'os', 'kernel' -staticdata = [] -staticdata.append(str(psutil.cpu_count())) -staticdata.append(subprocess.run(['uname', '-r'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -staticdata.append(subprocess.run(['uname', '-o'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -staticdata.append(subprocess.run(['uname', '-m'], stdout=subprocess.PIPE).stdout.decode('ascii').strip()) -# Print static data on start - -print('{0}Node hostname:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), myhostname)) -print('{0}IPMI hostname:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), config['ipmi_hostname'])) -print('{0}Machine details:{1}'.format(ansiiprint.bold(), ansiiprint.end())) -print(' {0}CPUs:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[0])) -print(' {0}Arch:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[3])) -print(' {0}OS:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[2])) -print(' {0}Kernel:{1} {2}'.format(ansiiprint.bold(), ansiiprint.end(), staticdata[1])) - -# Check if our node exists in Zookeeper, and create it if not -if zk_conn.exists('/nodes/{}'.format(myhostname)): - print("Node is " + ansiiprint.green() + "present" + ansiiprint.end() + " in Zookeeper") - # Update static data just in case it's changed - zkhandler.writedata(zk_conn, { '/nodes/{}/staticdata'.format(myhostname): ' '.join(staticdata) }) -else: - print("Node is " + ansiiprint.red() + "absent" + ansiiprint.end() + " in Zookeeper; adding new node") - keepalive_time = int(time.time()) - transaction = zk_conn.transaction() - transaction.create('/nodes/{}'.format(myhostname), 'hypervisor'.encode('ascii')) - # Basic state information - transaction.create('/nodes/{}/daemonstate'.format(myhostname), 'stop'.encode('ascii')) - transaction.create('/nodes/{}/domainstate'.format(myhostname), 'ready'.encode('ascii')) - transaction.create('/nodes/{}/staticdata'.format(myhostname), ' '.join(staticdata).encode('ascii')) - transaction.create('/nodes/{}/memfree'.format(myhostname), '0'.encode('ascii')) - transaction.create('/nodes/{}/memused'.format(myhostname), '0'.encode('ascii')) - transaction.create('/nodes/{}/memalloc'.format(myhostname), '0'.encode('ascii')) - transaction.create('/nodes/{}/vcpualloc'.format(myhostname), '0'.encode('ascii')) - transaction.create('/nodes/{}/cpuload'.format(myhostname), '0.0'.encode('ascii')) - transaction.create('/nodes/{}/runningdomains'.format(myhostname), ''.encode('ascii')) - transaction.create('/nodes/{}/domainscount'.format(myhostname), '0'.encode('ascii')) - # Keepalives and fencing information - transaction.create('/nodes/{}/keepalive'.format(myhostname), str(keepalive_time).encode('ascii')) - transaction.create('/nodes/{}/ipmihostname'.format(myhostname), config['ipmi_hostname'].encode('ascii')) - transaction.create('/nodes/{}/ipmiusername'.format(myhostname), config['ipmi_username'].encode('ascii')) - transaction.create('/nodes/{}/ipmipassword'.format(myhostname), config['ipmi_password'].encode('ascii')) - transaction.commit() - -zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'init' }) - -t_node = dict() -s_domain = dict() -node_list = [] -domain_list = [] - -@zk_conn.ChildrenWatch('/nodes') -def updatenodes(new_node_list): - global node_list - node_list = new_node_list - print(ansiiprint.blue() + 'Node list: ' + ansiiprint.end() + '{}'.format(' '.join(node_list))) - for node in node_list: - if node in t_node: - t_node[node].updatenodelist(t_node) - else: - t_node[node] = NodeInstance.NodeInstance(myhostname, node, t_node, s_domain, zk_conn, config) - -@zk_conn.ChildrenWatch('/domains') -def updatedomains(new_domain_list): - global domain_list - domain_list = new_domain_list - print(ansiiprint.blue() + 'Domain list: ' + ansiiprint.end() + '{}'.format(' '.join(domain_list))) - for domain in domain_list: - if not domain in s_domain: - s_domain[domain] = VMInstance.VMInstance(domain, zk_conn, config, t_node[myhostname]); - for node in node_list: - if node in t_node: - t_node[node].updatedomainlist(s_domain) - -# Set up our update function -this_node = t_node[myhostname] -update_zookeeper = this_node.update_zookeeper - -# Create timer to update this node in Zookeeper -def createKeepaliveTimer(): - interval = int(config['keepalive_interval']) - ansiiprint.echo('Starting keepalive timer ({} second interval)'.format(interval), '', 'o') - update_timer = apscheduler.schedulers.background.BackgroundScheduler() - update_timer.add_job(update_zookeeper, 'interval', seconds=interval) - update_timer.start() - return update_timer - -def stopKeepaliveTimer(update_timer): - ansiiprint.echo('Stopping keepalive timer', '', 'c') - update_timer.shutdown() - -# Start keepalive thread -update_timer = createKeepaliveTimer() -update_zookeeper() - -# Tick loop -while True: - try: - time.sleep(0.1) - except: - break diff --git a/virtualization-daemon/pvcvd/__init__.py b/virtualization-daemon/pvcvd/__init__.py deleted file mode 100644 index e69de29b..00000000