Bump version to 0.9.25

Don't overwrite shutdown state on termination
Just a minor quibble and not really impactful.
2021-07-11 23:19:09 -04:00 · 2021-07-11 23:18:14 -04:00 · 2021-07-11 23:10:41 -04:00 · 2021-07-10 23:35:49 -04:00 · 2021-07-10 23:24:59 -04:00 · 2021-07-10 17:28:42 -04:00
15 changed files with 278 additions and 140 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.22
+0.9.25
--- a/README.md
+++ b/README.md
@ -42,6 +42,27 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r

 ## Changelog

+#### v0.9.25
+
+  * [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
+  * [Node Daemon] Adds a date output to keepalive messages
+  * [Daemons] Configures ZK connection logging only for persistent connections
+  * [API Provisioner] Add context manager-based chroot to Debootstrap example script
+  * [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
+
+#### v0.9.24
+
+  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
+  * [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
+  * [CLI Client] Allow raw lists of clusters from `pvc cluster list`
+  * [API Daemon] Fixes several issues when getting VM data without stats
+  * [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
+
+#### v0.9.23
+
+  * [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
+  * [Node Daemon] Ensures the daemon mode is updated on every startup (fixes the side effect of the above bug in 0.9.22)
+
 #### v0.9.22

  * [API Daemon] Drastically improves performance when getting large lists (e.g. VMs)
--- a/api-daemon/provisioner/examples/debootstrap_script.py
+++ b/api-daemon/provisioner/examples/debootstrap_script.py
@ -34,6 +34,29 @@
 # with that.

 import os
+from contextlib import contextmanager
+
+
+# Create a chroot context manager
+# This can be used later in the script to chroot to the destination directory
+# for instance to run commands within the target.
+@contextmanager
+def chroot_target(destination):
+    try:
+        real_root = os.open("/", os.O_RDONLY)
+        os.chroot(destination)
+        fake_root = os.open("/", os.O_RDONLY)
+        os.fchdir(fake_root)
+        yield
+    finally:
+        os.fchdir(real_root)
+        os.chroot(".")
+        os.fchdir(real_root)
+        os.close(fake_root)
+        os.close(real_root)
+        del fake_root
+        del real_root
+

 # Installation function - performs a debootstrap install of a Debian system
 # Note that the only arguments are keyword arguments.
@ -193,13 +216,7 @@ GRUB_DISABLE_LINUX_UUID=false
        fh.write(data)

    # Chroot, do some in-root tasks, then exit the chroot
-    # EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
-    # WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
-    real_root = os.open("/", os.O_RDONLY)
-    os.chroot(temporary_directory)
-    fake_root = os.open("/", os.O_RDONLY)
-    os.fchdir(fake_root)
-
+    with chroot_target(temporary_directory):
        # Install and update GRUB
        os.system(
            "grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
@ -219,15 +236,6 @@ GRUB_DISABLE_LINUX_UUID=false
            "systemctl enable cloud-init.target"
        )

-    # Restore our original root/exit the chroot
-    # EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
-    # WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
-    os.fchdir(real_root)
-    os.chroot(".")
-    os.fchdir(real_root)
-    os.close(fake_root)
-    os.close(real_root)
-
    # Unmount the bound devfs
    os.system(
        "umount {}/dev".format(
@ -235,8 +243,4 @@ GRUB_DISABLE_LINUX_UUID=false
        )
    )

-    # Clean up file handles so paths can be unmounted
-    del fake_root
-    del real_root
-
    # Everything else is done via cloud-init user-data
--- a/api-daemon/provisioner/examples/dummy_script.py
+++ b/api-daemon/provisioner/examples/dummy_script.py
@ -29,7 +29,7 @@
 # This script will run under root privileges as the provisioner does. Be careful
 # with that.

-# Installation function - performs a debootstrap install of a Debian system
+# Installation function - performs no actions then returns
 # Note that the only arguments are keyword arguments.
 def install(**kwargs):
    # The provisioner has already mounted the disks on kwargs['temporary_directory'].
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -25,7 +25,7 @@ import yaml
 from distutils.util import strtobool as dustrtobool

 # Daemon version
-version = '0.9.22'
+version = '0.9.25'

 # API version
 API_VERSION = 1.0
--- a/client-cli/pvc/pvc.py
+++ b/client-cli/pvc/pvc.py
@ -251,7 +251,11 @@ def cluster_remove(name):
 # pvc cluster list
 ###############################################################################
@click.command(name='list', short_help='List all available clusters.')
-def cluster_list():
+@click.option(
+    '-r', '--raw', 'raw', is_flag=True, default=False,
+    help='Display the raw list of cluster names only.'
+)
+def cluster_list(raw):
    """
    List all the available PVC clusters configured in this CLI instance.
    """
@ -302,6 +306,7 @@ def cluster_list():
        if _api_key_length > api_key_length:
            api_key_length = _api_key_length

+    if not raw:
        # Display the data nicely
        click.echo("Available clusters:")
        click.echo()
@ -341,6 +346,7 @@ def cluster_list():
            if not api_key:
                api_key = 'N/A'

+        if not raw:
            click.echo(
                '{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}'.format(
                    bold='',
@ -359,6 +365,8 @@ def cluster_list():
                    api_key_length=api_key_length
                )
            )
+        else:
+            click.echo(cluster)


 # Validate that the cluster is set for a given command
@ -1652,7 +1660,7 @@ def vm_dump(filename, domain):
@cluster_req
 def vm_list(target_node, target_state, limit, raw):
    """
-    List all virtual machines; optionally only match names matching regex LIMIT.
+    List all virtual machines; optionally only match names or full UUIDs matching regex LIMIT.

    NOTE: Red-coloured network lists indicate one or more configured networks are missing/invalid.
    """
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name='pvc',
-    version='0.9.22',
+    version='0.9.25',
    packages=['pvc', 'pvc.cli_lib'],
    install_requires=[
        'Click',
--- a/daemon-common/common.py
+++ b/daemon-common/common.py
@ -343,8 +343,13 @@ def getInformationFromXML(zkhandler, uuid):

    parsed_xml = getDomainXML(zkhandler, uuid)

-    stats_data = loads(zkhandler.read(('domain.stats', uuid)))
-    if stats_data is None:
+    stats_data = zkhandler.read(('domain.stats', uuid))
+    if stats_data is not None:
+        try:
+            stats_data = loads(stats_data)
+        except Exception:
+            stats_data = {}
+    else:
        stats_data = {}

    domain_uuid, domain_name, domain_description, domain_memory, domain_vcpu, domain_vcputopo = getDomainMainDetails(parsed_xml)
--- a/daemon-common/vm.py
+++ b/daemon-common/vm.py
@ -449,14 +449,6 @@ def remove_vm(zkhandler, domain):
    if current_vm_state != 'stop':
        change_state(zkhandler, dom_uuid, 'stop')

-    # Gracefully terminate the class instances
-    change_state(zkhandler, dom_uuid, 'delete')
-
-    # Delete the configurations
-    zkhandler.delete([
-        ('domain', dom_uuid)
-    ])
-
    # Wait for 1 second to allow state to flow to all nodes
    time.sleep(1)

@ -465,11 +457,28 @@ def remove_vm(zkhandler, domain):
        # vmpool/vmname_volume
        try:
            disk_pool, disk_name = disk.split('/')
-            retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
        except ValueError:
            continue

-    return True, 'Removed VM "{}" and disks from the cluster.'.format(domain)
+        retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
+        if not retcode:
+            if re.match('^ERROR: No volume with name', message):
+                continue
+            else:
+                return False, message
+
+    # Gracefully terminate the class instances
+    change_state(zkhandler, dom_uuid, 'delete')
+
+    # Wait for 1/2 second to allow state to flow to all nodes
+    time.sleep(0.5)
+
+    # Delete the VM configuration from Zookeeper
+    zkhandler.delete([
+        ('domain', dom_uuid)
+    ])
+
+    return True, 'Removed VM "{}" and its disks from the cluster.'.format(domain)


 def start_vm(zkhandler, domain):
@ -789,7 +798,10 @@ def get_console_log(zkhandler, domain, lines=1000):
        return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)

    # Get the data from ZK
-    console_log = zkhandler.read(('domain.log.console', dom_uuid))
+    console_log = zkhandler.read(('domain.console.log', dom_uuid))
+
+    if console_log is None:
+        return True, ''

    # Shrink the log buffer to length lines
    shrunk_log = console_log.split('\n')[-lines:]
@ -897,6 +909,9 @@ def get_list(zkhandler, node, state, limit, is_fuzzy=True):
        for vm_uuid in vm_execute_list:
            futures.append(executor.submit(common.getInformationFromXML, zkhandler, vm_uuid))
        for future in futures:
+            try:
                vm_data_list.append(future.result())
+            except Exception:
+                pass

    return True, vm_data_list
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@ -124,37 +124,29 @@ class ZKHandler(object):
    # State/connection management
    #
    def listener(self, state):
+        """
+        Listen for KazooState changes and log accordingly.
+
+        This function does not do anything except for log the state, and Kazoo handles the rest.
+        """
        if state == KazooState.CONNECTED:
-            self.log('Connection to Zookeeper started', state='o')
+            self.log('Connection to Zookeeper resumed', state='o')
        else:
-            self.log('Connection to Zookeeper lost', state='w')
-
-            while True:
-                time.sleep(0.5)
-
-                _zk_conn = KazooClient(hosts=self.coordinators)
-                try:
-                    _zk_conn.start()
-                except Exception:
-                    del _zk_conn
-                    continue
-
-                self.zk_conn = _zk_conn
-                self.zk_conn.add_listener(self.listener)
-                break
+            self.log('Connection to Zookeeper lost with state {}'.format(state), state='w')

    def connect(self, persistent=False):
        """
-        Start the zk_conn object and connect to the cluster, then load the current schema version
+        Start the zk_conn object and connect to the cluster
        """
        try:
            self.zk_conn.start()
            if persistent:
+                self.log('Connection to Zookeeper started', state='o')
                self.zk_conn.add_listener(self.listener)
        except Exception as e:
            raise ZKConnectionException(self, e)

-    def disconnect(self):
+    def disconnect(self, persistent=False):
        """
        Stop and close the zk_conn object and disconnect from the cluster

@ -162,11 +154,27 @@ class ZKHandler(object):
        """
        self.zk_conn.stop()
        self.zk_conn.close()
+        if persistent:
+            self.log('Connection to Zookeeper terminated', state='o')

    #
    # Schema helper actions
    #
    def get_schema_path(self, key):
+        """
+        Get the Zookeeper path for {key} from the current schema based on its format.
+
+        If {key} is a tuple of length 2, it's treated as a path plus an item instance of that path (e.g. a node, a VM, etc.).
+
+        If {key} is a tuple of length 4, it is treated as a path plus an item instance, as well as another item instance of the subpath.
+
+        If {key} is just a string, it's treated as a lone path (mostly used for the 'base' schema group.
+
+        Otherwise, returns None since this is not a valid key.
+
+        This function also handles the special case where a string that looks like an existing path (i.e. starts with '/') is passed;
+        in that case it will silently return the same path back. This was mostly a migration functionality and is deprecated.
+        """
        if isinstance(key, tuple):
            # This is a key tuple with both an ipath and an item
            if len(key) == 2:
@ -201,6 +209,10 @@ class ZKHandler(object):
        Check if a key exists
        """
        path = self.get_schema_path(key)
+        if path is None:
+            # This path is invalid, this is likely due to missing schema entries, so return False
+            return False
+
        stat = self.zk_conn.exists(path)
        if stat:
            return True
@ -213,11 +225,13 @@ class ZKHandler(object):
        """
        try:
            path = self.get_schema_path(key)
-            data = self.zk_conn.get(path)[0].decode(self.encoding)
-        except NoNodeError:
-            data = None
+            if path is None:
+                # This path is invalid; this is likely due to missing schema entries, so return None
+                return None

-        return data
+            return self.zk_conn.get(path)[0].decode(self.encoding)
+        except NoNodeError:
+            return None

    def write(self, kvpairs):
        """
@ -238,6 +252,9 @@ class ZKHandler(object):
            value = kvpair[1]

            path = self.get_schema_path(key)
+            if path is None:
+                # This path is invalid; this is likely due to missing schema entries, so continue
+                continue

            if not self.exists(key):
                # Creating a new key
@ -276,9 +293,9 @@ class ZKHandler(object):
            keys = [keys]

        for key in keys:
-            path = self.get_schema_path(key)
            if self.exists(key):
                try:
+                    path = self.get_schema_path(key)
                    self.zk_conn.delete(path, recursive=recursive)
                except Exception as e:
                    self.log("ZKHandler error: Failed to delete key {}: {}".format(path, e), state='e')
@ -292,11 +309,13 @@ class ZKHandler(object):
        """
        try:
            path = self.get_schema_path(key)
-            children = self.zk_conn.get_children(path)
-        except NoNodeError:
-            children = None
+            if path is None:
+                # This path is invalid; this is likely due to missing schema entries, so return None
+                return None

-        return children
+            return self.zk_conn.get_children(path)
+        except NoNodeError:
+            return None

    def rename(self, kkpairs):
        """
@ -327,13 +346,20 @@ class ZKHandler(object):

            source_key = kkpair[0]
            source_path = self.get_schema_path(source_key)
+            if source_path is None:
+                # This path is invalid; this is likely due to missing schema entries, so continue
+                continue

            destination_key = kkpair[1]
            destination_path = self.get_schema_path(destination_key)
+            if destination_path is None:
+                # This path is invalid; this is likely due to missing schema entries, so continue
+                continue

            if not self.exists(source_key):
                self.log("ZKHander error: Source key '{}' does not exist".format(source_path), state='e')
                return False
+
            if self.exists(destination_key):
                self.log("ZKHander error: Destination key '{}' already exists".format(destination_path), state='e')
                return False
@ -698,9 +724,16 @@ class ZKSchema(object):
            if base_path is None:
                # This should only really happen for second-layer key types where the helper functions join them together
                base_path = ''
+
+            if not ipath:
+                # This is a root path
+                return f'{base_path}/{item}'
+
            sub_path = self.schema.get(itype).get('.'.join(ipath))
            if sub_path is None:
-                sub_path = ''
+                # We didn't find the path we're looking for, so we don't want to do anything
+                return None
+
            return f'{base_path}/{item}{sub_path}'

    # Get keys of a schema location
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,30 @@
+pvc (0.9.25-0) unstable; urgency=high
+
+  * [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
+  * [Node Daemon] Adds a date output to keepalive messages
+  * [Daemons] Configures ZK connection logging only for persistent connections
+  * [API Provisioner] Add context manager-based chroot to Debootstrap example script
+  * [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sun, 11 Jul 2021 23:19:09 -0400
+
+pvc (0.9.24-0) unstable; urgency=high
+
+  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
+  * [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
+  * [CLI Client] Allow raw lists of clusters from `pvc cluster list`
+  * [API Daemon] Fixes several issues when getting VM data without stats
+  * [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Fri, 09 Jul 2021 15:58:36 -0400
+
+pvc (0.9.23-0) unstable; urgency=high
+
+  * [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
+  * [Node Daemon] Ensures the daemon mode is updated on every startup (fixes the side effect of the above bug in 0.9.22)
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Mon, 05 Jul 2021 23:40:32 -0400
+
 pvc (0.9.22-0) unstable; urgency=high

  * [API Daemon] Drastically improves performance when getting large lists (e.g. VMs)
--- a/debian/pvc-daemon-api.install
+++ b/debian/pvc-daemon-api.install
@ -5,5 +5,6 @@ api-daemon/pvcapid.sample.yaml etc/pvc
 api-daemon/pvcapid usr/share/pvc
 api-daemon/pvcapid.service lib/systemd/system
 api-daemon/pvcapid-worker.service lib/systemd/system
+api-daemon/pvcapid-worker.sh usr/share/pvc
 api-daemon/provisioner usr/share/pvc
 api-daemon/migrations usr/share/pvc
--- a/docs/index.md
+++ b/docs/index.md
@ -42,6 +42,27 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r

 ## Changelog

+#### v0.9.25
+
+  * [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
+  * [Node Daemon] Adds a date output to keepalive messages
+  * [Daemons] Configures ZK connection logging only for persistent connections
+  * [API Provisioner] Add context manager-based chroot to Debootstrap example script
+  * [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
+
+#### v0.9.24
+
+  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
+  * [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
+  * [CLI Client] Allow raw lists of clusters from `pvc cluster list`
+  * [API Daemon] Fixes several issues when getting VM data without stats
+  * [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
+
+#### v0.9.23
+
+  * [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
+  * [Node Daemon] Ensures the daemon mode is updated on every startup (fixes the side effect of the above bug in 0.9.22)
+
 #### v0.9.22

  * [API Daemon] Drastically improves performance when getting large lists (e.g. VMs)
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -32,6 +32,7 @@ import yaml
 import json

 from socket import gethostname
+from datetime import datetime
 from threading import Thread
 from ipaddress import ip_address, ip_network
 from apscheduler.schedulers.background import BackgroundScheduler
@ -55,7 +56,7 @@ import pvcnoded.CephInstance as CephInstance
 import pvcnoded.MetadataAPIInstance as MetadataAPIInstance

 # Version string for startup output
-version = '0.9.22'
+version = '0.9.25'

 ###############################################################################
 # PVCD - node daemon startup program
@ -658,7 +659,7 @@ def update_schema(new_schema_version, stat, event=''):
    # Restart ourselves with the new schema
    logger.out('Reloading node daemon', state='s')
    try:
-        zkhandler.disconnect()
+        zkhandler.disconnect(persistent=True)
        del zkhandler
    except Exception:
        pass
@ -751,7 +752,7 @@ def cleanup():

    # Close the Zookeeper connection
    try:
-        zkhandler.disconnect()
+        zkhandler.disconnect(persistent=True)
        del zkhandler
    except Exception:
        pass
@ -791,6 +792,7 @@ if zkhandler.exists(('node', myhostname)):
    logger.out("Node is " + fmt_green + "present" + fmt_end + " in Zookeeper", state='i')
    # Update static data just in case it's changed
    zkhandler.write([
+        (('node', myhostname), config['daemon_mode']),
        (('node.mode', myhostname), config['daemon_mode']),
        (('node.state.daemon', myhostname), 'init'),
        (('node.state.router', myhostname), init_routerstate),
@ -1333,11 +1335,13 @@ def collect_ceph_stats(queue):
        ceph_health = health_status['status']
    except Exception as e:
        logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
-        return
+        ceph_health = 'HEALTH_UNKN'

-    if ceph_health == 'HEALTH_OK':
+    if ceph_health in ['HEALTH_OK']:
        ceph_health_colour = fmt_green
-    elif ceph_health == 'HEALTH_WARN':
+    elif ceph_health in ['HEALTH_UNKN']:
+        ceph_health_colour = fmt_cyan
+    elif ceph_health in ['HEALTH_WARN']:
        ceph_health_colour = fmt_yellow
    else:
        ceph_health_colour = fmt_red
@ -1355,7 +1359,6 @@ def collect_ceph_stats(queue):
            ])
        except Exception as e:
            logger.out('Failed to set Ceph status data: {}'.format(e), state='e')
-            return

        if debug:
            logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
@ -1369,15 +1372,15 @@ def collect_ceph_stats(queue):
            ])
        except Exception as e:
            logger.out('Failed to set Ceph utilization data: {}'.format(e), state='e')
-            return

        if debug:
            logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')

        # Get pool info
-        retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
+        command = {"prefix": "df", "format": "json"}
+        ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
        try:
-            ceph_pool_df_raw = json.loads(stdout)['pools']
+            ceph_pool_df_raw = json.loads(ceph_df_output)['pools']
        except Exception as e:
            logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
            ceph_pool_df_raw = []
@ -1448,9 +1451,9 @@ def collect_ceph_stats(queue):
        osd_dump = dict()

        command = {"prefix": "osd dump", "format": "json"}
+        osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
        try:
-            retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
-            osd_dump_raw = json.loads(stdout)['osds']
+            osd_dump_raw = json.loads(osd_dump_output)['osds']
        except Exception as e:
            logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
            osd_dump_raw = []
@ -1607,7 +1610,6 @@ def collect_vm_stats(queue):
    lv_conn = libvirt.open(libvirt_name)
    if lv_conn is None:
        logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e')
-        return

    memalloc = 0
    memprov = 0
@ -1777,8 +1779,9 @@ def node_keepalive():
    # Get past state and update if needed
    if debug:
        logger.out("Get past state and update if needed", state='d', prefix='main-thread')
+
    past_state = zkhandler.read(('node.state.daemon', this_node.name))
-    if past_state != 'run':
+    if past_state != 'run' and past_state != 'shutdown':
        this_node.daemon_state = 'run'
        zkhandler.write([
            (('node.state.daemon', this_node.name), 'run')
@ -1867,7 +1870,6 @@ def node_keepalive():
        ])
    except Exception:
        logger.out('Failed to set keepalive data', state='e')
-        return

    # Display node information to the terminal
    if config['log_keepalives']:
@ -1878,9 +1880,10 @@ def node_keepalive():
        else:
            cst_colour = fmt_cyan
        logger.out(
-            '{}{} keepalive{} [{}{}{}]'.format(
+            '{}{} keepalive @ {}{} [{}{}{}]'.format(
                fmt_purple,
                myhostname,
+                datetime.now(),
                fmt_end,
                fmt_bold + cst_colour,
                this_node.router_state,
--- a/node-daemon/pvcnoded/NodeInstance.py
+++ b/node-daemon/pvcnoded/NodeInstance.py
@ -246,7 +246,7 @@ class NodeInstance(object):
            if data != self.domain_list:
                self.domain_list = data

-        @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.count.provisioned_domainss', self.name))
+        @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.count.provisioned_domains', self.name))
        def watch_node_domainscount(data, stat, event=''):
            if event and event.type == 'DELETED':
                # The key has been deleted after existing before; terminate this watcher
Author	SHA1	Message	Date
Joshua M. Boniface	2e9f6ac201	Bump version to 0.9.25	2021-07-11 23:19:09 -04:00
Joshua M. Boniface	f09849bedf	Don't overwrite shutdown state on termination Just a minor quibble and not really impactful.	2021-07-11 23:18:14 -04:00
Joshua M. Boniface	8c975e5c46	Add chroot context manager example to debootstrap Closes #132	2021-07-11 23:10:41 -04:00
Joshua M. Boniface	c76149141f	Only log ZK connections when persistent Prevents spam in the API logs.	2021-07-10 23:35:49 -04:00
Joshua M. Boniface	f00c4d07f4	Add date output to keepalive Helps track when there is a log follow in "-o cat" mode.	2021-07-10 23:24:59 -04:00
Joshua M. Boniface	20b66c10e1	Move two more commands to Rados library	2021-07-10 17:28:42 -04:00
Joshua M. Boniface	cfeba50b17	Revert "Return to all command-based Ceph gathering" This reverts commit `65d14ccd92`. This was actually a bad idea. For inexplicable reasons, running these Ceph commands manually (not even via Python, but in a normal shell) takes 7 * two orders of magnitude longer than running them with the Rados module, so long in fact that some basic commands like "ceph health" would sometimes take longer than the 1 second timeout to complete. The Rados commands would however take about 1ms instead. Despite the occasional issues when monitors drop out, the Rados module is clearly far superior to the shell commands for any moderately-loaded Ceph cluster. We can look into solving timeouts another way (perhaps with Processes instead of Threads) at a later time. Rados module "ceph health": b'{"checks":{},"status":"HEALTH_OK"}' 0.001204 (s) b'{"checks":{},"status":"HEALTH_OK"}' 0.001258 (s) Command "ceph health": joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null real 0m0.772s user 0m0.707s sys 0m0.046s joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null real 0m0.796s user 0m0.728s sys 0m0.054s	2021-07-10 03:47:45 -04:00
Joshua M. Boniface	0699c48d10	Fix bad schema path name	2021-07-09 16:47:09 -04:00
Joshua M. Boniface	551bae2518	Bump version to 0.9.24	2021-07-09 15:58:36 -04:00
Joshua M. Boniface	4832245d9c	Handle non-RBD disks and non-RBD errors better	2021-07-09 15:48:57 -04:00
Joshua M. Boniface	2138f2f59f	Fail VM removal on disk removal failures Prevents bad states where the VM is "removed" but some of its disks remain due to e.g. stuck watchers. Rearrange the sequence so it goes stop, delete disks, then delete VM, and then return a failure if any of the disk(s) fail to remove, allowing the task to be rerun after fixing the problem.	2021-07-09 15:39:06 -04:00
Joshua M. Boniface	d1d355a96b	Avoid errors if stats data is None	2021-07-09 13:13:54 -04:00
Joshua M. Boniface	2b5dc286ab	Correct failure to get ceph_health data	2021-07-09 13:10:28 -04:00
Joshua M. Boniface	c0c9327a7d	Return an empty log if the value is None	2021-07-09 13:08:00 -04:00
Joshua M. Boniface	5ffabcfef5	Avoid failing if we can't get the future data	2021-07-09 13:05:37 -04:00
Joshua M. Boniface	330cf14638	Remove return statements in keepalive collectors These seem to bork the keepalive timer process, so just remove them and let it continue to press on.	2021-07-09 13:04:17 -04:00
Joshua M. Boniface	9d0eb20197	Mention UUID matching in vm list help	2021-07-09 11:51:20 -04:00
Joshua M. Boniface	3f5b7045a2	Allow raw listing of cluster names in CLI	2021-07-09 10:53:20 -04:00
Joshua M. Boniface	80fe96b24d	Add some additional docstrings	2021-07-07 12:28:08 -04:00
Joshua M. Boniface	80f04ce8ee	Remove connection renewal in state handler Regenerating the ZK connection was fraught with issues, including duplicate connections, strange failures to reconnect, and various other wonkiness. Instead let Kazoo handle states sensibly. Kazoo moves to SUSPENDED state when it loses connectivity, and stays there indefinitely (based on cursory tests). And Kazoo seems to always resume from this just fine on its own. Thus all that hackery did nothing but complicate reconnection. This therefore turns the listener into a purely informational function, providing logs of when/why it failed, and we also add some additional output messages during initial connection and final disconnection.	2021-07-07 11:55:12 -04:00
Joshua M. Boniface	65d14ccd92	Return to all command-based Ceph gathering Using the Rados module was very problematic, specifically because it had no sensible timeout parameters and thus would hang for many seconds. This has poor implications since it blocks further keepalives. Instead, remove the Rados usage entirely and go back completely to using manual OS commands to gather this information. While this may cause PID exhaustion more quickly it's worthwhile to avoid failure scenarios when Ceph stats time out. Closes #137	2021-07-06 11:30:45 -04:00
Joshua M. Boniface	adc022f55d	Add missing install of pvcapid-worker.sh	2021-07-06 09:40:42 -04:00
Joshua M. Boniface	7082982a33	Bump version to 0.9.23	2021-07-05 23:40:32 -04:00
Joshua M. Boniface	5b6ef71909	Ensure daemon mode is updated on startup Fixes the side effect of the previous bug during deploys of 0.9.22.	2021-07-05 23:39:23 -04:00
Joshua M. Boniface	a8c28786dd	Better handle empty ipaths in schema When trying to write to sub-item paths that don't yet exist, the previous method would just blindly write to whatever the root key is, which is never what we actually want. Instead, check explicitly for a "base path" situation, and handle that. Then, if we try to get a subpath that isn't valid, return None. Finally in the various functions, if the path is None, just continue (or return false/None) and (try to) chug along.	2021-07-05 23:35:03 -04:00
Joshua M. Boniface	be7b0be8ed	Fix typo in schema path name	2021-07-05 23:23:23 -04:00
Joshua M. Boniface	c45804e8c1	Revert "Return none if a schema path is not found" This reverts commit `b1fcf6a4a5`.	2021-07-05 23:16:39 -04:00
Joshua M. Boniface	b1fcf6a4a5	Return none if a schema path is not found This can cause overwriting of unintended keys, so should not be happening. Will have to find the bugs this causes.	2021-07-05 17:15:55 -04:00
 @ -1 +1 @@
 .9.22
 .9.25