Fix bad schema path name

Bump version to 0.9.24
Handle non-RBD disks and non-RBD errors better
2021-07-09 16:47:09 -04:00 · 2021-07-09 15:58:36 -04:00 · 2021-07-09 15:48:57 -04:00 · 2021-07-09 15:39:06 -04:00 · 2021-07-09 13:13:54 -04:00 · 2021-07-09 13:10:28 -04:00
12 changed files with 154 additions and 114 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.23
+0.9.24
--- a/README.md
+++ b/README.md
@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r

 ## Changelog

+#### v0.9.24
+
+  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
+  * [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
+  * [CLI Client] Allow raw lists of clusters from `pvc cluster list`
+  * [API Daemon] Fixes several issues when getting VM data without stats
+  * [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
+
 #### v0.9.23

  * [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -25,7 +25,7 @@ import yaml
 from distutils.util import strtobool as dustrtobool

 # Daemon version
-version = '0.9.23'
+version = '0.9.24'

 # API version
 API_VERSION = 1.0
--- a/client-cli/pvc/pvc.py
+++ b/client-cli/pvc/pvc.py
@ -251,7 +251,11 @@ def cluster_remove(name):
 # pvc cluster list
 ###############################################################################
@click.command(name='list', short_help='List all available clusters.')
-def cluster_list():
+@click.option(
+    '-r', '--raw', 'raw', is_flag=True, default=False,
+    help='Display the raw list of cluster names only.'
+)
+def cluster_list(raw):
    """
    List all the available PVC clusters configured in this CLI instance.
    """
@ -302,6 +306,7 @@ def cluster_list():
        if _api_key_length > api_key_length:
            api_key_length = _api_key_length

+    if not raw:
        # Display the data nicely
        click.echo("Available clusters:")
        click.echo()
@ -341,6 +346,7 @@ def cluster_list():
            if not api_key:
                api_key = 'N/A'

+        if not raw:
            click.echo(
                '{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}'.format(
                    bold='',
@ -359,6 +365,8 @@ def cluster_list():
                    api_key_length=api_key_length
                )
            )
+        else:
+            click.echo(cluster)


 # Validate that the cluster is set for a given command
@ -1652,7 +1660,7 @@ def vm_dump(filename, domain):
@cluster_req
 def vm_list(target_node, target_state, limit, raw):
    """
-    List all virtual machines; optionally only match names matching regex LIMIT.
+    List all virtual machines; optionally only match names or full UUIDs matching regex LIMIT.

    NOTE: Red-coloured network lists indicate one or more configured networks are missing/invalid.
    """
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name='pvc',
-    version='0.9.23',
+    version='0.9.24',
    packages=['pvc', 'pvc.cli_lib'],
    install_requires=[
        'Click',
--- a/daemon-common/common.py
+++ b/daemon-common/common.py
@ -343,8 +343,13 @@ def getInformationFromXML(zkhandler, uuid):

    parsed_xml = getDomainXML(zkhandler, uuid)

-    stats_data = loads(zkhandler.read(('domain.stats', uuid)))
-    if stats_data is None:
+    stats_data = zkhandler.read(('domain.stats', uuid))
+    if stats_data is not None:
+        try:
+            stats_data = loads(stats_data)
+        except Exception:
+            stats_data = {}
+    else:
        stats_data = {}

    domain_uuid, domain_name, domain_description, domain_memory, domain_vcpu, domain_vcputopo = getDomainMainDetails(parsed_xml)
--- a/daemon-common/vm.py
+++ b/daemon-common/vm.py
@ -449,14 +449,6 @@ def remove_vm(zkhandler, domain):
    if current_vm_state != 'stop':
        change_state(zkhandler, dom_uuid, 'stop')

-    # Gracefully terminate the class instances
-    change_state(zkhandler, dom_uuid, 'delete')
-
-    # Delete the configurations
-    zkhandler.delete([
-        ('domain', dom_uuid)
-    ])
-
    # Wait for 1 second to allow state to flow to all nodes
    time.sleep(1)

@ -465,11 +457,28 @@ def remove_vm(zkhandler, domain):
        # vmpool/vmname_volume
        try:
            disk_pool, disk_name = disk.split('/')
-            retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
        except ValueError:
            continue

-    return True, 'Removed VM "{}" and disks from the cluster.'.format(domain)
+        retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
+        if not retcode:
+            if re.match('^ERROR: No volume with name', message):
+                continue
+            else:
+                return False, message
+
+    # Gracefully terminate the class instances
+    change_state(zkhandler, dom_uuid, 'delete')
+
+    # Wait for 1/2 second to allow state to flow to all nodes
+    time.sleep(0.5)
+
+    # Delete the VM configuration from Zookeeper
+    zkhandler.delete([
+        ('domain', dom_uuid)
+    ])
+
+    return True, 'Removed VM "{}" and its disks from the cluster.'.format(domain)


 def start_vm(zkhandler, domain):
@ -789,7 +798,10 @@ def get_console_log(zkhandler, domain, lines=1000):
        return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)

    # Get the data from ZK
-    console_log = zkhandler.read(('domain.log.console', dom_uuid))
+    console_log = zkhandler.read(('domain.console.log', dom_uuid))
+
+    if console_log is None:
+        return True, ''

    # Shrink the log buffer to length lines
    shrunk_log = console_log.split('\n')[-lines:]
@ -897,6 +909,9 @@ def get_list(zkhandler, node, state, limit, is_fuzzy=True):
        for vm_uuid in vm_execute_list:
            futures.append(executor.submit(common.getInformationFromXML, zkhandler, vm_uuid))
        for future in futures:
+            try:
                vm_data_list.append(future.result())
+            except Exception:
+                pass

    return True, vm_data_list
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@ -124,31 +124,23 @@ class ZKHandler(object):
    # State/connection management
    #
    def listener(self, state):
+        """
+        Listen for KazooState changes and log accordingly.
+
+        This function does not do anything except for log the state, and Kazoo handles the rest.
+        """
        if state == KazooState.CONNECTED:
-            self.log('Connection to Zookeeper started', state='o')
+            self.log('Connection to Zookeeper resumed', state='o')
        else:
-            self.log('Connection to Zookeeper lost', state='w')
-
-            while True:
-                time.sleep(0.5)
-
-                _zk_conn = KazooClient(hosts=self.coordinators)
-                try:
-                    _zk_conn.start()
-                except Exception:
-                    del _zk_conn
-                    continue
-
-                self.zk_conn = _zk_conn
-                self.zk_conn.add_listener(self.listener)
-                break
+            self.log('Connection to Zookeeper lost with state {}'.format(state), state='w')

    def connect(self, persistent=False):
        """
-        Start the zk_conn object and connect to the cluster, then load the current schema version
+        Start the zk_conn object and connect to the cluster
        """
        try:
            self.zk_conn.start()
+            self.log('Connection to Zookeeper started', state='o')
            if persistent:
                self.zk_conn.add_listener(self.listener)
        except Exception as e:
@ -162,11 +154,26 @@ class ZKHandler(object):
        """
        self.zk_conn.stop()
        self.zk_conn.close()
+        self.log('Connection to Zookeeper terminated', state='o')

    #
    # Schema helper actions
    #
    def get_schema_path(self, key):
+        """
+        Get the Zookeeper path for {key} from the current schema based on its format.
+
+        If {key} is a tuple of length 2, it's treated as a path plus an item instance of that path (e.g. a node, a VM, etc.).
+
+        If {key} is a tuple of length 4, it is treated as a path plus an item instance, as well as another item instance of the subpath.
+
+        If {key} is just a string, it's treated as a lone path (mostly used for the 'base' schema group.
+
+        Otherwise, returns None since this is not a valid key.
+
+        This function also handles the special case where a string that looks like an existing path (i.e. starts with '/') is passed;
+        in that case it will silently return the same path back. This was mostly a migration functionality and is deprecated.
+        """
        if isinstance(key, tuple):
            # This is a key tuple with both an ipath and an item
            if len(key) == 2:
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,13 @@
+pvc (0.9.24-0) unstable; urgency=high
+
+  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
+  * [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
+  * [CLI Client] Allow raw lists of clusters from `pvc cluster list`
+  * [API Daemon] Fixes several issues when getting VM data without stats
+  * [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Fri, 09 Jul 2021 15:58:36 -0400
+
 pvc (0.9.23-0) unstable; urgency=high

  * [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
--- a/debian/control
+++ b/debian/control
@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2

 Package: pvc-daemon-node
 Architecture: all
-Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
+Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
 Suggests: pvc-client-api, pvc-client-cli
 Description: Parallel Virtual Cluster node daemon (Python 3)
 A KVM/Zookeeper/Ceph-based VM and private cloud manager
--- a/docs/index.md
+++ b/docs/index.md
@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r

 ## Changelog

+#### v0.9.24
+
+  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
+  * [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
+  * [CLI Client] Allow raw lists of clusters from `pvc cluster list`
+  * [API Daemon] Fixes several issues when getting VM data without stats
+  * [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
+
 #### v0.9.23

  * [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -38,7 +38,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
 from distutils.util import strtobool
 from queue import Queue
 from xml.etree import ElementTree
-from rados import Rados

 from daemon_lib.zkhandler import ZKHandler

@ -55,7 +54,7 @@ import pvcnoded.CephInstance as CephInstance
 import pvcnoded.MetadataAPIInstance as MetadataAPIInstance

 # Version string for startup output
-version = '0.9.23'
+version = '0.9.24'

 ###############################################################################
 # PVCD - node daemon startup program
@ -1314,31 +1313,22 @@ def collect_ceph_stats(queue):
    if debug:
        logger.out("Thread starting", state='d', prefix='ceph-thread')

-    # Connect to the Ceph cluster
-    try:
-        ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
-        if debug:
-            logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
-        ceph_conn.connect(timeout=1)
-    except Exception as e:
-        logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
-        return
-
    if debug:
        logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')

    # Get Ceph cluster health for local status output
-    command = {"prefix": "health", "format": "json"}
+    _, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
    try:
-        health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
-        ceph_health = health_status['status']
+        ceph_health = json.loads(stdout)['status']
    except Exception as e:
        logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
-        return
+        ceph_health = 'HEALTH_UNKN'

-    if ceph_health == 'HEALTH_OK':
+    if ceph_health in ['HEALTH_OK']:
        ceph_health_colour = fmt_green
-    elif ceph_health == 'HEALTH_WARN':
+    elif ceph_health in ['HEALTH_UNKN']:
+        ceph_health_colour = fmt_cyan
+    elif ceph_health in ['HEALTH_WARN']:
        ceph_health_colour = fmt_yellow
    else:
        ceph_health_colour = fmt_red
@ -1348,42 +1338,38 @@ def collect_ceph_stats(queue):
        if debug:
            logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')

-        command = {"prefix": "status", "format": "pretty"}
-        ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
+        _, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
        try:
            zkhandler.write([
                ('base.storage', str(ceph_status))
            ])
        except Exception as e:
            logger.out('Failed to set Ceph status data: {}'.format(e), state='e')
-            return

        if debug:
            logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')

        # Get rados df info
-        command = {"prefix": "df", "format": "pretty"}
-        ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
+        _, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
        try:
            zkhandler.write([
                ('base.storage.util', str(ceph_df))
            ])
        except Exception as e:
            logger.out('Failed to set Ceph utilization data: {}'.format(e), state='e')
-            return

        if debug:
            logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')

        # Get pool info
-        retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
+        _, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
        try:
            ceph_pool_df_raw = json.loads(stdout)['pools']
        except Exception as e:
            logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
            ceph_pool_df_raw = []

-        retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
+        _, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
        try:
            rados_pool_df_raw = json.loads(stdout)['pools']
        except Exception as e:
@ -1448,9 +1434,8 @@ def collect_ceph_stats(queue):
        # Parse the dump data
        osd_dump = dict()

-        command = {"prefix": "osd dump", "format": "json"}
+        _, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
        try:
-            retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
            osd_dump_raw = json.loads(stdout)['osds']
        except Exception as e:
            logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
@ -1474,9 +1459,9 @@ def collect_ceph_stats(queue):

        osd_df = dict()

-        command = {"prefix": "osd df", "format": "json"}
+        _, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
        try:
-            osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
+            osd_df_raw = json.loads(osd_df_out)['nodes']
        except Exception as e:
            logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
            osd_df_raw = []
@ -1501,12 +1486,10 @@ def collect_ceph_stats(queue):

        osd_status = dict()

-        command = {"prefix": "osd status", "format": "pretty"}
-        try:
-            osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
-        except Exception as e:
-            logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
-            osd_status_raw = []
+        retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
+        if retcode != 0:
+            logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
+            osd_status_raw = ''

        if debug:
            logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
@ -1573,8 +1556,6 @@ def collect_ceph_stats(queue):
                    # One or more of the status commands timed out, just continue
                    logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')

-    ceph_conn.shutdown()
-
    queue.put(ceph_health_colour)
    queue.put(ceph_health)
    queue.put(osds_this_node)
@ -1608,7 +1589,6 @@ def collect_vm_stats(queue):
    lv_conn = libvirt.open(libvirt_name)
    if lv_conn is None:
        logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e')
-        return

    memalloc = 0
    memprov = 0
@ -1868,7 +1848,6 @@ def node_keepalive():
        ])
    except Exception:
        logger.out('Failed to set keepalive data', state='e')
-        return

    # Display node information to the terminal
    if config['log_keepalives']:
Author	SHA1	Message	Date
Joshua M. Boniface	0699c48d10	Fix bad schema path name	2021-07-09 16:47:09 -04:00
Joshua M. Boniface	551bae2518	Bump version to 0.9.24	2021-07-09 15:58:36 -04:00
Joshua M. Boniface	4832245d9c	Handle non-RBD disks and non-RBD errors better	2021-07-09 15:48:57 -04:00
Joshua M. Boniface	2138f2f59f	Fail VM removal on disk removal failures Prevents bad states where the VM is "removed" but some of its disks remain due to e.g. stuck watchers. Rearrange the sequence so it goes stop, delete disks, then delete VM, and then return a failure if any of the disk(s) fail to remove, allowing the task to be rerun after fixing the problem.	2021-07-09 15:39:06 -04:00
Joshua M. Boniface	d1d355a96b	Avoid errors if stats data is None	2021-07-09 13:13:54 -04:00
Joshua M. Boniface	2b5dc286ab	Correct failure to get ceph_health data	2021-07-09 13:10:28 -04:00
Joshua M. Boniface	c0c9327a7d	Return an empty log if the value is None	2021-07-09 13:08:00 -04:00
Joshua M. Boniface	5ffabcfef5	Avoid failing if we can't get the future data	2021-07-09 13:05:37 -04:00
Joshua M. Boniface	330cf14638	Remove return statements in keepalive collectors These seem to bork the keepalive timer process, so just remove them and let it continue to press on.	2021-07-09 13:04:17 -04:00
Joshua M. Boniface	9d0eb20197	Mention UUID matching in vm list help	2021-07-09 11:51:20 -04:00
Joshua M. Boniface	3f5b7045a2	Allow raw listing of cluster names in CLI	2021-07-09 10:53:20 -04:00
Joshua M. Boniface	80fe96b24d	Add some additional docstrings	2021-07-07 12:28:08 -04:00
Joshua M. Boniface	80f04ce8ee	Remove connection renewal in state handler Regenerating the ZK connection was fraught with issues, including duplicate connections, strange failures to reconnect, and various other wonkiness. Instead let Kazoo handle states sensibly. Kazoo moves to SUSPENDED state when it loses connectivity, and stays there indefinitely (based on cursory tests). And Kazoo seems to always resume from this just fine on its own. Thus all that hackery did nothing but complicate reconnection. This therefore turns the listener into a purely informational function, providing logs of when/why it failed, and we also add some additional output messages during initial connection and final disconnection.	2021-07-07 11:55:12 -04:00
Joshua M. Boniface	65d14ccd92	Return to all command-based Ceph gathering Using the Rados module was very problematic, specifically because it had no sensible timeout parameters and thus would hang for many seconds. This has poor implications since it blocks further keepalives. Instead, remove the Rados usage entirely and go back completely to using manual OS commands to gather this information. While this may cause PID exhaustion more quickly it's worthwhile to avoid failure scenarios when Ceph stats time out. Closes #137	2021-07-06 11:30:45 -04:00
 @ -1 +1 @@
 .9.23
 .9.24