Compare commits

...

14 Commits

Author SHA1 Message Date
0699c48d10 Fix bad schema path name 2021-07-09 16:47:09 -04:00
551bae2518 Bump version to 0.9.24 2021-07-09 15:58:36 -04:00
4832245d9c Handle non-RBD disks and non-RBD errors better 2021-07-09 15:48:57 -04:00
2138f2f59f Fail VM removal on disk removal failures
Prevents bad states where the VM is "removed" but some of its disks
remain due to e.g. stuck watchers.

Rearrange the sequence so it goes stop, delete disks, then delete VM,
and then return a failure if any of the disk(s) fail to remove, allowing
the task to be rerun after fixing the problem.
2021-07-09 15:39:06 -04:00
d1d355a96b Avoid errors if stats data is None 2021-07-09 13:13:54 -04:00
2b5dc286ab Correct failure to get ceph_health data 2021-07-09 13:10:28 -04:00
c0c9327a7d Return an empty log if the value is None 2021-07-09 13:08:00 -04:00
5ffabcfef5 Avoid failing if we can't get the future data 2021-07-09 13:05:37 -04:00
330cf14638 Remove return statements in keepalive collectors
These seem to bork the keepalive timer process, so just remove them and
let it continue to press on.
2021-07-09 13:04:17 -04:00
9d0eb20197 Mention UUID matching in vm list help 2021-07-09 11:51:20 -04:00
3f5b7045a2 Allow raw listing of cluster names in CLI 2021-07-09 10:53:20 -04:00
80fe96b24d Add some additional docstrings 2021-07-07 12:28:08 -04:00
80f04ce8ee Remove connection renewal in state handler
Regenerating the ZK connection was fraught with issues, including
duplicate connections, strange failures to reconnect, and various other
wonkiness.

Instead let Kazoo handle states sensibly. Kazoo moves to SUSPENDED state
when it loses connectivity, and stays there indefinitely (based on
cursory tests). And Kazoo seems to always resume from this just fine on
its own. Thus all that hackery did nothing but complicate reconnection.

This therefore turns the listener into a purely informational function,
providing logs of when/why it failed, and we also add some additional
output messages during initial connection and final disconnection.
2021-07-07 11:55:12 -04:00
65d14ccd92 Return to all command-based Ceph gathering
Using the Rados module was very problematic, specifically because it had
no sensible timeout parameters and thus would hang for many seconds.
This has poor implications since it blocks further keepalives.

Instead, remove the Rados usage entirely and go back completely to using
manual OS commands to gather this information. While this may cause PID
exhaustion more quickly it's worthwhile to avoid failure scenarios when
Ceph stats time out.

Closes #137
2021-07-06 11:30:45 -04:00
12 changed files with 154 additions and 114 deletions

View File

@ -1 +1 @@
0.9.23
0.9.24

View File

@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
## Changelog
#### v0.9.24
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
* [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
* [CLI Client] Allow raw lists of clusters from `pvc cluster list`
* [API Daemon] Fixes several issues when getting VM data without stats
* [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
#### v0.9.23
* [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid

View File

@ -25,7 +25,7 @@ import yaml
from distutils.util import strtobool as dustrtobool
# Daemon version
version = '0.9.23'
version = '0.9.24'
# API version
API_VERSION = 1.0

View File

@ -251,7 +251,11 @@ def cluster_remove(name):
# pvc cluster list
###############################################################################
@click.command(name='list', short_help='List all available clusters.')
def cluster_list():
@click.option(
'-r', '--raw', 'raw', is_flag=True, default=False,
help='Display the raw list of cluster names only.'
)
def cluster_list(raw):
"""
List all the available PVC clusters configured in this CLI instance.
"""
@ -302,6 +306,7 @@ def cluster_list():
if _api_key_length > api_key_length:
api_key_length = _api_key_length
if not raw:
# Display the data nicely
click.echo("Available clusters:")
click.echo()
@ -341,6 +346,7 @@ def cluster_list():
if not api_key:
api_key = 'N/A'
if not raw:
click.echo(
'{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}'.format(
bold='',
@ -359,6 +365,8 @@ def cluster_list():
api_key_length=api_key_length
)
)
else:
click.echo(cluster)
# Validate that the cluster is set for a given command
@ -1652,7 +1660,7 @@ def vm_dump(filename, domain):
@cluster_req
def vm_list(target_node, target_state, limit, raw):
"""
List all virtual machines; optionally only match names matching regex LIMIT.
List all virtual machines; optionally only match names or full UUIDs matching regex LIMIT.
NOTE: Red-coloured network lists indicate one or more configured networks are missing/invalid.
"""

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name='pvc',
version='0.9.23',
version='0.9.24',
packages=['pvc', 'pvc.cli_lib'],
install_requires=[
'Click',

View File

@ -343,8 +343,13 @@ def getInformationFromXML(zkhandler, uuid):
parsed_xml = getDomainXML(zkhandler, uuid)
stats_data = loads(zkhandler.read(('domain.stats', uuid)))
if stats_data is None:
stats_data = zkhandler.read(('domain.stats', uuid))
if stats_data is not None:
try:
stats_data = loads(stats_data)
except Exception:
stats_data = {}
else:
stats_data = {}
domain_uuid, domain_name, domain_description, domain_memory, domain_vcpu, domain_vcputopo = getDomainMainDetails(parsed_xml)

View File

@ -449,14 +449,6 @@ def remove_vm(zkhandler, domain):
if current_vm_state != 'stop':
change_state(zkhandler, dom_uuid, 'stop')
# Gracefully terminate the class instances
change_state(zkhandler, dom_uuid, 'delete')
# Delete the configurations
zkhandler.delete([
('domain', dom_uuid)
])
# Wait for 1 second to allow state to flow to all nodes
time.sleep(1)
@ -465,11 +457,28 @@ def remove_vm(zkhandler, domain):
# vmpool/vmname_volume
try:
disk_pool, disk_name = disk.split('/')
retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
except ValueError:
continue
return True, 'Removed VM "{}" and disks from the cluster.'.format(domain)
retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
if not retcode:
if re.match('^ERROR: No volume with name', message):
continue
else:
return False, message
# Gracefully terminate the class instances
change_state(zkhandler, dom_uuid, 'delete')
# Wait for 1/2 second to allow state to flow to all nodes
time.sleep(0.5)
# Delete the VM configuration from Zookeeper
zkhandler.delete([
('domain', dom_uuid)
])
return True, 'Removed VM "{}" and its disks from the cluster.'.format(domain)
def start_vm(zkhandler, domain):
@ -789,7 +798,10 @@ def get_console_log(zkhandler, domain, lines=1000):
return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)
# Get the data from ZK
console_log = zkhandler.read(('domain.log.console', dom_uuid))
console_log = zkhandler.read(('domain.console.log', dom_uuid))
if console_log is None:
return True, ''
# Shrink the log buffer to length lines
shrunk_log = console_log.split('\n')[-lines:]
@ -897,6 +909,9 @@ def get_list(zkhandler, node, state, limit, is_fuzzy=True):
for vm_uuid in vm_execute_list:
futures.append(executor.submit(common.getInformationFromXML, zkhandler, vm_uuid))
for future in futures:
try:
vm_data_list.append(future.result())
except Exception:
pass
return True, vm_data_list

View File

@ -124,31 +124,23 @@ class ZKHandler(object):
# State/connection management
#
def listener(self, state):
"""
Listen for KazooState changes and log accordingly.
This function does not do anything except for log the state, and Kazoo handles the rest.
"""
if state == KazooState.CONNECTED:
self.log('Connection to Zookeeper started', state='o')
self.log('Connection to Zookeeper resumed', state='o')
else:
self.log('Connection to Zookeeper lost', state='w')
while True:
time.sleep(0.5)
_zk_conn = KazooClient(hosts=self.coordinators)
try:
_zk_conn.start()
except Exception:
del _zk_conn
continue
self.zk_conn = _zk_conn
self.zk_conn.add_listener(self.listener)
break
self.log('Connection to Zookeeper lost with state {}'.format(state), state='w')
def connect(self, persistent=False):
"""
Start the zk_conn object and connect to the cluster, then load the current schema version
Start the zk_conn object and connect to the cluster
"""
try:
self.zk_conn.start()
self.log('Connection to Zookeeper started', state='o')
if persistent:
self.zk_conn.add_listener(self.listener)
except Exception as e:
@ -162,11 +154,26 @@ class ZKHandler(object):
"""
self.zk_conn.stop()
self.zk_conn.close()
self.log('Connection to Zookeeper terminated', state='o')
#
# Schema helper actions
#
def get_schema_path(self, key):
"""
Get the Zookeeper path for {key} from the current schema based on its format.
If {key} is a tuple of length 2, it's treated as a path plus an item instance of that path (e.g. a node, a VM, etc.).
If {key} is a tuple of length 4, it is treated as a path plus an item instance, as well as another item instance of the subpath.
If {key} is just a string, it's treated as a lone path (mostly used for the 'base' schema group.
Otherwise, returns None since this is not a valid key.
This function also handles the special case where a string that looks like an existing path (i.e. starts with '/') is passed;
in that case it will silently return the same path back. This was mostly a migration functionality and is deprecated.
"""
if isinstance(key, tuple):
# This is a key tuple with both an ipath and an item
if len(key) == 2:

10
debian/changelog vendored
View File

@ -1,3 +1,13 @@
pvc (0.9.24-0) unstable; urgency=high
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
* [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
* [CLI Client] Allow raw lists of clusters from `pvc cluster list`
* [API Daemon] Fixes several issues when getting VM data without stats
* [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
-- Joshua M. Boniface <joshua@boniface.me> Fri, 09 Jul 2021 15:58:36 -0400
pvc (0.9.23-0) unstable; urgency=high
* [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid

2
debian/control vendored
View File

@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
Package: pvc-daemon-node
Architecture: all
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Suggests: pvc-client-api, pvc-client-cli
Description: Parallel Virtual Cluster node daemon (Python 3)
A KVM/Zookeeper/Ceph-based VM and private cloud manager

View File

@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
## Changelog
#### v0.9.24
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
* [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
* [CLI Client] Allow raw lists of clusters from `pvc cluster list`
* [API Daemon] Fixes several issues when getting VM data without stats
* [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
#### v0.9.23
* [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid

View File

@ -38,7 +38,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
from distutils.util import strtobool
from queue import Queue
from xml.etree import ElementTree
from rados import Rados
from daemon_lib.zkhandler import ZKHandler
@ -55,7 +54,7 @@ import pvcnoded.CephInstance as CephInstance
import pvcnoded.MetadataAPIInstance as MetadataAPIInstance
# Version string for startup output
version = '0.9.23'
version = '0.9.24'
###############################################################################
# PVCD - node daemon startup program
@ -1314,31 +1313,22 @@ def collect_ceph_stats(queue):
if debug:
logger.out("Thread starting", state='d', prefix='ceph-thread')
# Connect to the Ceph cluster
try:
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
if debug:
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
ceph_conn.connect(timeout=1)
except Exception as e:
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
return
if debug:
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
# Get Ceph cluster health for local status output
command = {"prefix": "health", "format": "json"}
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
try:
health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
ceph_health = health_status['status']
ceph_health = json.loads(stdout)['status']
except Exception as e:
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
return
ceph_health = 'HEALTH_UNKN'
if ceph_health == 'HEALTH_OK':
if ceph_health in ['HEALTH_OK']:
ceph_health_colour = fmt_green
elif ceph_health == 'HEALTH_WARN':
elif ceph_health in ['HEALTH_UNKN']:
ceph_health_colour = fmt_cyan
elif ceph_health in ['HEALTH_WARN']:
ceph_health_colour = fmt_yellow
else:
ceph_health_colour = fmt_red
@ -1348,42 +1338,38 @@ def collect_ceph_stats(queue):
if debug:
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
command = {"prefix": "status", "format": "pretty"}
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
try:
zkhandler.write([
('base.storage', str(ceph_status))
])
except Exception as e:
logger.out('Failed to set Ceph status data: {}'.format(e), state='e')
return
if debug:
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get rados df info
command = {"prefix": "df", "format": "pretty"}
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
try:
zkhandler.write([
('base.storage.util', str(ceph_df))
])
except Exception as e:
logger.out('Failed to set Ceph utilization data: {}'.format(e), state='e')
return
if debug:
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get pool info
retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
try:
ceph_pool_df_raw = json.loads(stdout)['pools']
except Exception as e:
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
ceph_pool_df_raw = []
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
try:
rados_pool_df_raw = json.loads(stdout)['pools']
except Exception as e:
@ -1448,9 +1434,8 @@ def collect_ceph_stats(queue):
# Parse the dump data
osd_dump = dict()
command = {"prefix": "osd dump", "format": "json"}
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
try:
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
osd_dump_raw = json.loads(stdout)['osds']
except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
@ -1474,9 +1459,9 @@ def collect_ceph_stats(queue):
osd_df = dict()
command = {"prefix": "osd df", "format": "json"}
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
try:
osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
osd_df_raw = json.loads(osd_df_out)['nodes']
except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
osd_df_raw = []
@ -1501,12 +1486,10 @@ def collect_ceph_stats(queue):
osd_status = dict()
command = {"prefix": "osd status", "format": "pretty"}
try:
osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
except Exception as e:
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
osd_status_raw = []
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
if retcode != 0:
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
osd_status_raw = ''
if debug:
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
@ -1573,8 +1556,6 @@ def collect_ceph_stats(queue):
# One or more of the status commands timed out, just continue
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
ceph_conn.shutdown()
queue.put(ceph_health_colour)
queue.put(ceph_health)
queue.put(osds_this_node)
@ -1608,7 +1589,6 @@ def collect_vm_stats(queue):
lv_conn = libvirt.open(libvirt_name)
if lv_conn is None:
logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e')
return
memalloc = 0
memprov = 0
@ -1868,7 +1848,6 @@ def node_keepalive():
])
except Exception:
logger.out('Failed to set keepalive data', state='e')
return
# Display node information to the terminal
if config['log_keepalives']: