Return to all command-based Ceph gathering

Using the Rados module was very problematic, specifically because it had
no sensible timeout parameters and thus would hang for many seconds.
This has poor implications since it blocks further keepalives.

Instead, remove the Rados usage entirely and go back completely to using
manual OS commands to gather this information. While this may cause PID
exhaustion more quickly it's worthwhile to avoid failure scenarios when
Ceph stats time out.

Closes #137
This commit is contained in:
Joshua Boniface 2021-07-06 11:19:22 -04:00
parent adc022f55d
commit 65d14ccd92
2 changed files with 14 additions and 33 deletions

2
debian/control vendored
View File

@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
Package: pvc-daemon-node
Architecture: all
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Suggests: pvc-client-api, pvc-client-cli
Description: Parallel Virtual Cluster node daemon (Python 3)
A KVM/Zookeeper/Ceph-based VM and private cloud manager

View File

@ -38,7 +38,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
from distutils.util import strtobool
from queue import Queue
from xml.etree import ElementTree
from rados import Rados
from daemon_lib.zkhandler import ZKHandler
@ -1314,24 +1313,13 @@ def collect_ceph_stats(queue):
if debug:
logger.out("Thread starting", state='d', prefix='ceph-thread')
# Connect to the Ceph cluster
try:
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
if debug:
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
ceph_conn.connect(timeout=1)
except Exception as e:
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
return
if debug:
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
# Get Ceph cluster health for local status output
command = {"prefix": "health", "format": "json"}
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
try:
health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
ceph_health = health_status['status']
ceph_health = json.loads(stdout)['status']
except Exception as e:
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
return
@ -1348,8 +1336,7 @@ def collect_ceph_stats(queue):
if debug:
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
command = {"prefix": "status", "format": "pretty"}
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
try:
zkhandler.write([
('base.storage', str(ceph_status))
@ -1362,8 +1349,7 @@ def collect_ceph_stats(queue):
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get rados df info
command = {"prefix": "df", "format": "pretty"}
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
try:
zkhandler.write([
('base.storage.util', str(ceph_df))
@ -1376,14 +1362,14 @@ def collect_ceph_stats(queue):
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get pool info
retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
try:
ceph_pool_df_raw = json.loads(stdout)['pools']
except Exception as e:
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
ceph_pool_df_raw = []
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
try:
rados_pool_df_raw = json.loads(stdout)['pools']
except Exception as e:
@ -1448,9 +1434,8 @@ def collect_ceph_stats(queue):
# Parse the dump data
osd_dump = dict()
command = {"prefix": "osd dump", "format": "json"}
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
try:
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
osd_dump_raw = json.loads(stdout)['osds']
except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
@ -1474,9 +1459,9 @@ def collect_ceph_stats(queue):
osd_df = dict()
command = {"prefix": "osd df", "format": "json"}
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
try:
osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
osd_df_raw = json.loads(osd_df_out)['nodes']
except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
osd_df_raw = []
@ -1501,12 +1486,10 @@ def collect_ceph_stats(queue):
osd_status = dict()
command = {"prefix": "osd status", "format": "pretty"}
try:
osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
except Exception as e:
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
osd_status_raw = []
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
if retcode != 0:
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
osd_status_raw = ''
if debug:
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
@ -1573,8 +1556,6 @@ def collect_ceph_stats(queue):
# One or more of the status commands timed out, just continue
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
ceph_conn.shutdown()
queue.put(ceph_health_colour)
queue.put(ceph_health)
queue.put(osds_this_node)