Return to all command-based Ceph gathering
Using the Rados module was very problematic, specifically because it had no sensible timeout parameters and thus would hang for many seconds. This has poor implications since it blocks further keepalives. Instead, remove the Rados usage entirely and go back completely to using manual OS commands to gather this information. While this may cause PID exhaustion more quickly it's worthwhile to avoid failure scenarios when Ceph stats time out. Closes #137
This commit is contained in:
parent
adc022f55d
commit
65d14ccd92
|
@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
|
||||||
|
|
||||||
Package: pvc-daemon-node
|
Package: pvc-daemon-node
|
||||||
Architecture: all
|
Architecture: all
|
||||||
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
||||||
Suggests: pvc-client-api, pvc-client-cli
|
Suggests: pvc-client-api, pvc-client-cli
|
||||||
Description: Parallel Virtual Cluster node daemon (Python 3)
|
Description: Parallel Virtual Cluster node daemon (Python 3)
|
||||||
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
||||||
|
|
|
@ -38,7 +38,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
from rados import Rados
|
|
||||||
|
|
||||||
from daemon_lib.zkhandler import ZKHandler
|
from daemon_lib.zkhandler import ZKHandler
|
||||||
|
|
||||||
|
@ -1314,24 +1313,13 @@ def collect_ceph_stats(queue):
|
||||||
if debug:
|
if debug:
|
||||||
logger.out("Thread starting", state='d', prefix='ceph-thread')
|
logger.out("Thread starting", state='d', prefix='ceph-thread')
|
||||||
|
|
||||||
# Connect to the Ceph cluster
|
|
||||||
try:
|
|
||||||
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
|
|
||||||
if debug:
|
|
||||||
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
|
|
||||||
ceph_conn.connect(timeout=1)
|
|
||||||
except Exception as e:
|
|
||||||
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
|
|
||||||
return
|
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
|
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
|
||||||
|
|
||||||
# Get Ceph cluster health for local status output
|
# Get Ceph cluster health for local status output
|
||||||
command = {"prefix": "health", "format": "json"}
|
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
|
||||||
try:
|
try:
|
||||||
health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
|
ceph_health = json.loads(stdout)['status']
|
||||||
ceph_health = health_status['status']
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
|
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
|
||||||
return
|
return
|
||||||
|
@ -1348,8 +1336,7 @@ def collect_ceph_stats(queue):
|
||||||
if debug:
|
if debug:
|
||||||
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||||
|
|
||||||
command = {"prefix": "status", "format": "pretty"}
|
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
|
||||||
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
|
||||||
try:
|
try:
|
||||||
zkhandler.write([
|
zkhandler.write([
|
||||||
('base.storage', str(ceph_status))
|
('base.storage', str(ceph_status))
|
||||||
|
@ -1362,8 +1349,7 @@ def collect_ceph_stats(queue):
|
||||||
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||||
|
|
||||||
# Get rados df info
|
# Get rados df info
|
||||||
command = {"prefix": "df", "format": "pretty"}
|
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
|
||||||
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
|
||||||
try:
|
try:
|
||||||
zkhandler.write([
|
zkhandler.write([
|
||||||
('base.storage.util', str(ceph_df))
|
('base.storage.util', str(ceph_df))
|
||||||
|
@ -1376,14 +1362,14 @@ def collect_ceph_stats(queue):
|
||||||
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||||
|
|
||||||
# Get pool info
|
# Get pool info
|
||||||
retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
|
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
|
||||||
try:
|
try:
|
||||||
ceph_pool_df_raw = json.loads(stdout)['pools']
|
ceph_pool_df_raw = json.loads(stdout)['pools']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
|
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
|
||||||
ceph_pool_df_raw = []
|
ceph_pool_df_raw = []
|
||||||
|
|
||||||
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
|
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
|
||||||
try:
|
try:
|
||||||
rados_pool_df_raw = json.loads(stdout)['pools']
|
rados_pool_df_raw = json.loads(stdout)['pools']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -1448,9 +1434,8 @@ def collect_ceph_stats(queue):
|
||||||
# Parse the dump data
|
# Parse the dump data
|
||||||
osd_dump = dict()
|
osd_dump = dict()
|
||||||
|
|
||||||
command = {"prefix": "osd dump", "format": "json"}
|
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
|
||||||
try:
|
try:
|
||||||
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
|
|
||||||
osd_dump_raw = json.loads(stdout)['osds']
|
osd_dump_raw = json.loads(stdout)['osds']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||||
|
@ -1474,9 +1459,9 @@ def collect_ceph_stats(queue):
|
||||||
|
|
||||||
osd_df = dict()
|
osd_df = dict()
|
||||||
|
|
||||||
command = {"prefix": "osd df", "format": "json"}
|
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
|
||||||
try:
|
try:
|
||||||
osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
|
osd_df_raw = json.loads(osd_df_out)['nodes']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||||
osd_df_raw = []
|
osd_df_raw = []
|
||||||
|
@ -1501,12 +1486,10 @@ def collect_ceph_stats(queue):
|
||||||
|
|
||||||
osd_status = dict()
|
osd_status = dict()
|
||||||
|
|
||||||
command = {"prefix": "osd status", "format": "pretty"}
|
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
|
||||||
try:
|
if retcode != 0:
|
||||||
osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
|
||||||
except Exception as e:
|
osd_status_raw = ''
|
||||||
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
|
|
||||||
osd_status_raw = []
|
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
|
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
|
||||||
|
@ -1573,8 +1556,6 @@ def collect_ceph_stats(queue):
|
||||||
# One or more of the status commands timed out, just continue
|
# One or more of the status commands timed out, just continue
|
||||||
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
|
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
|
||||||
|
|
||||||
ceph_conn.shutdown()
|
|
||||||
|
|
||||||
queue.put(ceph_health_colour)
|
queue.put(ceph_health_colour)
|
||||||
queue.put(ceph_health)
|
queue.put(ceph_health)
|
||||||
queue.put(osds_this_node)
|
queue.put(osds_this_node)
|
||||||
|
|
Loading…
Reference in New Issue