Revert "Return to all command-based Ceph gathering"
This reverts commit 65d14ccd92
.
This was actually a bad idea. For inexplicable reasons, running these
Ceph commands manually (not even via Python, but in a normal shell)
takes 7 * two orders of magnitude longer than running them with the
Rados module, so long in fact that some basic commands like "ceph
health" would sometimes take longer than the 1 second timeout to
complete. The Rados commands would however take about 1ms instead.
Despite the occasional issues when monitors drop out, the Rados module
is clearly far superior to the shell commands for any moderately-loaded
Ceph cluster. We can look into solving timeouts another way (perhaps
with Processes instead of Threads) at a later time.
Rados module "ceph health":
b'{"checks":{},"status":"HEALTH_OK"}'
0.001204 (s)
b'{"checks":{},"status":"HEALTH_OK"}'
0.001258 (s)
Command "ceph health":
joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null
real 0m0.772s
user 0m0.707s
sys 0m0.046s
joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null
real 0m0.796s
user 0m0.728s
sys 0m0.054s
This commit is contained in:
parent
0699c48d10
commit
cfeba50b17
|
@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
|
|||
|
||||
Package: pvc-daemon-node
|
||||
Architecture: all
|
||||
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
||||
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
||||
Suggests: pvc-client-api, pvc-client-cli
|
||||
Description: Parallel Virtual Cluster node daemon (Python 3)
|
||||
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
||||
|
|
|
@ -38,6 +38,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|||
from distutils.util import strtobool
|
||||
from queue import Queue
|
||||
from xml.etree import ElementTree
|
||||
from rados import Rados
|
||||
|
||||
from daemon_lib.zkhandler import ZKHandler
|
||||
|
||||
|
@ -1313,13 +1314,24 @@ def collect_ceph_stats(queue):
|
|||
if debug:
|
||||
logger.out("Thread starting", state='d', prefix='ceph-thread')
|
||||
|
||||
# Connect to the Ceph cluster
|
||||
try:
|
||||
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
|
||||
if debug:
|
||||
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
|
||||
ceph_conn.connect(timeout=1)
|
||||
except Exception as e:
|
||||
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
|
||||
return
|
||||
|
||||
if debug:
|
||||
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get Ceph cluster health for local status output
|
||||
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
|
||||
command = {"prefix": "health", "format": "json"}
|
||||
try:
|
||||
ceph_health = json.loads(stdout)['status']
|
||||
health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
|
||||
ceph_health = health_status['status']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
|
||||
ceph_health = 'HEALTH_UNKN'
|
||||
|
@ -1338,7 +1350,8 @@ def collect_ceph_stats(queue):
|
|||
if debug:
|
||||
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
|
||||
command = {"prefix": "status", "format": "pretty"}
|
||||
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
zkhandler.write([
|
||||
('base.storage', str(ceph_status))
|
||||
|
@ -1350,7 +1363,8 @@ def collect_ceph_stats(queue):
|
|||
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get rados df info
|
||||
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
|
||||
command = {"prefix": "df", "format": "pretty"}
|
||||
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
zkhandler.write([
|
||||
('base.storage.util', str(ceph_df))
|
||||
|
@ -1362,14 +1376,14 @@ def collect_ceph_stats(queue):
|
|||
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get pool info
|
||||
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
|
||||
retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
|
||||
try:
|
||||
ceph_pool_df_raw = json.loads(stdout)['pools']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
|
||||
ceph_pool_df_raw = []
|
||||
|
||||
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
|
||||
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
|
||||
try:
|
||||
rados_pool_df_raw = json.loads(stdout)['pools']
|
||||
except Exception as e:
|
||||
|
@ -1434,8 +1448,9 @@ def collect_ceph_stats(queue):
|
|||
# Parse the dump data
|
||||
osd_dump = dict()
|
||||
|
||||
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
|
||||
command = {"prefix": "osd dump", "format": "json"}
|
||||
try:
|
||||
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
|
||||
osd_dump_raw = json.loads(stdout)['osds']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||
|
@ -1459,9 +1474,9 @@ def collect_ceph_stats(queue):
|
|||
|
||||
osd_df = dict()
|
||||
|
||||
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
|
||||
command = {"prefix": "osd df", "format": "json"}
|
||||
try:
|
||||
osd_df_raw = json.loads(osd_df_out)['nodes']
|
||||
osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||
osd_df_raw = []
|
||||
|
@ -1486,10 +1501,12 @@ def collect_ceph_stats(queue):
|
|||
|
||||
osd_status = dict()
|
||||
|
||||
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
|
||||
if retcode != 0:
|
||||
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
|
||||
osd_status_raw = ''
|
||||
command = {"prefix": "osd status", "format": "pretty"}
|
||||
try:
|
||||
osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
|
||||
osd_status_raw = []
|
||||
|
||||
if debug:
|
||||
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
|
||||
|
@ -1556,6 +1573,8 @@ def collect_ceph_stats(queue):
|
|||
# One or more of the status commands timed out, just continue
|
||||
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
|
||||
|
||||
ceph_conn.shutdown()
|
||||
|
||||
queue.put(ceph_health_colour)
|
||||
queue.put(ceph_health)
|
||||
queue.put(osds_this_node)
|
||||
|
|
Loading…
Reference in New Issue