Go back to manual command for OSD stats
Using the Ceph library was a disaster here; it had no timeout or way to force it to continue, so keepalives would become stuck and trigger fence storms. Go back to the manual osd dump command with a 2s timeout which is far more reliable and can be adequately terminated if it runs long.
This commit is contained in:
parent
42f2dedf6d
commit
0587bcbd67
|
@ -1149,7 +1149,8 @@ def collect_ceph_stats(queue):
|
||||||
|
|
||||||
command = { "prefix": "osd dump", "format": "json" }
|
command = { "prefix": "osd dump", "format": "json" }
|
||||||
try:
|
try:
|
||||||
osd_dump_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['osds']
|
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
|
||||||
|
osd_dump_raw = json.loads(stdout)['osds']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||||
osd_dump_raw = []
|
osd_dump_raw = []
|
||||||
|
|
Loading…
Reference in New Issue