Handle failures of Ceph commands gradefully

If these commands fail, catch the error, print a message, and set up
empty lists. Also handle later data parsing in this case.
This commit is contained in:
Joshua Boniface 2019-07-09 16:20:56 -04:00
parent 1a8e7509f7
commit 47f86475f8
1 changed files with 37 additions and 12 deletions

View File

@ -956,7 +956,12 @@ def update_zookeeper():
# Get pool info # Get pool info
pool_df = dict() pool_df = dict()
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1) retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
try:
pool_df_raw = json.loads(stdout)['pools'] pool_df_raw = json.loads(stdout)['pools']
except json.decoder.JSONDecodeError:
logger.out('Failed to obtain Pool data', state='w')
pool_df_raw = []
for pool in pool_df_raw: for pool in pool_df_raw:
pool_df.update({ pool_df.update({
str(pool['name']): { str(pool['name']): {
@ -977,9 +982,14 @@ def update_zookeeper():
# Trigger updates for each pool on this node # Trigger updates for each pool on this node
for pool in pool_list: for pool in pool_list:
try:
stats = json.dumps(pool_df[pool])
zkhandler.writedata(zk_conn, { zkhandler.writedata(zk_conn, {
'/ceph/pools/{}/stats'.format(pool): str(json.dumps(pool_df[pool])) '/ceph/pools/{}/stats'.format(pool): str(stats)
}) })
except KeyError:
# One or more of the status commands timed out, just continue
pass
# Only grab OSD stats if there are OSDs to grab (otherwise `ceph osd df` hangs) # Only grab OSD stats if there are OSDs to grab (otherwise `ceph osd df` hangs)
osds_this_node = 0 osds_this_node = 0
@ -990,7 +1000,12 @@ def update_zookeeper():
# Parse the dump data # Parse the dump data
osd_dump = dict() osd_dump = dict()
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json', timeout=1) retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json', timeout=1)
try:
osd_dump_raw = json.loads(stdout)['osds'] osd_dump_raw = json.loads(stdout)['osds']
except json.decoder.JSONDecodeError:
logger.out('Failed to obtain OSD data', state='w')
osd_dump_raw = []
if debug: if debug:
print("Loop through OSD dump") print("Loop through OSD dump")
for osd in osd_dump_raw: for osd in osd_dump_raw:
@ -1012,6 +1027,7 @@ def update_zookeeper():
osd_df_raw = json.loads(stdout)['nodes'] osd_df_raw = json.loads(stdout)['nodes']
except: except:
logger.out('Failed to parse OSD list', state='w') logger.out('Failed to parse OSD list', state='w')
osd_df_raw = []
if debug: if debug:
print("Loop through OSD df") print("Loop through OSD df")
@ -1066,19 +1082,28 @@ def update_zookeeper():
print("Merge OSD data together") print("Merge OSD data together")
osd_stats = dict() osd_stats = dict()
for osd in osd_list: for osd in osd_list:
try:
this_dump = osd_dump[osd] this_dump = osd_dump[osd]
this_dump.update(osd_df[osd]) this_dump.update(osd_df[osd])
this_dump.update(osd_status[osd]) this_dump.update(osd_status[osd])
osd_stats[osd] = this_dump osd_stats[osd] = this_dump
except KeyError:
# One or more of the status commands timed out, just continue
pass
# Trigger updates for each OSD on this node # Trigger updates for each OSD on this node
if debug: if debug:
print("Trigger updates for each OSD on this node") print("Trigger updates for each OSD on this node")
for osd in osd_list: for osd in osd_list:
if d_osd[osd].node == myhostname: if d_osd[osd].node == myhostname:
try:
stats = json.dumps(osd_stats[osd])
zkhandler.writedata(zk_conn, { zkhandler.writedata(zk_conn, {
'/ceph/osds/{}/stats'.format(osd): str(json.dumps(osd_stats[osd])) '/ceph/osds/{}/stats'.format(osd): str(stats)
}) })
except KeyError:
# One or more of the status commands timed out, just continue
pass
osds_this_node += 1 osds_this_node += 1
memalloc = 0 memalloc = 0