Remove OSD stat collection if count is zero

Otherwise, ceph osd df will hang indefinitely trying to get data
for the zero OSDs.
This commit is contained in:
Joshua Boniface 2019-06-18 12:36:53 -04:00
parent 5a327dc41a
commit 8d9007f697
1 changed files with 108 additions and 101 deletions

View File

@ -58,7 +58,7 @@ import pvcd.CephInstance as CephInstance
############################################################################### ###############################################################################
# PVCD - node daemon startup program # PVCD - node daemon startup program
############################################################################### ###############################################################################
# #
# The PVC daemon starts a node and configures all the required components for # The PVC daemon starts a node and configures all the required components for
# the node to run. It determines which of the 3 daemon modes it should be in # the node to run. It determines which of the 3 daemon modes it should be in
# during initial setup based on hostname and the config file, and then starts # during initial setup based on hostname and the config file, and then starts
@ -201,14 +201,14 @@ def readConfig(pvcd_config_file, myhostname):
address_key = '{}_dev_ip'.format(net) address_key = '{}_dev_ip'.format(net)
floating_key = '{}_floating_ip'.format(net) floating_key = '{}_floating_ip'.format(net)
network_key = '{}_network'.format(net) network_key = '{}_network'.format(net)
# Verify the network provided is valid # Verify the network provided is valid
try: try:
network = ipaddress.ip_network(config[network_key]) network = ipaddress.ip_network(config[network_key])
except Exception as e: except Exception as e:
print('ERROR: Network address {} for {} is not valid!'.format(config[network_key], network_key)) print('ERROR: Network address {} for {} is not valid!'.format(config[network_key], network_key))
exit(1) exit(1)
# If we should be autoselected # If we should be autoselected
if config[address_key] == 'by-id': if config[address_key] == 'by-id':
# Construct an IP from the relevant network # Construct an IP from the relevant network
@ -216,9 +216,9 @@ def readConfig(pvcd_config_file, myhostname):
address_id = int(mynodeid) - 1 address_id = int(mynodeid) - 1
# Grab the nth address from the network # Grab the nth address from the network
config[address_key] = '{}/{}'.format(list(network.hosts())[address_id], network.prefixlen) config[address_key] = '{}/{}'.format(list(network.hosts())[address_id], network.prefixlen)
# Verify that the floating IP is valid # Verify that the floating IP is valid
try: try:
# Set the ipaddr # Set the ipaddr
floating_addr = ipaddress.ip_address(config[floating_key].split('/')[0]) floating_addr = ipaddress.ip_address(config[floating_key].split('/')[0])
@ -228,7 +228,7 @@ def readConfig(pvcd_config_file, myhostname):
except Exception as e: except Exception as e:
print('ERROR: Floating address {} for {} is not valid!'.format(config[floating_key], floating_key)) print('ERROR: Floating address {} for {} is not valid!'.format(config[floating_key], floating_key))
exit(1) exit(1)
# Handle the storage config # Handle the storage config
if config['enable_storage']: if config['enable_storage']:
try: try:
@ -246,7 +246,7 @@ def readConfig(pvcd_config_file, myhostname):
# Get the config object from readConfig() # Get the config object from readConfig()
config = readConfig(pvcd_config_file, myhostname) config = readConfig(pvcd_config_file, myhostname)
# Handle the enable values # Handle the enable values
enable_hypervisor = config['enable_hypervisor'] enable_hypervisor = config['enable_hypervisor']
enable_networking = config['enable_networking'] enable_networking = config['enable_networking']
@ -357,19 +357,19 @@ if enable_networking:
# Enable routing functions # Enable routing functions
common.run_os_command('sysctl net.ipv4.ip_forward=1') common.run_os_command('sysctl net.ipv4.ip_forward=1')
common.run_os_command('sysctl net.ipv6.ip_forward=1') common.run_os_command('sysctl net.ipv6.ip_forward=1')
# Send redirects # Send redirects
common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1') common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1')
common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1') common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1')
common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1') common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1')
common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1') common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1')
# Accept source routes # Accept source routes
common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1') common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1')
common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1') common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1')
common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1') common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1')
common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1') common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1')
# Disable RP filtering on the VNI dev and bridge interfaces (to allow traffic pivoting) # Disable RP filtering on the VNI dev and bridge interfaces (to allow traffic pivoting)
common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['vni_dev'])) common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['vni_dev']))
common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['upstream_dev'])) common.run_os_command('sysctl net.ipv4.conf.{}.rp_filter=0'.format(config['upstream_dev']))
@ -912,104 +912,111 @@ def update_zookeeper():
} }
}) })
# Trigger updates for each OSD on this node # Trigger updates for each pool on this node
for pool in pool_list: for pool in pool_list:
zkhandler.writedata(zk_conn, { zkhandler.writedata(zk_conn, {
'/ceph/pools/{}/stats'.format(pool): str(json.dumps(pool_df[pool])) '/ceph/pools/{}/stats'.format(pool): str(json.dumps(pool_df[pool]))
}) })
# Get data from Ceph OSDs # Only grab OSD stats if there are OSDs to grab (otherwise `ceph osd df` hangs)
if debug: osds_this_node = 0
print("Get data from Ceph OSDs") if len(osd_list) > 0:
# Parse the dump data # Get data from Ceph OSDs
osd_dump = dict() if debug:
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json') print("Get data from Ceph OSDs")
osd_dump_raw = json.loads(stdout)['osds'] # Parse the dump data
if debug: osd_dump = dict()
print("Loop through OSD dump") retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json')
for osd in osd_dump_raw: osd_dump_raw = json.loads(stdout)['osds']
osd_dump.update({ if debug:
str(osd['osd']): { print("Loop through OSD dump")
'uuid': osd['uuid'], for osd in osd_dump_raw:
'up': osd['up'], osd_dump.update({
'in': osd['in'], str(osd['osd']): {
'primary_affinity': osd['primary_affinity'] 'uuid': osd['uuid'],
} 'up': osd['up'],
}) 'in': osd['in'],
# Parse the df data 'primary_affinity': osd['primary_affinity']
if debug:
print("Parse the OSD df data")
osd_df = dict()
retcode, stdout, stderr = common.run_os_command('ceph osd df --format json')
osd_df_raw = json.loads(stdout)['nodes']
if debug:
print("Loop through OSD df")
for osd in osd_df_raw:
osd_df.update({
str(osd['id']): {
'utilization': osd['utilization'],
'var': osd['var'],
'pgs': osd['pgs'],
'kb': osd['kb'],
'weight': osd['crush_weight'],
'reweight': osd['reweight'],
}
})
# Parse the status data
if debug:
print("Parse the OSD status data")
osd_status = dict()
retcode, stdout, stderr = common.run_os_command('ceph osd status')
if debug:
print("Loop through OSD status data")
for line in stderr.split('\n'):
# Strip off colour
line = re.sub(r'\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))', '', line)
# Split it for parsing
line = line.split()
if len(line) > 1 and line[1].isdigit():
# This is an OSD line so parse it
osd_id = line[1]
node = line[3].split('.')[0]
used = line[5]
avail = line[7]
wr_ops = line[9]
wr_data = line[11]
rd_ops = line[13]
rd_data = line[15]
state = line[17]
osd_status.update({
str(osd_id): {
'node': node,
'used': used,
'avail': avail,
'wr_ops': wr_ops,
'wr_data': wr_data,
'rd_ops': rd_ops,
'rd_data': rd_data,
'state': state
} }
}) })
# Merge them together into a single meaningful dict
if debug:
print("Merge OSD data together")
osd_stats = dict()
for osd in osd_list:
this_dump = osd_dump[osd]
this_dump.update(osd_df[osd])
this_dump.update(osd_status[osd])
osd_stats[osd] = this_dump
# Trigger updates for each OSD on this node # Parse the df data
if debug: if debug:
print("Trigger updates for each OSD on this node") print("Parse the OSD df data")
osds_this_node = 0 osd_df = dict()
for osd in osd_list: retcode, stdout, stderr = common.run_os_command('ceph osd df --format json')
if d_osd[osd].node == myhostname: try:
zkhandler.writedata(zk_conn, { osd_df_raw = json.loads(stdout)['nodes']
'/ceph/osds/{}/stats'.format(osd): str(json.dumps(osd_stats[osd])) except:
logger.out('Failed to parse OSD list', state='w')
if debug:
print("Loop through OSD df")
for osd in osd_df_raw:
osd_df.update({
str(osd['id']): {
'utilization': osd['utilization'],
'var': osd['var'],
'pgs': osd['pgs'],
'kb': osd['kb'],
'weight': osd['crush_weight'],
'reweight': osd['reweight'],
}
}) })
osds_this_node += 1 # Parse the status data
if debug:
print("Parse the OSD status data")
osd_status = dict()
retcode, stdout, stderr = common.run_os_command('ceph osd status')
if debug:
print("Loop through OSD status data")
for line in stderr.split('\n'):
# Strip off colour
line = re.sub(r'\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))', '', line)
# Split it for parsing
line = line.split()
if len(line) > 1 and line[1].isdigit():
# This is an OSD line so parse it
osd_id = line[1]
node = line[3].split('.')[0]
used = line[5]
avail = line[7]
wr_ops = line[9]
wr_data = line[11]
rd_ops = line[13]
rd_data = line[15]
state = line[17]
osd_status.update({
str(osd_id): {
'node': node,
'used': used,
'avail': avail,
'wr_ops': wr_ops,
'wr_data': wr_data,
'rd_ops': rd_ops,
'rd_data': rd_data,
'state': state
}
})
# Merge them together into a single meaningful dict
if debug:
print("Merge OSD data together")
osd_stats = dict()
for osd in osd_list:
this_dump = osd_dump[osd]
this_dump.update(osd_df[osd])
this_dump.update(osd_status[osd])
osd_stats[osd] = this_dump
# Trigger updates for each OSD on this node
if debug:
print("Trigger updates for each OSD on this node")
for osd in osd_list:
if d_osd[osd].node == myhostname:
zkhandler.writedata(zk_conn, {
'/ceph/osds/{}/stats'.format(osd): str(json.dumps(osd_stats[osd]))
})
osds_this_node += 1
memalloc = 0 memalloc = 0
vcpualloc = 0 vcpualloc = 0
@ -1030,7 +1037,7 @@ def update_zookeeper():
except Exception as e: except Exception as e:
# Toggle a state "change" # Toggle a state "change"
zkhandler.writedata(zk_conn, { '/domains/{}/state'.format(domain): instance.getstate() }) zkhandler.writedata(zk_conn, { '/domains/{}/state'.format(domain): instance.getstate() })
# Connect to libvirt # Connect to libvirt
if debug: if debug:
print("Connect to libvirt") print("Connect to libvirt")
@ -1039,7 +1046,7 @@ def update_zookeeper():
if lv_conn == None: if lv_conn == None:
logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e') logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e')
return return
# Ensure that any running VMs are readded to the domain_list # Ensure that any running VMs are readded to the domain_list
if debug: if debug:
print("Ensure that any running VMs are readded to the domain_list") print("Ensure that any running VMs are readded to the domain_list")