From 15e78aa9f07e1d85b637c24e499d13b8d5685c4a Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 14 Aug 2020 12:27:13 -0400 Subject: [PATCH] Add status information in cluster status Provide textual explanations for the degraded status, including specific node/VM/OSD issues as well as detailed Ceph health. "Single pane of glass" mentality. --- client-cli/cli_lib/cluster.py | 10 ++++++++++ daemon-common/cluster.py | 33 ++++++++++++++++++++++++++++----- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/client-cli/cli_lib/cluster.py b/client-cli/cli_lib/cluster.py index f7f58657..54d4459d 100644 --- a/client-cli/cli_lib/cluster.py +++ b/client-cli/cli_lib/cluster.py @@ -103,7 +103,17 @@ def format_info(cluster_information, oformat): ainformation.append('{}PVC cluster status:{}'.format(ansiprint.bold(), ansiprint.end())) ainformation.append('') ainformation.append('{}Cluster health:{} {}{}{}'.format(ansiprint.purple(), ansiprint.end(), health_colour, cluster_information['health'], ansiprint.end())) + if cluster_information['health_msg']: + for line in cluster_information['health_msg']: + ainformation.append( + ' > {}'.format(line) + ) ainformation.append('{}Storage health:{} {}{}{}'.format(ansiprint.purple(), ansiprint.end(), storage_health_colour, cluster_information['storage_health'], ansiprint.end())) + if cluster_information['storage_health_msg']: + for line in cluster_information['storage_health_msg']: + ainformation.append( + ' > {}'.format(line) + ) ainformation.append('') ainformation.append('{}Primary node:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['primary_node'])) ainformation.append('{}Cluster upstream IP:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['upstream_ip'])) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index df083880..709884bf 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -21,6 +21,7 @@ ############################################################################### import json +import re from distutils.util import strtobool @@ -50,6 +51,10 @@ def getClusterInformation(zk_conn): except: maint_state = 'false' + # List of messages to display to the clients + cluster_health_msg = [] + storage_health_msg = [] + # Get node information object list retcode, node_list = pvc_node.get_list(zk_conn, None) @@ -82,6 +87,7 @@ def getClusterInformation(zk_conn): domain_state = node['domain_state'] if daemon_state != 'run' and domain_state != 'ready': node_healthy_status[index] = False + cluster_health_msg.append("Node '{}' in {},{} state".format(node['name'], daemon_state, domain_state)) else: node_healthy_status[index] = True node_report_status[index] = daemon_state + ',' + domain_state @@ -93,6 +99,7 @@ def getClusterInformation(zk_conn): vm_state = vm['state'] if vm_state not in ['start', 'disable', 'migrate', 'unmigrate', 'provision']: vm_healthy_status[index] = False + cluster_health_msg.append("VM '{}' in {} state".format(vm['name'], vm_state)) else: vm_healthy_status[index] = True vm_report_status[index] = vm_state @@ -111,12 +118,14 @@ def getClusterInformation(zk_conn): except KeyError: ceph_osd_in = 0 - if not ceph_osd_up or not ceph_osd_in: - ceph_osd_healthy_status[index] = False - else: - ceph_osd_healthy_status[index] = True up_texts = { 1: 'up', 0: 'down' } in_texts = { 1: 'in', 0: 'out' } + + if not ceph_osd_up or not ceph_osd_in: + ceph_osd_healthy_status[index] = False + cluster_health_msg.append('OSD {} in {},{} state'.format(ceph_osd['id'], up_texts[ceph_osd_up], in_texts[ceph_osd_in])) + else: + ceph_osd_healthy_status[index] = True ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in] # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy @@ -128,7 +137,19 @@ def getClusterInformation(zk_conn): cluster_health = 'Optimal' # Find out our storage health from Ceph - ceph_health = zkhandler.readdata(zk_conn, '/ceph').split('\n')[2].split()[-1] + ceph_status = zkhandler.readdata(zk_conn, '/ceph').split('\n') + ceph_health = ceph_status[2].split()[-1] + + # Parse the status output to get the health indicators + line_record = False + for index, line in enumerate(ceph_status): + if re.search('services:', line): + line_record = False + if line_record and len(line.strip()) > 0: + storage_health_msg.append(line.strip()) + if re.search('health:', line): + line_record = True + if maint_state == 'true': storage_health = 'Maintenance' elif ceph_health != 'HEALTH_OK': @@ -183,7 +204,9 @@ def getClusterInformation(zk_conn): # Format the status data cluster_information = { 'health': cluster_health, + 'health_msg': cluster_health_msg, 'storage_health': storage_health, + 'storage_health_msg': storage_health_msg, 'primary_node': common.getPrimaryNode(zk_conn), 'upstream_ip': zkhandler.readdata(zk_conn, '/upstream_ip'), 'nodes': formatted_node_states,