Modify cluster health to use new values
This commit is contained in:
parent
f4eef30770
commit
00586074cf
|
@ -125,82 +125,42 @@ def format_info(cluster_information, oformat):
|
||||||
return json.dumps(cluster_information, indent=4)
|
return json.dumps(cluster_information, indent=4)
|
||||||
|
|
||||||
# Plain formatting, i.e. human-readable
|
# Plain formatting, i.e. human-readable
|
||||||
if cluster_information["health"] == "Optimal":
|
if cluster_information["maintenance"] == "True":
|
||||||
health_colour = ansiprint.green()
|
|
||||||
elif cluster_information["health"] == "Maintenance":
|
|
||||||
health_colour = ansiprint.blue()
|
health_colour = ansiprint.blue()
|
||||||
else:
|
elif cluster_information["health"] > 90:
|
||||||
|
health_colour = ansiprint.green()
|
||||||
|
elif cluster_information["health"] > 50:
|
||||||
health_colour = ansiprint.yellow()
|
health_colour = ansiprint.yellow()
|
||||||
|
|
||||||
if cluster_information["storage_health"] == "Optimal":
|
|
||||||
storage_health_colour = ansiprint.green()
|
|
||||||
elif cluster_information["storage_health"] == "Maintenance":
|
|
||||||
storage_health_colour = ansiprint.blue()
|
|
||||||
else:
|
else:
|
||||||
storage_health_colour = ansiprint.yellow()
|
health_colour = ansiprint.red()
|
||||||
|
|
||||||
ainformation = []
|
ainformation = []
|
||||||
|
|
||||||
if oformat == "short":
|
|
||||||
ainformation.append(
|
|
||||||
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
|
||||||
)
|
|
||||||
ainformation.append(
|
|
||||||
"{}Cluster health:{} {}{}{}".format(
|
|
||||||
ansiprint.purple(),
|
|
||||||
ansiprint.end(),
|
|
||||||
health_colour,
|
|
||||||
cluster_information["health"],
|
|
||||||
ansiprint.end(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if cluster_information["health_msg"]:
|
|
||||||
for line in cluster_information["health_msg"]:
|
|
||||||
ainformation.append(" > {}".format(line))
|
|
||||||
ainformation.append(
|
|
||||||
"{}Storage health:{} {}{}{}".format(
|
|
||||||
ansiprint.purple(),
|
|
||||||
ansiprint.end(),
|
|
||||||
storage_health_colour,
|
|
||||||
cluster_information["storage_health"],
|
|
||||||
ansiprint.end(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if cluster_information["storage_health_msg"]:
|
|
||||||
for line in cluster_information["storage_health_msg"]:
|
|
||||||
ainformation.append(" > {}".format(line))
|
|
||||||
|
|
||||||
return "\n".join(ainformation)
|
|
||||||
|
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
||||||
)
|
)
|
||||||
ainformation.append("")
|
ainformation.append("")
|
||||||
|
|
||||||
|
health_text = f"{cluster_information['health']}%"
|
||||||
|
if cluster_information["maintenance"] == "True":
|
||||||
|
health_text += " (maintenance on)"
|
||||||
|
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Cluster health:{} {}{}{}".format(
|
"{}Cluster health:{} {}{}{}".format(
|
||||||
ansiprint.purple(),
|
ansiprint.purple(),
|
||||||
ansiprint.end(),
|
ansiprint.end(),
|
||||||
health_colour,
|
health_colour,
|
||||||
cluster_information["health"],
|
health_text,
|
||||||
ansiprint.end(),
|
ansiprint.end(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if cluster_information["health_msg"]:
|
if cluster_information["health_messages"]:
|
||||||
for line in cluster_information["health_msg"]:
|
for line in cluster_information["health_messages"]:
|
||||||
ainformation.append(" > {}".format(line))
|
|
||||||
ainformation.append(
|
|
||||||
"{}Storage health:{} {}{}{}".format(
|
|
||||||
ansiprint.purple(),
|
|
||||||
ansiprint.end(),
|
|
||||||
storage_health_colour,
|
|
||||||
cluster_information["storage_health"],
|
|
||||||
ansiprint.end(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if cluster_information["storage_health_msg"]:
|
|
||||||
for line in cluster_information["storage_health_msg"]:
|
|
||||||
ainformation.append(" > {}".format(line))
|
ainformation.append(" > {}".format(line))
|
||||||
|
|
||||||
|
if oformat == "short":
|
||||||
|
return "\n".join(ainformation)
|
||||||
|
|
||||||
ainformation.append("")
|
ainformation.append("")
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Primary node:{} {}".format(
|
"{}Primary node:{} {}".format(
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
#
|
#
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
import re
|
from json import loads
|
||||||
|
|
||||||
import daemon_lib.common as common
|
import daemon_lib.common as common
|
||||||
import daemon_lib.vm as pvc_vm
|
import daemon_lib.vm as pvc_vm
|
||||||
|
@ -44,13 +44,99 @@ def set_maintenance(zkhandler, maint_state):
|
||||||
return True, "Successfully set cluster in normal mode"
|
return True, "Successfully set cluster in normal mode"
|
||||||
|
|
||||||
|
|
||||||
|
def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
|
||||||
|
health_delta_map = {
|
||||||
|
'node_stopped': 50,
|
||||||
|
'node_flushed': 10,
|
||||||
|
'vm_stopped': 10,
|
||||||
|
'osd_out': 50,
|
||||||
|
'osd_down': 10,
|
||||||
|
'memory_overprovisioned': 50,
|
||||||
|
'ceph_err': 50,
|
||||||
|
'ceph_warn': 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generate total cluster health numbers
|
||||||
|
cluster_health = 100
|
||||||
|
messages = list()
|
||||||
|
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
# Apply node health values to total health number
|
||||||
|
cluster_health -= 100 - node['health']
|
||||||
|
for entry in node['health_details']:
|
||||||
|
if entry['health_delta'] > 0:
|
||||||
|
messages.append(f"{node['name']}: plugin {entry['plugin_name']}: {entry['message']}")
|
||||||
|
|
||||||
|
# Handle unhealthy node states
|
||||||
|
if node['daemon_state'] not in ['run']:
|
||||||
|
cluster_health -= health_delta_map['node_stopped']
|
||||||
|
messages.append(f"cluster: {node['name']} in {node['daemon_state']} daemon state")
|
||||||
|
elif node['domain_state'] not in ['ready']:
|
||||||
|
cluster_health -= health_delta_map['node_flushed']
|
||||||
|
messages.append(f"cluster: {node['name']} in {node['domain_state']} domain state")
|
||||||
|
|
||||||
|
for index, vm in enumerate(vm_list):
|
||||||
|
# Handle unhealthy VM states
|
||||||
|
if vm['state'] not in ["start", "disable", "migrate", "unmigrate", "provision"]:
|
||||||
|
cluster_health -= health_delta_map['vm_stopped']
|
||||||
|
messages.append(f"cluster: {vm['name']} in {vm['state']} state")
|
||||||
|
|
||||||
|
for index, ceph_osd in enumerate(ceph_osd_list):
|
||||||
|
in_texts = {1: "in", 0: "out"}
|
||||||
|
up_texts = {1: "up", 0: "down"}
|
||||||
|
|
||||||
|
# Handle unhealthy OSD states
|
||||||
|
if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
|
||||||
|
cluster_health -= health_delta_map['osd_out']
|
||||||
|
messages.append(f"cluster: OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']]} state")
|
||||||
|
elif up_texts[ceph_osd["stats"]["up"]] not in ['up']:
|
||||||
|
cluster_health -= health_delta_map['osd_down']
|
||||||
|
messages.append(f"cluster: OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']]} state")
|
||||||
|
|
||||||
|
# Check for (n-1) overprovisioning
|
||||||
|
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
||||||
|
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
||||||
|
n_minus_1_total = 0
|
||||||
|
alloc_total = 0
|
||||||
|
node_largest_index = None
|
||||||
|
node_largest_count = 0
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
node_mem_total = node["memory"]["total"]
|
||||||
|
node_mem_alloc = node["memory"]["allocated"]
|
||||||
|
alloc_total += node_mem_alloc
|
||||||
|
# Determine if this node is the largest seen so far
|
||||||
|
if node_mem_total > node_largest_count:
|
||||||
|
node_largest_index = index
|
||||||
|
node_largest_count = node_mem_total
|
||||||
|
n_minus_1_node_list = list()
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
if index == node_largest_index:
|
||||||
|
continue
|
||||||
|
n_minus_1_node_list.append(node)
|
||||||
|
for index, node in enumerate(n_minus_1_node_list):
|
||||||
|
n_minus_1_total += node["memory"]["total"]
|
||||||
|
if alloc_total > n_minus_1_total:
|
||||||
|
cluster_health -= health_delta_map['memory_overprovisioned']
|
||||||
|
messages.append(f"cluster: Total VM memory is overprovisioned ({alloc_total} > {n_minus_1_total} n-1)")
|
||||||
|
|
||||||
|
# Check Ceph cluster health
|
||||||
|
ceph_health = loads(zkhandler.read("base.storage.health"))
|
||||||
|
ceph_health_status = ceph_health["status"]
|
||||||
|
ceph_health_entries = ceph_health["checks"].keys()
|
||||||
|
|
||||||
|
if ceph_health_status == 'HEALTH_ERR':
|
||||||
|
cluster_health -= health_delta_map['ceph_err']
|
||||||
|
messages.append(f"cluster: Ceph cluster in ERROR state: {', '.join(ceph_health_entries)}")
|
||||||
|
elif ceph_health_status == 'HEALTH_WARN':
|
||||||
|
cluster_health -= health_delta_map['ceph_warn']
|
||||||
|
messages.append(f"cluster: Ceph cluster in WARNING state: {', '.join(ceph_health_entries)}")
|
||||||
|
|
||||||
|
return cluster_health, messages
|
||||||
|
|
||||||
|
|
||||||
def getClusterInformation(zkhandler):
|
def getClusterInformation(zkhandler):
|
||||||
# Get cluster maintenance state
|
# Get cluster maintenance state
|
||||||
maint_state = zkhandler.read("base.config.maintenance")
|
maintenance_state = zkhandler.read("base.config.maintenance")
|
||||||
|
|
||||||
# List of messages to display to the clients
|
|
||||||
cluster_health_msg = []
|
|
||||||
storage_health_msg = []
|
|
||||||
|
|
||||||
# Get node information object list
|
# Get node information object list
|
||||||
retcode, node_list = pvc_node.get_list(zkhandler, None)
|
retcode, node_list = pvc_node.get_list(zkhandler, None)
|
||||||
|
@ -78,135 +164,6 @@ def getClusterInformation(zkhandler):
|
||||||
ceph_volume_count = len(ceph_volume_list)
|
ceph_volume_count = len(ceph_volume_list)
|
||||||
ceph_snapshot_count = len(ceph_snapshot_list)
|
ceph_snapshot_count = len(ceph_snapshot_list)
|
||||||
|
|
||||||
# Determinations for general cluster health
|
|
||||||
cluster_healthy_status = True
|
|
||||||
# Check for (n-1) overprovisioning
|
|
||||||
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
|
||||||
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
|
||||||
n_minus_1_total = 0
|
|
||||||
alloc_total = 0
|
|
||||||
|
|
||||||
node_largest_index = None
|
|
||||||
node_largest_count = 0
|
|
||||||
for index, node in enumerate(node_list):
|
|
||||||
node_mem_total = node["memory"]["total"]
|
|
||||||
node_mem_alloc = node["memory"]["allocated"]
|
|
||||||
alloc_total += node_mem_alloc
|
|
||||||
|
|
||||||
# Determine if this node is the largest seen so far
|
|
||||||
if node_mem_total > node_largest_count:
|
|
||||||
node_largest_index = index
|
|
||||||
node_largest_count = node_mem_total
|
|
||||||
n_minus_1_node_list = list()
|
|
||||||
for index, node in enumerate(node_list):
|
|
||||||
if index == node_largest_index:
|
|
||||||
continue
|
|
||||||
n_minus_1_node_list.append(node)
|
|
||||||
for index, node in enumerate(n_minus_1_node_list):
|
|
||||||
n_minus_1_total += node["memory"]["total"]
|
|
||||||
if alloc_total > n_minus_1_total:
|
|
||||||
cluster_healthy_status = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
|
|
||||||
alloc_total, n_minus_1_total
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Determinations for node health
|
|
||||||
node_healthy_status = list(range(0, node_count))
|
|
||||||
node_report_status = list(range(0, node_count))
|
|
||||||
for index, node in enumerate(node_list):
|
|
||||||
daemon_state = node["daemon_state"]
|
|
||||||
domain_state = node["domain_state"]
|
|
||||||
if daemon_state != "run" and domain_state != "ready":
|
|
||||||
node_healthy_status[index] = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"Node '{}' in {},{} state".format(
|
|
||||||
node["name"], daemon_state, domain_state
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
node_healthy_status[index] = True
|
|
||||||
node_report_status[index] = daemon_state + "," + domain_state
|
|
||||||
|
|
||||||
# Determinations for VM health
|
|
||||||
vm_healthy_status = list(range(0, vm_count))
|
|
||||||
vm_report_status = list(range(0, vm_count))
|
|
||||||
for index, vm in enumerate(vm_list):
|
|
||||||
vm_state = vm["state"]
|
|
||||||
if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
|
|
||||||
vm_healthy_status[index] = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"VM '{}' in {} state".format(vm["name"], vm_state)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
vm_healthy_status[index] = True
|
|
||||||
vm_report_status[index] = vm_state
|
|
||||||
|
|
||||||
# Determinations for OSD health
|
|
||||||
ceph_osd_healthy_status = list(range(0, ceph_osd_count))
|
|
||||||
ceph_osd_report_status = list(range(0, ceph_osd_count))
|
|
||||||
for index, ceph_osd in enumerate(ceph_osd_list):
|
|
||||||
try:
|
|
||||||
ceph_osd_up = ceph_osd["stats"]["up"]
|
|
||||||
except KeyError:
|
|
||||||
ceph_osd_up = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
ceph_osd_in = ceph_osd["stats"]["in"]
|
|
||||||
except KeyError:
|
|
||||||
ceph_osd_in = 0
|
|
||||||
|
|
||||||
up_texts = {1: "up", 0: "down"}
|
|
||||||
in_texts = {1: "in", 0: "out"}
|
|
||||||
|
|
||||||
if not ceph_osd_up or not ceph_osd_in:
|
|
||||||
ceph_osd_healthy_status[index] = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"OSD {} in {},{} state".format(
|
|
||||||
ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ceph_osd_healthy_status[index] = True
|
|
||||||
ceph_osd_report_status[index] = (
|
|
||||||
up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
|
|
||||||
if maint_state == "true":
|
|
||||||
cluster_health = "Maintenance"
|
|
||||||
elif (
|
|
||||||
cluster_healthy_status is False
|
|
||||||
or False in node_healthy_status
|
|
||||||
or False in vm_healthy_status
|
|
||||||
or False in ceph_osd_healthy_status
|
|
||||||
):
|
|
||||||
cluster_health = "Degraded"
|
|
||||||
else:
|
|
||||||
cluster_health = "Optimal"
|
|
||||||
|
|
||||||
# Find out our storage health from Ceph
|
|
||||||
ceph_status = zkhandler.read("base.storage").split("\n")
|
|
||||||
ceph_health = ceph_status[2].split()[-1]
|
|
||||||
|
|
||||||
# Parse the status output to get the health indicators
|
|
||||||
line_record = False
|
|
||||||
for index, line in enumerate(ceph_status):
|
|
||||||
if re.search("services:", line):
|
|
||||||
line_record = False
|
|
||||||
if line_record and len(line.strip()) > 0:
|
|
||||||
storage_health_msg.append(line.strip())
|
|
||||||
if re.search("health:", line):
|
|
||||||
line_record = True
|
|
||||||
|
|
||||||
if maint_state == "true":
|
|
||||||
storage_health = "Maintenance"
|
|
||||||
elif ceph_health != "HEALTH_OK":
|
|
||||||
storage_health = "Degraded"
|
|
||||||
else:
|
|
||||||
storage_health = "Optimal"
|
|
||||||
|
|
||||||
# State lists
|
# State lists
|
||||||
node_state_combinations = [
|
node_state_combinations = [
|
||||||
"run,ready",
|
"run,ready",
|
||||||
|
@ -237,13 +194,19 @@ def getClusterInformation(zkhandler):
|
||||||
"unmigrate",
|
"unmigrate",
|
||||||
"provision",
|
"provision",
|
||||||
]
|
]
|
||||||
ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
|
ceph_osd_state_combinations = [
|
||||||
|
"up,in",
|
||||||
|
"up,out",
|
||||||
|
"down,in",
|
||||||
|
"down,out",
|
||||||
|
]
|
||||||
|
|
||||||
# Format the Node states
|
# Format the Node states
|
||||||
formatted_node_states = {"total": node_count}
|
formatted_node_states = {"total": node_count}
|
||||||
for state in node_state_combinations:
|
for state in node_state_combinations:
|
||||||
state_count = 0
|
state_count = 0
|
||||||
for node_state in node_report_status:
|
for node in node_list:
|
||||||
|
node_state = f"{node['daemon_state']},{node['domain_state']}"
|
||||||
if node_state == state:
|
if node_state == state:
|
||||||
state_count += 1
|
state_count += 1
|
||||||
if state_count > 0:
|
if state_count > 0:
|
||||||
|
@ -253,28 +216,33 @@ def getClusterInformation(zkhandler):
|
||||||
formatted_vm_states = {"total": vm_count}
|
formatted_vm_states = {"total": vm_count}
|
||||||
for state in vm_state_combinations:
|
for state in vm_state_combinations:
|
||||||
state_count = 0
|
state_count = 0
|
||||||
for vm_state in vm_report_status:
|
for vm in vm_list:
|
||||||
if vm_state == state:
|
if vm["state"] == state:
|
||||||
state_count += 1
|
state_count += 1
|
||||||
if state_count > 0:
|
if state_count > 0:
|
||||||
formatted_vm_states[state] = state_count
|
formatted_vm_states[state] = state_count
|
||||||
|
|
||||||
# Format the OSD states
|
# Format the OSD states
|
||||||
|
up_texts = {1: "up", 0: "down"}
|
||||||
|
in_texts = {1: "in", 0: "out"}
|
||||||
formatted_osd_states = {"total": ceph_osd_count}
|
formatted_osd_states = {"total": ceph_osd_count}
|
||||||
for state in ceph_osd_state_combinations:
|
for state in ceph_osd_state_combinations:
|
||||||
state_count = 0
|
state_count = 0
|
||||||
for ceph_osd_state in ceph_osd_report_status:
|
for ceph_osd in ceph_osd_list:
|
||||||
|
ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
|
||||||
if ceph_osd_state == state:
|
if ceph_osd_state == state:
|
||||||
state_count += 1
|
state_count += 1
|
||||||
if state_count > 0:
|
if state_count > 0:
|
||||||
formatted_osd_states[state] = state_count
|
formatted_osd_states[state] = state_count
|
||||||
|
|
||||||
|
# Get cluster health data
|
||||||
|
cluster_health, cluster_health_messages = getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list)
|
||||||
|
|
||||||
# Format the status data
|
# Format the status data
|
||||||
cluster_information = {
|
cluster_information = {
|
||||||
"health": cluster_health,
|
"health": cluster_health,
|
||||||
"health_msg": cluster_health_msg,
|
"health_messages": cluster_health_messages,
|
||||||
"storage_health": storage_health,
|
"maintenance": maintenance_state,
|
||||||
"storage_health_msg": storage_health_msg,
|
|
||||||
"primary_node": common.getPrimaryNode(zkhandler),
|
"primary_node": common.getPrimaryNode(zkhandler),
|
||||||
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
|
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
|
||||||
"nodes": formatted_node_states,
|
"nodes": formatted_node_states,
|
||||||
|
|
Loading…
Reference in New Issue