Add cluster overprovision determination

Adds a check of (n-1) memory overprovisioning. (n-1) is considered to be
the configuration that excludes the "largest" node. The cluster will
report degraded when in this state.
This commit is contained in:
Joshua Boniface 2020-10-18 14:46:32 -04:00
parent c6e34c7dc6
commit e7ab1bfddd
1 changed files with 31 additions and 1 deletions

View File

@ -79,6 +79,36 @@ def getClusterInformation(zk_conn):
ceph_volume_count = len(ceph_volume_list) ceph_volume_count = len(ceph_volume_list)
ceph_snapshot_count = len(ceph_snapshot_list) ceph_snapshot_count = len(ceph_snapshot_list)
# Determinations for general cluster health
cluster_healthy_status = True
# Check for (n-1) overprovisioning
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
# the total memory of the (n-1) smallest nodes, trigger this warning.
n_minus_1_total = 0
alloc_total = 0
node_largest_index = None
node_largest_count = 0
for index, node in enumerate(node_list):
node_mem_total = node['memory']['total']
node_mem_alloc = node['memory']['allocated']
alloc_total += node_mem_alloc
# Determine if this node is the largest seen so far
if node_mem_total > node_largest_count:
node_largest_index = index
node_largest_count = node_mem_total
n_minus_1_node_list = list()
for index, node in enumerate(node_list):
if index == node_largest_index:
continue
n_minus_1_node_list.append(node)
for index, node in enumerate(n_minus_1_node_list):
n_minus_1_total += node['memory']['total']
if alloc_total > n_minus_1_total:
cluster_healthy_status = False
cluster_health_msg.append("Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(alloc_total, n_minus_1_total))
# Determinations for node health # Determinations for node health
node_healthy_status = list(range(0, node_count)) node_healthy_status = list(range(0, node_count))
node_report_status = list(range(0, node_count)) node_report_status = list(range(0, node_count))
@ -131,7 +161,7 @@ def getClusterInformation(zk_conn):
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
if maint_state == 'true': if maint_state == 'true':
cluster_health = 'Maintenance' cluster_health = 'Maintenance'
elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status: elif cluster_healthy_status is False or False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
cluster_health = 'Degraded' cluster_health = 'Degraded'
else: else:
cluster_health = 'Optimal' cluster_health = 'Optimal'