Add cluster overprovision determination

Adds a check of (n-1) memory overprovisioning. (n-1) is considered to be the configuration that excludes the "largest" node. The cluster will report degraded when in this state.
2020-10-18 14:46:32 -04:00
parent c6e34c7dc6
commit e7ab1bfddd
1 changed files with 31 additions and 1 deletions
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -79,6 +79,36 @@ def getClusterInformation(zk_conn):
    ceph_volume_count = len(ceph_volume_list)
    ceph_snapshot_count = len(ceph_snapshot_list)
    # Determinations for general cluster health
    cluster_healthy_status = True
    # Check for (n-1) overprovisioning
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
    n_minus_1_total = 0
    alloc_total = 0
    node_largest_index = None
    node_largest_count = 0
    for index, node in enumerate(node_list):
        node_mem_total = node['memory']['total']
        node_mem_alloc = node['memory']['allocated']
        alloc_total += node_mem_alloc
        # Determine if this node is the largest seen so far
        if node_mem_total > node_largest_count:
            node_largest_index = index
            node_largest_count = node_mem_total
    n_minus_1_node_list = list()
    for index, node in enumerate(node_list):
        if index == node_largest_index:
            continue
        n_minus_1_node_list.append(node)
    for index, node in enumerate(n_minus_1_node_list):
        n_minus_1_total += node['memory']['total']
    if alloc_total > n_minus_1_total:
        cluster_healthy_status = False
        cluster_health_msg.append("Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(alloc_total, n_minus_1_total))
    # Determinations for node health
    node_healthy_status = list(range(0, node_count))
    node_report_status = list(range(0, node_count))
@@ -131,7 +161,7 @@ def getClusterInformation(zk_conn):
    # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
    if maint_state == 'true':
        cluster_health = 'Maintenance'
-    elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
+    elif cluster_healthy_status is False or False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
        cluster_health = 'Degraded'
    else:
        cluster_health = 'Optimal'