Add cluster overprovision determination

Adds a check of (n-1) memory overprovisioning. (n-1) is considered to be the configuration that excludes the "largest" node. The cluster will report degraded when in this state.
2020-10-18 14:46:32 -04:00
parent c6e34c7dc6
commit e7ab1bfddd
1 changed files with 31 additions and 1 deletions
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -79,6 +79,36 @@ def getClusterInformation(zk_conn):
    ceph_volume_count = len(ceph_volume_list)
    ceph_snapshot_count = len(ceph_snapshot_list)

+    # Determinations for general cluster health
+    cluster_healthy_status = True
+    # Check for (n-1) overprovisioning
+    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
+    #   the total memory of the (n-1) smallest nodes, trigger this warning.
+    n_minus_1_total = 0
+    alloc_total = 0
+
+    node_largest_index = None
+    node_largest_count = 0
+    for index, node in enumerate(node_list):
+        node_mem_total = node['memory']['total']
+        node_mem_alloc = node['memory']['allocated']
+        alloc_total += node_mem_alloc
+
+        # Determine if this node is the largest seen so far
+        if node_mem_total > node_largest_count:
+            node_largest_index = index
+            node_largest_count = node_mem_total
+    n_minus_1_node_list = list()
+    for index, node in enumerate(node_list):
+        if index == node_largest_index:
+            continue
+        n_minus_1_node_list.append(node)
+    for index, node in enumerate(n_minus_1_node_list):
+        n_minus_1_total += node['memory']['total']
+    if alloc_total > n_minus_1_total:
+        cluster_healthy_status = False
+        cluster_health_msg.append("Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(alloc_total, n_minus_1_total))
+
    # Determinations for node health
    node_healthy_status = list(range(0, node_count))
    node_report_status = list(range(0, node_count))
@@ -131,7 +161,7 @@ def getClusterInformation(zk_conn):
    # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
    if maint_state == 'true':
        cluster_health = 'Maintenance'
-    elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
+    elif cluster_healthy_status is False or False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
        cluster_health = 'Degraded'
    else:
        cluster_health = 'Optimal'