From 5d53a3e529b33a3a1842518115868bc46ea058cd Mon Sep 17 00:00:00 2001
From: "Joshua M. Boniface" <joshua@boniface.me>
Date: Sun, 10 Dec 2023 17:24:21 -0500
Subject: [PATCH] Add state and faults detail to cluster information

We already parse this information out anyways, so might as well add it
to the API output JSON. This can be leveraged by the Prometheus endpoint
as well to avoid duplicate listings.
---
 daemon-common/cluster.py | 53 ++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 8 deletions(-)

diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py
index b2c2f5e8..c9812343 100644
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -42,9 +42,7 @@ def set_maintenance(zkhandler, maint_state):
         return True, "Successfully set cluster in normal mode"
 
 
-def getClusterHealthFromFaults(zkhandler):
-    faults_list = faults.getAllFaults(zkhandler)
-
+def getClusterHealthFromFaults(zkhandler, faults_list):
     unacknowledged_faults = [fault for fault in faults_list if fault["status"] != "ack"]
 
     # Generate total cluster health numbers
@@ -277,13 +275,22 @@ def getClusterInformation(zkhandler):
         ]
     all_node_states = zkhandler.read_many(node_state_reads)
     # Parse out the Node states
+    node_data = list()
     formatted_node_states = {"total": node_count}
     for nidx, node in enumerate(node_list):
         # Split the large list of return values by the IDX of this node
         # Each node result is 2 fields long
         pos_start = nidx * 2
         pos_end = nidx * 2 + 2
-        node_state = ",".join(tuple(all_node_states[pos_start:pos_end]))
+        node_daemon_state, node_domain_state = tuple(all_node_states[pos_start:pos_end])
+        node_data.append(
+            {
+                "name": node,
+                "daemon_state": node_daemon_state,
+                "domain_state": node_domain_state,
+            }
+        )
+        node_state = f"{node_daemon_state},{node_domain_state}"
         # Add to the count for this node's state
         if node_state in common.node_state_combinations:
             if formatted_node_states.get(node_state) is not None:
@@ -298,15 +305,26 @@ def getClusterInformation(zkhandler):
     vm_state_reads = list()
     for vm in vm_list:
         vm_state_reads += [
+            ("domain", vm),
             ("domain.state", vm),
         ]
     all_vm_states = zkhandler.read_many(vm_state_reads)
     # Parse out the VM states
+    vm_data = list()
     formatted_vm_states = {"total": vm_count}
     for vidx, vm in enumerate(vm_list):
         # Split the large list of return values by the IDX of this VM
-        # Each VM result is 1 field long, so just use the IDX
-        vm_state = all_vm_states[vidx]
+        # Each VM result is 2 field long
+        pos_start = nidx * 2
+        pos_end = nidx * 2 + 2
+        vm_name, vm_state = tuple(all_vm_states[pos_start:pos_end])
+        vm_data.append(
+            {
+                "uuid": vm,
+                "name": vm_name,
+                "state": vm_state,
+            }
+        )
         # Add to the count for this VM's state
         if vm_state in common.vm_state_combinations:
             if formatted_vm_states.get(vm_state) is not None:
@@ -324,6 +342,7 @@ def getClusterInformation(zkhandler):
         osd_stat_reads += [("osd.stats", osd)]
     all_osd_stats = zkhandler.read_many(osd_stat_reads)
     # Parse out the OSD states
+    osd_data = list()
     formatted_osd_states = {"total": ceph_osd_count}
     up_texts = {1: "up", 0: "down"}
     in_texts = {1: "in", 0: "out"}
@@ -334,7 +353,16 @@ def getClusterInformation(zkhandler):
         # We have to load this JSON object and get our up/in states from it
         osd_stats = loads(_osd_stats)
         # Get our states
-        osd_state = f"{up_texts[osd_stats['up']]},{in_texts[osd_stats['in']]}"
+        osd_up = up_texts[osd_stats['up']]
+        osd_in = in_texts[osd_stats['in']]
+        osd_data.append(
+            {
+                "id": osd,
+                "up": osd_up,
+                "in": osd_in,
+            }
+        )
+        osd_state = f"{osd_up},{osd_in}"
         # Add to the count for this OSD's state
         if osd_state in common.ceph_osd_state_combinations:
             if formatted_osd_states.get(osd_state) is not None:
@@ -358,9 +386,12 @@ def getClusterInformation(zkhandler):
     ceph_snapshot_list = zkhandler.children("base.snapshot")
     ceph_snapshot_count = len(ceph_snapshot_list)
 
+    # Get the list of faults
+    faults_data = faults.getAllFaults(zkhandler)
+
     # Format the status data
     cluster_information = {
-        "cluster_health": getClusterHealthFromFaults(zkhandler),
+        "cluster_health": getClusterHealthFromFaults(zkhandler, faults_data),
         "node_health": getNodeHealth(zkhandler, node_list),
         "maintenance": maintenance_state,
         "primary_node": primary_node,
@@ -373,6 +404,12 @@ def getClusterInformation(zkhandler):
         "pools": ceph_pool_count,
         "volumes": ceph_volume_count,
         "snapshots": ceph_snapshot_count,
+        "detail": {
+            "node": node_data,
+            "vm": vm_data,
+            "osd": osd_data,
+            "faults": faults_data,
+        }
     }
 
     return cluster_information