From b9fbfe2ed5ebae0f3f862c554d2b503111558595 Mon Sep 17 00:00:00 2001
From: "Joshua M. Boniface" <joshua@boniface.me>
Date: Sat, 9 Dec 2023 16:48:14 -0500
Subject: [PATCH] Improve fault ID format

Instead of using random hex characters from an md5sum, use a nice name
in all-caps similar to how Ceph does. This further helps prevent dupes
but also permits a changing health delta within a single event (which
would really only ever apply to plugin faults).
---
 daemon-common/faults.py                       | 39 +++++++++----------
 .../pvchealthd/objects/MonitoringInstance.py  | 17 +++++---
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/daemon-common/faults.py b/daemon-common/faults.py
index 2aa6e31b..792d638a 100644
--- a/daemon-common/faults.py
+++ b/daemon-common/faults.py
@@ -20,7 +20,6 @@
 ###############################################################################
 
 from datetime import datetime
-from hashlib import md5
 
 
 def generate_fault(
@@ -32,10 +31,6 @@ def generate_fault(
     fault_message,
     fault_details=None,
 ):
-    # Generate a fault ID from the fault_name, fault_delta, and fault_message
-    fault_str = f"{fault_name} {fault_delta} {fault_message}"
-    fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
-
     # Strip the microseconds off of the fault time; we don't care about that precision
     fault_time = str(fault_time).split(".")[0]
 
@@ -45,47 +40,49 @@ def generate_fault(
     # If a fault already exists with this ID, just update the time
     if not zkhandler.exists("base.faults"):
         logger.out(
-            f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
+            f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
             state="w",
         )
         return
 
     existing_faults = zkhandler.children("base.faults")
-    if fault_id in existing_faults:
+    if fault_name in existing_faults:
         logger.out(
-            f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
+            f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
         )
     else:
         logger.out(
-            f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
+            f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
             state="i",
         )
 
     if zkhandler.read("base.config.maintenance") == "true":
         logger.out(
-            f"Skipping fault reporting for {fault_id} due to maintenance mode",
+            f"Skipping fault reporting for {fault_name} due to maintenance mode",
             state="w",
         )
         return
 
-    if fault_id in existing_faults:
+    # Update an existing fault
+    if fault_name in existing_faults:
         zkhandler.write(
             [
-                (("faults.last_time", fault_id), fault_time),
-                (("faults.message", fault_id), fault_message),
+                (("faults.last_time", fault_name), fault_time),
+                (("faults.delta", fault_name), fault_delta),
+                (("faults.message", fault_name), fault_message),
             ]
         )
-    # Otherwise, generate a new fault event
+    # Generate a new fault
     else:
         zkhandler.write(
             [
-                (("faults.id", fault_id), ""),
-                (("faults.first_time", fault_id), fault_time),
-                (("faults.last_time", fault_id), fault_time),
-                (("faults.ack_time", fault_id), ""),
-                (("faults.status", fault_id), "new"),
-                (("faults.delta", fault_id), fault_delta),
-                (("faults.message", fault_id), fault_message),
+                (("faults.id", fault_name), ""),
+                (("faults.first_time", fault_name), fault_time),
+                (("faults.last_time", fault_name), fault_time),
+                (("faults.ack_time", fault_name), ""),
+                (("faults.status", fault_name), "new"),
+                (("faults.delta", fault_name), fault_delta),
+                (("faults.message", fault_name), fault_message),
             ]
         )
 
diff --git a/health-daemon/pvchealthd/objects/MonitoringInstance.py b/health-daemon/pvchealthd/objects/MonitoringInstance.py
index f3dda18f..7034620a 100644
--- a/health-daemon/pvchealthd/objects/MonitoringInstance.py
+++ b/health-daemon/pvchealthd/objects/MonitoringInstance.py
@@ -228,7 +228,7 @@ class MonitoringInstance(object):
         def get_ceph_health_entries():
             ceph_health_entries = [
                 {
-                    "entry": f"{value['severity']} {key}",
+                    "entry": key,
                     "check": value["severity"],
                     "details": value["summary"]["message"],
                 }
@@ -281,36 +281,42 @@ class MonitoringInstance(object):
         # This is a list of all possible faults (cluster error messages) and their corresponding details
         self.cluster_faults_map = {
             "dead_or_fenced_node": {
+                "name": "DEAD_NODE_{entry}",
                 "entries": get_node_daemon_states,
                 "conditions": ["dead", "fenced"],
                 "delta": 50,
                 "message": "Node {entry} was dead and/or fenced",
             },
             "ceph_osd_out": {
+                "name": "CEPH_OSD_OUT_{entry}",
                 "entries": get_osd_in_states,
                 "conditions": ["0"],
                 "delta": 50,
                 "message": "OSD {entry} was marked out",
             },
             "ceph_warn": {
+                "name": "CEPH_WARN_{entry}",
                 "entries": get_ceph_health_entries,
                 "conditions": ["HEALTH_WARN"],
                 "delta": 10,
                 "message": "{entry} reported by Ceph cluster",
             },
             "ceph_err": {
+                "name": "CEPH_ERR_{entry}",
                 "entries": get_ceph_health_entries,
                 "conditions": ["HEALTH_ERR"],
                 "delta": 50,
                 "message": "{entry} reported by Ceph cluster",
             },
             "vm_failed": {
+                "name": "VM_FAILED_{entry}",
                 "entries": get_vm_states,
                 "conditions": ["fail"],
                 "delta": 10,
                 "message": "VM {entry} was failed",
             },
             "memory_overprovisioned": {
+                "name": "MEMORY_OVERPROVISIONED",
                 "entries": get_overprovisioned_memory,
                 "conditions": ["overprovisioned"],
                 "delta": 50,
@@ -531,11 +537,12 @@ class MonitoringInstance(object):
                     if str(condition) == str(check):
                         fault_time = datetime.now()
                         fault_delta = fault_data["delta"]
+                        fault_name = fault_data["name"].format(entry=entry)
                         fault_message = fault_data["message"].format(entry=entry)
                         generate_fault(
                             self.zkhandler,
                             self.logger,
-                            fault_type,
+                            fault_name,
                             fault_time,
                             fault_delta,
                             fault_message,
@@ -587,7 +594,7 @@ class MonitoringInstance(object):
 
             # Generate a cluster fault if the plugin is in a suboptimal state
             if result.health_delta > 0:
-                fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
+                fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}"
                 fault_time = datetime.now()
 
                 # Map our check results to fault results
@@ -602,11 +609,11 @@ class MonitoringInstance(object):
                 generate_fault(
                     self.zkhandler,
                     self.logger,
-                    fault_type,
+                    fault_name,
                     fault_time,
                     fault_delta,
                     fault_message,
-                    fault_detail=None,
+                    fault_details=None,
                 )
                 self.faults += 1