From b9fbfe2ed5ebae0f3f862c554d2b503111558595 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sat, 9 Dec 2023 16:48:14 -0500 Subject: [PATCH] Improve fault ID format Instead of using random hex characters from an md5sum, use a nice name in all-caps similar to how Ceph does. This further helps prevent dupes but also permits a changing health delta within a single event (which would really only ever apply to plugin faults). --- daemon-common/faults.py | 39 +++++++++---------- .../pvchealthd/objects/MonitoringInstance.py | 17 +++++--- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/daemon-common/faults.py b/daemon-common/faults.py index 2aa6e31b..792d638a 100644 --- a/daemon-common/faults.py +++ b/daemon-common/faults.py @@ -20,7 +20,6 @@ ############################################################################### from datetime import datetime -from hashlib import md5 def generate_fault( @@ -32,10 +31,6 @@ def generate_fault( fault_message, fault_details=None, ): - # Generate a fault ID from the fault_name, fault_delta, and fault_message - fault_str = f"{fault_name} {fault_delta} {fault_message}" - fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8] - # Strip the microseconds off of the fault time; we don't care about that precision fault_time = str(fault_time).split(".")[0] @@ -45,47 +40,49 @@ def generate_fault( # If a fault already exists with this ID, just update the time if not zkhandler.exists("base.faults"): logger.out( - f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas", + f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas", state="w", ) return existing_faults = zkhandler.children("base.faults") - if fault_id in existing_faults: + if fault_name in existing_faults: logger.out( - f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i" + f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i" ) else: logger.out( - f"Generating fault {fault_id}: {fault_message} @ {fault_time}", + f"Generating fault {fault_name}: {fault_message} @ {fault_time}", state="i", ) if zkhandler.read("base.config.maintenance") == "true": logger.out( - f"Skipping fault reporting for {fault_id} due to maintenance mode", + f"Skipping fault reporting for {fault_name} due to maintenance mode", state="w", ) return - if fault_id in existing_faults: + # Update an existing fault + if fault_name in existing_faults: zkhandler.write( [ - (("faults.last_time", fault_id), fault_time), - (("faults.message", fault_id), fault_message), + (("faults.last_time", fault_name), fault_time), + (("faults.delta", fault_name), fault_delta), + (("faults.message", fault_name), fault_message), ] ) - # Otherwise, generate a new fault event + # Generate a new fault else: zkhandler.write( [ - (("faults.id", fault_id), ""), - (("faults.first_time", fault_id), fault_time), - (("faults.last_time", fault_id), fault_time), - (("faults.ack_time", fault_id), ""), - (("faults.status", fault_id), "new"), - (("faults.delta", fault_id), fault_delta), - (("faults.message", fault_id), fault_message), + (("faults.id", fault_name), ""), + (("faults.first_time", fault_name), fault_time), + (("faults.last_time", fault_name), fault_time), + (("faults.ack_time", fault_name), ""), + (("faults.status", fault_name), "new"), + (("faults.delta", fault_name), fault_delta), + (("faults.message", fault_name), fault_message), ] ) diff --git a/health-daemon/pvchealthd/objects/MonitoringInstance.py b/health-daemon/pvchealthd/objects/MonitoringInstance.py index f3dda18f..7034620a 100644 --- a/health-daemon/pvchealthd/objects/MonitoringInstance.py +++ b/health-daemon/pvchealthd/objects/MonitoringInstance.py @@ -228,7 +228,7 @@ class MonitoringInstance(object): def get_ceph_health_entries(): ceph_health_entries = [ { - "entry": f"{value['severity']} {key}", + "entry": key, "check": value["severity"], "details": value["summary"]["message"], } @@ -281,36 +281,42 @@ class MonitoringInstance(object): # This is a list of all possible faults (cluster error messages) and their corresponding details self.cluster_faults_map = { "dead_or_fenced_node": { + "name": "DEAD_NODE_{entry}", "entries": get_node_daemon_states, "conditions": ["dead", "fenced"], "delta": 50, "message": "Node {entry} was dead and/or fenced", }, "ceph_osd_out": { + "name": "CEPH_OSD_OUT_{entry}", "entries": get_osd_in_states, "conditions": ["0"], "delta": 50, "message": "OSD {entry} was marked out", }, "ceph_warn": { + "name": "CEPH_WARN_{entry}", "entries": get_ceph_health_entries, "conditions": ["HEALTH_WARN"], "delta": 10, "message": "{entry} reported by Ceph cluster", }, "ceph_err": { + "name": "CEPH_ERR_{entry}", "entries": get_ceph_health_entries, "conditions": ["HEALTH_ERR"], "delta": 50, "message": "{entry} reported by Ceph cluster", }, "vm_failed": { + "name": "VM_FAILED_{entry}", "entries": get_vm_states, "conditions": ["fail"], "delta": 10, "message": "VM {entry} was failed", }, "memory_overprovisioned": { + "name": "MEMORY_OVERPROVISIONED", "entries": get_overprovisioned_memory, "conditions": ["overprovisioned"], "delta": 50, @@ -531,11 +537,12 @@ class MonitoringInstance(object): if str(condition) == str(check): fault_time = datetime.now() fault_delta = fault_data["delta"] + fault_name = fault_data["name"].format(entry=entry) fault_message = fault_data["message"].format(entry=entry) generate_fault( self.zkhandler, self.logger, - fault_type, + fault_name, fault_time, fault_delta, fault_message, @@ -587,7 +594,7 @@ class MonitoringInstance(object): # Generate a cluster fault if the plugin is in a suboptimal state if result.health_delta > 0: - fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}" + fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}" fault_time = datetime.now() # Map our check results to fault results @@ -602,11 +609,11 @@ class MonitoringInstance(object): generate_fault( self.zkhandler, self.logger, - fault_type, + fault_name, fault_time, fault_delta, fault_message, - fault_detail=None, + fault_details=None, ) self.faults += 1