diff --git a/daemon-common/faults.py b/daemon-common/faults.py index b3b166bc..2aa6e31b 100644 --- a/daemon-common/faults.py +++ b/daemon-common/faults.py @@ -21,21 +21,27 @@ from datetime import datetime from hashlib import md5 -from re import sub def generate_fault( - zkhandler, logger, fault_name, fault_time, fault_delta, fault_message + zkhandler, + logger, + fault_name, + fault_time, + fault_delta, + fault_message, + fault_details=None, ): - # Strip off any "extra" data from the message (things in brackets) - fault_core_message = sub(r"[\(\[].*?[\)\]]", "", fault_message).strip() - # Generate a fault ID from the fault_name, fault_delta, and fault_core_message - fault_str = f"{fault_name} {fault_delta} {fault_core_message}" + # Generate a fault ID from the fault_name, fault_delta, and fault_message + fault_str = f"{fault_name} {fault_delta} {fault_message}" fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8] # Strip the microseconds off of the fault time; we don't care about that precision fault_time = str(fault_time).split(".")[0] + if fault_details is not None: + fault_message = f"{fault_message}: {fault_details}" + # If a fault already exists with this ID, just update the time if not zkhandler.exists("base.faults"): logger.out( diff --git a/health-daemon/pvchealthd/objects/MonitoringInstance.py b/health-daemon/pvchealthd/objects/MonitoringInstance.py index 5f26c889..f3dda18f 100644 --- a/health-daemon/pvchealthd/objects/MonitoringInstance.py +++ b/health-daemon/pvchealthd/objects/MonitoringInstance.py @@ -206,7 +206,7 @@ class MonitoringInstance(object): { "entry": node, "check": self.zkhandler.read(("node.state.daemon", node)), - "details": "", + "details": None, } for node in self.zkhandler.children("base.node") ] @@ -219,7 +219,7 @@ class MonitoringInstance(object): "check": loads(self.zkhandler.read(("osd.stats", osd))).get( "in", 0 ), - "details": "", + "details": None, } for osd in self.zkhandler.children("base.osd") ] @@ -271,9 +271,9 @@ class MonitoringInstance(object): op_str = "ok" overprovisioned_memory = [ { - "entry": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)", + "entry": "Cluster memory was overprovisioned", "check": op_str, - "details": "", + "details": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)", } ] return overprovisioned_memory @@ -296,25 +296,25 @@ class MonitoringInstance(object): "entries": get_ceph_health_entries, "conditions": ["HEALTH_WARN"], "delta": 10, - "message": "{entry} reported by Ceph ({details})", + "message": "{entry} reported by Ceph cluster", }, "ceph_err": { "entries": get_ceph_health_entries, "conditions": ["HEALTH_ERR"], "delta": 50, - "message": "{entry} reported by Ceph ({details})", + "message": "{entry} reported by Ceph cluster", }, "vm_failed": { "entries": get_vm_states, "conditions": ["fail"], "delta": 10, - "message": "VM {entry} was failed ({details})", + "message": "VM {entry} was failed", }, "memory_overprovisioned": { "entries": get_overprovisioned_memory, "conditions": ["overprovisioned"], "delta": 50, - "message": "Cluster memory was overprovisioned {entry}", + "message": "{entry}", }, } @@ -507,7 +507,7 @@ class MonitoringInstance(object): ) for fault_type in self.cluster_faults_map.keys(): - fault_details = self.cluster_faults_map[fault_type] + fault_data = self.cluster_faults_map[fault_type] if self.config["log_monitoring_details"] or self.config["debug"]: self.logger.out( @@ -515,7 +515,7 @@ class MonitoringInstance(object): state="t", ) - entries = fault_details["entries"]() + entries = fault_data["entries"]() if self.config["debug"]: self.logger.out( @@ -527,13 +527,11 @@ class MonitoringInstance(object): entry = _entry["entry"] check = _entry["check"] details = _entry["details"] - for condition in fault_details["conditions"]: + for condition in fault_data["conditions"]: if str(condition) == str(check): fault_time = datetime.now() - fault_delta = fault_details["delta"] - fault_message = fault_details["message"].format( - entry=entry, details=details - ) + fault_delta = fault_data["delta"] + fault_message = fault_data["message"].format(entry=entry) generate_fault( self.zkhandler, self.logger, @@ -541,6 +539,7 @@ class MonitoringInstance(object): fault_time, fault_delta, fault_message, + fault_details=details, ) self.faults += 1 @@ -607,6 +606,7 @@ class MonitoringInstance(object): fault_time, fault_delta, fault_message, + fault_detail=None, ) self.faults += 1