Improve fault detail handling further

Since we already had a "details" field, simply move where it gets added
to the message later, in generate_fault, after the main message value
was used to generate the ID.
This commit is contained in:
Joshua Boniface 2023-12-09 16:13:36 -05:00
parent 4ca2381077
commit 7e6d922877
2 changed files with 27 additions and 21 deletions

View File

@ -21,21 +21,27 @@
from datetime import datetime from datetime import datetime
from hashlib import md5 from hashlib import md5
from re import sub
def generate_fault( def generate_fault(
zkhandler, logger, fault_name, fault_time, fault_delta, fault_message zkhandler,
logger,
fault_name,
fault_time,
fault_delta,
fault_message,
fault_details=None,
): ):
# Strip off any "extra" data from the message (things in brackets) # Generate a fault ID from the fault_name, fault_delta, and fault_message
fault_core_message = sub(r"[\(\[].*?[\)\]]", "", fault_message).strip() fault_str = f"{fault_name} {fault_delta} {fault_message}"
# Generate a fault ID from the fault_name, fault_delta, and fault_core_message
fault_str = f"{fault_name} {fault_delta} {fault_core_message}"
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8] fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
# Strip the microseconds off of the fault time; we don't care about that precision # Strip the microseconds off of the fault time; we don't care about that precision
fault_time = str(fault_time).split(".")[0] fault_time = str(fault_time).split(".")[0]
if fault_details is not None:
fault_message = f"{fault_message}: {fault_details}"
# If a fault already exists with this ID, just update the time # If a fault already exists with this ID, just update the time
if not zkhandler.exists("base.faults"): if not zkhandler.exists("base.faults"):
logger.out( logger.out(

View File

@ -206,7 +206,7 @@ class MonitoringInstance(object):
{ {
"entry": node, "entry": node,
"check": self.zkhandler.read(("node.state.daemon", node)), "check": self.zkhandler.read(("node.state.daemon", node)),
"details": "", "details": None,
} }
for node in self.zkhandler.children("base.node") for node in self.zkhandler.children("base.node")
] ]
@ -219,7 +219,7 @@ class MonitoringInstance(object):
"check": loads(self.zkhandler.read(("osd.stats", osd))).get( "check": loads(self.zkhandler.read(("osd.stats", osd))).get(
"in", 0 "in", 0
), ),
"details": "", "details": None,
} }
for osd in self.zkhandler.children("base.osd") for osd in self.zkhandler.children("base.osd")
] ]
@ -271,9 +271,9 @@ class MonitoringInstance(object):
op_str = "ok" op_str = "ok"
overprovisioned_memory = [ overprovisioned_memory = [
{ {
"entry": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)", "entry": "Cluster memory was overprovisioned",
"check": op_str, "check": op_str,
"details": "", "details": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
} }
] ]
return overprovisioned_memory return overprovisioned_memory
@ -296,25 +296,25 @@ class MonitoringInstance(object):
"entries": get_ceph_health_entries, "entries": get_ceph_health_entries,
"conditions": ["HEALTH_WARN"], "conditions": ["HEALTH_WARN"],
"delta": 10, "delta": 10,
"message": "{entry} reported by Ceph ({details})", "message": "{entry} reported by Ceph cluster",
}, },
"ceph_err": { "ceph_err": {
"entries": get_ceph_health_entries, "entries": get_ceph_health_entries,
"conditions": ["HEALTH_ERR"], "conditions": ["HEALTH_ERR"],
"delta": 50, "delta": 50,
"message": "{entry} reported by Ceph ({details})", "message": "{entry} reported by Ceph cluster",
}, },
"vm_failed": { "vm_failed": {
"entries": get_vm_states, "entries": get_vm_states,
"conditions": ["fail"], "conditions": ["fail"],
"delta": 10, "delta": 10,
"message": "VM {entry} was failed ({details})", "message": "VM {entry} was failed",
}, },
"memory_overprovisioned": { "memory_overprovisioned": {
"entries": get_overprovisioned_memory, "entries": get_overprovisioned_memory,
"conditions": ["overprovisioned"], "conditions": ["overprovisioned"],
"delta": 50, "delta": 50,
"message": "Cluster memory was overprovisioned {entry}", "message": "{entry}",
}, },
} }
@ -507,7 +507,7 @@ class MonitoringInstance(object):
) )
for fault_type in self.cluster_faults_map.keys(): for fault_type in self.cluster_faults_map.keys():
fault_details = self.cluster_faults_map[fault_type] fault_data = self.cluster_faults_map[fault_type]
if self.config["log_monitoring_details"] or self.config["debug"]: if self.config["log_monitoring_details"] or self.config["debug"]:
self.logger.out( self.logger.out(
@ -515,7 +515,7 @@ class MonitoringInstance(object):
state="t", state="t",
) )
entries = fault_details["entries"]() entries = fault_data["entries"]()
if self.config["debug"]: if self.config["debug"]:
self.logger.out( self.logger.out(
@ -527,13 +527,11 @@ class MonitoringInstance(object):
entry = _entry["entry"] entry = _entry["entry"]
check = _entry["check"] check = _entry["check"]
details = _entry["details"] details = _entry["details"]
for condition in fault_details["conditions"]: for condition in fault_data["conditions"]:
if str(condition) == str(check): if str(condition) == str(check):
fault_time = datetime.now() fault_time = datetime.now()
fault_delta = fault_details["delta"] fault_delta = fault_data["delta"]
fault_message = fault_details["message"].format( fault_message = fault_data["message"].format(entry=entry)
entry=entry, details=details
)
generate_fault( generate_fault(
self.zkhandler, self.zkhandler,
self.logger, self.logger,
@ -541,6 +539,7 @@ class MonitoringInstance(object):
fault_time, fault_time,
fault_delta, fault_delta,
fault_message, fault_message,
fault_details=details,
) )
self.faults += 1 self.faults += 1
@ -607,6 +606,7 @@ class MonitoringInstance(object):
fault_time, fault_time,
fault_delta, fault_delta,
fault_message, fault_message,
fault_detail=None,
) )
self.faults += 1 self.faults += 1