Improve fault ID format

Instead of using random hex characters from an md5sum, use a nice name
in all-caps similar to how Ceph does. This further helps prevent dupes
but also permits a changing health delta within a single event (which
would really only ever apply to plugin faults).
This commit is contained in:
Joshua Boniface 2023-12-09 16:48:14 -05:00
parent 764e3e3722
commit b9fbfe2ed5
2 changed files with 30 additions and 26 deletions

View File

@ -20,7 +20,6 @@
############################################################################### ###############################################################################
from datetime import datetime from datetime import datetime
from hashlib import md5
def generate_fault( def generate_fault(
@ -32,10 +31,6 @@ def generate_fault(
fault_message, fault_message,
fault_details=None, fault_details=None,
): ):
# Generate a fault ID from the fault_name, fault_delta, and fault_message
fault_str = f"{fault_name} {fault_delta} {fault_message}"
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
# Strip the microseconds off of the fault time; we don't care about that precision # Strip the microseconds off of the fault time; we don't care about that precision
fault_time = str(fault_time).split(".")[0] fault_time = str(fault_time).split(".")[0]
@ -45,47 +40,49 @@ def generate_fault(
# If a fault already exists with this ID, just update the time # If a fault already exists with this ID, just update the time
if not zkhandler.exists("base.faults"): if not zkhandler.exists("base.faults"):
logger.out( logger.out(
f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas", f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
state="w", state="w",
) )
return return
existing_faults = zkhandler.children("base.faults") existing_faults = zkhandler.children("base.faults")
if fault_id in existing_faults: if fault_name in existing_faults:
logger.out( logger.out(
f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i" f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
) )
else: else:
logger.out( logger.out(
f"Generating fault {fault_id}: {fault_message} @ {fault_time}", f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
state="i", state="i",
) )
if zkhandler.read("base.config.maintenance") == "true": if zkhandler.read("base.config.maintenance") == "true":
logger.out( logger.out(
f"Skipping fault reporting for {fault_id} due to maintenance mode", f"Skipping fault reporting for {fault_name} due to maintenance mode",
state="w", state="w",
) )
return return
if fault_id in existing_faults: # Update an existing fault
if fault_name in existing_faults:
zkhandler.write( zkhandler.write(
[ [
(("faults.last_time", fault_id), fault_time), (("faults.last_time", fault_name), fault_time),
(("faults.message", fault_id), fault_message), (("faults.delta", fault_name), fault_delta),
(("faults.message", fault_name), fault_message),
] ]
) )
# Otherwise, generate a new fault event # Generate a new fault
else: else:
zkhandler.write( zkhandler.write(
[ [
(("faults.id", fault_id), ""), (("faults.id", fault_name), ""),
(("faults.first_time", fault_id), fault_time), (("faults.first_time", fault_name), fault_time),
(("faults.last_time", fault_id), fault_time), (("faults.last_time", fault_name), fault_time),
(("faults.ack_time", fault_id), ""), (("faults.ack_time", fault_name), ""),
(("faults.status", fault_id), "new"), (("faults.status", fault_name), "new"),
(("faults.delta", fault_id), fault_delta), (("faults.delta", fault_name), fault_delta),
(("faults.message", fault_id), fault_message), (("faults.message", fault_name), fault_message),
] ]
) )

View File

@ -228,7 +228,7 @@ class MonitoringInstance(object):
def get_ceph_health_entries(): def get_ceph_health_entries():
ceph_health_entries = [ ceph_health_entries = [
{ {
"entry": f"{value['severity']} {key}", "entry": key,
"check": value["severity"], "check": value["severity"],
"details": value["summary"]["message"], "details": value["summary"]["message"],
} }
@ -281,36 +281,42 @@ class MonitoringInstance(object):
# This is a list of all possible faults (cluster error messages) and their corresponding details # This is a list of all possible faults (cluster error messages) and their corresponding details
self.cluster_faults_map = { self.cluster_faults_map = {
"dead_or_fenced_node": { "dead_or_fenced_node": {
"name": "DEAD_NODE_{entry}",
"entries": get_node_daemon_states, "entries": get_node_daemon_states,
"conditions": ["dead", "fenced"], "conditions": ["dead", "fenced"],
"delta": 50, "delta": 50,
"message": "Node {entry} was dead and/or fenced", "message": "Node {entry} was dead and/or fenced",
}, },
"ceph_osd_out": { "ceph_osd_out": {
"name": "CEPH_OSD_OUT_{entry}",
"entries": get_osd_in_states, "entries": get_osd_in_states,
"conditions": ["0"], "conditions": ["0"],
"delta": 50, "delta": 50,
"message": "OSD {entry} was marked out", "message": "OSD {entry} was marked out",
}, },
"ceph_warn": { "ceph_warn": {
"name": "CEPH_WARN_{entry}",
"entries": get_ceph_health_entries, "entries": get_ceph_health_entries,
"conditions": ["HEALTH_WARN"], "conditions": ["HEALTH_WARN"],
"delta": 10, "delta": 10,
"message": "{entry} reported by Ceph cluster", "message": "{entry} reported by Ceph cluster",
}, },
"ceph_err": { "ceph_err": {
"name": "CEPH_ERR_{entry}",
"entries": get_ceph_health_entries, "entries": get_ceph_health_entries,
"conditions": ["HEALTH_ERR"], "conditions": ["HEALTH_ERR"],
"delta": 50, "delta": 50,
"message": "{entry} reported by Ceph cluster", "message": "{entry} reported by Ceph cluster",
}, },
"vm_failed": { "vm_failed": {
"name": "VM_FAILED_{entry}",
"entries": get_vm_states, "entries": get_vm_states,
"conditions": ["fail"], "conditions": ["fail"],
"delta": 10, "delta": 10,
"message": "VM {entry} was failed", "message": "VM {entry} was failed",
}, },
"memory_overprovisioned": { "memory_overprovisioned": {
"name": "MEMORY_OVERPROVISIONED",
"entries": get_overprovisioned_memory, "entries": get_overprovisioned_memory,
"conditions": ["overprovisioned"], "conditions": ["overprovisioned"],
"delta": 50, "delta": 50,
@ -531,11 +537,12 @@ class MonitoringInstance(object):
if str(condition) == str(check): if str(condition) == str(check):
fault_time = datetime.now() fault_time = datetime.now()
fault_delta = fault_data["delta"] fault_delta = fault_data["delta"]
fault_name = fault_data["name"].format(entry=entry)
fault_message = fault_data["message"].format(entry=entry) fault_message = fault_data["message"].format(entry=entry)
generate_fault( generate_fault(
self.zkhandler, self.zkhandler,
self.logger, self.logger,
fault_type, fault_name,
fault_time, fault_time,
fault_delta, fault_delta,
fault_message, fault_message,
@ -587,7 +594,7 @@ class MonitoringInstance(object):
# Generate a cluster fault if the plugin is in a suboptimal state # Generate a cluster fault if the plugin is in a suboptimal state
if result.health_delta > 0: if result.health_delta > 0:
fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}" fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}"
fault_time = datetime.now() fault_time = datetime.now()
# Map our check results to fault results # Map our check results to fault results
@ -602,11 +609,11 @@ class MonitoringInstance(object):
generate_fault( generate_fault(
self.zkhandler, self.zkhandler,
self.logger, self.logger,
fault_type, fault_name,
fault_time, fault_time,
fault_delta, fault_delta,
fault_message, fault_message,
fault_detail=None, fault_details=None,
) )
self.faults += 1 self.faults += 1