Improve fault detail handling further
Since we already had a "details" field, simply move where it gets added to the message later, in generate_fault, after the main message value was used to generate the ID.
This commit is contained in:
parent
4ca2381077
commit
7e6d922877
|
@ -21,21 +21,27 @@
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from re import sub
|
|
||||||
|
|
||||||
|
|
||||||
def generate_fault(
|
def generate_fault(
|
||||||
zkhandler, logger, fault_name, fault_time, fault_delta, fault_message
|
zkhandler,
|
||||||
|
logger,
|
||||||
|
fault_name,
|
||||||
|
fault_time,
|
||||||
|
fault_delta,
|
||||||
|
fault_message,
|
||||||
|
fault_details=None,
|
||||||
):
|
):
|
||||||
# Strip off any "extra" data from the message (things in brackets)
|
# Generate a fault ID from the fault_name, fault_delta, and fault_message
|
||||||
fault_core_message = sub(r"[\(\[].*?[\)\]]", "", fault_message).strip()
|
fault_str = f"{fault_name} {fault_delta} {fault_message}"
|
||||||
# Generate a fault ID from the fault_name, fault_delta, and fault_core_message
|
|
||||||
fault_str = f"{fault_name} {fault_delta} {fault_core_message}"
|
|
||||||
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
|
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
|
||||||
|
|
||||||
# Strip the microseconds off of the fault time; we don't care about that precision
|
# Strip the microseconds off of the fault time; we don't care about that precision
|
||||||
fault_time = str(fault_time).split(".")[0]
|
fault_time = str(fault_time).split(".")[0]
|
||||||
|
|
||||||
|
if fault_details is not None:
|
||||||
|
fault_message = f"{fault_message}: {fault_details}"
|
||||||
|
|
||||||
# If a fault already exists with this ID, just update the time
|
# If a fault already exists with this ID, just update the time
|
||||||
if not zkhandler.exists("base.faults"):
|
if not zkhandler.exists("base.faults"):
|
||||||
logger.out(
|
logger.out(
|
||||||
|
|
|
@ -206,7 +206,7 @@ class MonitoringInstance(object):
|
||||||
{
|
{
|
||||||
"entry": node,
|
"entry": node,
|
||||||
"check": self.zkhandler.read(("node.state.daemon", node)),
|
"check": self.zkhandler.read(("node.state.daemon", node)),
|
||||||
"details": "",
|
"details": None,
|
||||||
}
|
}
|
||||||
for node in self.zkhandler.children("base.node")
|
for node in self.zkhandler.children("base.node")
|
||||||
]
|
]
|
||||||
|
@ -219,7 +219,7 @@ class MonitoringInstance(object):
|
||||||
"check": loads(self.zkhandler.read(("osd.stats", osd))).get(
|
"check": loads(self.zkhandler.read(("osd.stats", osd))).get(
|
||||||
"in", 0
|
"in", 0
|
||||||
),
|
),
|
||||||
"details": "",
|
"details": None,
|
||||||
}
|
}
|
||||||
for osd in self.zkhandler.children("base.osd")
|
for osd in self.zkhandler.children("base.osd")
|
||||||
]
|
]
|
||||||
|
@ -271,9 +271,9 @@ class MonitoringInstance(object):
|
||||||
op_str = "ok"
|
op_str = "ok"
|
||||||
overprovisioned_memory = [
|
overprovisioned_memory = [
|
||||||
{
|
{
|
||||||
"entry": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
|
"entry": "Cluster memory was overprovisioned",
|
||||||
"check": op_str,
|
"check": op_str,
|
||||||
"details": "",
|
"details": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
return overprovisioned_memory
|
return overprovisioned_memory
|
||||||
|
@ -296,25 +296,25 @@ class MonitoringInstance(object):
|
||||||
"entries": get_ceph_health_entries,
|
"entries": get_ceph_health_entries,
|
||||||
"conditions": ["HEALTH_WARN"],
|
"conditions": ["HEALTH_WARN"],
|
||||||
"delta": 10,
|
"delta": 10,
|
||||||
"message": "{entry} reported by Ceph ({details})",
|
"message": "{entry} reported by Ceph cluster",
|
||||||
},
|
},
|
||||||
"ceph_err": {
|
"ceph_err": {
|
||||||
"entries": get_ceph_health_entries,
|
"entries": get_ceph_health_entries,
|
||||||
"conditions": ["HEALTH_ERR"],
|
"conditions": ["HEALTH_ERR"],
|
||||||
"delta": 50,
|
"delta": 50,
|
||||||
"message": "{entry} reported by Ceph ({details})",
|
"message": "{entry} reported by Ceph cluster",
|
||||||
},
|
},
|
||||||
"vm_failed": {
|
"vm_failed": {
|
||||||
"entries": get_vm_states,
|
"entries": get_vm_states,
|
||||||
"conditions": ["fail"],
|
"conditions": ["fail"],
|
||||||
"delta": 10,
|
"delta": 10,
|
||||||
"message": "VM {entry} was failed ({details})",
|
"message": "VM {entry} was failed",
|
||||||
},
|
},
|
||||||
"memory_overprovisioned": {
|
"memory_overprovisioned": {
|
||||||
"entries": get_overprovisioned_memory,
|
"entries": get_overprovisioned_memory,
|
||||||
"conditions": ["overprovisioned"],
|
"conditions": ["overprovisioned"],
|
||||||
"delta": 50,
|
"delta": 50,
|
||||||
"message": "Cluster memory was overprovisioned {entry}",
|
"message": "{entry}",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -507,7 +507,7 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
for fault_type in self.cluster_faults_map.keys():
|
for fault_type in self.cluster_faults_map.keys():
|
||||||
fault_details = self.cluster_faults_map[fault_type]
|
fault_data = self.cluster_faults_map[fault_type]
|
||||||
|
|
||||||
if self.config["log_monitoring_details"] or self.config["debug"]:
|
if self.config["log_monitoring_details"] or self.config["debug"]:
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
|
@ -515,7 +515,7 @@ class MonitoringInstance(object):
|
||||||
state="t",
|
state="t",
|
||||||
)
|
)
|
||||||
|
|
||||||
entries = fault_details["entries"]()
|
entries = fault_data["entries"]()
|
||||||
|
|
||||||
if self.config["debug"]:
|
if self.config["debug"]:
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
|
@ -527,13 +527,11 @@ class MonitoringInstance(object):
|
||||||
entry = _entry["entry"]
|
entry = _entry["entry"]
|
||||||
check = _entry["check"]
|
check = _entry["check"]
|
||||||
details = _entry["details"]
|
details = _entry["details"]
|
||||||
for condition in fault_details["conditions"]:
|
for condition in fault_data["conditions"]:
|
||||||
if str(condition) == str(check):
|
if str(condition) == str(check):
|
||||||
fault_time = datetime.now()
|
fault_time = datetime.now()
|
||||||
fault_delta = fault_details["delta"]
|
fault_delta = fault_data["delta"]
|
||||||
fault_message = fault_details["message"].format(
|
fault_message = fault_data["message"].format(entry=entry)
|
||||||
entry=entry, details=details
|
|
||||||
)
|
|
||||||
generate_fault(
|
generate_fault(
|
||||||
self.zkhandler,
|
self.zkhandler,
|
||||||
self.logger,
|
self.logger,
|
||||||
|
@ -541,6 +539,7 @@ class MonitoringInstance(object):
|
||||||
fault_time,
|
fault_time,
|
||||||
fault_delta,
|
fault_delta,
|
||||||
fault_message,
|
fault_message,
|
||||||
|
fault_details=details,
|
||||||
)
|
)
|
||||||
self.faults += 1
|
self.faults += 1
|
||||||
|
|
||||||
|
@ -607,6 +606,7 @@ class MonitoringInstance(object):
|
||||||
fault_time,
|
fault_time,
|
||||||
fault_delta,
|
fault_delta,
|
||||||
fault_message,
|
fault_message,
|
||||||
|
fault_detail=None,
|
||||||
)
|
)
|
||||||
self.faults += 1
|
self.faults += 1
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue