Improve logging and handling of fault entries

This commit is contained in:
Joshua Boniface 2023-12-01 16:43:19 -05:00
parent 4c3f235e05
commit b59f743690
1 changed files with 75 additions and 33 deletions

View File

@ -200,7 +200,7 @@ class MonitoringInstance(object):
# Create functions for each fault type # Create functions for each fault type
def get_node_health_states(): def get_node_health_states():
node_entries = list() node_health_states = list()
for node in self.zkhandler.children("base.node"): for node in self.zkhandler.children("base.node"):
node_health = self.zkhandler.read(("node.monitoring.health", node)) node_health = self.zkhandler.read(("node.monitoring.health", node))
node_faulty_plugins = list() node_faulty_plugins = list()
@ -217,43 +217,62 @@ class MonitoringInstance(object):
if int(plugin_delta) > 0: if int(plugin_delta) > 0:
node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%") node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%")
node_entries.append( node_health_states.append(
( {
f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})", "entry": f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})",
node_health, "check": node_health,
) "details": "",
}
) )
return node_health_states
return node_entries
def get_node_daemon_states(): def get_node_daemon_states():
return [ node_daemon_states = [
(node, self.zkhandler.read(("node.state.daemon", node))) {
"entry": node,
"check": self.zkhandler.read(("node.state.daemon", node)),
"details": "",
}
for node in self.zkhandler.children("base.node") for node in self.zkhandler.children("base.node")
] ]
return node_daemon_states
def get_osd_out_states(): def get_osd_in_states():
return [ osd_in_states = [
(osd, loads(self.zkhandler.read(("osd.stats", osd))).get("out", 0)) {
"entry": osd,
"check": loads(self.zkhandler.read(("osd.stats", osd))).get(
"in", 0
),
"details": "",
}
for osd in self.zkhandler.children("base.osd") for osd in self.zkhandler.children("base.osd")
] ]
return osd_in_states
def get_ceph_health_entries(): def get_ceph_health_entries():
return [ ceph_health_entries = [
(value, key) {
"entry": f"{value['severity']} {key}",
"check": value["severity"],
"details": value["summary"]["message"],
}
for key, value in loads(zkhandler.read("base.storage.health"))[ for key, value in loads(zkhandler.read("base.storage.health"))[
"checks" "checks"
].items() ].items()
] ]
return ceph_health_entries
def get_vm_states(): def get_vm_states():
return [ vm_states = [
( {
self.zkhandler.read(("domain.name", domain)), "entry": self.zkhandler.read(("domain.name", domain)),
self.zkhandler.read(("domain.state", domain)), "check": self.zkhandler.read(("domain.state", domain)),
) "details": self.zkhandler.read(("domain.failed_reason", domain)),
}
for domain in self.zkhandler.children("base.domain") for domain in self.zkhandler.children("base.domain")
] ]
return vm_states
def get_overprovisioned_memory(): def get_overprovisioned_memory():
all_nodes = self.zkhandler.children("base.node") all_nodes = self.zkhandler.children("base.node")
@ -275,12 +294,14 @@ class MonitoringInstance(object):
op_str = "overprovisioned" op_str = "overprovisioned"
else: else:
op_str = "ok" op_str = "ok"
return [ overprovisioned_memory = [
( {
f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)", "entry": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
op_str, "check": op_str,
) "details": "",
}
] ]
return overprovisioned_memory
# This is a list of all possible faults (cluster error messages) and their corresponding details # This is a list of all possible faults (cluster error messages) and their corresponding details
self.cluster_faults_map = { self.cluster_faults_map = {
@ -297,22 +318,22 @@ class MonitoringInstance(object):
"message": "Node {entry} was dead and/or fenced", "message": "Node {entry} was dead and/or fenced",
}, },
"ceph_osd_out": { "ceph_osd_out": {
"entries": get_osd_out_states, "entries": get_osd_in_states,
"conditions": ["1"], "conditions": ["0"],
"delta": 25, "delta": 25,
"message": "OSD {entry} was marked out", "message": "OSD {entry} was marked out",
}, },
"ceph_err": { "ceph_err": {
"entries": get_ceph_health_entries, "entries": get_ceph_health_entries,
"conditions": ["HEALTH_ERR"], "conditions": ["HEALTH_ERR", "HEALTH_WARN"],
"delta": 50, "delta": 50,
"message": "HEALTH_ERR {entry} reported by Ceph", "message": "{entry} reported by Ceph ({details})",
}, },
"vm_failed": { "vm_failed": {
"entries": get_vm_states, "entries": get_vm_states,
"conditions": ["fail"], "conditions": ["fail"],
"delta": 10, "delta": 10,
"message": "VM {entry} was failed", "message": "VM {entry} was failed ({details})",
}, },
"memory_overprovisioned": { "memory_overprovisioned": {
"entries": get_overprovisioned_memory, "entries": get_overprovisioned_memory,
@ -578,15 +599,36 @@ class MonitoringInstance(object):
for fault_type in self.cluster_faults_map.keys(): for fault_type in self.cluster_faults_map.keys():
fault_details = self.cluster_faults_map[fault_type] fault_details = self.cluster_faults_map[fault_type]
if self.config["log_monitoring_details"] or self.config["debug"]:
self.logger.out(
f"Running fault check {fault_type}",
state="t",
)
entries = fault_details["entries"]() entries = fault_details["entries"]()
if self.config["debug"]:
self.logger.out(
f"Entries for fault check {fault_type}:",
state="d",
)
for line in dumps(entries, indent=2).split("\n"):
self.logger.out(
line,
state="d",
)
for _entry in entries: for _entry in entries:
entry = _entry[0] entry = _entry["entry"]
check = _entry[1] check = _entry["check"]
details = _entry["details"]
for condition in fault_details["conditions"]: for condition in fault_details["conditions"]:
if str(condition) == str(check): if str(condition) == str(check):
fault_time = datetime.now() fault_time = datetime.now()
fault_delta = fault_details["delta"] fault_delta = fault_details["delta"]
fault_message = fault_details["message"].format(entry=entry) fault_message = fault_details["message"].format(
entry=entry, details=details
)
fault_count += 1 fault_count += 1
self.generate_fault( self.generate_fault(
fault_type, fault_time, fault_delta, fault_message fault_type, fault_time, fault_delta, fault_message