Improve logging and handling of fault entries
This commit is contained in:
parent
4c3f235e05
commit
b59f743690
|
@ -200,7 +200,7 @@ class MonitoringInstance(object):
|
||||||
|
|
||||||
# Create functions for each fault type
|
# Create functions for each fault type
|
||||||
def get_node_health_states():
|
def get_node_health_states():
|
||||||
node_entries = list()
|
node_health_states = list()
|
||||||
for node in self.zkhandler.children("base.node"):
|
for node in self.zkhandler.children("base.node"):
|
||||||
node_health = self.zkhandler.read(("node.monitoring.health", node))
|
node_health = self.zkhandler.read(("node.monitoring.health", node))
|
||||||
node_faulty_plugins = list()
|
node_faulty_plugins = list()
|
||||||
|
@ -217,43 +217,62 @@ class MonitoringInstance(object):
|
||||||
if int(plugin_delta) > 0:
|
if int(plugin_delta) > 0:
|
||||||
node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%")
|
node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%")
|
||||||
|
|
||||||
node_entries.append(
|
node_health_states.append(
|
||||||
(
|
{
|
||||||
f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})",
|
"entry": f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})",
|
||||||
node_health,
|
"check": node_health,
|
||||||
)
|
"details": "",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
return node_health_states
|
||||||
return node_entries
|
|
||||||
|
|
||||||
def get_node_daemon_states():
|
def get_node_daemon_states():
|
||||||
return [
|
node_daemon_states = [
|
||||||
(node, self.zkhandler.read(("node.state.daemon", node)))
|
{
|
||||||
|
"entry": node,
|
||||||
|
"check": self.zkhandler.read(("node.state.daemon", node)),
|
||||||
|
"details": "",
|
||||||
|
}
|
||||||
for node in self.zkhandler.children("base.node")
|
for node in self.zkhandler.children("base.node")
|
||||||
]
|
]
|
||||||
|
return node_daemon_states
|
||||||
|
|
||||||
def get_osd_out_states():
|
def get_osd_in_states():
|
||||||
return [
|
osd_in_states = [
|
||||||
(osd, loads(self.zkhandler.read(("osd.stats", osd))).get("out", 0))
|
{
|
||||||
|
"entry": osd,
|
||||||
|
"check": loads(self.zkhandler.read(("osd.stats", osd))).get(
|
||||||
|
"in", 0
|
||||||
|
),
|
||||||
|
"details": "",
|
||||||
|
}
|
||||||
for osd in self.zkhandler.children("base.osd")
|
for osd in self.zkhandler.children("base.osd")
|
||||||
]
|
]
|
||||||
|
return osd_in_states
|
||||||
|
|
||||||
def get_ceph_health_entries():
|
def get_ceph_health_entries():
|
||||||
return [
|
ceph_health_entries = [
|
||||||
(value, key)
|
{
|
||||||
|
"entry": f"{value['severity']} {key}",
|
||||||
|
"check": value["severity"],
|
||||||
|
"details": value["summary"]["message"],
|
||||||
|
}
|
||||||
for key, value in loads(zkhandler.read("base.storage.health"))[
|
for key, value in loads(zkhandler.read("base.storage.health"))[
|
||||||
"checks"
|
"checks"
|
||||||
].items()
|
].items()
|
||||||
]
|
]
|
||||||
|
return ceph_health_entries
|
||||||
|
|
||||||
def get_vm_states():
|
def get_vm_states():
|
||||||
return [
|
vm_states = [
|
||||||
(
|
{
|
||||||
self.zkhandler.read(("domain.name", domain)),
|
"entry": self.zkhandler.read(("domain.name", domain)),
|
||||||
self.zkhandler.read(("domain.state", domain)),
|
"check": self.zkhandler.read(("domain.state", domain)),
|
||||||
)
|
"details": self.zkhandler.read(("domain.failed_reason", domain)),
|
||||||
|
}
|
||||||
for domain in self.zkhandler.children("base.domain")
|
for domain in self.zkhandler.children("base.domain")
|
||||||
]
|
]
|
||||||
|
return vm_states
|
||||||
|
|
||||||
def get_overprovisioned_memory():
|
def get_overprovisioned_memory():
|
||||||
all_nodes = self.zkhandler.children("base.node")
|
all_nodes = self.zkhandler.children("base.node")
|
||||||
|
@ -275,12 +294,14 @@ class MonitoringInstance(object):
|
||||||
op_str = "overprovisioned"
|
op_str = "overprovisioned"
|
||||||
else:
|
else:
|
||||||
op_str = "ok"
|
op_str = "ok"
|
||||||
return [
|
overprovisioned_memory = [
|
||||||
(
|
{
|
||||||
f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
|
"entry": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
|
||||||
op_str,
|
"check": op_str,
|
||||||
)
|
"details": "",
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
return overprovisioned_memory
|
||||||
|
|
||||||
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
||||||
self.cluster_faults_map = {
|
self.cluster_faults_map = {
|
||||||
|
@ -297,22 +318,22 @@ class MonitoringInstance(object):
|
||||||
"message": "Node {entry} was dead and/or fenced",
|
"message": "Node {entry} was dead and/or fenced",
|
||||||
},
|
},
|
||||||
"ceph_osd_out": {
|
"ceph_osd_out": {
|
||||||
"entries": get_osd_out_states,
|
"entries": get_osd_in_states,
|
||||||
"conditions": ["1"],
|
"conditions": ["0"],
|
||||||
"delta": 25,
|
"delta": 25,
|
||||||
"message": "OSD {entry} was marked out",
|
"message": "OSD {entry} was marked out",
|
||||||
},
|
},
|
||||||
"ceph_err": {
|
"ceph_err": {
|
||||||
"entries": get_ceph_health_entries,
|
"entries": get_ceph_health_entries,
|
||||||
"conditions": ["HEALTH_ERR"],
|
"conditions": ["HEALTH_ERR", "HEALTH_WARN"],
|
||||||
"delta": 50,
|
"delta": 50,
|
||||||
"message": "HEALTH_ERR {entry} reported by Ceph",
|
"message": "{entry} reported by Ceph ({details})",
|
||||||
},
|
},
|
||||||
"vm_failed": {
|
"vm_failed": {
|
||||||
"entries": get_vm_states,
|
"entries": get_vm_states,
|
||||||
"conditions": ["fail"],
|
"conditions": ["fail"],
|
||||||
"delta": 10,
|
"delta": 10,
|
||||||
"message": "VM {entry} was failed",
|
"message": "VM {entry} was failed ({details})",
|
||||||
},
|
},
|
||||||
"memory_overprovisioned": {
|
"memory_overprovisioned": {
|
||||||
"entries": get_overprovisioned_memory,
|
"entries": get_overprovisioned_memory,
|
||||||
|
@ -578,15 +599,36 @@ class MonitoringInstance(object):
|
||||||
for fault_type in self.cluster_faults_map.keys():
|
for fault_type in self.cluster_faults_map.keys():
|
||||||
fault_details = self.cluster_faults_map[fault_type]
|
fault_details = self.cluster_faults_map[fault_type]
|
||||||
|
|
||||||
|
if self.config["log_monitoring_details"] or self.config["debug"]:
|
||||||
|
self.logger.out(
|
||||||
|
f"Running fault check {fault_type}",
|
||||||
|
state="t",
|
||||||
|
)
|
||||||
|
|
||||||
entries = fault_details["entries"]()
|
entries = fault_details["entries"]()
|
||||||
|
|
||||||
|
if self.config["debug"]:
|
||||||
|
self.logger.out(
|
||||||
|
f"Entries for fault check {fault_type}:",
|
||||||
|
state="d",
|
||||||
|
)
|
||||||
|
for line in dumps(entries, indent=2).split("\n"):
|
||||||
|
self.logger.out(
|
||||||
|
line,
|
||||||
|
state="d",
|
||||||
|
)
|
||||||
|
|
||||||
for _entry in entries:
|
for _entry in entries:
|
||||||
entry = _entry[0]
|
entry = _entry["entry"]
|
||||||
check = _entry[1]
|
check = _entry["check"]
|
||||||
|
details = _entry["details"]
|
||||||
for condition in fault_details["conditions"]:
|
for condition in fault_details["conditions"]:
|
||||||
if str(condition) == str(check):
|
if str(condition) == str(check):
|
||||||
fault_time = datetime.now()
|
fault_time = datetime.now()
|
||||||
fault_delta = fault_details["delta"]
|
fault_delta = fault_details["delta"]
|
||||||
fault_message = fault_details["message"].format(entry=entry)
|
fault_message = fault_details["message"].format(
|
||||||
|
entry=entry, details=details
|
||||||
|
)
|
||||||
fault_count += 1
|
fault_count += 1
|
||||||
self.generate_fault(
|
self.generate_fault(
|
||||||
fault_type, fault_time, fault_delta, fault_message
|
fault_type, fault_time, fault_delta, fault_message
|
||||||
|
|
Loading…
Reference in New Issue