Add node health to fault states
Adjusts ordering and ensures that node health states are included in faults if they are less than 50%. Also adjusts fault ID generation and runs fault checks only coordinator nodes to avoid too many runs.
This commit is contained in:
parent
8594eb697f
commit
9c2b1b29ee
|
@ -328,19 +328,19 @@ class ZKHandler(object):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def children(self, key, retval=None):
|
def children(self, key):
|
||||||
"""
|
"""
|
||||||
Lists all children of a key
|
Lists all children of a key
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
path = self.get_schema_path(key)
|
path = self.get_schema_path(key)
|
||||||
if path is None:
|
if path is None:
|
||||||
# This path is invalid; this is likely due to missing schema entries, so return None
|
raise NoNodeError
|
||||||
return retval
|
|
||||||
|
|
||||||
return self.zk_conn.get_children(path)
|
return self.zk_conn.get_children(path)
|
||||||
except NoNodeError:
|
except NoNodeError:
|
||||||
return retval
|
# This path is invalid; this is likely due to missing schema entries, so return None
|
||||||
|
return None
|
||||||
|
|
||||||
def rename(self, kkpairs):
|
def rename(self, kkpairs):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -25,7 +25,7 @@ import importlib.util
|
||||||
|
|
||||||
from os import walk
|
from os import walk
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from hashlib import sha1
|
from hashlib import md5
|
||||||
from json import dumps, loads
|
from json import dumps, loads
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
|
@ -199,6 +199,33 @@ class MonitoringInstance(object):
|
||||||
self.this_node = this_node
|
self.this_node = this_node
|
||||||
|
|
||||||
# Create functions for each fault type
|
# Create functions for each fault type
|
||||||
|
def get_node_health_states():
|
||||||
|
node_entries = list()
|
||||||
|
for node in self.zkhandler.children("base.node"):
|
||||||
|
node_health = self.zkhandler.read(("node.monitoring.health", node))
|
||||||
|
node_faulty_plugins = list()
|
||||||
|
all_plugins = self.zkhandler.children(("node.monitoring.data", node))
|
||||||
|
for plugin in all_plugins:
|
||||||
|
plugin_delta = self.zkhandler.read(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
node,
|
||||||
|
"monitoring_plugin.health_delta",
|
||||||
|
plugin,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if int(plugin_delta) > 0:
|
||||||
|
node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%")
|
||||||
|
|
||||||
|
node_entries.append(
|
||||||
|
(
|
||||||
|
f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})",
|
||||||
|
node_health,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return node_entries
|
||||||
|
|
||||||
def get_node_daemon_states():
|
def get_node_daemon_states():
|
||||||
return [
|
return [
|
||||||
(node, self.zkhandler.read(("node.state.daemon", node)))
|
(node, self.zkhandler.read(("node.state.daemon", node)))
|
||||||
|
@ -256,30 +283,36 @@ class MonitoringInstance(object):
|
||||||
]
|
]
|
||||||
|
|
||||||
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
||||||
self.faults_map = {
|
self.cluster_faults_map = {
|
||||||
|
"unhealthy_node": {
|
||||||
|
"entries": get_node_health_states,
|
||||||
|
"conditions": range(50, 0, -1),
|
||||||
|
"delta": 0,
|
||||||
|
"message": "Node {entry} <= 50% health",
|
||||||
|
},
|
||||||
"dead_or_fenced_node": {
|
"dead_or_fenced_node": {
|
||||||
"entries": get_node_daemon_states,
|
"entries": get_node_daemon_states,
|
||||||
"conditions": ["dead", "fenced"],
|
"conditions": ["dead", "fenced"],
|
||||||
"delta": 50,
|
"delta": 50,
|
||||||
"message": "Node {entry} was dead and/or fenced.",
|
"message": "Node {entry} was dead and/or fenced",
|
||||||
},
|
},
|
||||||
"ceph_osd_out": {
|
"ceph_osd_out": {
|
||||||
"entries": get_osd_out_states,
|
"entries": get_osd_out_states,
|
||||||
"conditions": ["1"],
|
"conditions": ["1"],
|
||||||
"delta": 25,
|
"delta": 25,
|
||||||
"message": "OSD {entry} was out.",
|
"message": "OSD {entry} was marked out",
|
||||||
},
|
},
|
||||||
"ceph_err": {
|
"ceph_err": {
|
||||||
"entries": get_ceph_health_entries,
|
"entries": get_ceph_health_entries,
|
||||||
"conditions": ["HEALTH_ERR"],
|
"conditions": ["HEALTH_ERR"],
|
||||||
"delta": 50,
|
"delta": 50,
|
||||||
"message": "Ceph cluster reported ERR: {entry}",
|
"message": "HEALTH_ERR {entry} reported by Ceph",
|
||||||
},
|
},
|
||||||
"vm_failed": {
|
"vm_failed": {
|
||||||
"entries": get_vm_states,
|
"entries": get_vm_states,
|
||||||
"conditions": ["fail"],
|
"conditions": ["fail"],
|
||||||
"delta": 10,
|
"delta": 10,
|
||||||
"message": "VM {entry} was failed.",
|
"message": "VM {entry} was failed",
|
||||||
},
|
},
|
||||||
"memory_overprovisioned": {
|
"memory_overprovisioned": {
|
||||||
"entries": get_overprovisioned_memory,
|
"entries": get_overprovisioned_memory,
|
||||||
|
@ -462,8 +495,7 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
self.timer.start()
|
self.timer.start()
|
||||||
|
|
||||||
self.run_faults()
|
self.run_checks()
|
||||||
self.run_plugins()
|
|
||||||
|
|
||||||
def stop_timer(self):
|
def stop_timer(self):
|
||||||
try:
|
try:
|
||||||
|
@ -472,23 +504,25 @@ class MonitoringInstance(object):
|
||||||
except Exception:
|
except Exception:
|
||||||
self.logger.out("Failed to stop monitoring check timer", state="w")
|
self.logger.out("Failed to stop monitoring check timer", state="w")
|
||||||
|
|
||||||
def generate_fault(self, fault_time, fault_delta, fault_message):
|
def generate_fault(self, fault_name, fault_time, fault_delta, fault_message):
|
||||||
# Generate a fault ID from the fault_message and fault_delta
|
# Generate a fault ID from the fault_message and fault_delta
|
||||||
fault_str = f"{fault_delta} {fault_message}"
|
fault_str = f"{fault_name} {fault_delta} {fault_message}"
|
||||||
fault_id = int(sha1(fault_str.encode("utf-8")).hexdigest(), 16) % (10**8)
|
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())
|
||||||
|
|
||||||
self.logger.out(
|
|
||||||
f"Generating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
|
|
||||||
)
|
|
||||||
|
|
||||||
# If a fault already exists with this ID, just update the time
|
# If a fault already exists with this ID, just update the time
|
||||||
if not self.zkhandler.exists("base.faults"):
|
if not self.zkhandler.exists("base.faults"):
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
"Skipping fault reporting due to missing Zookeeper schemas", state="w"
|
"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
|
||||||
|
state="w",
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if fault_id in self.zkhandler.children("base.faults", retval=[]):
|
existing_faults = self.zkhandler.children("base.faults")
|
||||||
|
if fault_id in existing_faults:
|
||||||
|
self.logger.out(
|
||||||
|
f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
|
||||||
|
)
|
||||||
|
|
||||||
self.zkhandler.write(
|
self.zkhandler.write(
|
||||||
[
|
[
|
||||||
(("faults.last_time", fault_id), str(fault_time)),
|
(("faults.last_time", fault_id), str(fault_time)),
|
||||||
|
@ -496,6 +530,10 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
# Otherwise, generate a new fault event
|
# Otherwise, generate a new fault event
|
||||||
else:
|
else:
|
||||||
|
self.logger.out(
|
||||||
|
f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
|
||||||
|
state="i",
|
||||||
|
)
|
||||||
self.zkhandler.write(
|
self.zkhandler.write(
|
||||||
[
|
[
|
||||||
(("faults.id", fault_id), ""),
|
(("faults.id", fault_id), ""),
|
||||||
|
@ -509,14 +547,17 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_faults(self):
|
def run_faults(self):
|
||||||
if self.this_node.coordinator_state == "primary":
|
coordinator_state = self.this_node.coordinator_state
|
||||||
|
|
||||||
|
if coordinator_state == "primary":
|
||||||
cst_colour = self.logger.fmt_green
|
cst_colour = self.logger.fmt_green
|
||||||
elif self.this_node.coordinator_state == "secondary":
|
elif coordinator_state == "secondary":
|
||||||
cst_colour = self.logger.fmt_blue
|
cst_colour = self.logger.fmt_blue
|
||||||
else:
|
else:
|
||||||
cst_colour = self.logger.fmt_cyan
|
cst_colour = self.logger.fmt_cyan
|
||||||
|
|
||||||
active_coordinator_state = self.this_node.coordinator_state
|
if coordinator_state not in ["primary", "secondary", "takeover", "relinquish"]:
|
||||||
|
return
|
||||||
|
|
||||||
runtime_start = datetime.now()
|
runtime_start = datetime.now()
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
|
@ -525,20 +566,22 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
fault_count = 0
|
fault_count = 0
|
||||||
for fault_type in self.faults_map.keys():
|
for fault_type in self.cluster_faults_map.keys():
|
||||||
fault_details = self.faults_map[fault_type]
|
fault_details = self.cluster_faults_map[fault_type]
|
||||||
|
|
||||||
entries = fault_details["entries"]()
|
entries = fault_details["entries"]()
|
||||||
for _entry in entries:
|
for _entry in entries:
|
||||||
entry = _entry[0]
|
entry = _entry[0]
|
||||||
detail = _entry[1]
|
check = _entry[1]
|
||||||
for condition in fault_details["conditions"]:
|
for condition in fault_details["conditions"]:
|
||||||
if str(condition) in str(detail):
|
if str(condition) == str(check):
|
||||||
fault_time = datetime.now()
|
fault_time = datetime.now()
|
||||||
fault_delta = fault_details["delta"]
|
fault_delta = fault_details["delta"]
|
||||||
fault_message = fault_details["message"].format(entry=entry)
|
fault_message = fault_details["message"].format(entry=entry)
|
||||||
fault_count += 1
|
fault_count += 1
|
||||||
self.generate_fault(fault_time, fault_delta, fault_message)
|
self.generate_fault(
|
||||||
|
fault_type, fault_time, fault_delta, fault_message
|
||||||
|
)
|
||||||
|
|
||||||
runtime_end = datetime.now()
|
runtime_end = datetime.now()
|
||||||
runtime_delta = runtime_end - runtime_start
|
runtime_delta = runtime_end - runtime_start
|
||||||
|
@ -556,7 +599,7 @@ class MonitoringInstance(object):
|
||||||
nofmt=self.logger.fmt_end,
|
nofmt=self.logger.fmt_end,
|
||||||
hostname=self.config["node_hostname"],
|
hostname=self.config["node_hostname"],
|
||||||
starttime=runtime_start,
|
starttime=runtime_start,
|
||||||
costate=active_coordinator_state,
|
costate=coordinator_state,
|
||||||
fault_count=fault_count,
|
fault_count=fault_count,
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
),
|
),
|
||||||
|
@ -582,15 +625,15 @@ class MonitoringInstance(object):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def run_plugins(self):
|
def run_plugins(self):
|
||||||
if self.this_node.coordinator_state == "primary":
|
coordinator_state = self.this_node.coordinator_state
|
||||||
|
|
||||||
|
if coordinator_state == "primary":
|
||||||
cst_colour = self.logger.fmt_green
|
cst_colour = self.logger.fmt_green
|
||||||
elif self.this_node.coordinator_state == "secondary":
|
elif coordinator_state == "secondary":
|
||||||
cst_colour = self.logger.fmt_blue
|
cst_colour = self.logger.fmt_blue
|
||||||
else:
|
else:
|
||||||
cst_colour = self.logger.fmt_cyan
|
cst_colour = self.logger.fmt_cyan
|
||||||
|
|
||||||
active_coordinator_state = self.this_node.coordinator_state
|
|
||||||
|
|
||||||
runtime_start = datetime.now()
|
runtime_start = datetime.now()
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
"Starting monitoring plugin check run",
|
"Starting monitoring plugin check run",
|
||||||
|
@ -614,6 +657,15 @@ class MonitoringInstance(object):
|
||||||
state="t",
|
state="t",
|
||||||
prefix=f"{result.plugin_name} ({result.runtime}s)",
|
prefix=f"{result.plugin_name} ({result.runtime}s)",
|
||||||
)
|
)
|
||||||
|
# Leaving this code if we ever want plugins to directly generate faults
|
||||||
|
# if result.health_delta >= 25:
|
||||||
|
# fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
|
||||||
|
# fault_time = datetime.now()
|
||||||
|
# fault_delta = result.health_delta
|
||||||
|
# fault_message = (
|
||||||
|
# f"{self.this_node.name} {result.plugin_name} {result.message}"
|
||||||
|
# )
|
||||||
|
# self.generate_fault(fault_type, fault_time, fault_delta, fault_message)
|
||||||
total_health -= result.health_delta
|
total_health -= result.health_delta
|
||||||
|
|
||||||
if total_health < 0:
|
if total_health < 0:
|
||||||
|
@ -653,7 +705,7 @@ class MonitoringInstance(object):
|
||||||
nofmt=self.logger.fmt_end,
|
nofmt=self.logger.fmt_end,
|
||||||
hostname=self.config["node_hostname"],
|
hostname=self.config["node_hostname"],
|
||||||
starttime=runtime_start,
|
starttime=runtime_start,
|
||||||
costate=active_coordinator_state,
|
costate=coordinator_state,
|
||||||
health=health_text,
|
health=health_text,
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
),
|
),
|
||||||
|
@ -683,5 +735,5 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_checks(self):
|
def run_checks(self):
|
||||||
self.run_faults()
|
|
||||||
self.run_plugins()
|
self.run_plugins()
|
||||||
|
self.run_faults()
|
||||||
|
|
Loading…
Reference in New Issue