Add node health to fault states

Adjusts ordering and ensures that node health states are included in
faults if they are less than 50%.

Also adjusts fault ID generation and runs fault checks only coordinator
nodes to avoid too many runs.
This commit is contained in:
Joshua Boniface 2023-12-01 15:50:11 -05:00
parent 8594eb697f
commit 9c2b1b29ee
2 changed files with 88 additions and 36 deletions

View File

@ -328,19 +328,19 @@ class ZKHandler(object):
return True return True
def children(self, key, retval=None): def children(self, key):
""" """
Lists all children of a key Lists all children of a key
""" """
try: try:
path = self.get_schema_path(key) path = self.get_schema_path(key)
if path is None: if path is None:
# This path is invalid; this is likely due to missing schema entries, so return None raise NoNodeError
return retval
return self.zk_conn.get_children(path) return self.zk_conn.get_children(path)
except NoNodeError: except NoNodeError:
return retval # This path is invalid; this is likely due to missing schema entries, so return None
return None
def rename(self, kkpairs): def rename(self, kkpairs):
""" """

View File

@ -25,7 +25,7 @@ import importlib.util
from os import walk from os import walk
from datetime import datetime from datetime import datetime
from hashlib import sha1 from hashlib import md5
from json import dumps, loads from json import dumps, loads
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
@ -199,6 +199,33 @@ class MonitoringInstance(object):
self.this_node = this_node self.this_node = this_node
# Create functions for each fault type # Create functions for each fault type
def get_node_health_states():
node_entries = list()
for node in self.zkhandler.children("base.node"):
node_health = self.zkhandler.read(("node.monitoring.health", node))
node_faulty_plugins = list()
all_plugins = self.zkhandler.children(("node.monitoring.data", node))
for plugin in all_plugins:
plugin_delta = self.zkhandler.read(
(
"node.monitoring.data",
node,
"monitoring_plugin.health_delta",
plugin,
)
)
if int(plugin_delta) > 0:
node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%")
node_entries.append(
(
f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})",
node_health,
)
)
return node_entries
def get_node_daemon_states(): def get_node_daemon_states():
return [ return [
(node, self.zkhandler.read(("node.state.daemon", node))) (node, self.zkhandler.read(("node.state.daemon", node)))
@ -256,30 +283,36 @@ class MonitoringInstance(object):
] ]
# This is a list of all possible faults (cluster error messages) and their corresponding details # This is a list of all possible faults (cluster error messages) and their corresponding details
self.faults_map = { self.cluster_faults_map = {
"unhealthy_node": {
"entries": get_node_health_states,
"conditions": range(50, 0, -1),
"delta": 0,
"message": "Node {entry} <= 50% health",
},
"dead_or_fenced_node": { "dead_or_fenced_node": {
"entries": get_node_daemon_states, "entries": get_node_daemon_states,
"conditions": ["dead", "fenced"], "conditions": ["dead", "fenced"],
"delta": 50, "delta": 50,
"message": "Node {entry} was dead and/or fenced.", "message": "Node {entry} was dead and/or fenced",
}, },
"ceph_osd_out": { "ceph_osd_out": {
"entries": get_osd_out_states, "entries": get_osd_out_states,
"conditions": ["1"], "conditions": ["1"],
"delta": 25, "delta": 25,
"message": "OSD {entry} was out.", "message": "OSD {entry} was marked out",
}, },
"ceph_err": { "ceph_err": {
"entries": get_ceph_health_entries, "entries": get_ceph_health_entries,
"conditions": ["HEALTH_ERR"], "conditions": ["HEALTH_ERR"],
"delta": 50, "delta": 50,
"message": "Ceph cluster reported ERR: {entry}", "message": "HEALTH_ERR {entry} reported by Ceph",
}, },
"vm_failed": { "vm_failed": {
"entries": get_vm_states, "entries": get_vm_states,
"conditions": ["fail"], "conditions": ["fail"],
"delta": 10, "delta": 10,
"message": "VM {entry} was failed.", "message": "VM {entry} was failed",
}, },
"memory_overprovisioned": { "memory_overprovisioned": {
"entries": get_overprovisioned_memory, "entries": get_overprovisioned_memory,
@ -462,8 +495,7 @@ class MonitoringInstance(object):
) )
self.timer.start() self.timer.start()
self.run_faults() self.run_checks()
self.run_plugins()
def stop_timer(self): def stop_timer(self):
try: try:
@ -472,23 +504,25 @@ class MonitoringInstance(object):
except Exception: except Exception:
self.logger.out("Failed to stop monitoring check timer", state="w") self.logger.out("Failed to stop monitoring check timer", state="w")
def generate_fault(self, fault_time, fault_delta, fault_message): def generate_fault(self, fault_name, fault_time, fault_delta, fault_message):
# Generate a fault ID from the fault_message and fault_delta # Generate a fault ID from the fault_message and fault_delta
fault_str = f"{fault_delta} {fault_message}" fault_str = f"{fault_name} {fault_delta} {fault_message}"
fault_id = int(sha1(fault_str.encode("utf-8")).hexdigest(), 16) % (10**8) fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())
self.logger.out(
f"Generating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
)
# If a fault already exists with this ID, just update the time # If a fault already exists with this ID, just update the time
if not self.zkhandler.exists("base.faults"): if not self.zkhandler.exists("base.faults"):
self.logger.out( self.logger.out(
"Skipping fault reporting due to missing Zookeeper schemas", state="w" "Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
state="w",
) )
return return
if fault_id in self.zkhandler.children("base.faults", retval=[]): existing_faults = self.zkhandler.children("base.faults")
if fault_id in existing_faults:
self.logger.out(
f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
)
self.zkhandler.write( self.zkhandler.write(
[ [
(("faults.last_time", fault_id), str(fault_time)), (("faults.last_time", fault_id), str(fault_time)),
@ -496,6 +530,10 @@ class MonitoringInstance(object):
) )
# Otherwise, generate a new fault event # Otherwise, generate a new fault event
else: else:
self.logger.out(
f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
state="i",
)
self.zkhandler.write( self.zkhandler.write(
[ [
(("faults.id", fault_id), ""), (("faults.id", fault_id), ""),
@ -509,14 +547,17 @@ class MonitoringInstance(object):
) )
def run_faults(self): def run_faults(self):
if self.this_node.coordinator_state == "primary": coordinator_state = self.this_node.coordinator_state
if coordinator_state == "primary":
cst_colour = self.logger.fmt_green cst_colour = self.logger.fmt_green
elif self.this_node.coordinator_state == "secondary": elif coordinator_state == "secondary":
cst_colour = self.logger.fmt_blue cst_colour = self.logger.fmt_blue
else: else:
cst_colour = self.logger.fmt_cyan cst_colour = self.logger.fmt_cyan
active_coordinator_state = self.this_node.coordinator_state if coordinator_state not in ["primary", "secondary", "takeover", "relinquish"]:
return
runtime_start = datetime.now() runtime_start = datetime.now()
self.logger.out( self.logger.out(
@ -525,20 +566,22 @@ class MonitoringInstance(object):
) )
fault_count = 0 fault_count = 0
for fault_type in self.faults_map.keys(): for fault_type in self.cluster_faults_map.keys():
fault_details = self.faults_map[fault_type] fault_details = self.cluster_faults_map[fault_type]
entries = fault_details["entries"]() entries = fault_details["entries"]()
for _entry in entries: for _entry in entries:
entry = _entry[0] entry = _entry[0]
detail = _entry[1] check = _entry[1]
for condition in fault_details["conditions"]: for condition in fault_details["conditions"]:
if str(condition) in str(detail): if str(condition) == str(check):
fault_time = datetime.now() fault_time = datetime.now()
fault_delta = fault_details["delta"] fault_delta = fault_details["delta"]
fault_message = fault_details["message"].format(entry=entry) fault_message = fault_details["message"].format(entry=entry)
fault_count += 1 fault_count += 1
self.generate_fault(fault_time, fault_delta, fault_message) self.generate_fault(
fault_type, fault_time, fault_delta, fault_message
)
runtime_end = datetime.now() runtime_end = datetime.now()
runtime_delta = runtime_end - runtime_start runtime_delta = runtime_end - runtime_start
@ -556,7 +599,7 @@ class MonitoringInstance(object):
nofmt=self.logger.fmt_end, nofmt=self.logger.fmt_end,
hostname=self.config["node_hostname"], hostname=self.config["node_hostname"],
starttime=runtime_start, starttime=runtime_start,
costate=active_coordinator_state, costate=coordinator_state,
fault_count=fault_count, fault_count=fault_count,
runtime=runtime, runtime=runtime,
), ),
@ -582,15 +625,15 @@ class MonitoringInstance(object):
return result return result
def run_plugins(self): def run_plugins(self):
if self.this_node.coordinator_state == "primary": coordinator_state = self.this_node.coordinator_state
if coordinator_state == "primary":
cst_colour = self.logger.fmt_green cst_colour = self.logger.fmt_green
elif self.this_node.coordinator_state == "secondary": elif coordinator_state == "secondary":
cst_colour = self.logger.fmt_blue cst_colour = self.logger.fmt_blue
else: else:
cst_colour = self.logger.fmt_cyan cst_colour = self.logger.fmt_cyan
active_coordinator_state = self.this_node.coordinator_state
runtime_start = datetime.now() runtime_start = datetime.now()
self.logger.out( self.logger.out(
"Starting monitoring plugin check run", "Starting monitoring plugin check run",
@ -614,6 +657,15 @@ class MonitoringInstance(object):
state="t", state="t",
prefix=f"{result.plugin_name} ({result.runtime}s)", prefix=f"{result.plugin_name} ({result.runtime}s)",
) )
# Leaving this code if we ever want plugins to directly generate faults
# if result.health_delta >= 25:
# fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
# fault_time = datetime.now()
# fault_delta = result.health_delta
# fault_message = (
# f"{self.this_node.name} {result.plugin_name} {result.message}"
# )
# self.generate_fault(fault_type, fault_time, fault_delta, fault_message)
total_health -= result.health_delta total_health -= result.health_delta
if total_health < 0: if total_health < 0:
@ -653,7 +705,7 @@ class MonitoringInstance(object):
nofmt=self.logger.fmt_end, nofmt=self.logger.fmt_end,
hostname=self.config["node_hostname"], hostname=self.config["node_hostname"],
starttime=runtime_start, starttime=runtime_start,
costate=active_coordinator_state, costate=coordinator_state,
health=health_text, health=health_text,
runtime=runtime, runtime=runtime,
), ),
@ -683,5 +735,5 @@ class MonitoringInstance(object):
) )
def run_checks(self): def run_checks(self):
self.run_faults()
self.run_plugins() self.run_plugins()
self.run_faults()