Move back to per-plugin fault reporting

This commit is contained in:
Joshua Boniface 2023-12-07 11:13:56 -05:00
parent 61b39d0739
commit 9dbadfdd6e
1 changed files with 41 additions and 21 deletions

View File

@ -306,18 +306,18 @@ class MonitoringInstance(object):
# This is a list of all possible faults (cluster error messages) and their corresponding details # This is a list of all possible faults (cluster error messages) and their corresponding details
self.cluster_faults_map = { self.cluster_faults_map = {
"unhealthy_node": { # "unhealthy_node": {
"entries": get_node_health_states, # "entries": get_node_health_states,
"conditions": range(90, 51, -1), # "conditions": range(90, 51, -1),
"delta": 10, # "delta": 10,
"message": "Node {entry} health", # "message": "Node {entry} health",
}, # },
"very_unhealthy_node": { # "very_unhealthy_node": {
"entries": get_node_health_states, # "entries": get_node_health_states,
"conditions": range(50, 0, -1), # "conditions": range(50, 0, -1),
"delta": 50, # "delta": 50,
"message": "Node {entry} health", # "message": "Node {entry} health",
}, # },
"dead_or_fenced_node": { "dead_or_fenced_node": {
"entries": get_node_daemon_states, "entries": get_node_daemon_states,
"conditions": ["dead", "fenced"], "conditions": ["dead", "fenced"],
@ -675,15 +675,35 @@ class MonitoringInstance(object):
state="t", state="t",
prefix=f"{result.plugin_name} ({result.runtime}s)", prefix=f"{result.plugin_name} ({result.runtime}s)",
) )
# Leaving this code if we ever want plugins to directly generate faults
# if result.health_delta >= 25: # Generate a cluster fault if the plugin is in a suboptimal state
# fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}" if result.health_delta > 0:
# fault_time = datetime.now() fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
# fault_delta = result.health_delta fault_time = datetime.now()
# fault_message = (
# f"{self.this_node.name} {result.plugin_name} {result.message}" # Map our check results to fault results
# ) # These are not 1-to-1, as faults are cluster-wide.
# generate_fault(self.zkhandler, self.logger, fault_type, fault_time, fault_delta, fault_message) # So a single node even with a severe fault is not alone enough to make the
# whole cluster report a serious fault. But 2 is, so 25 is chosen here.
if result.health_delta < 10:
fault_delta = 0
elif result.health_delta < 50:
fault_delta = 10
else:
fault_delta = 25
fault_message = (
f"{self.this_node.name} {result.plugin_name} {result.message}"
)
generate_fault(
self.zkhandler,
self.logger,
fault_type,
fault_time,
fault_delta,
fault_message,
)
total_health -= result.health_delta total_health -= result.health_delta
if total_health < 0: if total_health < 0: