Combine pvchealthd output into single log message
This commit is contained in:
parent
157b8c20bf
commit
9e2e749c55
|
@ -198,35 +198,9 @@ class MonitoringInstance(object):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.this_node = this_node
|
self.this_node = this_node
|
||||||
|
self.faults = 0
|
||||||
|
|
||||||
# Create functions for each fault type
|
# Create functions for each fault type
|
||||||
def get_node_health_states():
|
|
||||||
node_health_states = list()
|
|
||||||
for node in self.zkhandler.children("base.node"):
|
|
||||||
node_health = self.zkhandler.read(("node.monitoring.health", node))
|
|
||||||
node_faulty_plugins = list()
|
|
||||||
all_plugins = self.zkhandler.children(("node.monitoring.data", node))
|
|
||||||
for plugin in all_plugins:
|
|
||||||
plugin_delta = self.zkhandler.read(
|
|
||||||
(
|
|
||||||
"node.monitoring.data",
|
|
||||||
node,
|
|
||||||
"monitoring_plugin.health_delta",
|
|
||||||
plugin,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if int(plugin_delta) > 0:
|
|
||||||
node_faulty_plugins.append(f"{plugin}@-{plugin_delta}%")
|
|
||||||
|
|
||||||
node_health_states.append(
|
|
||||||
{
|
|
||||||
"entry": f"{node} was at {node_health}% ({', '.join(node_faulty_plugins)})",
|
|
||||||
"check": node_health,
|
|
||||||
"details": "",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return node_health_states
|
|
||||||
|
|
||||||
def get_node_daemon_states():
|
def get_node_daemon_states():
|
||||||
node_daemon_states = [
|
node_daemon_states = [
|
||||||
{
|
{
|
||||||
|
@ -306,18 +280,6 @@ class MonitoringInstance(object):
|
||||||
|
|
||||||
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
||||||
self.cluster_faults_map = {
|
self.cluster_faults_map = {
|
||||||
# "unhealthy_node": {
|
|
||||||
# "entries": get_node_health_states,
|
|
||||||
# "conditions": range(90, 51, -1),
|
|
||||||
# "delta": 10,
|
|
||||||
# "message": "Node {entry} health",
|
|
||||||
# },
|
|
||||||
# "very_unhealthy_node": {
|
|
||||||
# "entries": get_node_health_states,
|
|
||||||
# "conditions": range(50, 0, -1),
|
|
||||||
# "delta": 50,
|
|
||||||
# "message": "Node {entry} health",
|
|
||||||
# },
|
|
||||||
"dead_or_fenced_node": {
|
"dead_or_fenced_node": {
|
||||||
"entries": get_node_daemon_states,
|
"entries": get_node_daemon_states,
|
||||||
"conditions": ["dead", "fenced"],
|
"conditions": ["dead", "fenced"],
|
||||||
|
@ -538,26 +500,12 @@ class MonitoringInstance(object):
|
||||||
except Exception:
|
except Exception:
|
||||||
self.logger.out("Failed to stop monitoring check timer", state="w")
|
self.logger.out("Failed to stop monitoring check timer", state="w")
|
||||||
|
|
||||||
def run_faults(self):
|
def run_faults(self, coordinator_state=None):
|
||||||
coordinator_state = self.this_node.coordinator_state
|
|
||||||
|
|
||||||
if coordinator_state == "primary":
|
|
||||||
cst_colour = self.logger.fmt_green
|
|
||||||
elif coordinator_state == "secondary":
|
|
||||||
cst_colour = self.logger.fmt_blue
|
|
||||||
else:
|
|
||||||
cst_colour = self.logger.fmt_cyan
|
|
||||||
|
|
||||||
if coordinator_state not in ["primary", "secondary", "takeover", "relinquish"]:
|
|
||||||
return
|
|
||||||
|
|
||||||
runtime_start = datetime.now()
|
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
"Starting monitoring fault check run",
|
f"Starting cluster fault check run at {datetime.now()}",
|
||||||
state="t",
|
state="t",
|
||||||
)
|
)
|
||||||
|
|
||||||
fault_count = 0
|
|
||||||
for fault_type in self.cluster_faults_map.keys():
|
for fault_type in self.cluster_faults_map.keys():
|
||||||
fault_details = self.cluster_faults_map[fault_type]
|
fault_details = self.cluster_faults_map[fault_type]
|
||||||
|
|
||||||
|
@ -586,7 +534,6 @@ class MonitoringInstance(object):
|
||||||
fault_message = fault_details["message"].format(
|
fault_message = fault_details["message"].format(
|
||||||
entry=entry, details=details
|
entry=entry, details=details
|
||||||
)
|
)
|
||||||
fault_count += 1
|
|
||||||
generate_fault(
|
generate_fault(
|
||||||
self.zkhandler,
|
self.zkhandler,
|
||||||
self.logger,
|
self.logger,
|
||||||
|
@ -595,29 +542,7 @@ class MonitoringInstance(object):
|
||||||
fault_delta,
|
fault_delta,
|
||||||
fault_message,
|
fault_message,
|
||||||
)
|
)
|
||||||
|
self.faults += 1
|
||||||
runtime_end = datetime.now()
|
|
||||||
runtime_delta = runtime_end - runtime_start
|
|
||||||
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
|
|
||||||
if fault_count > 0:
|
|
||||||
fault_colour = self.logger.fmt_red
|
|
||||||
else:
|
|
||||||
fault_colour = self.logger.fmt_green
|
|
||||||
|
|
||||||
self.logger.out(
|
|
||||||
"{start_colour}{hostname} fault check @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] result is {fault_colour}{fault_count} faults{nofmt} in {runtime} seconds".format(
|
|
||||||
start_colour=self.logger.fmt_purple,
|
|
||||||
cst_colour=self.logger.fmt_bold + cst_colour,
|
|
||||||
fault_colour=fault_colour,
|
|
||||||
nofmt=self.logger.fmt_end,
|
|
||||||
hostname=self.config["node_hostname"],
|
|
||||||
starttime=runtime_start,
|
|
||||||
costate=coordinator_state,
|
|
||||||
fault_count=fault_count,
|
|
||||||
runtime=runtime,
|
|
||||||
),
|
|
||||||
state="t",
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_plugin(self, plugin):
|
def run_plugin(self, plugin):
|
||||||
time_start = datetime.now()
|
time_start = datetime.now()
|
||||||
|
@ -637,19 +562,9 @@ class MonitoringInstance(object):
|
||||||
result.to_zookeeper()
|
result.to_zookeeper()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def run_plugins(self):
|
def run_plugins(self, coordinator_state=None):
|
||||||
coordinator_state = self.this_node.coordinator_state
|
|
||||||
|
|
||||||
if coordinator_state == "primary":
|
|
||||||
cst_colour = self.logger.fmt_green
|
|
||||||
elif coordinator_state == "secondary":
|
|
||||||
cst_colour = self.logger.fmt_blue
|
|
||||||
else:
|
|
||||||
cst_colour = self.logger.fmt_cyan
|
|
||||||
|
|
||||||
runtime_start = datetime.now()
|
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
"Starting monitoring plugin check run",
|
f"Starting node plugin check run at {datetime.now()}",
|
||||||
state="t",
|
state="t",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -693,8 +608,9 @@ class MonitoringInstance(object):
|
||||||
fault_delta,
|
fault_delta,
|
||||||
fault_message,
|
fault_message,
|
||||||
)
|
)
|
||||||
|
self.faults += 1
|
||||||
|
|
||||||
total_health -= result.health_delta
|
total_health -= result.health_delta
|
||||||
|
|
||||||
if total_health < 0:
|
if total_health < 0:
|
||||||
total_health = 0
|
total_health = 0
|
||||||
|
@ -708,38 +624,6 @@ class MonitoringInstance(object):
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
runtime_end = datetime.now()
|
|
||||||
runtime_delta = runtime_end - runtime_start
|
|
||||||
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
|
|
||||||
time.sleep(0.2)
|
|
||||||
|
|
||||||
if isinstance(self.this_node.health, int):
|
|
||||||
if self.this_node.health > 90:
|
|
||||||
health_colour = self.logger.fmt_green
|
|
||||||
elif self.this_node.health > 50:
|
|
||||||
health_colour = self.logger.fmt_yellow
|
|
||||||
else:
|
|
||||||
health_colour = self.logger.fmt_red
|
|
||||||
health_text = str(self.this_node.health) + "%"
|
|
||||||
else:
|
|
||||||
health_colour = self.logger.fmt_blue
|
|
||||||
health_text = "N/A"
|
|
||||||
|
|
||||||
self.logger.out(
|
|
||||||
"{start_colour}{hostname} plugin check @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] result is {health_colour}{health}{nofmt} in {runtime} seconds".format(
|
|
||||||
start_colour=self.logger.fmt_purple,
|
|
||||||
cst_colour=self.logger.fmt_bold + cst_colour,
|
|
||||||
health_colour=health_colour,
|
|
||||||
nofmt=self.logger.fmt_end,
|
|
||||||
hostname=self.config["node_hostname"],
|
|
||||||
starttime=runtime_start,
|
|
||||||
costate=coordinator_state,
|
|
||||||
health=health_text,
|
|
||||||
runtime=runtime,
|
|
||||||
),
|
|
||||||
state="t",
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_cleanup(self, plugin):
|
def run_cleanup(self, plugin):
|
||||||
return plugin.cleanup()
|
return plugin.cleanup()
|
||||||
|
|
||||||
|
@ -763,5 +647,64 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_checks(self):
|
def run_checks(self):
|
||||||
self.run_plugins()
|
self.faults = 0
|
||||||
self.run_faults()
|
runtime_start = datetime.now()
|
||||||
|
|
||||||
|
coordinator_state = self.this_node.coordinator_state
|
||||||
|
|
||||||
|
if coordinator_state == "primary":
|
||||||
|
cst_colour = self.logger.fmt_green
|
||||||
|
elif coordinator_state == "secondary":
|
||||||
|
cst_colour = self.logger.fmt_blue
|
||||||
|
else:
|
||||||
|
cst_colour = self.logger.fmt_cyan
|
||||||
|
|
||||||
|
self.run_plugins(coordinator_state=coordinator_state)
|
||||||
|
|
||||||
|
if coordinator_state in ["primary", "secondary", "takeover", "relinquish"]:
|
||||||
|
self.run_faults(coordinator_state=coordinator_state)
|
||||||
|
|
||||||
|
runtime_end = datetime.now()
|
||||||
|
runtime_delta = runtime_end - runtime_start
|
||||||
|
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
|
||||||
|
|
||||||
|
result_text = list()
|
||||||
|
|
||||||
|
if coordinator_state in ["primary", "secondary", "takeover", "relinquish"]:
|
||||||
|
if self.faults > 0:
|
||||||
|
fault_colour = self.logger.fmt_red
|
||||||
|
else:
|
||||||
|
fault_colour = self.logger.fmt_green
|
||||||
|
if self.faults != 1:
|
||||||
|
s = "s"
|
||||||
|
else:
|
||||||
|
s = ""
|
||||||
|
fault_text = f"{fault_colour}{self.faults}{self.logger.fmt_end} fault{s}"
|
||||||
|
result_text.append(fault_text)
|
||||||
|
|
||||||
|
if isinstance(self.this_node.health, int):
|
||||||
|
if self.this_node.health > 90:
|
||||||
|
health_colour = self.logger.fmt_green
|
||||||
|
elif self.this_node.health > 50:
|
||||||
|
health_colour = self.logger.fmt_yellow
|
||||||
|
else:
|
||||||
|
health_colour = self.logger.fmt_red
|
||||||
|
health_text = f"{health_colour}{self.this_node.health}%{self.logger.fmt_end} node health"
|
||||||
|
result_text.append(health_text)
|
||||||
|
else:
|
||||||
|
health_text = "{self.logger.fmt_blue}N/A{self.logger.fmt_end} node health"
|
||||||
|
result_text.append(health_text)
|
||||||
|
|
||||||
|
self.logger.out(
|
||||||
|
"{start_colour}{hostname} health check @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] result is {result_text} in {runtime} seconds".format(
|
||||||
|
start_colour=self.logger.fmt_purple,
|
||||||
|
cst_colour=self.logger.fmt_bold + cst_colour,
|
||||||
|
nofmt=self.logger.fmt_end,
|
||||||
|
hostname=self.config["node_hostname"],
|
||||||
|
starttime=runtime_start,
|
||||||
|
costate=coordinator_state,
|
||||||
|
runtime=runtime,
|
||||||
|
result_text=", ".join(result_text),
|
||||||
|
),
|
||||||
|
state="t",
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue