From 40b7d688533980f222e1ddd1543ac01354c02881 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 15 Sep 2023 16:27:41 -0400 Subject: [PATCH] Separate monitoring and move to 60s interval Removes the dependency of the monitoring subsystem from the node keepalives, and runs them at a 60s interval to avoid excessive backups if a plugin takes too long. Adds its own logs and related items as required. Finally adds a new required argument to the run() of plugins, the coordinator state, which can be used by a plugin to determine actions based on whether the node is a primary, secondary, or non-coordinator. --- node-daemon/plugins/disk | 4 +- node-daemon/plugins/dpkg | 4 +- node-daemon/plugins/edac | 4 +- node-daemon/plugins/ipmi | 4 +- node-daemon/plugins/lbvt | 4 +- node-daemon/plugins/load | 4 +- node-daemon/plugins/nics | 4 +- node-daemon/plugins/psql | 4 +- node-daemon/plugins/zkpr | 4 +- node-daemon/pvcnoded/Daemon.py | 6 +- .../pvcnoded/objects/MonitoringInstance.py | 90 +++++++++++++++++-- node-daemon/pvcnoded/util/keepalive.py | 5 -- 12 files changed, 113 insertions(+), 24 deletions(-) diff --git a/node-daemon/plugins/disk b/node-daemon/plugins/disk index cd1fac9d..f585e6fd 100644 --- a/node-daemon/plugins/disk +++ b/node-daemon/plugins/disk @@ -100,9 +100,11 @@ class MonitoringPluginScript(MonitoringPlugin): self.disk_details = disk_details - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Re-run setup each time to ensure the disk details are current diff --git a/node-daemon/plugins/dpkg b/node-daemon/plugins/dpkg index 1501253b..1562ebc9 100644 --- a/node-daemon/plugins/dpkg +++ b/node-daemon/plugins/dpkg @@ -62,9 +62,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/edac b/node-daemon/plugins/edac index 877f1424..c996b76b 100644 --- a/node-daemon/plugins/edac +++ b/node-daemon/plugins/edac @@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/ipmi b/node-daemon/plugins/ipmi index 0d864593..1d9f2ff0 100644 --- a/node-daemon/plugins/ipmi +++ b/node-daemon/plugins/ipmi @@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/lbvt b/node-daemon/plugins/lbvt index 9333b71f..6f0707c8 100644 --- a/node-daemon/plugins/lbvt +++ b/node-daemon/plugins/lbvt @@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/load b/node-daemon/plugins/load index a2014535..c521a614 100644 --- a/node-daemon/plugins/load +++ b/node-daemon/plugins/load @@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/nics b/node-daemon/plugins/nics index 0d2f1198..b46e0400 100644 --- a/node-daemon/plugins/nics +++ b/node-daemon/plugins/nics @@ -61,9 +61,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/psql b/node-daemon/plugins/psql index bd4e9e69..856ea7a8 100644 --- a/node-daemon/plugins/psql +++ b/node-daemon/plugins/psql @@ -57,9 +57,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/plugins/zkpr b/node-daemon/plugins/zkpr index 61e06423..b0064b12 100644 --- a/node-daemon/plugins/zkpr +++ b/node-daemon/plugins/zkpr @@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin): pass - def run(self): + def run(self, coordinator_state=None): """ run(): Perform the check actions and return a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ # Run any imports first diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index b0945140..9ba83816 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -255,10 +255,10 @@ def entrypoint(): except Exception: pass - # Clean up any monitoring plugins that have cleanup + # Shut down the monitoring system try: - logger.out("Performing monitoring plugin cleanup", state="s") - monitoring_instance.run_cleanups() + logger.out("Shutting down monitoring subsystem", state="s") + monitoring_instance.shutdown() except Exception: pass diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 5a131baa..dc4c5532 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -26,6 +26,7 @@ import importlib.util from os import walk from datetime import datetime from json import dumps +from apscheduler.schedulers.background import BackgroundScheduler class PluginError(Exception): @@ -173,9 +174,11 @@ class MonitoringPlugin(object): """ pass - def run(self): + def run(self, coordinator_state=None): """ run(): Run the plugin, returning a PluginResult object + + The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator) """ return self.plugin_result @@ -332,10 +335,40 @@ class MonitoringInstance(object): ) ) + self.start_check_timer() + + def __del__(self): + self.shutdown() + + def shutdown(self): + self.stop_check_timer() + self.run_cleanups() + + def start_check_timer(self): + check_interval = 60 + self.logger.out( + f"Starting monitoring check timer ({check_interval} second interval)", + state="s", + ) + self.check_timer = BackgroundScheduler() + self.check_timer.add_job( + self.run_plugins, + trigger="interval", + seconds=check_interval, + ) + self.check_timer.start() + + def stop_check_timer(self): + try: + self.check_timer.shutdown() + self.logger.out("Stopping monitoring check timer", state="s") + except Exception: + self.logger.out("Failed to stop monitoring check timer", state="w") + def run_plugin(self, plugin): time_start = datetime.now() try: - result = plugin.run() + result = plugin.run(coordinator_state=self.this_node.router_state) except Exception as e: self.logger.out( f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}", @@ -351,12 +384,28 @@ class MonitoringInstance(object): return result def run_plugins(self): + if self.this_node.router_state == "primary": + cst_colour = self.logger.fmt_green + elif self.this_node.router_state == "secondary": + cst_colour = self.logger.fmt_blue + else: + cst_colour = self.logger.fmt_cyan + + self.logger.out( + "{}{} healthcheck @ {}{} [{}{}{}]".format( + self.logger.fmt_purple, + self.config["node_hostname"], + datetime.now(), + self.logger.fmt_end, + self.logger.fmt_bold + cst_colour, + self.this_node.router_state, + self.logger.fmt_end, + ), + state="t", + ) + + runtime_start = datetime.now() total_health = 100 - if self.config["log_keepalive_plugin_details"]: - self.logger.out( - f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}", - state="t", - ) plugin_results = list() with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor: to_future_plugin_results = { @@ -390,6 +439,33 @@ class MonitoringInstance(object): ] ) + runtime_end = datetime.now() + runtime_delta = runtime_end - runtime_start + runtime = "{:0.02f}".format(runtime_delta.total_seconds()) + time.sleep(0.2) + + if isinstance(self.this_node.health, int): + if self.this_node.health > 90: + health_colour = self.logger.fmt_green + elif self.this_node.health > 50: + health_colour = self.logger.fmt_yellow + else: + health_colour = self.logger.fmt_red + health_text = str(self.this_node.health) + "%" + else: + health_colour = self.logger.fmt_blue + health_text = "N/A" + + self.logger.out( + "Node health at {health_colour}{health}{nofmt}, checked in {runtime} seconds".format( + health_colour=health_colour, + nofmt=self.logger.fmt_end, + health=health_text, + runtime=runtime, + ), + state="t", + ) + def run_cleanup(self, plugin): return plugin.cleanup() diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py index dedd49d2..99056c59 100644 --- a/node-daemon/pvcnoded/util/keepalive.py +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -859,11 +859,6 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): except Exception: logger.out("Failed to set keepalive data", state="e") - # Run this here since monitoring plugins output directly - monitoring_instance.run_plugins() - # Allow the health value to update in the Node instance - time.sleep(0.1) - if config["log_keepalives"]: if this_node.maintenance is True: maintenance_colour = logger.fmt_blue