Separate monitoring and move to 60s interval

Removes the dependency of the monitoring subsystem from the node
keepalives, and runs them at a 60s interval to avoid excessive backups
if a plugin takes too long.

Adds its own logs and related items as required.

Finally adds a new required argument to the run() of plugins, the
coordinator state, which can be used by a plugin to determine actions
based on whether the node is a primary, secondary, or non-coordinator.
This commit is contained in:
Joshua Boniface 2023-09-15 16:27:41 -04:00
parent 79d871ebc6
commit 40b7d68853
12 changed files with 113 additions and 24 deletions

View File

@ -100,9 +100,11 @@ class MonitoringPluginScript(MonitoringPlugin):
self.disk_details = disk_details self.disk_details = disk_details
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Re-run setup each time to ensure the disk details are current # Re-run setup each time to ensure the disk details are current

View File

@ -62,9 +62,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -61,9 +61,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -57,9 +57,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Perform the check actions and return a PluginResult object run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
# Run any imports first # Run any imports first

View File

@ -255,10 +255,10 @@ def entrypoint():
except Exception: except Exception:
pass pass
# Clean up any monitoring plugins that have cleanup # Shut down the monitoring system
try: try:
logger.out("Performing monitoring plugin cleanup", state="s") logger.out("Shutting down monitoring subsystem", state="s")
monitoring_instance.run_cleanups() monitoring_instance.shutdown()
except Exception: except Exception:
pass pass

View File

@ -26,6 +26,7 @@ import importlib.util
from os import walk from os import walk
from datetime import datetime from datetime import datetime
from json import dumps from json import dumps
from apscheduler.schedulers.background import BackgroundScheduler
class PluginError(Exception): class PluginError(Exception):
@ -173,9 +174,11 @@ class MonitoringPlugin(object):
""" """
pass pass
def run(self): def run(self, coordinator_state=None):
""" """
run(): Run the plugin, returning a PluginResult object run(): Run the plugin, returning a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
""" """
return self.plugin_result return self.plugin_result
@ -332,10 +335,40 @@ class MonitoringInstance(object):
) )
) )
self.start_check_timer()
def __del__(self):
self.shutdown()
def shutdown(self):
self.stop_check_timer()
self.run_cleanups()
def start_check_timer(self):
check_interval = 60
self.logger.out(
f"Starting monitoring check timer ({check_interval} second interval)",
state="s",
)
self.check_timer = BackgroundScheduler()
self.check_timer.add_job(
self.run_plugins,
trigger="interval",
seconds=check_interval,
)
self.check_timer.start()
def stop_check_timer(self):
try:
self.check_timer.shutdown()
self.logger.out("Stopping monitoring check timer", state="s")
except Exception:
self.logger.out("Failed to stop monitoring check timer", state="w")
def run_plugin(self, plugin): def run_plugin(self, plugin):
time_start = datetime.now() time_start = datetime.now()
try: try:
result = plugin.run() result = plugin.run(coordinator_state=self.this_node.router_state)
except Exception as e: except Exception as e:
self.logger.out( self.logger.out(
f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}", f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}",
@ -351,12 +384,28 @@ class MonitoringInstance(object):
return result return result
def run_plugins(self): def run_plugins(self):
if self.this_node.router_state == "primary":
cst_colour = self.logger.fmt_green
elif self.this_node.router_state == "secondary":
cst_colour = self.logger.fmt_blue
else:
cst_colour = self.logger.fmt_cyan
self.logger.out(
"{}{} healthcheck @ {}{} [{}{}{}]".format(
self.logger.fmt_purple,
self.config["node_hostname"],
datetime.now(),
self.logger.fmt_end,
self.logger.fmt_bold + cst_colour,
self.this_node.router_state,
self.logger.fmt_end,
),
state="t",
)
runtime_start = datetime.now()
total_health = 100 total_health = 100
if self.config["log_keepalive_plugin_details"]:
self.logger.out(
f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}",
state="t",
)
plugin_results = list() plugin_results = list()
with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
to_future_plugin_results = { to_future_plugin_results = {
@ -390,6 +439,33 @@ class MonitoringInstance(object):
] ]
) )
runtime_end = datetime.now()
runtime_delta = runtime_end - runtime_start
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
time.sleep(0.2)
if isinstance(self.this_node.health, int):
if self.this_node.health > 90:
health_colour = self.logger.fmt_green
elif self.this_node.health > 50:
health_colour = self.logger.fmt_yellow
else:
health_colour = self.logger.fmt_red
health_text = str(self.this_node.health) + "%"
else:
health_colour = self.logger.fmt_blue
health_text = "N/A"
self.logger.out(
"Node health at {health_colour}{health}{nofmt}, checked in {runtime} seconds".format(
health_colour=health_colour,
nofmt=self.logger.fmt_end,
health=health_text,
runtime=runtime,
),
state="t",
)
def run_cleanup(self, plugin): def run_cleanup(self, plugin):
return plugin.cleanup() return plugin.cleanup()

View File

@ -859,11 +859,6 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
except Exception: except Exception:
logger.out("Failed to set keepalive data", state="e") logger.out("Failed to set keepalive data", state="e")
# Run this here since monitoring plugins output directly
monitoring_instance.run_plugins()
# Allow the health value to update in the Node instance
time.sleep(0.1)
if config["log_keepalives"]: if config["log_keepalives"]:
if this_node.maintenance is True: if this_node.maintenance is True:
maintenance_colour = logger.fmt_blue maintenance_colour = logger.fmt_blue