Separate monitoring and move to 60s interval

Removes the dependency of the monitoring subsystem from the node
keepalives, and runs them at a 60s interval to avoid excessive backups
if a plugin takes too long.

Adds its own logs and related items as required.

Finally adds a new required argument to the run() of plugins, the
coordinator state, which can be used by a plugin to determine actions
based on whether the node is a primary, secondary, or non-coordinator.
This commit is contained in:
Joshua Boniface 2023-09-15 16:27:41 -04:00
parent 79d871ebc6
commit 40b7d68853
12 changed files with 113 additions and 24 deletions

View File

@ -100,9 +100,11 @@ class MonitoringPluginScript(MonitoringPlugin):
self.disk_details = disk_details
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Re-run setup each time to ensure the disk details are current

View File

@ -62,9 +62,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -61,9 +61,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -57,9 +57,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
# Run any imports first

View File

@ -255,10 +255,10 @@ def entrypoint():
except Exception:
pass
# Clean up any monitoring plugins that have cleanup
# Shut down the monitoring system
try:
logger.out("Performing monitoring plugin cleanup", state="s")
monitoring_instance.run_cleanups()
logger.out("Shutting down monitoring subsystem", state="s")
monitoring_instance.shutdown()
except Exception:
pass

View File

@ -26,6 +26,7 @@ import importlib.util
from os import walk
from datetime import datetime
from json import dumps
from apscheduler.schedulers.background import BackgroundScheduler
class PluginError(Exception):
@ -173,9 +174,11 @@ class MonitoringPlugin(object):
"""
pass
def run(self):
def run(self, coordinator_state=None):
"""
run(): Run the plugin, returning a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
"""
return self.plugin_result
@ -332,10 +335,40 @@ class MonitoringInstance(object):
)
)
self.start_check_timer()
def __del__(self):
self.shutdown()
def shutdown(self):
self.stop_check_timer()
self.run_cleanups()
def start_check_timer(self):
check_interval = 60
self.logger.out(
f"Starting monitoring check timer ({check_interval} second interval)",
state="s",
)
self.check_timer = BackgroundScheduler()
self.check_timer.add_job(
self.run_plugins,
trigger="interval",
seconds=check_interval,
)
self.check_timer.start()
def stop_check_timer(self):
try:
self.check_timer.shutdown()
self.logger.out("Stopping monitoring check timer", state="s")
except Exception:
self.logger.out("Failed to stop monitoring check timer", state="w")
def run_plugin(self, plugin):
time_start = datetime.now()
try:
result = plugin.run()
result = plugin.run(coordinator_state=self.this_node.router_state)
except Exception as e:
self.logger.out(
f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}",
@ -351,12 +384,28 @@ class MonitoringInstance(object):
return result
def run_plugins(self):
total_health = 100
if self.config["log_keepalive_plugin_details"]:
if self.this_node.router_state == "primary":
cst_colour = self.logger.fmt_green
elif self.this_node.router_state == "secondary":
cst_colour = self.logger.fmt_blue
else:
cst_colour = self.logger.fmt_cyan
self.logger.out(
f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}",
"{}{} healthcheck @ {}{} [{}{}{}]".format(
self.logger.fmt_purple,
self.config["node_hostname"],
datetime.now(),
self.logger.fmt_end,
self.logger.fmt_bold + cst_colour,
self.this_node.router_state,
self.logger.fmt_end,
),
state="t",
)
runtime_start = datetime.now()
total_health = 100
plugin_results = list()
with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
to_future_plugin_results = {
@ -390,6 +439,33 @@ class MonitoringInstance(object):
]
)
runtime_end = datetime.now()
runtime_delta = runtime_end - runtime_start
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
time.sleep(0.2)
if isinstance(self.this_node.health, int):
if self.this_node.health > 90:
health_colour = self.logger.fmt_green
elif self.this_node.health > 50:
health_colour = self.logger.fmt_yellow
else:
health_colour = self.logger.fmt_red
health_text = str(self.this_node.health) + "%"
else:
health_colour = self.logger.fmt_blue
health_text = "N/A"
self.logger.out(
"Node health at {health_colour}{health}{nofmt}, checked in {runtime} seconds".format(
health_colour=health_colour,
nofmt=self.logger.fmt_end,
health=health_text,
runtime=runtime,
),
state="t",
)
def run_cleanup(self, plugin):
return plugin.cleanup()

View File

@ -859,11 +859,6 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
except Exception:
logger.out("Failed to set keepalive data", state="e")
# Run this here since monitoring plugins output directly
monitoring_instance.run_plugins()
# Allow the health value to update in the Node instance
time.sleep(0.1)
if config["log_keepalives"]:
if this_node.maintenance is True:
maintenance_colour = logger.fmt_blue