Separate monitoring and move to 60s interval
Removes the dependency of the monitoring subsystem from the node keepalives, and runs them at a 60s interval to avoid excessive backups if a plugin takes too long. Adds its own logs and related items as required. Finally adds a new required argument to the run() of plugins, the coordinator state, which can be used by a plugin to determine actions based on whether the node is a primary, secondary, or non-coordinator.
This commit is contained in:
parent
79d871ebc6
commit
40b7d68853
|
@ -100,9 +100,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
self.disk_details = disk_details
|
self.disk_details = disk_details
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Re-run setup each time to ensure the disk details are current
|
# Re-run setup each time to ensure the disk details are current
|
||||||
|
|
|
@ -62,9 +62,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -61,9 +61,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -57,9 +57,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -60,9 +60,11 @@ class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Perform the check actions and return a PluginResult object
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Run any imports first
|
# Run any imports first
|
||||||
|
|
|
@ -255,10 +255,10 @@ def entrypoint():
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Clean up any monitoring plugins that have cleanup
|
# Shut down the monitoring system
|
||||||
try:
|
try:
|
||||||
logger.out("Performing monitoring plugin cleanup", state="s")
|
logger.out("Shutting down monitoring subsystem", state="s")
|
||||||
monitoring_instance.run_cleanups()
|
monitoring_instance.shutdown()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ import importlib.util
|
||||||
from os import walk
|
from os import walk
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from json import dumps
|
from json import dumps
|
||||||
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
|
|
||||||
class PluginError(Exception):
|
class PluginError(Exception):
|
||||||
|
@ -173,9 +174,11 @@ class MonitoringPlugin(object):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def run(self):
|
def run(self, coordinator_state=None):
|
||||||
"""
|
"""
|
||||||
run(): Run the plugin, returning a PluginResult object
|
run(): Run the plugin, returning a PluginResult object
|
||||||
|
|
||||||
|
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "hypervisor" (non-coordinator)
|
||||||
"""
|
"""
|
||||||
return self.plugin_result
|
return self.plugin_result
|
||||||
|
|
||||||
|
@ -332,10 +335,40 @@ class MonitoringInstance(object):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.start_check_timer()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.shutdown()
|
||||||
|
|
||||||
|
def shutdown(self):
|
||||||
|
self.stop_check_timer()
|
||||||
|
self.run_cleanups()
|
||||||
|
|
||||||
|
def start_check_timer(self):
|
||||||
|
check_interval = 60
|
||||||
|
self.logger.out(
|
||||||
|
f"Starting monitoring check timer ({check_interval} second interval)",
|
||||||
|
state="s",
|
||||||
|
)
|
||||||
|
self.check_timer = BackgroundScheduler()
|
||||||
|
self.check_timer.add_job(
|
||||||
|
self.run_plugins,
|
||||||
|
trigger="interval",
|
||||||
|
seconds=check_interval,
|
||||||
|
)
|
||||||
|
self.check_timer.start()
|
||||||
|
|
||||||
|
def stop_check_timer(self):
|
||||||
|
try:
|
||||||
|
self.check_timer.shutdown()
|
||||||
|
self.logger.out("Stopping monitoring check timer", state="s")
|
||||||
|
except Exception:
|
||||||
|
self.logger.out("Failed to stop monitoring check timer", state="w")
|
||||||
|
|
||||||
def run_plugin(self, plugin):
|
def run_plugin(self, plugin):
|
||||||
time_start = datetime.now()
|
time_start = datetime.now()
|
||||||
try:
|
try:
|
||||||
result = plugin.run()
|
result = plugin.run(coordinator_state=self.this_node.router_state)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}",
|
f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}",
|
||||||
|
@ -351,12 +384,28 @@ class MonitoringInstance(object):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def run_plugins(self):
|
def run_plugins(self):
|
||||||
total_health = 100
|
if self.this_node.router_state == "primary":
|
||||||
if self.config["log_keepalive_plugin_details"]:
|
cst_colour = self.logger.fmt_green
|
||||||
|
elif self.this_node.router_state == "secondary":
|
||||||
|
cst_colour = self.logger.fmt_blue
|
||||||
|
else:
|
||||||
|
cst_colour = self.logger.fmt_cyan
|
||||||
|
|
||||||
self.logger.out(
|
self.logger.out(
|
||||||
f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}",
|
"{}{} healthcheck @ {}{} [{}{}{}]".format(
|
||||||
|
self.logger.fmt_purple,
|
||||||
|
self.config["node_hostname"],
|
||||||
|
datetime.now(),
|
||||||
|
self.logger.fmt_end,
|
||||||
|
self.logger.fmt_bold + cst_colour,
|
||||||
|
self.this_node.router_state,
|
||||||
|
self.logger.fmt_end,
|
||||||
|
),
|
||||||
state="t",
|
state="t",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
runtime_start = datetime.now()
|
||||||
|
total_health = 100
|
||||||
plugin_results = list()
|
plugin_results = list()
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
|
||||||
to_future_plugin_results = {
|
to_future_plugin_results = {
|
||||||
|
@ -390,6 +439,33 @@ class MonitoringInstance(object):
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
runtime_end = datetime.now()
|
||||||
|
runtime_delta = runtime_end - runtime_start
|
||||||
|
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
if isinstance(self.this_node.health, int):
|
||||||
|
if self.this_node.health > 90:
|
||||||
|
health_colour = self.logger.fmt_green
|
||||||
|
elif self.this_node.health > 50:
|
||||||
|
health_colour = self.logger.fmt_yellow
|
||||||
|
else:
|
||||||
|
health_colour = self.logger.fmt_red
|
||||||
|
health_text = str(self.this_node.health) + "%"
|
||||||
|
else:
|
||||||
|
health_colour = self.logger.fmt_blue
|
||||||
|
health_text = "N/A"
|
||||||
|
|
||||||
|
self.logger.out(
|
||||||
|
"Node health at {health_colour}{health}{nofmt}, checked in {runtime} seconds".format(
|
||||||
|
health_colour=health_colour,
|
||||||
|
nofmt=self.logger.fmt_end,
|
||||||
|
health=health_text,
|
||||||
|
runtime=runtime,
|
||||||
|
),
|
||||||
|
state="t",
|
||||||
|
)
|
||||||
|
|
||||||
def run_cleanup(self, plugin):
|
def run_cleanup(self, plugin):
|
||||||
return plugin.cleanup()
|
return plugin.cleanup()
|
||||||
|
|
||||||
|
|
|
@ -859,11 +859,6 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.out("Failed to set keepalive data", state="e")
|
logger.out("Failed to set keepalive data", state="e")
|
||||||
|
|
||||||
# Run this here since monitoring plugins output directly
|
|
||||||
monitoring_instance.run_plugins()
|
|
||||||
# Allow the health value to update in the Node instance
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
if config["log_keepalives"]:
|
if config["log_keepalives"]:
|
||||||
if this_node.maintenance is True:
|
if this_node.maintenance is True:
|
||||||
maintenance_colour = logger.fmt_blue
|
maintenance_colour = logger.fmt_blue
|
||||||
|
|
Loading…
Reference in New Issue