diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 0c76a0a7..d3ad4aee 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -365,7 +365,10 @@ class MonitoringInstance(object): plugin_results.append(future.result()) for result in sorted(plugin_results, key=lambda x: x.plugin_name): - if self.config["log_keepalive_plugin_details"]: + if ( + self.config["log_keepalives"] + and self.config["log_keepalive_plugin_details"] + ): self.logger.out( result.message + f" [-{result.health_delta}]", state="t", @@ -376,13 +379,6 @@ class MonitoringInstance(object): if total_health < 0: total_health = 0 - if total_health > 90: - health_colour = self.logger.fmt_green - elif total_health > 50: - health_colour = self.logger.fmt_yellow - else: - health_colour = self.logger.fmt_red - self.zkhandler.write( [ ( @@ -391,10 +387,6 @@ class MonitoringInstance(object): ), ] ) - self.logger.out( - f"Node health: {health_colour}{total_health}%{self.logger.fmt_end}", - state="t", - ) def run_cleanup(self, plugin): return plugin.cleanup() diff --git a/node-daemon/pvcnoded/objects/NodeInstance.py b/node-daemon/pvcnoded/objects/NodeInstance.py index 72d25bd4..a8dc5d11 100644 --- a/node-daemon/pvcnoded/objects/NodeInstance.py +++ b/node-daemon/pvcnoded/objects/NodeInstance.py @@ -67,6 +67,7 @@ class NodeInstance(object): self.network_list = [] self.domain_list = [] # Node resources + self.health = 100 self.domains_count = 0 self.memused = 0 self.memfree = 0 @@ -224,6 +225,28 @@ class NodeInstance(object): ) self.flush_thread.start() + @self.zkhandler.zk_conn.DataWatch( + self.zkhandler.schema.path("node.monitoring.health", self.name) + ) + def watch_node_health(data, stat, event=""): + if event and event.type == "DELETED": + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode("ascii") + except AttributeError: + data = 100 + + try: + data = int(data) + except ValueError: + pass + + if data != self.health: + self.health = data + @self.zkhandler.zk_conn.DataWatch( self.zkhandler.schema.path("node.memory.free", self.name) ) diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py index c75a0e23..5504382a 100644 --- a/node-daemon/pvcnoded/util/keepalive.py +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -644,8 +644,27 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue): # Keepalive update function def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): debug = config["debug"] - if debug: - logger.out("Keepalive starting", state="d", prefix="main-thread") + + # Display node information to the terminal + if config["log_keepalives"]: + if this_node.router_state == "primary": + cst_colour = logger.fmt_green + elif this_node.router_state == "secondary": + cst_colour = logger.fmt_blue + else: + cst_colour = logger.fmt_cyan + logger.out( + "{}{} keepalive @ {}{} [{}{}{}]".format( + logger.fmt_purple, + config["node_hostname"], + datetime.now(), + logger.fmt_end, + logger.fmt_bold + cst_colour, + this_node.router_state, + logger.fmt_end, + ), + state="t", + ) # Set the migration selector in Zookeeper for clients to read if config["enable_hypervisor"]: @@ -808,44 +827,51 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): except Exception: logger.out("Failed to set keepalive data", state="e") - # Display node information to the terminal + # Run this here since monitoring plugins output directly + monitoring_instance.run_plugins() + # Allow the health value to update in the Node instance + time.sleep(0.1) + if config["log_keepalives"]: - if this_node.router_state == "primary": - cst_colour = logger.fmt_green - elif this_node.router_state == "secondary": - cst_colour = logger.fmt_blue + if this_node.maintenance is True: + maintenance_colour = logger.fmt_blue else: - cst_colour = logger.fmt_cyan - logger.out( - "{}{} keepalive @ {}{} [{}{}{}]".format( - logger.fmt_purple, - config["node_hostname"], - datetime.now(), - logger.fmt_end, - logger.fmt_bold + cst_colour, - this_node.router_state, - logger.fmt_end, - ), - state="t", - ) + maintenance_colour = logger.fmt_green + + if isinstance(this_node.health, int): + if this_node.health > 90: + health_colour = logger.fmt_green + elif this_node.health > 50: + health_colour = logger.fmt_yellow + else: + health_colour = logger.fmt_red + health_text = str(this_node.health) + "%" + + else: + health_colour = logger.fmt_blue + health_text = "N/A" + if config["log_keepalive_cluster_details"]: logger.out( - "{bold}Maintenance:{nofmt} {maint} " - "{bold}Node VMs:{nofmt} {domcount} " - "{bold}Node OSDs:{nofmt} {osdcount} " + "{bold}Maintenance:{nofmt} {maintenance_colour}{maintenance}{nofmt} " + "{bold}Health:{nofmt} {health_colour}{health}{nofmt} " + "{bold}VMs:{nofmt} {domcount} " + "{bold}OSDs:{nofmt} {osdcount} " "{bold}Load:{nofmt} {load} " - "{bold}Memory [MiB]: VMs:{nofmt} {allocmem} " + "{bold}Memory [MiB]: " "{bold}Used:{nofmt} {usedmem} " "{bold}Free:{nofmt} {freemem}".format( bold=logger.fmt_bold, + maintenance_colour=maintenance_colour, + health_colour=health_colour, nofmt=logger.fmt_end, - maint=this_node.maintenance, + maintenance=this_node.maintenance, + health=health_text, domcount=this_node.domains_count, osdcount=osds_this_node, load=this_node.cpuload, freemem=this_node.memfree, usedmem=this_node.memused, - allocmem=this_node.memalloc, ), state="t", ) @@ -893,8 +919,3 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): zkhandler.write( [(("node.state.daemon", node_name), "dead")] ) - - monitoring_instance.run_plugins() - - if debug: - logger.out("Keepalive finished", state="d", prefix="main-thread")