From 3ad6ff2d9c19723db3be0811d29826d513393e14 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 03:06:06 -0500 Subject: [PATCH 01/55] Initial implementation of monitoring plugin system --- daemon-common/migrations/versions/9.json | 1 + daemon-common/zkhandler.py | 20 +- debian/pvc-daemon-node.install | 1 + node-daemon/plugins/dpkg | 158 ++++++++ node-daemon/plugins/load | 105 ++++++ node-daemon/pvcnoded.sample.yaml | 2 + node-daemon/pvcnoded/Daemon.py | 18 +- .../pvcnoded/objects/MonitoringInstance.py | 357 ++++++++++++++++++ node-daemon/pvcnoded/util/config.py | 3 + node-daemon/pvcnoded/util/keepalive.py | 8 +- 10 files changed, 664 insertions(+), 9 deletions(-) create mode 100644 daemon-common/migrations/versions/9.json create mode 100644 node-daemon/plugins/dpkg create mode 100644 node-daemon/plugins/load create mode 100644 node-daemon/pvcnoded/objects/MonitoringInstance.py diff --git a/daemon-common/migrations/versions/9.json b/daemon-common/migrations/versions/9.json new file mode 100644 index 00000000..84ea8ac1 --- /dev/null +++ b/daemon-common/migrations/versions/9.json @@ -0,0 +1 @@ +{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index 8e42d01e..5ec58b83 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -540,7 +540,7 @@ class ZKHandler(object): # class ZKSchema(object): # Current version - _version = 8 + _version = 9 # Root for doing nested keys _schema_root = "" @@ -608,6 +608,17 @@ class ZKSchema(object): "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", + "monitoring.plugins": "/monitoring_plugins", + "monitoring.data": "/monitoring_data", + }, + # The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin}) + "monitoring_plugin": { + "name": "", # The root key + "last_run": "/last_run", + "health_delta": "/health_delta", + "message": "/message", + "data": "/data", + "runtime": "/runtime", }, # The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf}) "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, # The root key @@ -874,9 +885,10 @@ class ZKSchema(object): if not zkhandler.zk_conn.exists(nkipath): result = False - # One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be - # managed here as well, but those are created automatically every time pvcnoded starts - # and thus never need to be validated or applied. + # One might expect child keys under node (specifically, sriov.pf, sriov.vf, + # monitoring.data) to be managed here as well, but those are created + # automatically every time pvcnoded started and thus never need to be validated + # or applied. # These two have several children layers that must be parsed through for elem in ["volume"]: diff --git a/debian/pvc-daemon-node.install b/debian/pvc-daemon-node.install index 4b85c0e1..f428e6c1 100644 --- a/debian/pvc-daemon-node.install +++ b/debian/pvc-daemon-node.install @@ -5,3 +5,4 @@ node-daemon/pvcnoded.service lib/systemd/system node-daemon/pvc.target lib/systemd/system node-daemon/pvcautoready.service lib/systemd/system node-daemon/monitoring usr/share/pvc +node-daemon/plugins usr/share/pvc diff --git a/node-daemon/plugins/dpkg b/node-daemon/plugins/dpkg new file mode 100644 index 00000000..74d00789 --- /dev/null +++ b/node-daemon/plugins/dpkg @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 + +# dpkg.py - PVC Monitoring example plugin for dpkg status +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the system dpkg status is as expected, with no invalid +# packages or obsolete configuration files, and will return a 1 health delta for each +# flaw in invalid packages, upgradable packages, and obsolete config files. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "dpkg" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from re import match + from json import dumps + import daemon_lib.common as pvc_common + + # Get Debian version + with open('/etc/debian_version', 'r') as fh: + debian_version = fh.read().strip() + + # Get a list of dpkg packages for analysis + retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/dpkg --list") + + # Get a list of installed packages and states + packages = list() + for dpkg_line in stdout.split('\n'): + if match('^[a-z][a-z] ', dpkg_line): + line_split = dpkg_line.split() + package_state = line_split[0] + package_name = line_split[1] + packages.append((package_name, package_state)) + + count_ok = 0 + count_inconsistent = 0 + list_inconsistent = list() + + for package in packages: + if package[1] == "ii": + count_ok += 1 + else: + count_inconsistent += 1 + list_inconsistent.append(package[0]) + + # Get upgradable packages + retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/apt list --upgradable") + + list_upgradable = list() + for apt_line in stdout.split('\n'): + if match('^[a-z][a-z] ', apt_line): + line_split = apt_line.split('/') + package_name = line_split[0] + list_upgradable.append(package_name) + + count_upgradable = len(list_upgradable) + + # Get obsolete config files (dpkg-* or ucf-* under /etc) + retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/find /etc -type f -a \( -name '*.dpkg-*' -o -name '*.ucf-*' \)") + + obsolete_conffiles = list() + for conffile_line in stdout.split('\n'): + if conffile_line: + obsolete_conffiles.append(conffile_line) + + count_obsolete_conffiles = len(obsolete_conffiles) + + # Set health_delta based on the results + health_delta = 0 + if count_inconsistent > 0: + health_delta += 1 + if count_upgradable > 0: + health_delta += 1 + if count_obsolete_conffiles > 0: + health_delta += 1 + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Craft the message + message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages valid: {count_ok}, inconsistent: {count_inconsistent}, upgradable: {count_upgradable}" + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Set the detailed data in our local PluginResult object + detailed_data = { + "debian_version": debian_version, + "obsolete_conffiles": obsolete_conffiles, + "inconsistent_packages": list_inconsistent, + "upgradable_packages": list_upgradable, + } + self.plugin_result.set_data(dumps(detailed_data)) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass diff --git a/node-daemon/plugins/load b/node-daemon/plugins/load new file mode 100644 index 00000000..f3e4fb39 --- /dev/null +++ b/node-daemon/plugins/load @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +# load.py - PVC Monitoring example plugin for load +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the system load against the total number of CPU cores, +# and return a 10 health delta (100 -> 90) if the load average is > 1/2 that number. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "load" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from os import getloadavg + from psutil import cpu_count + + # Get the current 1-minute system load average + load_average = getloadavg()[0] + + # Get the number of CPU cores + cpu_cores = cpu_count() + + # Check that the load average is greater or equal to the cpu count + if load_average > float(cpu_cores): + # Set the health delta to 10 (subtract 10 from the total of 100) + health_delta = 10 + # Craft a message that can be used by the clients + message = f"Current load is {load_average} out of {cpu_cores} CPU cores" + + else: + # Set the health delta to 0 (no change) + health_delta = 0 + # Craft a message that can be used by the clients + message = f"Current load is {load_average} out of {cpu_cores} CPU cores" + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass diff --git a/node-daemon/pvcnoded.sample.yaml b/node-daemon/pvcnoded.sample.yaml index de22a69d..ee36ee50 100644 --- a/node-daemon/pvcnoded.sample.yaml +++ b/node-daemon/pvcnoded.sample.yaml @@ -128,6 +128,8 @@ pvc: configuration: # directories: PVC system directories directories: + # plugin_directory: Directory containing node monitoring plugins + plugin_directory: "/usr/share/pvc/plugins" # dynamic_directory: Temporary in-memory directory for active configurations dynamic_directory: "/run/pvc" # log_directory: Logging directory diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 46b89afa..a7974237 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -27,6 +27,7 @@ import pvcnoded.util.services import pvcnoded.util.libvirt import pvcnoded.util.zookeeper +import pvcnoded.objects.MonitoringInstance as MonitoringInstance import pvcnoded.objects.DNSAggregatorInstance as DNSAggregatorInstance import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance import pvcnoded.objects.VMInstance as VMInstance @@ -58,6 +59,7 @@ version = "0.9.61" def entrypoint(): keepalive_timer = None + monitoring_instance = None # Get our configuration config = pvcnoded.util.config.get_configuration() @@ -204,7 +206,7 @@ def entrypoint(): # Define a cleanup function def cleanup(failure=False): - nonlocal logger, zkhandler, keepalive_timer, d_domain + nonlocal logger, zkhandler, keepalive_timer, d_domain, monitoring_instance logger.out("Terminating pvcnoded and cleaning up", state="s") @@ -253,6 +255,13 @@ def entrypoint(): except Exception: pass + # Clean up any monitoring plugins that have cleanup + try: + logger.out("Performing monitoring plugin cleanup", state="s") + monitoring_instance.run_cleanups() + except Exception: + pass + # Set stop state in Zookeeper zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")]) @@ -1015,9 +1024,14 @@ def entrypoint(): state="i", ) + # Set up the node monitoring instance + monitoring_instance = MonitoringInstance.MonitoringInstance( + zkhandler, config, logger, this_node + ) + # Start keepalived thread keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer( - logger, config, zkhandler, this_node + logger, config, zkhandler, this_node, monitoring_instance ) # Tick loop; does nothing since everything is async diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py new file mode 100644 index 00000000..7b79e1fc --- /dev/null +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python3 + +# PluginInstance.py - Class implementing a PVC monitoring instance +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import concurrent.futures +import time +import importlib.util + +from os import walk +from datetime import datetime + + +class PluginResult(object): + def __init__(self, zkhandler, config, logger, this_node, plugin_name): + self.zkhandler = zkhandler + self.config = config + self.logger = logger + self.this_node = this_node + self.plugin_name = plugin_name + self.current_time = int(time.time()) + self.health_delta = 0 + self.message = None + self.data = None + self.runtime = "0.00" + + def set_health_delta(self, new_delta): + self.health_delta = new_delta + + def set_message(self, new_message): + self.message = new_message + + def set_data(self, new_data): + self.data = new_data + + def set_runtime(self, new_runtime): + self.runtime = new_runtime + + def to_zookeeper(self): + self.zkhandler.write( + [ + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.name", + self.plugin_name, + ), + self.plugin_name, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.last_run", + self.plugin_name, + ), + self.current_time, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.health_delta", + self.plugin_name, + ), + self.health_delta, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.message", + self.plugin_name, + ), + self.message, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.data", + self.plugin_name, + ), + self.data, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.runtime", + self.plugin_name, + ), + self.runtime, + ), + ] + ) + + +class MonitoringPlugin(object): + def __init__(self, zkhandler, config, logger, this_node, plugin_name): + self.zkhandler = zkhandler + self.config = config + self.logger = logger + self.this_node = this_node + self.plugin_name = plugin_name + + self.plugin_result = PluginResult( + self.zkhandler, + self.config, + self.logger, + self.this_node, + self.plugin_name, + ) + + # + # Helper functions; exposed to child MonitoringPluginScript instances + # + def log(self, message, state="d"): + """ + Log a message to the PVC logger instance using the plugin name as a prefix + Takes "state" values as defined by the PVC logger instance, defaulting to debug: + "d": debug + "i": informational + "t": tick/keepalive + "w": warning + "e": error + """ + if state == "d" and not self.config["debug"]: + return + + self.logger.out(message, state=state, prefix=self.plugin_name) + + # + # Primary class functions; implemented by the individual plugins + # + def setup(self): + """ + setup(): Perform setup of the plugin; run once during daemon startup + OPTIONAL + """ + pass + + def run(self): + """ + run(): Run the plugin, returning a PluginResult object + """ + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Clean up after the plugin; run once during daemon shutdown + OPTIONAL + """ + pass + + +class MonitoringInstance(object): + def __init__(self, zkhandler, config, logger, this_node): + self.zkhandler = zkhandler + self.config = config + self.logger = logger + self.this_node = this_node + + # Get a list of plugins from the plugin_directory + plugin_files = next(walk(self.config["plugin_directory"]), (None, None, []))[ + 2 + ] # [] if no file + + self.all_plugins = list() + self.all_plugin_names = list() + + # Load each plugin file into the all_plugins list + for plugin_file in sorted(plugin_files): + try: + self.logger.out( + f"Loading monitoring plugin from {self.config['plugin_directory']}/{plugin_file}", + state="i", + ) + loader = importlib.machinery.SourceFileLoader( + "plugin_script", f"{self.config['plugin_directory']}/{plugin_file}" + ) + spec = importlib.util.spec_from_loader(loader.name, loader) + plugin_script = importlib.util.module_from_spec(spec) + spec.loader.exec_module(plugin_script) + + plugin = plugin_script.MonitoringPluginScript( + self.zkhandler, + self.config, + self.logger, + self.this_node, + plugin_script.PLUGIN_NAME, + ) + self.all_plugins.append(plugin) + self.all_plugin_names.append(plugin.plugin_name) + + # Create plugin key + self.zkhandler.write( + [ + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.name", + plugin.plugin_name, + ), + plugin.plugin_name, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.last_run", + plugin.plugin_name, + ), + "0", + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.health_delta", + plugin.plugin_name, + ), + "0", + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.message", + plugin.plugin_name, + ), + "Initializing", + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.data", + plugin.plugin_name, + ), + None, + ), + ( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin.runtime", + plugin.plugin_name, + ), + "0.00", + ), + ] + ) + self.logger.out( + f"Successfully loaded monitoring plugin '{plugin.plugin_name}'", + state="o", + ) + except Exception as e: + self.logger.out( + f"Failed to load monitoring plugin: {e}", + state="w", + ) + + self.zkhandler.write( + [ + ( + ("node.monitoring.plugins", self.this_node.name), + self.all_plugin_names, + ), + ] + ) + + # Clean up any old plugin data for which a plugin file no longer exists + for plugin_key in self.zkhandler.children( + ("node.monitoring.data", self.this_node.name) + ): + if plugin_key not in self.all_plugin_names: + self.zkhandler.delete( + ( + "node.monitoring.data", + self.this_node.name, + "monitoring_plugin", + plugin_key, + ) + ) + + def run_plugin(self, plugin): + time_start = datetime.now() + result = plugin.run() + time_end = datetime.now() + time_delta = time_end - time_start + runtime = "{:0.02f}".format(time_delta.total_seconds()) + result.set_runtime(runtime) + self.logger.out( + result.message, state="t", prefix=f"{plugin.plugin_name} ({runtime}s)" + ) + result.to_zookeeper() + return result + + def run_plugins(self): + total_health = 100 + self.logger.out("Running monitoring plugins:", state="t") + plugin_results = list() + with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor: + to_future_plugin_results = { + executor.submit(self.run_plugin, plugin): plugin + for plugin in self.all_plugins + } + for future in concurrent.futures.as_completed(to_future_plugin_results): + plugin_results.append(future.result()) + + for result in plugin_results: + if result is not None: + total_health -= result.health_delta + + if total_health > 90: + health_colour = self.logger.fmt_green + elif total_health > 50: + health_colour = self.logger.fmt_yellow + else: + health_colour = self.logger.fmt_red + + self.logger.out( + f"System health: {health_colour}{total_health}/100{self.logger.fmt_end}", + state="t", + ) + + def run_cleanup(self, plugin): + return plugin.cleanup() + + def run_cleanups(self): + with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor: + to_future_plugin_results = { + executor.submit(self.run_cleanup, plugin): plugin + for plugin in self.all_plugins + } + for future in concurrent.futures.as_completed(to_future_plugin_results): + # This doesn't do anything, just lets us wait for them all to complete + pass diff --git a/node-daemon/pvcnoded/util/config.py b/node-daemon/pvcnoded/util/config.py index d4dd24c7..29543407 100644 --- a/node-daemon/pvcnoded/util/config.py +++ b/node-daemon/pvcnoded/util/config.py @@ -180,6 +180,9 @@ def get_configuration(): raise MalformedConfigurationError(e) config_directories = { + "plugin_directory": o_directories.get( + "plugin_directory", "/usr/share/pvc/plugins" + ), "dynamic_directory": o_directories.get("dynamic_directory", None), "log_directory": o_directories.get("log_directory", None), "console_log_directory": o_directories.get("console_log_directory", None), diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py index 2fb07284..22f85dc1 100644 --- a/node-daemon/pvcnoded/util/keepalive.py +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -51,7 +51,7 @@ libvirt_vm_states = { } -def start_keepalive_timer(logger, config, zkhandler, this_node): +def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_instance): keepalive_interval = config["keepalive_interval"] logger.out( f"Starting keepalive timer ({keepalive_interval} second interval)", state="s" @@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node): keepalive_timer = BackgroundScheduler() keepalive_timer.add_job( node_keepalive, - args=(logger, config, zkhandler, this_node), + args=(logger, config, zkhandler, this_node, monitoring_instance), trigger="interval", seconds=keepalive_interval, ) @@ -648,7 +648,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue): # Keepalive update function -def node_keepalive(logger, config, zkhandler, this_node): +def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): debug = config["debug"] if debug: logger.out("Keepalive starting", state="d", prefix="main-thread") @@ -918,5 +918,7 @@ def node_keepalive(logger, config, zkhandler, this_node): [(("node.state.daemon", node_name), "dead")] ) + monitoring_instance.run_plugins() + if debug: logger.out("Keepalive finished", state="d", prefix="main-thread") From 25f3faa08f395067f8b4eb7b3af00cd95193831c Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 12:13:56 -0500 Subject: [PATCH 02/55] Move Ceph cluster health reporting to plugin Also removes several outputs from the normal keepalive that were superfluous/static so that the main output fits on one line. --- node-daemon/plugins/ceph | 126 +++++++++++++++++++++++++ node-daemon/pvcnoded/util/keepalive.py | 57 ++--------- 2 files changed, 133 insertions(+), 50 deletions(-) create mode 100644 node-daemon/plugins/ceph diff --git a/node-daemon/plugins/ceph b/node-daemon/plugins/ceph new file mode 100644 index 00000000..dc0bf8e5 --- /dev/null +++ b/node-daemon/plugins/ceph @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +# ceph.py - PVC Monitoring example plugin for ceph status +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the Ceph cluster health for anomalies, and return a health +# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50). + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "ceph" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from rados import Rados + from json import loads, dumps + + # Connect to the Ceph cluster + try: + ceph_conn = Rados( + conffile=self.config["ceph_config_file"], + conf=dict(keyring=self.config["ceph_admin_keyring"]), + ) + ceph_conn.connect(timeout=1) + except Exception as e: + self.log(f"Failed to connect to Ceph cluster: {e}", state="e") + return self.plugin_result + + # Get the Ceph cluster health + try: + health_status = loads( + ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1] + ) + ceph_health = health_status["status"] + except Exception as e: + self.log(f"Failed to get health data from Ceph cluster: {e}", state="e") + return self.plugin_result + finally: + ceph_conn.shutdown() + + # Get a list of error entries in the health status output + error_entries = health_status["checks"].keys() + + # Set the health delta based on the errors presented + if ceph_health == "HEALTH_ERR": + health_delta = 50 + message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}" + elif ceph_health == "HEALTH_WARN": + health_delta = 10 + message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}" + else: + health_delta = 0 + message = "Ceph cluster in OK state" + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Set the detailed data in our local PluginResult object + self.plugin_result.set_data(dumps(health_status)) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py index 22f85dc1..dc6e6a33 100644 --- a/node-daemon/pvcnoded/util/keepalive.py +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -97,29 +97,6 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue): logger.out("Failed to open connection to Ceph cluster: {}".format(e), state="e") return - if debug: - logger.out("Getting health stats from monitor", state="d", prefix="ceph-thread") - - # Get Ceph cluster health for local status output - command = {"prefix": "health", "format": "json"} - try: - health_status = json.loads( - ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[1] - ) - ceph_health = health_status["status"] - except Exception as e: - logger.out("Failed to obtain Ceph health data: {}".format(e), state="e") - ceph_health = "HEALTH_UNKN" - - if ceph_health in ["HEALTH_OK"]: - ceph_health_colour = logger.fmt_green - elif ceph_health in ["HEALTH_UNKN"]: - ceph_health_colour = logger.fmt_cyan - elif ceph_health in ["HEALTH_WARN"]: - ceph_health_colour = logger.fmt_yellow - else: - ceph_health_colour = logger.fmt_red - # Primary-only functions if this_node.router_state == "primary": if debug: @@ -408,8 +385,6 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue): ceph_conn.shutdown() - queue.put(ceph_health_colour) - queue.put(ceph_health) queue.put(osds_this_node) if debug: @@ -777,16 +752,14 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): if config["enable_storage"]: try: - ceph_health_colour = ceph_thread_queue.get( - timeout=config["keepalive_interval"] + osds_this_node = ceph_thread_queue.get( + timeout=(config["keepalive_interval"] - 1) ) - ceph_health = ceph_thread_queue.get(timeout=config["keepalive_interval"]) - osds_this_node = ceph_thread_queue.get(timeout=config["keepalive_interval"]) except Exception: logger.out("Ceph stats queue get exceeded timeout, continuing", state="w") - ceph_health_colour = logger.fmt_cyan - ceph_health = "UNKNOWN" osds_this_node = "?" + else: + osds_this_node = "0" # Set our information in zookeeper keepalive_time = int(time.time()) @@ -839,8 +812,8 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): if config["log_keepalive_cluster_details"]: logger.out( "{bold}Maintenance:{nofmt} {maint} " - "{bold}Active VMs:{nofmt} {domcount} " - "{bold}Networks:{nofmt} {netcount} " + "{bold}Node VMs:{nofmt} {domcount} " + "{bold}Node OSDs:{nofmt} {osdcount} " "{bold}Load:{nofmt} {load} " "{bold}Memory [MiB]: VMs:{nofmt} {allocmem} " "{bold}Used:{nofmt} {usedmem} " @@ -849,7 +822,7 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): nofmt=logger.fmt_end, maint=this_node.maintenance, domcount=this_node.domains_count, - netcount=len(zkhandler.children("base.network")), + osdcount=osds_this_node, load=this_node.cpuload, freemem=this_node.memfree, usedmem=this_node.memused, @@ -857,22 +830,6 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): ), state="t", ) - if config["enable_storage"] and config["log_keepalive_storage_details"]: - logger.out( - "{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt} " - "{bold}Total OSDs:{nofmt} {total_osds} " - "{bold}Node OSDs:{nofmt} {node_osds} " - "{bold}Pools:{nofmt} {total_pools} ".format( - bold=logger.fmt_bold, - health_colour=ceph_health_colour, - nofmt=logger.fmt_end, - health=ceph_health, - total_osds=len(zkhandler.children("base.osd")), - node_osds=osds_this_node, - total_pools=len(zkhandler.children("base.pool")), - ), - state="t", - ) # Look for dead nodes and fence them if not this_node.maintenance: From 4d786c11e375a7059eaf91328104039cbb7f6f4a Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 12:13:56 -0500 Subject: [PATCH 03/55] Move Ceph cluster health reporting to plugin Also removes several outputs from the normal keepalive that were superfluous/static so that the main output fits on one line. --- node-daemon/plugins/ceph-cluster | 126 ++++++++++++++++++ .../pvcnoded/objects/MonitoringInstance.py | 18 ++- 2 files changed, 139 insertions(+), 5 deletions(-) create mode 100644 node-daemon/plugins/ceph-cluster diff --git a/node-daemon/plugins/ceph-cluster b/node-daemon/plugins/ceph-cluster new file mode 100644 index 00000000..48788925 --- /dev/null +++ b/node-daemon/plugins/ceph-cluster @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +# ceph-cluster.py - PVC Monitoring example plugin for Ceph status +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the Ceph cluster health for anomalies, and return a health +# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50). + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "ceph-cluster" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from rados import Rados + from json import loads, dumps + + # Connect to the Ceph cluster + try: + ceph_conn = Rados( + conffile=self.config["ceph_config_file"], + conf=dict(keyring=self.config["ceph_admin_keyring"]), + ) + ceph_conn.connect(timeout=1) + except Exception as e: + self.log(f"Failed to connect to Ceph cluster: {e}", state="e") + return self.plugin_result + + # Get the Ceph cluster health + try: + health_status = loads( + ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1] + ) + ceph_health = health_status["status"] + except Exception as e: + self.log(f"Failed to get health data from Ceph cluster: {e}", state="e") + return self.plugin_result + finally: + ceph_conn.shutdown() + + # Get a list of error entries in the health status output + error_entries = health_status["checks"].keys() + + # Set the health delta based on the errors presented + if ceph_health == "HEALTH_ERR": + health_delta = 50 + message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}" + elif ceph_health == "HEALTH_WARN": + health_delta = 10 + message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}" + else: + health_delta = 0 + message = "Ceph cluster in OK state" + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Set the detailed data in our local PluginResult object + self.plugin_result.set_data(dumps(health_status)) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 7b79e1fc..c4d0f3f2 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -129,6 +129,9 @@ class MonitoringPlugin(object): self.plugin_name, ) + def __str__(self): + return self.plugin_name + # # Helper functions; exposed to child MonitoringPluginScript instances # @@ -309,15 +312,15 @@ class MonitoringInstance(object): time_delta = time_end - time_start runtime = "{:0.02f}".format(time_delta.total_seconds()) result.set_runtime(runtime) - self.logger.out( - result.message, state="t", prefix=f"{plugin.plugin_name} ({runtime}s)" - ) result.to_zookeeper() return result def run_plugins(self): total_health = 100 - self.logger.out("Running monitoring plugins:", state="t") + self.logger.out( + f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}", + state="t", + ) plugin_results = list() with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor: to_future_plugin_results = { @@ -327,7 +330,12 @@ class MonitoringInstance(object): for future in concurrent.futures.as_completed(to_future_plugin_results): plugin_results.append(future.result()) - for result in plugin_results: + for result in sorted(plugin_results, key=lambda x: x.plugin_name): + self.logger.out( + result.message, + state="t", + prefix=f"{result.plugin_name} ({result.runtime}s)", + ) if result is not None: total_health -= result.health_delta From edb3aea99060aad6c92b92177cb86cd665f30aee Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 14:37:44 -0500 Subject: [PATCH 04/55] Add node health value and send out API --- client-cli/pvc/cli_lib/node.py | 77 ++++++++++- daemon-common/migrations/versions/9.json | 2 +- daemon-common/node.py | 38 +++++- daemon-common/zkhandler.py | 1 + node-daemon/plugins/ceph | 4 +- node-daemon/plugins/ceph-cluster | 126 ------------------ node-daemon/plugins/dpkg | 3 +- .../pvcnoded/objects/MonitoringInstance.py | 17 ++- 8 files changed, 126 insertions(+), 142 deletions(-) delete mode 100644 node-daemon/plugins/ceph-cluster diff --git a/client-cli/pvc/cli_lib/node.py b/client-cli/pvc/cli_lib/node.py index 26f935ab..22325d20 100644 --- a/client-cli/pvc/cli_lib/node.py +++ b/client-cli/pvc/cli_lib/node.py @@ -215,6 +215,16 @@ def node_list( # Output display functions # def getOutputColours(node_information): + node_health = node_information.get("health", 999) + if node_health <= 50: + health_colour = ansiprint.red() + elif node_health <= 90: + health_colour = ansiprint.yellow() + elif node_health <= 100: + health_colour = ansiprint.green() + else: + health_colour = ansiprint.blue() + if node_information["daemon_state"] == "run": daemon_state_colour = ansiprint.green() elif node_information["daemon_state"] == "stop": @@ -251,6 +261,7 @@ def getOutputColours(node_information): mem_provisioned_colour = "" return ( + health_colour, daemon_state_colour, coordinator_state_colour, domain_state_colour, @@ -261,6 +272,7 @@ def getOutputColours(node_information): def format_info(node_information, long_output): ( + health_colour, daemon_state_colour, coordinator_state_colour, domain_state_colour, @@ -273,14 +285,34 @@ def format_info(node_information, long_output): # Basic information ainformation.append( "{}Name:{} {}".format( - ansiprint.purple(), ansiprint.end(), node_information["name"] + ansiprint.purple(), + ansiprint.end(), + node_information["name"], ) ) ainformation.append( "{}PVC Version:{} {}".format( - ansiprint.purple(), ansiprint.end(), node_information["pvc_version"] + ansiprint.purple(), + ansiprint.end(), + node_information["pvc_version"], ) ) + + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + node_health_text = f"{node_health}%" + else: + node_health_text = node_health + ainformation.append( + "{}Health Value:{} {}{}{}".format( + ansiprint.purple(), + ansiprint.end(), + health_colour, + node_health_text, + ansiprint.end(), + ) + ) + ainformation.append( "{}Daemon State:{} {}{}{}".format( ansiprint.purple(), @@ -397,6 +429,7 @@ def format_list(node_list, raw): # Determine optimal column widths node_name_length = 5 pvc_version_length = 8 + health_length = 7 daemon_state_length = 7 coordinator_state_length = 12 domain_state_length = 7 @@ -417,6 +450,15 @@ def format_list(node_list, raw): _pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1 if _pvc_version_length > pvc_version_length: pvc_version_length = _pvc_version_length + # node_health column + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + node_health_text = f"{node_health}%" + else: + node_health_text = node_health + _health_length = len(node_health_text) + 1 + if _health_length > health_length: + health_length = _health_length # daemon_state column _daemon_state_length = len(node_information["daemon_state"]) + 1 if _daemon_state_length > daemon_state_length: @@ -466,7 +508,10 @@ def format_list(node_list, raw): # Format the string (header) node_list_output.append( "{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format( - node_header_length=node_name_length + pvc_version_length + 1, + node_header_length=node_name_length + + pvc_version_length + + health_length + + 2, state_header_length=daemon_state_length + coordinator_state_length + domain_state_length @@ -484,7 +529,14 @@ def format_list(node_list, raw): bold=ansiprint.bold(), end_bold=ansiprint.end(), node_header="Nodes " - + "".join(["-" for _ in range(6, node_name_length + pvc_version_length)]), + + "".join( + [ + "-" + for _ in range( + 6, node_name_length + pvc_version_length + health_length + 1 + ) + ] + ), state_header="States " + "".join( [ @@ -526,12 +578,13 @@ def format_list(node_list, raw): ) node_list_output.append( - "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \ + "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {node_health: <{health_length}} \ {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \ {node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \ {node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format( node_name_length=node_name_length, pvc_version_length=pvc_version_length, + health_length=health_length, daemon_state_length=daemon_state_length, coordinator_state_length=coordinator_state_length, domain_state_length=domain_state_length, @@ -551,6 +604,7 @@ def format_list(node_list, raw): end_colour="", node_name="Name", node_pvc_version="Version", + node_health="Health", node_daemon_state="Daemon", node_coordinator_state="Coordinator", node_domain_state="Domain", @@ -568,19 +622,28 @@ def format_list(node_list, raw): # Format the string (elements) for node_information in sorted(node_list, key=lambda n: n["name"]): ( + health_colour, daemon_state_colour, coordinator_state_colour, domain_state_colour, mem_allocated_colour, mem_provisioned_colour, ) = getOutputColours(node_information) + + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + node_health_text = f"{node_health}%" + else: + node_health_text = node_health + node_list_output.append( - "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \ + "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {health_colour}{node_health: <{health_length}}{end_colour} \ {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \ {node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \ {node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format( node_name_length=node_name_length, pvc_version_length=pvc_version_length, + health_length=health_length, daemon_state_length=daemon_state_length, coordinator_state_length=coordinator_state_length, domain_state_length=domain_state_length, @@ -594,6 +657,7 @@ def format_list(node_list, raw): mem_prov_length=mem_prov_length, bold="", end_bold="", + health_colour=health_colour, daemon_state_colour=daemon_state_colour, coordinator_state_colour=coordinator_state_colour, domain_state_colour=domain_state_colour, @@ -602,6 +666,7 @@ def format_list(node_list, raw): end_colour=ansiprint.end(), node_name=node_information["name"], node_pvc_version=node_information.get("pvc_version", "N/A"), + node_health=node_health_text, node_daemon_state=node_information["daemon_state"], node_coordinator_state=node_information["coordinator_state"], node_domain_state=node_information["domain_state"], diff --git a/daemon-common/migrations/versions/9.json b/daemon-common/migrations/versions/9.json index 84ea8ac1..0401b70f 100644 --- a/daemon-common/migrations/versions/9.json +++ b/daemon-common/migrations/versions/9.json @@ -1 +1 @@ -{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file +{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/node.py b/daemon-common/node.py index ae5f7e5d..58962f2e 100644 --- a/daemon-common/node.py +++ b/daemon-common/node.py @@ -21,6 +21,7 @@ import time import re +import json import daemon_lib.common as common @@ -49,6 +50,35 @@ def getNodeInformation(zkhandler, node_name): zkhandler.read(("node.count.provisioned_domains", node_name)) ) node_running_domains = zkhandler.read(("node.running_domains", node_name)).split() + node_health = int(zkhandler.read(("node.monitoring.health", node_name))) + node_health_plugins = zkhandler.read(("node.monitoring.plugins", node_name)).split() + node_health_details = list() + for plugin in node_health_plugins: + plugin_last_run = zkhandler.read( + ("node.monitoring.data", node_name, "monitoring_plugin.last_run", plugin) + ) + plugin_health_delta = zkhandler.read( + ( + "node.monitoring.data", + node_name, + "monitoring_plugin.health_delta", + plugin, + ) + ) + plugin_message = zkhandler.read( + ("node.monitoring.data", node_name, "monitoring_plugin.message", plugin) + ) + plugin_data = zkhandler.read( + ("node.monitoring.data", node_name, "monitoring_plugin.data", plugin) + ) + plugin_output = { + "name": plugin, + "last_run": int(plugin_last_run), + "health_delta": int(plugin_health_delta), + "message": plugin_message, + "data": json.loads(plugin_data), + } + node_health_details.append(plugin_output) # Construct a data structure to represent the data node_information = { @@ -61,10 +91,16 @@ def getNodeInformation(zkhandler, node_name): "kernel": node_kernel, "os": node_os, "arch": node_arch, + "health": node_health, + "health_plugins": node_health_plugins, + "health_details": node_health_details, "load": node_load, "domains_count": node_domains_count, "running_domains": node_running_domains, - "vcpu": {"total": node_cpu_count, "allocated": node_vcpu_allocated}, + "vcpu": { + "total": node_cpu_count, + "allocated": node_vcpu_allocated, + }, "memory": { "total": node_mem_total, "allocated": node_mem_allocated, diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index 5ec58b83..a52c4ec1 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -610,6 +610,7 @@ class ZKSchema(object): "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", + "monitoring.health": "/monitoring_health", }, # The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin}) "monitoring_plugin": { diff --git a/node-daemon/plugins/ceph b/node-daemon/plugins/ceph index dc0bf8e5..31fc7551 100644 --- a/node-daemon/plugins/ceph +++ b/node-daemon/plugins/ceph @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# ceph.py - PVC Monitoring example plugin for ceph status +# ceph.py - PVC Monitoring example plugin for Ceph status # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018-2022 Joshua M. Boniface @@ -111,7 +111,7 @@ class MonitoringPluginScript(MonitoringPlugin): self.plugin_result.set_message(message) # Set the detailed data in our local PluginResult object - self.plugin_result.set_data(dumps(health_status)) + self.plugin_result.set_data(health_status) # Return our local PluginResult object return self.plugin_result diff --git a/node-daemon/plugins/ceph-cluster b/node-daemon/plugins/ceph-cluster deleted file mode 100644 index 48788925..00000000 --- a/node-daemon/plugins/ceph-cluster +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python3 - -# ceph-cluster.py - PVC Monitoring example plugin for Ceph status -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018-2022 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -# This script provides an example of a PVC monitoring plugin script. It will create -# a simple plugin to check the Ceph cluster health for anomalies, and return a health -# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50). - -# This script can thus be used as an example or reference implementation of a -# PVC monitoring pluginscript and expanded upon as required. - -# A monitoring plugin script must implement the class "MonitoringPluginScript" which -# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation -# of the role of each function is provided in context of the example; see the other -# examples for more potential uses. - -# WARNING: -# -# This script will run in the context of the node daemon keepalives as root. -# DO NOT install untrusted, unvetted plugins under any circumstances. - - -# This import is always required here, as MonitoringPlugin is used by the -# MonitoringPluginScript class -from pvcnoded.objects.MonitoringInstance import MonitoringPlugin - - -# A monitoring plugin script must always expose its nice name, which must be identical to -# the file name -PLUGIN_NAME = "ceph-cluster" - - -# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. -class MonitoringPluginScript(MonitoringPlugin): - def setup(self): - """ - setup(): Perform special setup steps during node daemon startup - - This step is optional and should be used sparingly. - """ - - pass - - def run(self): - """ - run(): Perform the check actions and return a PluginResult object - """ - - # Run any imports first - from rados import Rados - from json import loads, dumps - - # Connect to the Ceph cluster - try: - ceph_conn = Rados( - conffile=self.config["ceph_config_file"], - conf=dict(keyring=self.config["ceph_admin_keyring"]), - ) - ceph_conn.connect(timeout=1) - except Exception as e: - self.log(f"Failed to connect to Ceph cluster: {e}", state="e") - return self.plugin_result - - # Get the Ceph cluster health - try: - health_status = loads( - ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1] - ) - ceph_health = health_status["status"] - except Exception as e: - self.log(f"Failed to get health data from Ceph cluster: {e}", state="e") - return self.plugin_result - finally: - ceph_conn.shutdown() - - # Get a list of error entries in the health status output - error_entries = health_status["checks"].keys() - - # Set the health delta based on the errors presented - if ceph_health == "HEALTH_ERR": - health_delta = 50 - message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}" - elif ceph_health == "HEALTH_WARN": - health_delta = 10 - message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}" - else: - health_delta = 0 - message = "Ceph cluster in OK state" - - # Set the health delta in our local PluginResult object - self.plugin_result.set_health_delta(health_delta) - - # Set the message in our local PluginResult object - self.plugin_result.set_message(message) - - # Set the detailed data in our local PluginResult object - self.plugin_result.set_data(dumps(health_status)) - - # Return our local PluginResult object - return self.plugin_result - - def cleanup(self): - """ - cleanup(): Perform special cleanup steps during node daemon termination - - This step is optional and should be used sparingly. - """ - - pass diff --git a/node-daemon/plugins/dpkg b/node-daemon/plugins/dpkg index 74d00789..40b6990b 100644 --- a/node-daemon/plugins/dpkg +++ b/node-daemon/plugins/dpkg @@ -66,7 +66,6 @@ class MonitoringPluginScript(MonitoringPlugin): # Run any imports first from re import match - from json import dumps import daemon_lib.common as pvc_common # Get Debian version @@ -143,7 +142,7 @@ class MonitoringPluginScript(MonitoringPlugin): "inconsistent_packages": list_inconsistent, "upgradable_packages": list_upgradable, } - self.plugin_result.set_data(dumps(detailed_data)) + self.plugin_result.set_data(detailed_data) # Return our local PluginResult object return self.plugin_result diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index c4d0f3f2..5bdc8162 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -25,6 +25,7 @@ import importlib.util from os import walk from datetime import datetime +from json import dumps class PluginResult(object): @@ -37,7 +38,7 @@ class PluginResult(object): self.current_time = int(time.time()) self.health_delta = 0 self.message = None - self.data = None + self.data = {} self.runtime = "0.00" def set_health_delta(self, new_delta): @@ -98,7 +99,7 @@ class PluginResult(object): "monitoring_plugin.data", self.plugin_name, ), - self.data, + dumps(self.data), ), ( ( @@ -259,7 +260,7 @@ class MonitoringInstance(object): "monitoring_plugin.data", plugin.plugin_name, ), - None, + dumps({}), ), ( ( @@ -286,7 +287,7 @@ class MonitoringInstance(object): [ ( ("node.monitoring.plugins", self.this_node.name), - self.all_plugin_names, + " ".join(self.all_plugin_names), ), ] ) @@ -346,6 +347,14 @@ class MonitoringInstance(object): else: health_colour = self.logger.fmt_red + self.zkhandler.write( + [ + ( + ("node.monitoring.health", self.this_node.name), + total_health, + ), + ] + ) self.logger.out( f"System health: {health_colour}{total_health}/100{self.logger.fmt_end}", state="t", From af436a93cceee28069368d0b78bd0e432bb3aeed Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 15:54:46 -0500 Subject: [PATCH 05/55] Set node health to None when restarting --- node-daemon/pvcnoded/objects/MonitoringInstance.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 5bdc8162..7ae29f5b 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -372,3 +372,12 @@ class MonitoringInstance(object): for future in concurrent.futures.as_completed(to_future_plugin_results): # This doesn't do anything, just lets us wait for them all to complete pass + # Set the node health to None as no previous checks are now valid + self.zkhandler.write( + [ + ( + ("node.monitoring.health", self.this_node.name), + None, + ), + ] + ) From 8df189aa222640a6c3c6411656e6f9fddb2136c8 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 16:36:15 -0500 Subject: [PATCH 06/55] Fix several bugs and optimize output --- client-cli/pvc/cli_lib/node.py | 51 +++++++++++++++++++++++++--------- daemon-common/node.py | 5 +++- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/client-cli/pvc/cli_lib/node.py b/client-cli/pvc/cli_lib/node.py index 22325d20..1755b96a 100644 --- a/client-cli/pvc/cli_lib/node.py +++ b/client-cli/pvc/cli_lib/node.py @@ -215,13 +215,16 @@ def node_list( # Output display functions # def getOutputColours(node_information): - node_health = node_information.get("health", 999) - if node_health <= 50: - health_colour = ansiprint.red() - elif node_health <= 90: - health_colour = ansiprint.yellow() - elif node_health <= 100: - health_colour = ansiprint.green() + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + if node_health <= 50: + health_colour = ansiprint.red() + elif node_health <= 90: + health_colour = ansiprint.yellow() + elif node_health <= 100: + health_colour = ansiprint.green() + else: + health_colour = ansiprint.blue() else: health_colour = ansiprint.blue() @@ -304,7 +307,7 @@ def format_info(node_information, long_output): else: node_health_text = node_health ainformation.append( - "{}Health Value:{} {}{}{}".format( + "{}Health:{} {}{}{}".format( ansiprint.purple(), ansiprint.end(), health_colour, @@ -313,6 +316,28 @@ def format_info(node_information, long_output): ) ) + node_health_details = node_information.get("health_details", []) + if long_output: + node_health_messages = "\n ".join( + [f"{plugin['name']}: {plugin['message']}" for plugin in node_health_details] + ) + else: + node_health_messages = "\n ".join( + [ + f"{plugin['name']}: {plugin['message']}" + for plugin in node_health_details + if int(plugin.get("health_delta", 0)) > 0 + ] + ) + + if len(node_health_messages) > 0: + ainformation.append( + "{}Health Plugin Details:{} {}".format( + ansiprint.purple(), ansiprint.end(), node_health_messages + ) + ) + ainformation.append("") + ainformation.append( "{}Daemon State:{} {}{}{}".format( ansiprint.purple(), @@ -340,11 +365,6 @@ def format_info(node_information, long_output): ansiprint.end(), ) ) - ainformation.append( - "{}Active VM Count:{} {}".format( - ansiprint.purple(), ansiprint.end(), node_information["domains_count"] - ) - ) if long_output: ainformation.append("") ainformation.append( @@ -363,6 +383,11 @@ def format_info(node_information, long_output): ) ) ainformation.append("") + ainformation.append( + "{}Active VM Count:{} {}".format( + ansiprint.purple(), ansiprint.end(), node_information["domains_count"] + ) + ) ainformation.append( "{}Host CPUs:{} {}".format( ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["total"] diff --git a/daemon-common/node.py b/daemon-common/node.py index 58962f2e..ee97a267 100644 --- a/daemon-common/node.py +++ b/daemon-common/node.py @@ -50,7 +50,10 @@ def getNodeInformation(zkhandler, node_name): zkhandler.read(("node.count.provisioned_domains", node_name)) ) node_running_domains = zkhandler.read(("node.running_domains", node_name)).split() - node_health = int(zkhandler.read(("node.monitoring.health", node_name))) + try: + node_health = int(zkhandler.read(("node.monitoring.health", node_name))) + except ValueError: + node_health = "N/A" node_health_plugins = zkhandler.read(("node.monitoring.plugins", node_name)).split() node_health_details = list() for plugin in node_health_plugins: From 7378affcb547d7a09103bc359646ea8b79ed6ba7 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 21:43:13 -0500 Subject: [PATCH 07/55] Add EDAC check plugin --- node-daemon/plugins/edac | 103 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 node-daemon/plugins/edac diff --git a/node-daemon/plugins/edac b/node-daemon/plugins/edac new file mode 100644 index 00000000..edf1be8b --- /dev/null +++ b/node-daemon/plugins/edac @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# edac.py - PVC Monitoring example plugin for EDAC +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the system's EDAC registers and report any failures. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "edac" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + import daemon_lib.common as common + from re import match, search + + # Get edac-util output + retcode, stdout, stderr = common.run_os_command('/usr/bin/edac-util') + + # If there's no errors, we're OK + if match(r'^edac-util: No errors to report.', stdout): + health_delta = 0 + message = "EDAC reports no errors" + else: + health_delta = 0 + message = "EDAC reports errors: " + errors = list() + for line in stdout.split('\n'): + if match(r'^mc[0-9]: csrow', line): + if 'Corrected' not in line: + health_delta = 10 + errors.append(' '.join(line.split()[2:])) + message += ', '.join(errors) + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass From 54373c5becf8187a7198fda09e089aa20c1fb900 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 21:45:33 -0500 Subject: [PATCH 08/55] Fix bugs if plugins fail to load --- daemon-common/node.py | 10 ++++++++-- node-daemon/pvcnoded/objects/MonitoringInstance.py | 12 ++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/daemon-common/node.py b/daemon-common/node.py index ee97a267..9136d908 100644 --- a/daemon-common/node.py +++ b/daemon-common/node.py @@ -52,9 +52,15 @@ def getNodeInformation(zkhandler, node_name): node_running_domains = zkhandler.read(("node.running_domains", node_name)).split() try: node_health = int(zkhandler.read(("node.monitoring.health", node_name))) - except ValueError: + except Exception: node_health = "N/A" - node_health_plugins = zkhandler.read(("node.monitoring.plugins", node_name)).split() + try: + node_health_plugins = zkhandler.read( + ("node.monitoring.plugins", node_name) + ).split() + except Exception: + node_health_plugins = list() + node_health_details = list() for plugin in node_health_plugins: plugin_last_run = zkhandler.read( diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 7ae29f5b..db807ef1 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -190,6 +190,8 @@ class MonitoringInstance(object): self.all_plugins = list() self.all_plugin_names = list() + successful_plugins = 0 + # Load each plugin file into the all_plugins list for plugin_file in sorted(plugin_files): try: @@ -211,8 +213,6 @@ class MonitoringInstance(object): self.this_node, plugin_script.PLUGIN_NAME, ) - self.all_plugins.append(plugin) - self.all_plugin_names.append(plugin.plugin_name) # Create plugin key self.zkhandler.write( @@ -273,6 +273,11 @@ class MonitoringInstance(object): ), ] ) + + self.all_plugins.append(plugin) + self.all_plugin_names.append(plugin.plugin_name) + successful_plugins += 1 + self.logger.out( f"Successfully loaded monitoring plugin '{plugin.plugin_name}'", state="o", @@ -292,6 +297,9 @@ class MonitoringInstance(object): ] ) + if successful_plugins < 1: + return + # Clean up any old plugin data for which a plugin file no longer exists for plugin_key in self.zkhandler.children( ("node.monitoring.data", self.this_node.name) From 134f59f9eee990925ad14b08338b6b394e3c5d8c Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 21:58:56 -0500 Subject: [PATCH 09/55] Flip condition in EDAC check --- node-daemon/plugins/edac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/plugins/edac b/node-daemon/plugins/edac index edf1be8b..44e361eb 100644 --- a/node-daemon/plugins/edac +++ b/node-daemon/plugins/edac @@ -79,7 +79,7 @@ class MonitoringPluginScript(MonitoringPlugin): errors = list() for line in stdout.split('\n'): if match(r'^mc[0-9]: csrow', line): - if 'Corrected' not in line: + if 'Uncorrected' in line: health_delta = 10 errors.append(' '.join(line.split()[2:])) message += ', '.join(errors) From 96d3aff7adfd6b79f559e5d03090800eda2c771c Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 22:02:21 -0500 Subject: [PATCH 10/55] Add logging flag for montioring plugin output --- node-daemon/pvcnoded.sample.yaml | 4 ++-- .../pvcnoded/objects/MonitoringInstance.py | 20 ++++++++++--------- node-daemon/pvcnoded/util/config.py | 4 ++-- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/node-daemon/pvcnoded.sample.yaml b/node-daemon/pvcnoded.sample.yaml index ee36ee50..0fd93981 100644 --- a/node-daemon/pvcnoded.sample.yaml +++ b/node-daemon/pvcnoded.sample.yaml @@ -152,8 +152,8 @@ pvc: log_keepalives: True # log_keepalive_cluster_details: Enable or disable node status logging during keepalive log_keepalive_cluster_details: True - # log_keepalive_storage_details: Enable or disable node storage logging during keepalive - log_keepalive_storage_details: True + # log_keepalive_plugin_details: Enable or disable node health plugin logging during keepalive + log_keepalive_plugin_details: True # console_log_lines: Number of console log lines to store in Zookeeper per VM console_log_lines: 1000 # node_log_lines: Number of node log lines to store in Zookeeper per node diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index db807ef1..9b9a04a6 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -326,10 +326,11 @@ class MonitoringInstance(object): def run_plugins(self): total_health = 100 - self.logger.out( - f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}", - state="t", - ) + if self.config["log_keepalive_plugin_details"]: + self.logger.out( + f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}", + state="t", + ) plugin_results = list() with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor: to_future_plugin_results = { @@ -340,11 +341,12 @@ class MonitoringInstance(object): plugin_results.append(future.result()) for result in sorted(plugin_results, key=lambda x: x.plugin_name): - self.logger.out( - result.message, - state="t", - prefix=f"{result.plugin_name} ({result.runtime}s)", - ) + if self.config["log_keepalive_plugin_details"]: + self.logger.out( + result.message, + state="t", + prefix=f"{result.plugin_name} ({result.runtime}s)", + ) if result is not None: total_health -= result.health_delta diff --git a/node-daemon/pvcnoded/util/config.py b/node-daemon/pvcnoded/util/config.py index 29543407..e74277a7 100644 --- a/node-daemon/pvcnoded/util/config.py +++ b/node-daemon/pvcnoded/util/config.py @@ -228,8 +228,8 @@ def get_configuration(): "log_keepalive_cluster_details": o_logging.get( "log_keepalive_cluster_details", False ), - "log_keepalive_storage_details": o_logging.get( - "log_keepalive_storage_details", False + "log_keepalive_plugin_details": o_logging.get( + "log_keepalive_plugin_details", False ), "console_log_lines": o_logging.get("console_log_lines", False), "node_log_lines": o_logging.get("node_log_lines", False), From 8e6632bf106a272869c1ec45fef479a433f2af0f Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 22:21:23 -0500 Subject: [PATCH 11/55] Adjust text on log message --- node-daemon/pvcnoded/objects/MonitoringInstance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 9b9a04a6..c7bd4108 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -366,7 +366,7 @@ class MonitoringInstance(object): ] ) self.logger.out( - f"System health: {health_colour}{total_health}/100{self.logger.fmt_end}", + f"Node health: {health_colour}{total_health}/100{self.logger.fmt_end}", state="t", ) From c6a8c6d39b34a9b322781594056e70e33c94c7e9 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Tue, 14 Feb 2023 15:43:52 -0500 Subject: [PATCH 12/55] Add NIC monitoring plugin --- node-daemon/plugins/nics | 197 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 node-daemon/plugins/nics diff --git a/node-daemon/plugins/nics b/node-daemon/plugins/nics new file mode 100644 index 00000000..3edc4dec --- /dev/null +++ b/node-daemon/plugins/nics @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +# nics.py - PVC Monitoring example plugin for NIC interfaces +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the network interfaces of the host, specifically for speed +# and 802.3ad status (if applicable). + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "nics" + +# Set a minimum link speed variable used below +# For PVC at least 10 Gbps is required for proper operation of a cluster +MINIMUM_LINKSPEED = 10000 + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + import daemon_lib.common as common + from re import match, search + + messages = list() + health_delta = 0 + + # Get a list of the various underlying devices + _core_nics = set() + + for dev in [ + self.config['bridge_dev'], + self.config['upstream_dev'], + self.config['cluster_dev'], + self.config['storage_dev'], + ]: + with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent: + _devtype = uevent.readlines()[0].split('=')[-1].strip() + + if _devtype == 'vlan': + with open(f"/proc/net/vlan/{dev}") as devfh: + vlan_info = devfh.read().split('\n') + for line in vlan_info: + if match(r'^Device:', line): + dev = line.split()[-1] + + _core_nics.add(dev) + + core_nics = sorted(list(_core_nics)) + + for dev in core_nics: + with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent: + _devtype = uevent.readlines()[0].split('=')[-1].strip() + + if _devtype == "bond": + syspath = f"/proc/net/bonding/{dev}" + + with open(syspath) as devfh: + bonding_stats = devfh.read() + + _, _mode, _info, *_slaves = bonding_stats.split('\n\n') + + slave_interfaces = list() + for slavedev in _slaves: + lines = slavedev.split('\n') + for line in lines: + if match(r'^Slave Interface:', line): + interface_name = line.split()[-1] + if match(r'^MII Status:', line): + interface_status = line.split()[-1] + if match(r'^Speed:', line): + try: + interface_speed_mbps = int(line.split()[-2]) + except Exception: + interface_speed_mbps = 0 + if match(r'^Duplex:', line): + interface_duplex = line.split()[-1] + slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex)) + + # Ensure at least 2 slave interfaces are up + slave_interface_up_count = 0 + for slave_interface in slave_interfaces: + if slave_interface[1] == 'up': + slave_interface_up_count += 1 + if slave_interface_up_count < 2: + messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves") + health_delta += 10 + else: + messages.append(f"{dev} OK with {slave_interface_up_count} active slaves") + + # Get ethtool supported speeds for slave interfaces + supported_link_speeds = set() + for slave_interface in slave_interfaces: + slave_dev = slave_interface[0] + _, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}") + in_modes = False + for line in ethtool_stdout.split('\n'): + if search('Supported link modes:', line): + in_modes = True + if search('Supported pause frame use:', line): + in_modes = False + break + if in_modes: + speed = int(line.split()[-1].replace('baseT', '').split('/')[0]) + supported_link_speeds.add(speed) + else: + # Get ethtool supported speeds for interface + supported_link_speeds = set() + _, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}") + in_modes = False + for line in ethtool_stdout.split('\n'): + if search('Supported link modes:', line): + in_modes = True + if search('Supported pause frame use:', line): + in_modes = False + break + if in_modes: + speed = int(line.split()[-1].replace('baseT', '').split('/')[0]) + supported_link_speeds.add(speed) + + max_supported_link_speed = sorted(list(supported_link_speeds))[-1] + + # Ensure interface is running at MINIMUM_LINKSPEED + with open(f"/sys/class/net/{dev}/speed") as devfh: + dev_speed = int(devfh.read()) + if dev_speed < max_supported_link_speed: + messages.append(f"{dev} DEGRADED at {dev_speed} Mbps") + health_delta += 10 + else: + messages.append(f"{dev} OK at {dev_speed} Mbps") + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(', '.join(messages)) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass From 265e1e29d7761ed12a2dddc9c70fe41855a84410 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Tue, 14 Feb 2023 15:49:58 -0500 Subject: [PATCH 13/55] Improve ethtool parsing speeds --- node-daemon/plugins/nics | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-daemon/plugins/nics b/node-daemon/plugins/nics index 3edc4dec..70840245 100644 --- a/node-daemon/plugins/nics +++ b/node-daemon/plugins/nics @@ -69,7 +69,7 @@ class MonitoringPluginScript(MonitoringPlugin): # Run any imports first import daemon_lib.common as common - from re import match, search + from re import match, search, findall messages = list() health_delta = 0 @@ -150,7 +150,7 @@ class MonitoringPluginScript(MonitoringPlugin): in_modes = False break if in_modes: - speed = int(line.split()[-1].replace('baseT', '').split('/')[0]) + speed = int(findall(r'\d+', line.split()[-1])[0]) supported_link_speeds.add(speed) else: # Get ethtool supported speeds for interface From 8aa74aae6269072e652a506c253f8835e651591d Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 01:56:02 -0500 Subject: [PATCH 14/55] Use percentage in keepalie output --- node-daemon/pvcnoded/objects/MonitoringInstance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index c7bd4108..5651b79f 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -366,7 +366,7 @@ class MonitoringInstance(object): ] ) self.logger.out( - f"Node health: {health_colour}{total_health}/100{self.logger.fmt_end}", + f"Node health: {health_colour}{total_health}%{self.logger.fmt_end}", state="t", ) From fc16e26f23ac0dd0857584051bf9f69e5c829b79 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 10:11:38 -0500 Subject: [PATCH 15/55] Run setup during plugin loads --- .../pvcnoded/objects/MonitoringInstance.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 5651b79f..4ccc620c 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -28,6 +28,14 @@ from datetime import datetime from json import dumps +class PluginError(Exception): + """ + An exception that results from a plugin failing setup + """ + + pass + + class PluginResult(object): def __init__(self, zkhandler, config, logger, this_node, plugin_name): self.zkhandler = zkhandler @@ -157,7 +165,11 @@ class MonitoringPlugin(object): def setup(self): """ setup(): Perform setup of the plugin; run once during daemon startup - OPTIONAL + + This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. """ pass @@ -214,6 +226,10 @@ class MonitoringInstance(object): plugin_script.PLUGIN_NAME, ) + failed_setup = plugin.setup() + if failed_setup is not None: + raise PluginError(f"{failed_setup}") + # Create plugin key self.zkhandler.write( [ From f6bea50a0a91f5993e52c763a6ecc15928dd029f Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 11:28:39 -0500 Subject: [PATCH 16/55] Add disk monitoring plugin --- node-daemon/plugins/ceph | 3 + node-daemon/plugins/disk | 167 +++++++++++++++++++++++++++++++++++++++ node-daemon/plugins/dpkg | 3 + node-daemon/plugins/edac | 3 + node-daemon/plugins/load | 3 + node-daemon/plugins/nics | 3 + 6 files changed, 182 insertions(+) create mode 100644 node-daemon/plugins/disk diff --git a/node-daemon/plugins/ceph b/node-daemon/plugins/ceph index 31fc7551..aaeab8b4 100644 --- a/node-daemon/plugins/ceph +++ b/node-daemon/plugins/ceph @@ -54,6 +54,9 @@ class MonitoringPluginScript(MonitoringPlugin): setup(): Perform special setup steps during node daemon startup This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. """ pass diff --git a/node-daemon/plugins/disk b/node-daemon/plugins/disk new file mode 100644 index 00000000..77543162 --- /dev/null +++ b/node-daemon/plugins/disk @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +# disk.py - PVC Monitoring example plugin for disk (system + OSD) +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the system and OSD disks for errors and faults and return +# a health delta corresponding to severity. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "disk" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. + """ + + from daemon_lib.common import run_os_command + from json import loads + + _, _all_disks, _ = run_os_command("lsblk --json --paths --include 8,259") + try: + all_disks = loads(_all_disks) + except Exception as e: + return f"Error loading lsblk JSON: {e}" + + disk_details = list() + + def get_smartinfo(disk, extra_opt=""): + _, _smart_info, _ = run_os_command(f"smartctl --info --json {extra_opt} {disk}") + try: + smart_info = loads(_smart_info) + except Exception as e: + return None + + return smart_info + + for disk in [disk["name"] for disk in all_disks['blockdevices']]: + extra_opt = "" + smart_info = get_smartinfo(disk) + if smart_info is None or smart_info["smartctl"]["exit_status"] > 1: + continue + elif smart_info["smartctl"]["exit_status"] == 1: + if "requires option" in smart_info["smartctl"]["messages"][0]["string"]: + extra_opt = smart_info["smartctl"]["messages"][0]["string"].split("'")[1].replace('N','0') + smart_info = get_smartinfo(disk, extra_opt) + if smart_info is None or smart_info["smartctl"]["exit_status"] > 0: + continue + else: + continue + + disk_type = smart_info["device"]["type"] + + disk_details.append((disk, extra_opt, disk_type)) + + self.disk_details = disk_details + + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Re-run setup each time to ensure the disk details are current + self.setup() + + # Run any imports first + from daemon_lib.common import run_os_command + from json import loads + + health_delta = 0 + messages = list() + + for _disk in self.disk_details: + disk = _disk[0] + extra_opt = _disk[1] + disk_type = _disk[2] + + _, _smart_info, _ = run_os_command(f"smartctl --all --json {extra_opt} {disk}") + try: + smart_info = loads(_smart_info) + except Exception as e: + health_delta += 10 + messages.append(f"{disk} failed to load SMART data") + continue + + if disk_type == 'nvme': + for attribute in smart_info['nvme_smart_health_information_log'].items(): + if attribute[0] == "critical_warning" and attribute[1] > 0: + health_delta += 10 + messages.append(f"{disk} critical warning value {attribute[1]}") + if attribute[0] == "media_errors" and attribute[1] > 0: + health_delta += 10 + messages.append(f"{disk} media errors value {attribute[1]}") + if attribute[0] == "percentage_used" and attribute[1] > 90: + health_delta += 10 + messages.append(f"{disk} percentage used value {attribute[1]}%") + else: + for attribute in smart_info['ata_smart_attributes']['table']: + if attribute["when_failed"]: + health_delta += 10 + messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}") + + if len(messages) < 1: + messages.append(f"All {len(self.disk_details)} checked disks report OK: {', '.join([disk[0] for disk in self.disk_details])}") + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(', '.join(messages)) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass diff --git a/node-daemon/plugins/dpkg b/node-daemon/plugins/dpkg index 40b6990b..f27e35ae 100644 --- a/node-daemon/plugins/dpkg +++ b/node-daemon/plugins/dpkg @@ -55,6 +55,9 @@ class MonitoringPluginScript(MonitoringPlugin): setup(): Perform special setup steps during node daemon startup This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. """ pass diff --git a/node-daemon/plugins/edac b/node-daemon/plugins/edac index 44e361eb..dd2293ba 100644 --- a/node-daemon/plugins/edac +++ b/node-daemon/plugins/edac @@ -53,6 +53,9 @@ class MonitoringPluginScript(MonitoringPlugin): setup(): Perform special setup steps during node daemon startup This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. """ pass diff --git a/node-daemon/plugins/load b/node-daemon/plugins/load index f3e4fb39..8b0cd2bb 100644 --- a/node-daemon/plugins/load +++ b/node-daemon/plugins/load @@ -54,6 +54,9 @@ class MonitoringPluginScript(MonitoringPlugin): setup(): Perform special setup steps during node daemon startup This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. """ pass diff --git a/node-daemon/plugins/nics b/node-daemon/plugins/nics index 70840245..707078fc 100644 --- a/node-daemon/plugins/nics +++ b/node-daemon/plugins/nics @@ -58,6 +58,9 @@ class MonitoringPluginScript(MonitoringPlugin): setup(): Perform special setup steps during node daemon startup This step is optional and should be used sparingly. + + If you wish for the plugin to not load in certain conditions, do any checks here + and return a non-None failure message to indicate the error. """ pass From 3a1b8f0e7a59b6d69304994fe7674450f719b9c2 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 15:16:02 -0500 Subject: [PATCH 17/55] Add JSON health to cluster data --- daemon-common/ceph.py | 13 +++++++++++++ daemon-common/migrations/versions/9.json | 2 +- daemon-common/zkhandler.py | 1 + node-daemon/pvcnoded/util/keepalive.py | 23 +++++++++++++++++++++-- 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/daemon-common/ceph.py b/daemon-common/ceph.py index d269d19e..fbd43553 100644 --- a/daemon-common/ceph.py +++ b/daemon-common/ceph.py @@ -158,6 +158,19 @@ def get_status(zkhandler): return True, status_data +def get_health(zkhandler): + primary_node = zkhandler.read("base.config.primary_node") + ceph_health = zkhandler.read("base.storage.health").rstrip() + + # Create a data structure for the information + status_data = { + "type": "health", + "primary_node": primary_node, + "ceph_data": ceph_health, + } + return True, status_data + + def get_util(zkhandler): primary_node = zkhandler.read("base.config.primary_node") ceph_df = zkhandler.read("base.storage.util").rstrip() diff --git a/daemon-common/migrations/versions/9.json b/daemon-common/migrations/versions/9.json index 0401b70f..fcf5c15b 100644 --- a/daemon-common/migrations/versions/9.json +++ b/daemon-common/migrations/versions/9.json @@ -1 +1 @@ -{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file +{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index a52c4ec1..d5494ed4 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -569,6 +569,7 @@ class ZKSchema(object): "domain": f"{_schema_root}/domains", "network": f"{_schema_root}/networks", "storage": f"{_schema_root}/ceph", + "storage.health": f"{_schema_root}/ceph/health", "storage.util": f"{_schema_root}/ceph/util", "osd": f"{_schema_root}/ceph/osds", "pool": f"{_schema_root}/ceph/pools", diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py index dc6e6a33..c75a0e23 100644 --- a/node-daemon/pvcnoded/util/keepalive.py +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -99,9 +99,10 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue): # Primary-only functions if this_node.router_state == "primary": + # Get Ceph status information (pretty) if debug: logger.out( - "Set ceph health information in zookeeper (primary only)", + "Set Ceph status information in zookeeper (primary only)", state="d", prefix="ceph-thread", ) @@ -115,9 +116,27 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue): except Exception as e: logger.out("Failed to set Ceph status data: {}".format(e), state="e") + # Get Ceph health information (JSON) if debug: logger.out( - "Set ceph rados df information in zookeeper (primary only)", + "Set Ceph health information in zookeeper (primary only)", + state="d", + prefix="ceph-thread", + ) + + command = {"prefix": "health", "format": "json"} + ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[ + 1 + ].decode("ascii") + try: + zkhandler.write([("base.storage.health", str(ceph_health))]) + except Exception as e: + logger.out("Failed to set Ceph health data: {}".format(e), state="e") + + # Get Ceph df information (pretty) + if debug: + logger.out( + "Set Ceph rados df information in zookeeper (primary only)", state="d", prefix="ceph-thread", ) From 6ac5b0d02f3c52168f64906976273377dd9f41b2 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 15:45:43 -0500 Subject: [PATCH 18/55] Modify cluster health to use new values --- client-cli/pvc/cli_lib/cluster.py | 72 ++------- daemon-common/cluster.py | 254 +++++++++++++----------------- 2 files changed, 127 insertions(+), 199 deletions(-) diff --git a/client-cli/pvc/cli_lib/cluster.py b/client-cli/pvc/cli_lib/cluster.py index af3e54e6..33180c68 100644 --- a/client-cli/pvc/cli_lib/cluster.py +++ b/client-cli/pvc/cli_lib/cluster.py @@ -125,82 +125,42 @@ def format_info(cluster_information, oformat): return json.dumps(cluster_information, indent=4) # Plain formatting, i.e. human-readable - if cluster_information["health"] == "Optimal": - health_colour = ansiprint.green() - elif cluster_information["health"] == "Maintenance": + if cluster_information["maintenance"] == "True": health_colour = ansiprint.blue() - else: + elif cluster_information["health"] > 90: + health_colour = ansiprint.green() + elif cluster_information["health"] > 50: health_colour = ansiprint.yellow() - - if cluster_information["storage_health"] == "Optimal": - storage_health_colour = ansiprint.green() - elif cluster_information["storage_health"] == "Maintenance": - storage_health_colour = ansiprint.blue() else: - storage_health_colour = ansiprint.yellow() + health_colour = ansiprint.red() ainformation = [] - if oformat == "short": - ainformation.append( - "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end()) - ) - ainformation.append( - "{}Cluster health:{} {}{}{}".format( - ansiprint.purple(), - ansiprint.end(), - health_colour, - cluster_information["health"], - ansiprint.end(), - ) - ) - if cluster_information["health_msg"]: - for line in cluster_information["health_msg"]: - ainformation.append(" > {}".format(line)) - ainformation.append( - "{}Storage health:{} {}{}{}".format( - ansiprint.purple(), - ansiprint.end(), - storage_health_colour, - cluster_information["storage_health"], - ansiprint.end(), - ) - ) - if cluster_information["storage_health_msg"]: - for line in cluster_information["storage_health_msg"]: - ainformation.append(" > {}".format(line)) - - return "\n".join(ainformation) - ainformation.append( "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end()) ) ainformation.append("") + + health_text = f"{cluster_information['health']}%" + if cluster_information["maintenance"] == "True": + health_text += " (maintenance on)" + ainformation.append( "{}Cluster health:{} {}{}{}".format( ansiprint.purple(), ansiprint.end(), health_colour, - cluster_information["health"], + health_text, ansiprint.end(), ) ) - if cluster_information["health_msg"]: - for line in cluster_information["health_msg"]: - ainformation.append(" > {}".format(line)) - ainformation.append( - "{}Storage health:{} {}{}{}".format( - ansiprint.purple(), - ansiprint.end(), - storage_health_colour, - cluster_information["storage_health"], - ansiprint.end(), - ) - ) - if cluster_information["storage_health_msg"]: - for line in cluster_information["storage_health_msg"]: + if cluster_information["health_messages"]: + for line in cluster_information["health_messages"]: ainformation.append(" > {}".format(line)) + if oformat == "short": + return "\n".join(ainformation) + ainformation.append("") ainformation.append( "{}Primary node:{} {}".format( diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 6fe1cdaa..6424c2c8 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -19,7 +19,7 @@ # ############################################################################### -import re +from json import loads import daemon_lib.common as common import daemon_lib.vm as pvc_vm @@ -44,13 +44,99 @@ def set_maintenance(zkhandler, maint_state): return True, "Successfully set cluster in normal mode" +def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): + health_delta_map = { + 'node_stopped': 50, + 'node_flushed': 10, + 'vm_stopped': 10, + 'osd_out': 50, + 'osd_down': 10, + 'memory_overprovisioned': 50, + 'ceph_err': 50, + 'ceph_warn': 10, + } + + # Generate total cluster health numbers + cluster_health = 100 + messages = list() + + for index, node in enumerate(node_list): + # Apply node health values to total health number + cluster_health -= 100 - node['health'] + for entry in node['health_details']: + if entry['health_delta'] > 0: + messages.append(f"{node['name']}: plugin {entry['plugin_name']}: {entry['message']}") + + # Handle unhealthy node states + if node['daemon_state'] not in ['run']: + cluster_health -= health_delta_map['node_stopped'] + messages.append(f"cluster: {node['name']} in {node['daemon_state']} daemon state") + elif node['domain_state'] not in ['ready']: + cluster_health -= health_delta_map['node_flushed'] + messages.append(f"cluster: {node['name']} in {node['domain_state']} domain state") + + for index, vm in enumerate(vm_list): + # Handle unhealthy VM states + if vm['state'] not in ["start", "disable", "migrate", "unmigrate", "provision"]: + cluster_health -= health_delta_map['vm_stopped'] + messages.append(f"cluster: {vm['name']} in {vm['state']} state") + + for index, ceph_osd in enumerate(ceph_osd_list): + in_texts = {1: "in", 0: "out"} + up_texts = {1: "up", 0: "down"} + + # Handle unhealthy OSD states + if in_texts[ceph_osd["stats"]["in"]] not in ["in"]: + cluster_health -= health_delta_map['osd_out'] + messages.append(f"cluster: OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']]} state") + elif up_texts[ceph_osd["stats"]["up"]] not in ['up']: + cluster_health -= health_delta_map['osd_down'] + messages.append(f"cluster: OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']]} state") + + # Check for (n-1) overprovisioning + # Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than + # the total memory of the (n-1) smallest nodes, trigger this warning. + n_minus_1_total = 0 + alloc_total = 0 + node_largest_index = None + node_largest_count = 0 + for index, node in enumerate(node_list): + node_mem_total = node["memory"]["total"] + node_mem_alloc = node["memory"]["allocated"] + alloc_total += node_mem_alloc + # Determine if this node is the largest seen so far + if node_mem_total > node_largest_count: + node_largest_index = index + node_largest_count = node_mem_total + n_minus_1_node_list = list() + for index, node in enumerate(node_list): + if index == node_largest_index: + continue + n_minus_1_node_list.append(node) + for index, node in enumerate(n_minus_1_node_list): + n_minus_1_total += node["memory"]["total"] + if alloc_total > n_minus_1_total: + cluster_health -= health_delta_map['memory_overprovisioned'] + messages.append(f"cluster: Total VM memory is overprovisioned ({alloc_total} > {n_minus_1_total} n-1)") + + # Check Ceph cluster health + ceph_health = loads(zkhandler.read("base.storage.health")) + ceph_health_status = ceph_health["status"] + ceph_health_entries = ceph_health["checks"].keys() + + if ceph_health_status == 'HEALTH_ERR': + cluster_health -= health_delta_map['ceph_err'] + messages.append(f"cluster: Ceph cluster in ERROR state: {', '.join(ceph_health_entries)}") + elif ceph_health_status == 'HEALTH_WARN': + cluster_health -= health_delta_map['ceph_warn'] + messages.append(f"cluster: Ceph cluster in WARNING state: {', '.join(ceph_health_entries)}") + + return cluster_health, messages + + def getClusterInformation(zkhandler): # Get cluster maintenance state - maint_state = zkhandler.read("base.config.maintenance") - - # List of messages to display to the clients - cluster_health_msg = [] - storage_health_msg = [] + maintenance_state = zkhandler.read("base.config.maintenance") # Get node information object list retcode, node_list = pvc_node.get_list(zkhandler, None) @@ -78,135 +164,6 @@ def getClusterInformation(zkhandler): ceph_volume_count = len(ceph_volume_list) ceph_snapshot_count = len(ceph_snapshot_list) - # Determinations for general cluster health - cluster_healthy_status = True - # Check for (n-1) overprovisioning - # Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than - # the total memory of the (n-1) smallest nodes, trigger this warning. - n_minus_1_total = 0 - alloc_total = 0 - - node_largest_index = None - node_largest_count = 0 - for index, node in enumerate(node_list): - node_mem_total = node["memory"]["total"] - node_mem_alloc = node["memory"]["allocated"] - alloc_total += node_mem_alloc - - # Determine if this node is the largest seen so far - if node_mem_total > node_largest_count: - node_largest_index = index - node_largest_count = node_mem_total - n_minus_1_node_list = list() - for index, node in enumerate(node_list): - if index == node_largest_index: - continue - n_minus_1_node_list.append(node) - for index, node in enumerate(n_minus_1_node_list): - n_minus_1_total += node["memory"]["total"] - if alloc_total > n_minus_1_total: - cluster_healthy_status = False - cluster_health_msg.append( - "Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format( - alloc_total, n_minus_1_total - ) - ) - - # Determinations for node health - node_healthy_status = list(range(0, node_count)) - node_report_status = list(range(0, node_count)) - for index, node in enumerate(node_list): - daemon_state = node["daemon_state"] - domain_state = node["domain_state"] - if daemon_state != "run" and domain_state != "ready": - node_healthy_status[index] = False - cluster_health_msg.append( - "Node '{}' in {},{} state".format( - node["name"], daemon_state, domain_state - ) - ) - else: - node_healthy_status[index] = True - node_report_status[index] = daemon_state + "," + domain_state - - # Determinations for VM health - vm_healthy_status = list(range(0, vm_count)) - vm_report_status = list(range(0, vm_count)) - for index, vm in enumerate(vm_list): - vm_state = vm["state"] - if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]: - vm_healthy_status[index] = False - cluster_health_msg.append( - "VM '{}' in {} state".format(vm["name"], vm_state) - ) - else: - vm_healthy_status[index] = True - vm_report_status[index] = vm_state - - # Determinations for OSD health - ceph_osd_healthy_status = list(range(0, ceph_osd_count)) - ceph_osd_report_status = list(range(0, ceph_osd_count)) - for index, ceph_osd in enumerate(ceph_osd_list): - try: - ceph_osd_up = ceph_osd["stats"]["up"] - except KeyError: - ceph_osd_up = 0 - - try: - ceph_osd_in = ceph_osd["stats"]["in"] - except KeyError: - ceph_osd_in = 0 - - up_texts = {1: "up", 0: "down"} - in_texts = {1: "in", 0: "out"} - - if not ceph_osd_up or not ceph_osd_in: - ceph_osd_healthy_status[index] = False - cluster_health_msg.append( - "OSD {} in {},{} state".format( - ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in] - ) - ) - else: - ceph_osd_healthy_status[index] = True - ceph_osd_report_status[index] = ( - up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in] - ) - - # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy - if maint_state == "true": - cluster_health = "Maintenance" - elif ( - cluster_healthy_status is False - or False in node_healthy_status - or False in vm_healthy_status - or False in ceph_osd_healthy_status - ): - cluster_health = "Degraded" - else: - cluster_health = "Optimal" - - # Find out our storage health from Ceph - ceph_status = zkhandler.read("base.storage").split("\n") - ceph_health = ceph_status[2].split()[-1] - - # Parse the status output to get the health indicators - line_record = False - for index, line in enumerate(ceph_status): - if re.search("services:", line): - line_record = False - if line_record and len(line.strip()) > 0: - storage_health_msg.append(line.strip()) - if re.search("health:", line): - line_record = True - - if maint_state == "true": - storage_health = "Maintenance" - elif ceph_health != "HEALTH_OK": - storage_health = "Degraded" - else: - storage_health = "Optimal" - # State lists node_state_combinations = [ "run,ready", @@ -237,13 +194,19 @@ def getClusterInformation(zkhandler): "unmigrate", "provision", ] - ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"] + ceph_osd_state_combinations = [ + "up,in", + "up,out", + "down,in", + "down,out", + ] # Format the Node states formatted_node_states = {"total": node_count} for state in node_state_combinations: state_count = 0 - for node_state in node_report_status: + for node in node_list: + node_state = f"{node['daemon_state']},{node['domain_state']}" if node_state == state: state_count += 1 if state_count > 0: @@ -253,28 +216,33 @@ def getClusterInformation(zkhandler): formatted_vm_states = {"total": vm_count} for state in vm_state_combinations: state_count = 0 - for vm_state in vm_report_status: - if vm_state == state: + for vm in vm_list: + if vm["state"] == state: state_count += 1 if state_count > 0: formatted_vm_states[state] = state_count # Format the OSD states + up_texts = {1: "up", 0: "down"} + in_texts = {1: "in", 0: "out"} formatted_osd_states = {"total": ceph_osd_count} for state in ceph_osd_state_combinations: state_count = 0 - for ceph_osd_state in ceph_osd_report_status: + for ceph_osd in ceph_osd_list: + ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}" if ceph_osd_state == state: state_count += 1 if state_count > 0: formatted_osd_states[state] = state_count + # Get cluster health data + cluster_health, cluster_health_messages = getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list) + # Format the status data cluster_information = { "health": cluster_health, - "health_msg": cluster_health_msg, - "storage_health": storage_health, - "storage_health_msg": storage_health_msg, + "health_messages": cluster_health_messages, + "maintenance": maintenance_state, "primary_node": common.getPrimaryNode(zkhandler), "upstream_ip": zkhandler.read("base.config.upstream_ip"), "nodes": formatted_node_states, From 8b5011c266b944015161c82962b5f935cf127109 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 15:46:13 -0500 Subject: [PATCH 19/55] Move Ceph health to global cluster health --- node-daemon/plugins/ceph | 129 --------------------------------------- 1 file changed, 129 deletions(-) delete mode 100644 node-daemon/plugins/ceph diff --git a/node-daemon/plugins/ceph b/node-daemon/plugins/ceph deleted file mode 100644 index aaeab8b4..00000000 --- a/node-daemon/plugins/ceph +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 - -# ceph.py - PVC Monitoring example plugin for Ceph status -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018-2022 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -# This script provides an example of a PVC monitoring plugin script. It will create -# a simple plugin to check the Ceph cluster health for anomalies, and return a health -# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50). - -# This script can thus be used as an example or reference implementation of a -# PVC monitoring pluginscript and expanded upon as required. - -# A monitoring plugin script must implement the class "MonitoringPluginScript" which -# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation -# of the role of each function is provided in context of the example; see the other -# examples for more potential uses. - -# WARNING: -# -# This script will run in the context of the node daemon keepalives as root. -# DO NOT install untrusted, unvetted plugins under any circumstances. - - -# This import is always required here, as MonitoringPlugin is used by the -# MonitoringPluginScript class -from pvcnoded.objects.MonitoringInstance import MonitoringPlugin - - -# A monitoring plugin script must always expose its nice name, which must be identical to -# the file name -PLUGIN_NAME = "ceph" - - -# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. -class MonitoringPluginScript(MonitoringPlugin): - def setup(self): - """ - setup(): Perform special setup steps during node daemon startup - - This step is optional and should be used sparingly. - - If you wish for the plugin to not load in certain conditions, do any checks here - and return a non-None failure message to indicate the error. - """ - - pass - - def run(self): - """ - run(): Perform the check actions and return a PluginResult object - """ - - # Run any imports first - from rados import Rados - from json import loads, dumps - - # Connect to the Ceph cluster - try: - ceph_conn = Rados( - conffile=self.config["ceph_config_file"], - conf=dict(keyring=self.config["ceph_admin_keyring"]), - ) - ceph_conn.connect(timeout=1) - except Exception as e: - self.log(f"Failed to connect to Ceph cluster: {e}", state="e") - return self.plugin_result - - # Get the Ceph cluster health - try: - health_status = loads( - ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1] - ) - ceph_health = health_status["status"] - except Exception as e: - self.log(f"Failed to get health data from Ceph cluster: {e}", state="e") - return self.plugin_result - finally: - ceph_conn.shutdown() - - # Get a list of error entries in the health status output - error_entries = health_status["checks"].keys() - - # Set the health delta based on the errors presented - if ceph_health == "HEALTH_ERR": - health_delta = 50 - message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}" - elif ceph_health == "HEALTH_WARN": - health_delta = 10 - message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}" - else: - health_delta = 0 - message = "Ceph cluster in OK state" - - # Set the health delta in our local PluginResult object - self.plugin_result.set_health_delta(health_delta) - - # Set the message in our local PluginResult object - self.plugin_result.set_message(message) - - # Set the detailed data in our local PluginResult object - self.plugin_result.set_data(health_status) - - # Return our local PluginResult object - return self.plugin_result - - def cleanup(self): - """ - cleanup(): Perform special cleanup steps during node daemon termination - - This step is optional and should be used sparingly. - """ - - pass From 0ae77d7e7756fa148cff5494078e6bf7f1d39450 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 15:48:31 -0500 Subject: [PATCH 20/55] Fix linting of cluster.py file --- daemon-common/cluster.py | 86 ++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 34 deletions(-) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 6424c2c8..ed285f92 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -46,14 +46,14 @@ def set_maintenance(zkhandler, maint_state): def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): health_delta_map = { - 'node_stopped': 50, - 'node_flushed': 10, - 'vm_stopped': 10, - 'osd_out': 50, - 'osd_down': 10, - 'memory_overprovisioned': 50, - 'ceph_err': 50, - 'ceph_warn': 10, + "node_stopped": 50, + "node_flushed": 10, + "vm_stopped": 10, + "osd_out": 50, + "osd_down": 10, + "memory_overprovisioned": 50, + "ceph_err": 50, + "ceph_warn": 10, } # Generate total cluster health numbers @@ -62,23 +62,29 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): for index, node in enumerate(node_list): # Apply node health values to total health number - cluster_health -= 100 - node['health'] - for entry in node['health_details']: - if entry['health_delta'] > 0: - messages.append(f"{node['name']}: plugin {entry['plugin_name']}: {entry['message']}") + cluster_health -= 100 - node["health"] + for entry in node["health_details"]: + if entry["health_delta"] > 0: + messages.append( + f"{node['name']}: plugin {entry['plugin_name']}: {entry['message']}" + ) # Handle unhealthy node states - if node['daemon_state'] not in ['run']: - cluster_health -= health_delta_map['node_stopped'] - messages.append(f"cluster: {node['name']} in {node['daemon_state']} daemon state") - elif node['domain_state'] not in ['ready']: - cluster_health -= health_delta_map['node_flushed'] - messages.append(f"cluster: {node['name']} in {node['domain_state']} domain state") + if node["daemon_state"] not in ["run"]: + cluster_health -= health_delta_map["node_stopped"] + messages.append( + f"cluster: {node['name']} in {node['daemon_state']} daemon state" + ) + elif node["domain_state"] not in ["ready"]: + cluster_health -= health_delta_map["node_flushed"] + messages.append( + f"cluster: {node['name']} in {node['domain_state']} domain state" + ) for index, vm in enumerate(vm_list): # Handle unhealthy VM states - if vm['state'] not in ["start", "disable", "migrate", "unmigrate", "provision"]: - cluster_health -= health_delta_map['vm_stopped'] + if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]: + cluster_health -= health_delta_map["vm_stopped"] messages.append(f"cluster: {vm['name']} in {vm['state']} state") for index, ceph_osd in enumerate(ceph_osd_list): @@ -87,11 +93,15 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): # Handle unhealthy OSD states if in_texts[ceph_osd["stats"]["in"]] not in ["in"]: - cluster_health -= health_delta_map['osd_out'] - messages.append(f"cluster: OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']]} state") - elif up_texts[ceph_osd["stats"]["up"]] not in ['up']: - cluster_health -= health_delta_map['osd_down'] - messages.append(f"cluster: OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']]} state") + cluster_health -= health_delta_map["osd_out"] + messages.append( + f"cluster: OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']]} state" + ) + elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]: + cluster_health -= health_delta_map["osd_down"] + messages.append( + f"cluster: OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']]} state" + ) # Check for (n-1) overprovisioning # Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than @@ -116,20 +126,26 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): for index, node in enumerate(n_minus_1_node_list): n_minus_1_total += node["memory"]["total"] if alloc_total > n_minus_1_total: - cluster_health -= health_delta_map['memory_overprovisioned'] - messages.append(f"cluster: Total VM memory is overprovisioned ({alloc_total} > {n_minus_1_total} n-1)") + cluster_health -= health_delta_map["memory_overprovisioned"] + messages.append( + f"cluster: Total VM memory is overprovisioned ({alloc_total} > {n_minus_1_total} n-1)" + ) # Check Ceph cluster health ceph_health = loads(zkhandler.read("base.storage.health")) ceph_health_status = ceph_health["status"] ceph_health_entries = ceph_health["checks"].keys() - if ceph_health_status == 'HEALTH_ERR': - cluster_health -= health_delta_map['ceph_err'] - messages.append(f"cluster: Ceph cluster in ERROR state: {', '.join(ceph_health_entries)}") - elif ceph_health_status == 'HEALTH_WARN': - cluster_health -= health_delta_map['ceph_warn'] - messages.append(f"cluster: Ceph cluster in WARNING state: {', '.join(ceph_health_entries)}") + if ceph_health_status == "HEALTH_ERR": + cluster_health -= health_delta_map["ceph_err"] + messages.append( + f"cluster: Ceph cluster in ERROR state: {', '.join(ceph_health_entries)}" + ) + elif ceph_health_status == "HEALTH_WARN": + cluster_health -= health_delta_map["ceph_warn"] + messages.append( + f"cluster: Ceph cluster in WARNING state: {', '.join(ceph_health_entries)}" + ) return cluster_health, messages @@ -236,7 +252,9 @@ def getClusterInformation(zkhandler): formatted_osd_states[state] = state_count # Get cluster health data - cluster_health, cluster_health_messages = getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list) + cluster_health, cluster_health_messages = getClusterHealth( + zkhandler, node_list, vm_list, ceph_osd_list + ) # Format the status data cluster_information = { From b236127dba6fdc6291e4285f31a10debfba7599d Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 16:28:41 -0500 Subject: [PATCH 21/55] Remove extra text from packages plugin --- node-daemon/plugins/dpkg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/plugins/dpkg b/node-daemon/plugins/dpkg index f27e35ae..f083318b 100644 --- a/node-daemon/plugins/dpkg +++ b/node-daemon/plugins/dpkg @@ -133,7 +133,7 @@ class MonitoringPluginScript(MonitoringPlugin): self.plugin_result.set_health_delta(health_delta) # Craft the message - message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages valid: {count_ok}, inconsistent: {count_inconsistent}, upgradable: {count_upgradable}" + message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}" # Set the message in our local PluginResult object self.plugin_result.set_message(message) From fa900f6212f23e4a87362fb52b0a00ef7a6ad110 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 16:28:56 -0500 Subject: [PATCH 22/55] Fix bugs and formatting of health messages --- client-cli/pvc/cli_lib/cluster.py | 21 +++++++++++++++--- daemon-common/cluster.py | 36 +++++++++++++++++++------------ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/client-cli/pvc/cli_lib/cluster.py b/client-cli/pvc/cli_lib/cluster.py index 33180c68..0e2cbf22 100644 --- a/client-cli/pvc/cli_lib/cluster.py +++ b/client-cli/pvc/cli_lib/cluster.py @@ -146,7 +146,7 @@ def format_info(cluster_information, oformat): health_text += " (maintenance on)" ainformation.append( - "{}Cluster health:{} {}{}{}".format( + "{}Cluster health:{} {}{}{}".format( ansiprint.purple(), ansiprint.end(), health_colour, @@ -155,8 +155,23 @@ def format_info(cluster_information, oformat): ) ) if cluster_information["health_messages"]: - for line in cluster_information["health_messages"]: - ainformation.append(" > {}".format(line)) + health_messages = "\n > ".join( + sorted(cluster_information["health_messages"]) + ) + ainformation.append( + "{}Health messages:{} > {}".format( + ansiprint.purple(), + ansiprint.end(), + health_messages, + ) + ) + else: + ainformation.append( + "{}Health messages:{} N/A".format( + ansiprint.purple(), + ansiprint.end(), + ) + ) if oformat == "short": return "\n".join(ainformation) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index ed285f92..57565c93 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -62,30 +62,35 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): for index, node in enumerate(node_list): # Apply node health values to total health number - cluster_health -= 100 - node["health"] + try: + node_health_int = int(node["health"]) + except Exception: + node_health_int = 100 + cluster_health -= 100 - node_health_int + for entry in node["health_details"]: if entry["health_delta"] > 0: messages.append( - f"{node['name']}: plugin {entry['plugin_name']}: {entry['message']}" + f"{node['name']}: plugin '{entry['name']}': {entry['message']}" ) # Handle unhealthy node states if node["daemon_state"] not in ["run"]: cluster_health -= health_delta_map["node_stopped"] messages.append( - f"cluster: {node['name']} in {node['daemon_state']} daemon state" + f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state" ) elif node["domain_state"] not in ["ready"]: cluster_health -= health_delta_map["node_flushed"] messages.append( - f"cluster: {node['name']} in {node['domain_state']} domain state" + f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state" ) for index, vm in enumerate(vm_list): # Handle unhealthy VM states if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]: cluster_health -= health_delta_map["vm_stopped"] - messages.append(f"cluster: {vm['name']} in {vm['state']} state") + messages.append(f"cluster: VM {vm['name']} in {vm['state'].upper()} state") for index, ceph_osd in enumerate(ceph_osd_list): in_texts = {1: "in", 0: "out"} @@ -95,12 +100,12 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): if in_texts[ceph_osd["stats"]["in"]] not in ["in"]: cluster_health -= health_delta_map["osd_out"] messages.append( - f"cluster: OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']]} state" + f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state" ) elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]: cluster_health -= health_delta_map["osd_down"] messages.append( - f"cluster: OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']]} state" + f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state" ) # Check for (n-1) overprovisioning @@ -128,7 +133,7 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): if alloc_total > n_minus_1_total: cluster_health -= health_delta_map["memory_overprovisioned"] messages.append( - f"cluster: Total VM memory is overprovisioned ({alloc_total} > {n_minus_1_total} n-1)" + f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)" ) # Check Ceph cluster health @@ -136,16 +141,19 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): ceph_health_status = ceph_health["status"] ceph_health_entries = ceph_health["checks"].keys() + ceph_health_status_map = { + "HEALTH_ERR": "ERROR", + "HEALTH_WARN": "WARNING", + } + for entry in ceph_health_entries: + messages.append( + f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}" + ) + if ceph_health_status == "HEALTH_ERR": cluster_health -= health_delta_map["ceph_err"] - messages.append( - f"cluster: Ceph cluster in ERROR state: {', '.join(ceph_health_entries)}" - ) elif ceph_health_status == "HEALTH_WARN": cluster_health -= health_delta_map["ceph_warn"] - messages.append( - f"cluster: Ceph cluster in WARNING state: {', '.join(ceph_health_entries)}" - ) return cluster_health, messages From 3408e27355f42ec4680bb926b1193b2174891f29 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 16:42:42 -0500 Subject: [PATCH 23/55] Add per-node health entries for 3rd party checks --- daemon-common/cluster.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 57565c93..d5dd9ca5 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -158,6 +158,25 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): return cluster_health, messages +def getNodeHealth(zkhandler, node_list): + node_health = dict() + for index, node in enumerate(node_list): + node_health_messages = list() + node_health_value = node["health"] + for entry in node["health_details"]: + if entry["health_delta"] > 0: + node_health_messages.append(f"'{entry['name']}': {entry['message']}") + + node_health_entry = { + "health": node_health_value, + "messages": node_health_messages, + } + + node_health[node["name"]] = node_health_entry + + return node_health + + def getClusterInformation(zkhandler): # Get cluster maintenance state maintenance_state = zkhandler.read("base.config.maintenance") @@ -268,6 +287,7 @@ def getClusterInformation(zkhandler): cluster_information = { "health": cluster_health, "health_messages": cluster_health_messages, + "node_health": getNodeHealth(zkhandler, node_list), "maintenance": maintenance_state, "primary_node": common.getPrimaryNode(zkhandler), "upstream_ip": zkhandler.read("base.config.upstream_ip"), From 21965d280c2669117bf208ee6082a459b8c474ac Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 16:46:27 -0500 Subject: [PATCH 24/55] Fix comparison in maintenance check --- client-cli/pvc/cli_lib/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client-cli/pvc/cli_lib/cluster.py b/client-cli/pvc/cli_lib/cluster.py index 0e2cbf22..a20bdf77 100644 --- a/client-cli/pvc/cli_lib/cluster.py +++ b/client-cli/pvc/cli_lib/cluster.py @@ -125,7 +125,7 @@ def format_info(cluster_information, oformat): return json.dumps(cluster_information, indent=4) # Plain formatting, i.e. human-readable - if cluster_information["maintenance"] == "True": + if cluster_information["maintenance"] == "true": health_colour = ansiprint.blue() elif cluster_information["health"] > 90: health_colour = ansiprint.green() @@ -142,7 +142,7 @@ def format_info(cluster_information, oformat): ainformation.append("") health_text = f"{cluster_information['health']}%" - if cluster_information["maintenance"] == "True": + if cluster_information["maintenance"] == "true": health_text += " (maintenance on)" ainformation.append( From 4ab0bdd9e83fde3b3ce1ba455d512dbbd924ed57 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 16:49:12 -0500 Subject: [PATCH 25/55] Disallow health less than 0 --- daemon-common/cluster.py | 3 +++ node-daemon/pvcnoded/objects/MonitoringInstance.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index d5dd9ca5..8bf0aec1 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -155,6 +155,9 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): elif ceph_health_status == "HEALTH_WARN": cluster_health -= health_delta_map["ceph_warn"] + if cluster_health < 0: + cluster_health = 0 + return cluster_health, messages diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 4ccc620c..81b0ccd4 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -366,6 +366,9 @@ class MonitoringInstance(object): if result is not None: total_health -= result.health_delta + if total_health < 0: + total_health = 0 + if total_health > 90: health_colour = self.logger.fmt_green elif total_health > 50: From 25d0fde5e4990993bc96aabd3b37b7297d9c4d85 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 15 Feb 2023 21:35:44 -0500 Subject: [PATCH 26/55] Add JSON output format for node info --- client-cli/pvc/pvc.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/client-cli/pvc/pvc.py b/client-cli/pvc/pvc.py index ec13fa9e..f17f5d50 100755 --- a/client-cli/pvc/pvc.py +++ b/client-cli/pvc/pvc.py @@ -697,15 +697,29 @@ def node_log(node, lines, follow): default=False, help="Display more detailed information.", ) +@click.option( + "-f", + "--format", + "oformat", + default="plain", + show_default=True, + type=click.Choice(["plain", "json", "json-pretty"]), + help="Output format of node status information.", +) @cluster_req -def node_info(node, long_output): +def node_info(node, long_output, oformat): """ Show information about node NODE. If unspecified, defaults to this host. """ retcode, retdata = pvc_node.node_info(config, node) if retcode: - retdata = pvc_node.format_info(retdata, long_output) + if oformat == "json": + retdata = json.dumps(retdata) + elif oformat == "json-pretty": + retdata = json.dumps(retdata, indent=4) + else: + retdata = pvc_node.format_info(retdata, long_output) cleanup(retcode, retdata) From 3c6c33a3263659004ef7830babf5dfc3d793dea0 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Feb 2023 12:33:18 -0500 Subject: [PATCH 27/55] Exclude monitoring examples from flake8 --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 810a8500..2cc50667 100644 --- a/.flake8 +++ b/.flake8 @@ -6,7 +6,7 @@ ignore = W503, E501 extend-ignore = E203 # We exclude the Debian, migrations, and provisioner examples -exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples +exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples,node-daemon/monitoring # Set the max line length to 88 for Black max-line-length = 88 From 75639c17d943d5189434b18b0b7e2140096f50ef Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Feb 2023 12:33:36 -0500 Subject: [PATCH 28/55] Format cluster health like node healths Make a cleaner construct here. --- client-cli/pvc/cli_lib/cluster.py | 10 ++--- daemon-common/cluster.py | 61 ++++++++++++++++--------------- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/client-cli/pvc/cli_lib/cluster.py b/client-cli/pvc/cli_lib/cluster.py index a20bdf77..dab99c37 100644 --- a/client-cli/pvc/cli_lib/cluster.py +++ b/client-cli/pvc/cli_lib/cluster.py @@ -127,9 +127,9 @@ def format_info(cluster_information, oformat): # Plain formatting, i.e. human-readable if cluster_information["maintenance"] == "true": health_colour = ansiprint.blue() - elif cluster_information["health"] > 90: + elif cluster_information["cluster_health"]["health"] > 90: health_colour = ansiprint.green() - elif cluster_information["health"] > 50: + elif cluster_information["cluster_health"]["health"] > 50: health_colour = ansiprint.yellow() else: health_colour = ansiprint.red() @@ -141,7 +141,7 @@ def format_info(cluster_information, oformat): ) ainformation.append("") - health_text = f"{cluster_information['health']}%" + health_text = f"{cluster_information['cluster_health']['health']}%" if cluster_information["maintenance"] == "true": health_text += " (maintenance on)" @@ -154,9 +154,9 @@ def format_info(cluster_information, oformat): ansiprint.end(), ) ) - if cluster_information["health_messages"]: + if cluster_information["cluster_health"]["messages"]: health_messages = "\n > ".join( - sorted(cluster_information["health_messages"]) + sorted(cluster_information["cluster_health"]["messages"]) ) ainformation.append( "{}Health messages:{} > {}".format( diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 8bf0aec1..131761af 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -57,8 +57,8 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): } # Generate total cluster health numbers - cluster_health = 100 - messages = list() + cluster_health_value = 100 + cluster_health_messages = list() for index, node in enumerate(node_list): # Apply node health values to total health number @@ -66,31 +66,33 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): node_health_int = int(node["health"]) except Exception: node_health_int = 100 - cluster_health -= 100 - node_health_int + cluster_health_value -= 100 - node_health_int for entry in node["health_details"]: if entry["health_delta"] > 0: - messages.append( + cluster_health_messages.append( f"{node['name']}: plugin '{entry['name']}': {entry['message']}" ) # Handle unhealthy node states if node["daemon_state"] not in ["run"]: - cluster_health -= health_delta_map["node_stopped"] - messages.append( + cluster_health_value -= health_delta_map["node_stopped"] + cluster_health_messages.append( f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state" ) elif node["domain_state"] not in ["ready"]: - cluster_health -= health_delta_map["node_flushed"] - messages.append( + cluster_health_value -= health_delta_map["node_flushed"] + cluster_health_messages.append( f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state" ) for index, vm in enumerate(vm_list): # Handle unhealthy VM states if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]: - cluster_health -= health_delta_map["vm_stopped"] - messages.append(f"cluster: VM {vm['name']} in {vm['state'].upper()} state") + cluster_health_value -= health_delta_map["vm_stopped"] + cluster_health_messages.append( + f"cluster: VM {vm['name']} in {vm['state'].upper()} state" + ) for index, ceph_osd in enumerate(ceph_osd_list): in_texts = {1: "in", 0: "out"} @@ -98,13 +100,13 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): # Handle unhealthy OSD states if in_texts[ceph_osd["stats"]["in"]] not in ["in"]: - cluster_health -= health_delta_map["osd_out"] - messages.append( + cluster_health_value -= health_delta_map["osd_out"] + cluster_health_messages.append( f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state" ) elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]: - cluster_health -= health_delta_map["osd_down"] - messages.append( + cluster_health_value -= health_delta_map["osd_down"] + cluster_health_messages.append( f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state" ) @@ -131,8 +133,8 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): for index, node in enumerate(n_minus_1_node_list): n_minus_1_total += node["memory"]["total"] if alloc_total > n_minus_1_total: - cluster_health -= health_delta_map["memory_overprovisioned"] - messages.append( + cluster_health_value -= health_delta_map["memory_overprovisioned"] + cluster_health_messages.append( f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)" ) @@ -146,19 +148,24 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): "HEALTH_WARN": "WARNING", } for entry in ceph_health_entries: - messages.append( + cluster_health_messages.append( f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}" ) if ceph_health_status == "HEALTH_ERR": - cluster_health -= health_delta_map["ceph_err"] + cluster_health_value -= health_delta_map["ceph_err"] elif ceph_health_status == "HEALTH_WARN": - cluster_health -= health_delta_map["ceph_warn"] + cluster_health_value -= health_delta_map["ceph_warn"] - if cluster_health < 0: - cluster_health = 0 + if cluster_health_value < 0: + cluster_health_value = 0 - return cluster_health, messages + cluster_health = { + "health": cluster_health_value, + "messages": cluster_health_messages, + } + + return cluster_health def getNodeHealth(zkhandler, node_list): @@ -281,15 +288,11 @@ def getClusterInformation(zkhandler): if state_count > 0: formatted_osd_states[state] = state_count - # Get cluster health data - cluster_health, cluster_health_messages = getClusterHealth( - zkhandler, node_list, vm_list, ceph_osd_list - ) - # Format the status data cluster_information = { - "health": cluster_health, - "health_messages": cluster_health_messages, + "cluster_health": getClusterHealth( + zkhandler, node_list, vm_list, ceph_osd_list + ), "node_health": getNodeHealth(zkhandler, node_list), "maintenance": maintenance_state, "primary_node": common.getPrimaryNode(zkhandler), From 529e6d6878f89dd45a991af6908aa4b60387b143 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Feb 2023 13:06:35 -0500 Subject: [PATCH 29/55] Add CheckMK monitoring example plugins --- node-daemon/monitoring/README.md | 30 ++++++++- node-daemon/monitoring/checkmk/pvc | 6 ++ node-daemon/monitoring/checkmk/pvc.py | 95 +++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 3 deletions(-) create mode 100755 node-daemon/monitoring/checkmk/pvc create mode 100644 node-daemon/monitoring/checkmk/pvc.py diff --git a/node-daemon/monitoring/README.md b/node-daemon/monitoring/README.md index 5d786a79..3845c239 100644 --- a/node-daemon/monitoring/README.md +++ b/node-daemon/monitoring/README.md @@ -2,9 +2,9 @@ This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system. -### Munin +## Munin -The included munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time. +The included Munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time. When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99. @@ -21,4 +21,28 @@ env.pvc_storage_warning 0.99 env.pvc_storage_critical 1.99 ``` -### Check_MK +## CheckMK + +The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin, and can be activated as follows: + +### Agent plugin: `pvc` + +Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node. + +### Server plugin: `pvc.py` + +This monitoring server plugin requires CheckMK version 2.0 or higher. + +Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory for each monitoring site. + +### Output + +With both the agent and server plugins installed, you can then run `cmk -II ` (or use WATO) to inventory each node, which should produce two new checks: + +* `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check). + +* `PVC Node `: Provides the per-node health. + +The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage. + +The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself. diff --git a/node-daemon/monitoring/checkmk/pvc b/node-daemon/monitoring/checkmk/pvc new file mode 100755 index 00000000..cdddd8fd --- /dev/null +++ b/node-daemon/monitoring/checkmk/pvc @@ -0,0 +1,6 @@ +#!/bin/bash + +# PVC cluster status check for Check_MK (agent-side) + +echo "<<>>" +pvc --quiet status --format json diff --git a/node-daemon/monitoring/checkmk/pvc.py b/node-daemon/monitoring/checkmk/pvc.py new file mode 100644 index 00000000..d93e6135 --- /dev/null +++ b/node-daemon/monitoring/checkmk/pvc.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# +# Check_MK PVC plugin +# +# Copyright 2017-2021, Joshua Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from .agent_based_api.v1 import * +from cmk.base.check_api import host_name +from time import time +from json import loads + + +def discover_pvc(section): + my_node = host_name().split(".")[0] + yield Service(item=f"PVC Node {my_node}") + yield Service(item="PVC Cluster") + + +def check_pvc(item, params, section): + state = State.OK + summary = "Stuff" + details = None + data = loads(" ".join(section[0])) + my_node = host_name().split(".")[0] + + maintenance_map = { + "true": "on", + "false": "off", + } + maintenance = maintenance_map[data["maintenance"]] + + # Node check + if item == f"PVC Node {my_node}": + my_node = host_name().split(".")[0] + node_health = data["node_health"][my_node]["health"] + node_messages = data["node_health"][my_node]["messages"] + + summary = f"Node health is {node_health}% (maintenance {maintenance})" + + if len(node_messages) > 0: + details = ", ".join(node_messages) + + if node_health <= 50 and maintenance == "off": + state = State.CRIT + elif node_health <= 90 and maintenance == "off": + state = State.WARN + else: + state = State.OK + + yield Metric(name="node-health", value=node_health) + + # Cluster check + elif item == "PVC Cluster": + cluster_health = data["cluster_health"]["health"] + cluster_messages = data["cluster_health"]["messages"] + + summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})" + + if len(cluster_messages) > 0: + details = ", ".join(cluster_messages) + + if cluster_health <= 50 and maintenance == "off": + state = State.CRIT + elif cluster_health <= 90 and maintenance == "off": + state = State.WARN + else: + state = State.OK + + yield Metric(name="cluster-health", value=cluster_health) + + yield Result(state=state, summary=summary, details=details) + return + + +register.check_plugin( + name="pvc", + service_name="%s", + check_ruleset_name="pvc", + discovery_function=discover_pvc, + check_function=check_pvc, + check_default_parameters={}, +) From 396f424f808e2f3dcd2ad6e005a8cf0c20023ba8 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Feb 2023 16:06:00 -0500 Subject: [PATCH 30/55] Update Munin plugin example --- node-daemon/monitoring/README.md | 33 +++----- node-daemon/monitoring/munin/pvc | 132 +++++++++++++------------------ 2 files changed, 63 insertions(+), 102 deletions(-) diff --git a/node-daemon/monitoring/README.md b/node-daemon/monitoring/README.md index 3845c239..04ab005c 100644 --- a/node-daemon/monitoring/README.md +++ b/node-daemon/monitoring/README.md @@ -4,38 +4,25 @@ This directory contains several monitoring resources that can be used with vario ## Munin -The included Munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time. +The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided: -When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99. +* `pvc`: Checks the PVC cluster and node health, providing two graphs, one for each. -Example plugin configuration: +* `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself. -``` -[pvc] -# Make cluster warn on maintenance -env.pvc_cluster_warning 0.99 -# Disable critical threshold (>2) -env.pvc_cluster_critical 3 -# Make storage warn on maintenance, crit on degraded (latter is default) -env.pvc_storage_warning 0.99 -env.pvc_storage_critical 1.99 -``` +The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. + +The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well. ## CheckMK -The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin, and can be activated as follows: +The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin. This monitoring server plugin requires CheckMK version 2.0 or higher. The two parts can be installed as follows: -### Agent plugin: `pvc` +* `pvc`: Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node. -Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node. +* `pvc.py`: Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory on the CheckMK monitoring host for each monitoring site. -### Server plugin: `pvc.py` - -This monitoring server plugin requires CheckMK version 2.0 or higher. - -Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory for each monitoring site. - -### Output +The plugin provides no configuration: the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. With both the agent and server plugins installed, you can then run `cmk -II ` (or use WATO) to inventory each node, which should produce two new checks: diff --git a/node-daemon/monitoring/munin/pvc b/node-daemon/monitoring/munin/pvc index ac025890..da3c7fa6 100755 --- a/node-daemon/monitoring/munin/pvc +++ b/node-daemon/monitoring/munin/pvc @@ -7,23 +7,6 @@ pvc - Plugin to monitor a PVC cluster. -=head1 CONFIGURATION - -Note that due to how Munin thresholds work, these values must always be slightly less than 1 or 2 respectively, -or the alerts will never be triggered. - -Defaults (no config required): - -[pvc] -env.warning 1.99 -env.critical 1.99 - -Make degraded cluster WARN only (max value is 2, so 3 effectively disables): - -[pvc] -env.pvc_cluster_warning 1.99 -env.pvc_cluster_critical 3 - =head1 AUTHOR Joshua Boniface @@ -45,24 +28,14 @@ GPLv3 . "$MUNIN_LIBDIR/plugins/plugin.sh" -warning=1.99 -critical=1.99 +warning=1 +critical=2 export PVC_CLIENT_DIR="/run/shm/munin-pvc" PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty" JQ_CMD="/usr/bin/jq" output_usage() { - echo "This plugin outputs numerical values based on the health of the PVC cluster." - echo - echo "There are separate outputs for both the PVC cluster itself as well as the Ceph storage cluster." - echo "In normal operation, i.e. when both clusters are in 'Optimal' state, the plugin returns 0 for" - echo "each cluster. When the cluster is placed into 'Maintenance' mode,the plugin returns 1 for each" - echo "cluster, and goes into WARN state (limit 0.99); this can be adjusted by overriding the WARNING" - echo "threshold of the plugin to something other than 0.99 - note that due to Munin's alerting design," - echo "the warning value must always be very slightly below the whole number. When either cluster" - echo "element becomes 'Degraded', the plugin returns 2 for the relevant cluster, which is treated as a" - echo "critical. Like the WARNING threshold, this can be overridden, and with the same caveat about limit." exit 0 } @@ -84,72 +57,73 @@ output_autoconf() { } output_config() { - echo 'graph_title PVC Clusters' - echo 'graph_args --base 1000' + echo 'graph_title PVC CHealth' + echo 'graph_args --base 100' echo 'graph_vlabel Count' echo 'graph_category pvc' echo 'graph_period second' - echo 'graph_info This graph shows the nodes in the PVC cluster.' + echo 'graph_info These graphs show the health of the PVC cluster and specific node.' - echo 'pvc_cluster.label Cluster Degradation' + echo 'pvc_cluster.label Cluster Health' echo 'pvc_cluster.type GAUGE' - echo 'pvc_cluster.max 2' - echo 'pvc_cluster.info Whether the PVC cluster is in a degraded state.' - print_warning pvc_cluster - print_critical pvc_cluster + echo 'pvc_cluster.max 100' + echo 'pvc_cluster.info Health of the PVC cluster in %.' - echo 'pvc_storage.label Storage Degradation' - echo 'pvc_storage.type GAUGE' - echo 'pvc_storage.max 2' - echo 'pvc_storage.info Whether the storage cluster is in a degraded state.' - print_warning pvc_storage - print_critical pvc_storage + echo 'pvc_cluster_alert.label Cluster Health State' + echo 'pvc_cluster_alert.type GAUGE' + echo 'pvc_cluster_alert.max 2', + echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health' + print_warning pvc_cluster_alert + print_critical pvc_cluster_alert + + echo 'pvc_node.label Node Health' + echo 'pvc_node.type GAUGE' + echo 'pvc_node.max 100' + echo 'pvc_node.info Health of the PVC node in %.' + + echo 'pvc_node_alert.label Node Health State' + echo 'pvc_node_alert.type GAUGE' + echo 'pvc_node_alert.max 2', + echo 'pvc_node_alert.info Alerting state of the PVC node health' + print_warning pvc_node_alert + print_critical pvc_node_alert exit 0 } output_values() { PVC_OUTPUT="$( $PVC_CMD )" + HOST="$( hostname --short )" - cluster_health="$( $JQ_CMD '.health' <<<"${PVC_OUTPUT}" | tr -d '"' )" - cluster_failed_reason="$( $JQ_CMD -r '.health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" - case $cluster_health in - "Optimal") - cluster_value="0" - ;; - "Maintenance") - cluster_value="1" - ;; - "Degraded") - cluster_value="2" - esac + in_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )" - storage_health="$( $JQ_CMD '.storage_health' <<<"${PVC_OUTPUT}" | tr -d '"' )" - storage_failed_reason="$( $JQ_CMD -r '.storage_health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" - case $storage_health in - "Optimal") - storage_value="0" - ;; - "Maintenance") - storage_value="1" - ;; - "Degraded") - storage_value="2" - esac + cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )" + cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" + echo "pvc_cluster.value ${cluster_health}" + echo "pvc_cluster.extinfo ${cluster_health_messages}" + if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then + cluster_health_alert=2 + elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then + cluster_health_alert=1 + else + cluster_health_alert=0 + fi + echo "pvc_cluster_alert.value ${cluster_health_alert}" - echo "pvc_cluster.value $cluster_value" - if [[ $cluster_value -eq 1 ]]; then - echo "pvc_cluster.extinfo Cluster in maintenance mode" - elif [[ $cluster_value -eq 2 ]]; then - echo "pvc_cluster.extinfo ${cluster_failed_reason}" - fi - echo "pvc_storage.value $storage_value" - if [[ $storage_value -eq 1 ]]; then - echo "pvc_storage.extinfo Cluster in maintenance mode" - elif [[ $storage_value -eq 2 ]]; then - echo "pvc_storage.extinfo ${storage_failed_reason}" - fi + node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )" + node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" + echo "pvc_node.value ${node_health}" + echo "pvc_node.extinfo ${node_health_messages}" + + if [[ ${node_health} -le 50 && ${is_maintenance} == "false" ]]; then + node_health_alert=2 + elif [[ ${node_health} -le 90 && ${is_maintenance} == "false" ]]; then + node_health_alert=1 + else + node_health_alert=0 + fi + echo "pvc_node_alert.value ${node_health_alert}" } case $# in From 3f9c1c735be5a586eca3540372eaf077399a1c2d Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Feb 2023 20:32:33 -0500 Subject: [PATCH 31/55] Flip VM state condition to remove shutdown Don't cause health degredation for shutdown state, and flip the list around to make it clearer. --- daemon-common/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 131761af..a5139997 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -88,7 +88,7 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): for index, vm in enumerate(vm_list): # Handle unhealthy VM states - if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]: + if vm["state"] in ["stop", "fail"]: cluster_health_value -= health_delta_map["vm_stopped"] cluster_health_messages.append( f"cluster: VM {vm['name']} in {vm['state'].upper()} state" From f04f816e1b0b348d48d03dad9b471ec46f6e0e29 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 17 Feb 2023 13:18:46 -0500 Subject: [PATCH 32/55] Fix various issues with PVC Munin plugin --- node-daemon/monitoring/munin/pvc | 70 +++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/node-daemon/monitoring/munin/pvc b/node-daemon/monitoring/munin/pvc index da3c7fa6..a8be5ccc 100755 --- a/node-daemon/monitoring/munin/pvc +++ b/node-daemon/monitoring/munin/pvc @@ -28,6 +28,8 @@ GPLv3 . "$MUNIN_LIBDIR/plugins/plugin.sh" +is_multigraph + warning=1 critical=2 @@ -36,6 +38,7 @@ PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty" JQ_CMD="/usr/bin/jq" output_usage() { + echo "This plugin outputs information about a PVC cluster and node" exit 0 } @@ -57,33 +60,58 @@ output_autoconf() { } output_config() { - echo 'graph_title PVC CHealth' - echo 'graph_args --base 100' - echo 'graph_vlabel Count' + echo 'multigraph pvc_cluster_health' + echo 'graph_title PVC Cluster Health' + echo 'graph_args --base 1000' + echo 'graph_vlabel Health%' echo 'graph_category pvc' - echo 'graph_period second' - echo 'graph_info These graphs show the health of the PVC cluster and specific node.' + echo 'graph_info Health of the PVC cluster' - echo 'pvc_cluster.label Cluster Health' - echo 'pvc_cluster.type GAUGE' - echo 'pvc_cluster.max 100' - echo 'pvc_cluster.info Health of the PVC cluster in %.' + echo 'pvc_cluster_health.label Cluster Health' + echo 'pvc_cluster_health.type GAUGE' + echo 'pvc_cluster_health.max 100' + echo 'pvc_cluster_health.min 0' + echo 'pvc_cluster_health.info Health of the PVC cluster in %' + + echo 'multigraph pvc_cluster_alert' + echo 'graph_title PVC Cluster Alerting' + echo 'graph_args --base 1000' + echo 'graph_vlabel State' + echo 'graph_category pvc' + echo 'graph_info Alerting state of the PVC cluster health' echo 'pvc_cluster_alert.label Cluster Health State' echo 'pvc_cluster_alert.type GAUGE' - echo 'pvc_cluster_alert.max 2', + echo 'pvc_cluster_alert.max 2' + echo 'pvc_cluster_alert.min 0' echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health' print_warning pvc_cluster_alert print_critical pvc_cluster_alert - echo 'pvc_node.label Node Health' - echo 'pvc_node.type GAUGE' - echo 'pvc_node.max 100' - echo 'pvc_node.info Health of the PVC node in %.' + echo 'multigraph pvc_node_health' + echo 'graph_title PVC Node Health' + echo 'graph_args --base 1000' + echo 'graph_vlabel Health%' + echo 'graph_category pvc' + echo 'graph_info Health of the PVC node' + + echo 'pvc_node_health.label Node Health' + echo 'pvc_node_health.type GAUGE' + echo 'pvc_node_health.max 100' + echo 'pvc_node_health.min 0' + echo 'pvc_node_health.info Health of the PVC node in %' + + echo 'multigraph pvc_node_alert' + echo 'graph_title PVC Node Alerting' + echo 'graph_args --base 1000' + echo 'graph_vlabel State' + echo 'graph_category pvc' + echo 'graph_info Alerting state of the PVC node health' echo 'pvc_node_alert.label Node Health State' echo 'pvc_node_alert.type GAUGE' - echo 'pvc_node_alert.max 2', + echo 'pvc_node_alert.max 2' + echo 'pvc_node_alert.min 0' echo 'pvc_node_alert.info Alerting state of the PVC node health' print_warning pvc_node_alert print_critical pvc_node_alert @@ -99,8 +127,9 @@ output_values() { cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )" cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" - echo "pvc_cluster.value ${cluster_health}" - echo "pvc_cluster.extinfo ${cluster_health_messages}" + echo 'multigraph pvc_cluster_health' + echo "pvc_cluster_health.value ${cluster_health}" + echo "pvc_cluster_health.extinfo ${cluster_health_messages}" if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then cluster_health_alert=2 @@ -109,12 +138,14 @@ output_values() { else cluster_health_alert=0 fi + echo 'multigraph pvc_cluster_alert' echo "pvc_cluster_alert.value ${cluster_health_alert}" node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )" node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" - echo "pvc_node.value ${node_health}" - echo "pvc_node.extinfo ${node_health_messages}" + echo 'multigraph pvc_node_health' + echo "pvc_node_health.value ${node_health}" + echo "pvc_node_health.extinfo ${node_health_messages}" if [[ ${node_health} -le 50 && ${is_maintenance} == "false" ]]; then node_health_alert=2 @@ -123,6 +154,7 @@ output_values() { else node_health_alert=0 fi + echo 'multigraph pvc_node_alert' echo "pvc_node_alert.value ${node_health_alert}" } From 55f0aae2a7f26e24617e7ea4e28bd1572285666c Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 17 Feb 2023 16:17:46 -0500 Subject: [PATCH 33/55] Fix typo in var and flip conditional --- node-daemon/monitoring/munin/pvc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/node-daemon/monitoring/munin/pvc b/node-daemon/monitoring/munin/pvc index a8be5ccc..59c277e9 100755 --- a/node-daemon/monitoring/munin/pvc +++ b/node-daemon/monitoring/munin/pvc @@ -123,7 +123,7 @@ output_values() { PVC_OUTPUT="$( $PVC_CMD )" HOST="$( hostname --short )" - in_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )" + is_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )" cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )" cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )" @@ -147,9 +147,9 @@ output_values() { echo "pvc_node_health.value ${node_health}" echo "pvc_node_health.extinfo ${node_health_messages}" - if [[ ${node_health} -le 50 && ${is_maintenance} == "false" ]]; then + if [[ ${node_health} -le 50 && ${is_maintenance} != "true" ]]; then node_health_alert=2 - elif [[ ${node_health} -le 90 && ${is_maintenance} == "false" ]]; then + elif [[ ${node_health} -le 90 && ${is_maintenance} != "true" ]]; then node_health_alert=1 else node_health_alert=0 From a40de4b7f89fd2bcfef26e0f4fbaecb8866279ef Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sat, 18 Feb 2023 00:00:04 -0500 Subject: [PATCH 34/55] Update readme for Munin plugin --- node-daemon/monitoring/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-daemon/monitoring/README.md b/node-daemon/monitoring/README.md index 04ab005c..9f28bc1e 100644 --- a/node-daemon/monitoring/README.md +++ b/node-daemon/monitoring/README.md @@ -6,11 +6,11 @@ This directory contains several monitoring resources that can be used with vario The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided: -* `pvc`: Checks the PVC cluster and node health, providing two graphs, one for each. +* `pvc`: Checks the PVC cluster and node health, as well as their status (OK/Warning/Critical, based on maintenance status), providing 4 graphs. * `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself. -The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. +The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. The alerting is provided by two separate graphs from the health graph so that actual health state is logged regardless of alerting. The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well. From c834a3e9c8094c0594d1fe1afbaf6b3095509443 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 00:06:52 -0500 Subject: [PATCH 35/55] Update API specification --- api-daemon/pvcapid/flaskapi.py | 66 +++++++++++++++++++++--- docs/manuals/swagger.json | 93 ++++++++++++++++++++++++++++++---- 2 files changed, 141 insertions(+), 18 deletions(-) diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 1628a080..68dfb443 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -448,14 +448,40 @@ class API_Status(Resource): type: object id: ClusterStatus properties: - health: + cluster_health: + type: object + properties: + health: + type: integer + description: The overall health (%) of the cluster + example: 100 + messages: + type: array + description: A list of health event strings + items: + type: string + example: "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps" + node_health: + type: object + properties: + hvX: + type: object + description: A node entry for per-node health details, one per node in the cluster + properties: + health: + type: integer + description: The health (%) of the node + example: 100 + messages: + type: array + description: A list of health event strings + items: + type: string + example: "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps" + maintenance: type: string - description: The overall cluster health - example: Optimal - storage_health: - type: string - description: The overall storage cluster health - example: Optimal + description: Whether the cluster is in maintenance mode or not (string boolean) + example: true primary_node: type: string description: The current primary coordinator node @@ -605,6 +631,32 @@ class API_Node_Root(Resource): arch: type: string description: The architecture of the CPU + health: + type: integer + description: The overall health (%) of the node + example: 100 + health_details: + type: array + description: A list of health plugin results + items: + type: object + properties: + name: + type: string + description: The name of the health plugin + example: nics + last_run: + type: integer + description: The UNIX timestamp (s) of the last plugin run + example: 1676786078 + health_delta: + type: integer + description: The health delta (negatively applied to the health percentage) of the plugin's current state + example: 10 + message: + type: string + description: The output message of the plugin + example: "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps" load: type: number format: float diff --git a/docs/manuals/swagger.json b/docs/manuals/swagger.json index 03001f4d..2212b596 100644 --- a/docs/manuals/swagger.json +++ b/docs/manuals/swagger.json @@ -15,15 +15,57 @@ }, "ClusterStatus": { "properties": { - "health": { - "description": "The overall cluster health", - "example": "Optimal", + "cluster_health": { + "properties": { + "health": { + "description": "The overall health (%) of the cluster", + "example": 100, + "type": "integer" + }, + "messages": { + "description": "A list of health event strings", + "items": { + "example": "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps", + "type": "string" + }, + "type": "array" + } + }, + "type": "object" + }, + "maintenance": { + "description": "Whether the cluster is in maintenance mode or not (string boolean)", + "example": true, "type": "string" }, "networks": { "description": "The total number of networks in the cluster", "type": "integer" }, + "node_health": { + "properties": { + "hvX": { + "description": "A node entry for per-node health details, one per node in the cluster", + "properties": { + "health": { + "description": "The health (%) of the node", + "example": 100, + "type": "integer" + }, + "messages": { + "description": "A list of health event strings", + "items": { + "example": "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps", + "type": "string" + }, + "type": "array" + } + }, + "type": "object" + } + }, + "type": "object" + }, "nodes": { "properties": { "state-combination": { @@ -65,11 +107,6 @@ "description": "The total number of snapshots in the storage cluster", "type": "integer" }, - "storage_health": { - "description": "The overall storage cluster health", - "example": "Optimal", - "type": "string" - }, "upstream_ip": { "description": "The cluster upstream IP address in CIDR format", "example": "10.0.0.254/24", @@ -456,6 +493,40 @@ "description": "The number of running domains (VMs)", "type": "integer" }, + "health": { + "description": "The overall health (%) of the node", + "example": 100, + "type": "integer" + }, + "health_details": { + "description": "A list of health plugin results", + "items": { + "properties": { + "health_delta": { + "description": "The health delta (negatively applied to the health percentage) of the plugin's current state", + "example": 10, + "type": "integer" + }, + "last_run": { + "description": "The UNIX timestamp (s) of the last plugin run", + "example": 1676786078, + "type": "integer" + }, + "message": { + "description": "The output message of the plugin", + "example": "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps", + "type": "string" + }, + "name": { + "description": "The name of the health plugin", + "example": "nics", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, "kernel": { "desription": "The running kernel version from uname", "type": "string" @@ -6177,7 +6248,7 @@ "description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference", "enum": [ "mem", - "memfree", + "memprov", "vcpus", "load", "vms", @@ -6336,7 +6407,7 @@ "description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference", "enum": [ "mem", - "memfree", + "memprov", "vcpus", "load", "vms", @@ -6597,7 +6668,7 @@ "description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference", "enum": [ "mem", - "memfree", + "memprov", "vcpus", "load", "vms", From 6f5aecfa226169a1f26fe6f001741a79781ba0e9 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 00:19:05 -0500 Subject: [PATCH 36/55] Add plugin directory and plugin details log fields --- docs/manuals/daemon.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/manuals/daemon.md b/docs/manuals/daemon.md index 69732491..5e831a24 100644 --- a/docs/manuals/daemon.md +++ b/docs/manuals/daemon.md @@ -132,6 +132,7 @@ pvc: target_selector: mem configuration: directories: + plugin_directory: "/usr/share/pvc/plugins" dynamic_directory: "/run/pvc" log_directory: "/var/log/pvc" console_log_directory: "/var/log/libvirt" @@ -142,7 +143,7 @@ pvc: log_dates: True log_keepalives: True log_keepalive_cluster_details: True - log_keepalive_storage_details: True + log_keepalive_plugin_details: True console_log_lines: 1000 networking: bridge_device: ens4 @@ -367,6 +368,12 @@ For most clusters, `mem` should be sufficient, but others may be used based on t * `memprov` looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use. * `load` looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs. +#### `system` → `configuration` → `directories` → `plugin_directory` + +* *optional* + +The directory to load node health plugins from. Defaults to `/usr/share/pvc/plugins` if unset as per default packaging; should only be overridden by advanced users. + #### `system` → `configuration` → `directories` → `dynamic_directory` * *required* @@ -421,11 +428,11 @@ Whether to log keepalive messages or not. Whether to log node status information during keepalives or not. -#### `system` → `configuration` → `logging` → `log_keepalive_storage_details` +#### `system` → `configuration` → `logging` → `log_keepalive_plugin_details` * *required* -Whether to log storage cluster status information during keepalives or not. +Whether to log node health plugin status information during keepalives or not. #### `system` → `configuration` → `logging` → `console_log_lines` From 73e04ad2aa047592b7c4d8d44443b34b2c38fd99 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 00:25:27 -0500 Subject: [PATCH 37/55] Add last item to swagger doc --- api-daemon/pvcapid/flaskapi.py | 6 ++++++ docs/manuals/swagger.json | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 68dfb443..84931555 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -635,6 +635,12 @@ class API_Node_Root(Resource): type: integer description: The overall health (%) of the node example: 100 + health_plugins: + type: array + description: A list of health plugin names currently loaded on the node + items: + type: string + example: "nics" health_details: type: array description: A list of health plugin results diff --git a/docs/manuals/swagger.json b/docs/manuals/swagger.json index 2212b596..dd8cce8d 100644 --- a/docs/manuals/swagger.json +++ b/docs/manuals/swagger.json @@ -527,6 +527,14 @@ }, "type": "array" }, + "health_plugins": { + "description": "A list of health plugin names currently loaded on the node", + "items": { + "example": "nics", + "type": "string" + }, + "type": "array" + }, "kernel": { "desription": "The running kernel version from uname", "type": "string" From 8896c6914ced0cde8237a0cf916f15ad79d27ab8 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 01:01:54 -0500 Subject: [PATCH 38/55] Adjust health delta of EDAC Uncorrected to 50 This is a very bad situation and should be critical. --- node-daemon/plugins/edac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/plugins/edac b/node-daemon/plugins/edac index dd2293ba..877f1424 100644 --- a/node-daemon/plugins/edac +++ b/node-daemon/plugins/edac @@ -83,7 +83,7 @@ class MonitoringPluginScript(MonitoringPlugin): for line in stdout.split('\n'): if match(r'^mc[0-9]: csrow', line): if 'Uncorrected' in line: - health_delta = 10 + health_delta = 50 errors.append(' '.join(line.split()[2:])) message += ', '.join(errors) From ba6cb1371e1f2f39e75b117bef2db82f3f529cd9 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 01:03:12 -0500 Subject: [PATCH 39/55] Adjust health delta of load to 50 This is a very bad situation and should be critical. --- node-daemon/plugins/load | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/plugins/load b/node-daemon/plugins/load index 8b0cd2bb..8ab6847a 100644 --- a/node-daemon/plugins/load +++ b/node-daemon/plugins/load @@ -79,7 +79,7 @@ class MonitoringPluginScript(MonitoringPlugin): # Check that the load average is greater or equal to the cpu count if load_average > float(cpu_cores): # Set the health delta to 10 (subtract 10 from the total of 100) - health_delta = 10 + health_delta = 50 # Craft a message that can be used by the clients message = f"Current load is {load_average} out of {cpu_cores} CPU cores" From 109654ba779a5e94d01c4c0a55507333a2ce8f77 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 01:04:25 -0500 Subject: [PATCH 40/55] Remove obsolete LINKSPEED variable --- node-daemon/plugins/nics | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/node-daemon/plugins/nics b/node-daemon/plugins/nics index 707078fc..0d2f1198 100644 --- a/node-daemon/plugins/nics +++ b/node-daemon/plugins/nics @@ -46,10 +46,6 @@ from pvcnoded.objects.MonitoringInstance import MonitoringPlugin # the file name PLUGIN_NAME = "nics" -# Set a minimum link speed variable used below -# For PVC at least 10 Gbps is required for proper operation of a cluster -MINIMUM_LINKSPEED = 10000 - # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. class MonitoringPluginScript(MonitoringPlugin): @@ -172,7 +168,7 @@ class MonitoringPluginScript(MonitoringPlugin): max_supported_link_speed = sorted(list(supported_link_speeds))[-1] - # Ensure interface is running at MINIMUM_LINKSPEED + # Ensure interface is running at its maximum speed with open(f"/sys/class/net/{dev}/speed") as devfh: dev_speed = int(devfh.read()) if dev_speed < max_supported_link_speed: From 8699c291acdbcd31275ed762fae186123fae617f Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 01:37:41 -0500 Subject: [PATCH 41/55] Add documentation about new health and plugins --- docs/about.md | 2 + docs/manuals/daemon.md | 6 +- docs/manuals/health-plugins.md | 178 +++++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 docs/manuals/health-plugins.md diff --git a/docs/about.md b/docs/about.md index d59e7b8f..4fa34c11 100644 --- a/docs/about.md +++ b/docs/about.md @@ -71,6 +71,8 @@ Nodes are networked together via a set of statically-configured, simple layer-2 Further information about the general cluster architecture, including important considerations for node specifications/sizing and network configuration, [can be found at the cluster architecture page](/cluster-architecture). It is imperative that potential PVC administrators read this document thoroughly to understand the specific requirements of PVC and avoid potential missteps in obtaining and deploying their cluster. +More information about the node daemon can be found at the [Node Daemon manual page](/manuals/daemon) and details about the health system and health plugins for nodes can be found at the [health plugin manual page](/manuals/health-plugins). + ## Clients ### API Client diff --git a/docs/manuals/daemon.md b/docs/manuals/daemon.md index 5e831a24..bd0f98b4 100644 --- a/docs/manuals/daemon.md +++ b/docs/manuals/daemon.md @@ -52,9 +52,11 @@ The daemon startup sequence is documented below. The main daemon entry-point is 0. The node activates its keepalived timer and begins sending keepalive updates to the cluster. The daemon state transitions from `init` to `run` and the system has started fully. -# PVC Node Daemon manual +## Node health plugins -The PVC node daemon ins build with Python 3 and is run directly on nodes. For details of the startup sequence and general layout, see the [architecture document](/architecture/daemon). +The PVC node daemon includes a node health plugin system. These plugins are run during keepalives to check various aspects of node health and adjust the overall node and cluster health accordingly. For example, a plugin might check that all configured network interfaces are online and operating at their correct speed, or that all operating system packages are up-to-date. + +For the full details of the health and node health plugin system, see the [node health plugin manual](/manuals/health-plugins). ## Configuration diff --git a/docs/manuals/health-plugins.md b/docs/manuals/health-plugins.md new file mode 100644 index 00000000..e09a7398 --- /dev/null +++ b/docs/manuals/health-plugins.md @@ -0,0 +1,178 @@ +# Node health plugins + +The PVC node daemon includes a node health plugin system. These plugins are run during keepalives to check various aspects of node health and adjust the overall node and cluster health accordingly. For example, a plugin might check that all configured network interfaces are online and operating at their correct speed, or that all operating system packages are up-to-date. + +## Configuration + +### Plugin Directory + +The PVC node configuration includes a configuration option at `system` → `configuration` → `directories` → `plugin_directory` to configure the location of health plugin files on the system. By default if unset, this directory is `/usr/share/pvc/plugins`. An administrator can override this directory if they wish, though custom plugins can be installed to this directory without problems, and thus it is not recommended that it be changed. + +### Plugin Logging + +Plugin output is logged by default during keepalive messages. This is controlled by the node configuration option at `system` → `configuration` → `logging` → `log_keepalive_plugin_details`. Regardless of this setting, the overall node health is logged at the end of the plugin run. + +### Disabling Node Plugins + +Node plugins cannot be disabled; at best, a suite of zero plugins can be specified by pointing the above plugin directory to an empty folder. This will effectively render the node at a permanent 100% health. Note however that overall cluster health will still be affected by cluster-wide events (e.g. nodes or VMs being stopped, OSDs going out, etc.). + +## Health Plugin Architecture + +### Node and Cluster Health + +A core concept leveraged by the PVC system is that of node and cluster health. Starting with PVC version 0.9.61, these two health statistics are represented as percentages, with 100% representing optimal health, 51-90% representing a "warning" degraded state, and 0-50% representing a "critical" degraded state. + +While a cluster is in maintenance mode (set via `pvc maintenance on` and unset via `pvc maintenance off`), the health values continue to aggregate, but the value is ignored for the purposes of "health" output, i.e. its output colour will not change, and the reference monitoring plugins (for CheckMK and Munin) will not trigger alerting. This allows the administrator to specify that abnormal conditions are OK for some amount of time without triggering upstream alerting. Additionally, while a node is not in `run` Daemon state, its health will be reported as `N/A`, which is treated as 100% but displayed as such to make clear that the node has not initialized and run its health check plugins (yet). + +The node health is affected primarily by health plugins as discussed in this manual. Any plugin that adjusts node health lowers the node's health by its `health_delta` value, as well as the cluster health by its `health_delta` value. For example, a plugin might have a `health_delta` in a current state of `10`, which reduces its own node's health value to 90%, and the overall cluster health value to 90%. + +In addition, cluster health is affected by several fixed states within the PVC system. These are: + +* A node in `flushed` Domain state lowers the cluster health by 10; a node in `stop` Daemon state lowers the cluster health by 50. + +* A VM in `stop` state lowers the cluster health by 10 (hint: use `disable` state to avoid this). + +* An OSD in `down` state lowers the cluster health by 10; an OSD in `out` state lowers the cluster health by 50. + +* Memory overprovisioning (total provisioned and running guest memory allocation exceeds the total N-1 cluster memory availability) lowers the cluster health by 50. + +* Each Ceph health check message lowers the cluster health by 10 for a `HEALTH_WARN` severity or by 50 for a `HEALTH_ERR` severity. For example, the `OSDMAP_FLAGS` check (reporting, e.g. `noout` state) reports as a `HEALTH_WARN` severity and will thus decrease the cluster health by 10; if an additional `PG_DEGRADED` check fires (also reporting as `HEALTH_WARN` severity), this will decrease the cluster health by a further 10, or 20 total for both. This cumulative effect ensures that multiple simultaneous Ceph issues escalate in severity. For a full list of possible Ceph health check messages, [please see the Ceph documentation](https://docs.ceph.com/en/nautilus/rados/operations/health-checks/). + +### Built-in Health Plugins + +PVC ships with several node health plugins installed and loaded by default, to ensure several common aspects of node operation are validated and checked. The following plugins are included: + +#### `disk` + +This plugin checks all SATA/SAS and NVMe block devices for SMART health, if available, and reports any errors. + +For SATA/SAS disks reporting standard ATA SMART attributes, a health delta of 10 is raised for each SMART error on each disk, based on the `when_failed` value being set to true. Note that due to this design, several disks with multiple errors can quickly escalate to a critical condition, quickly alerting the administrator of possible major faults. + +For NVMe disks, only 3 specific NVMe health information messages are checked: `critical_warning`, `media_errors`, and `percentage_used` at > 90. Each check can only be reported once per disk and each raises a health delta of 10. + +#### `dpkg` + +This plugin checks for Debian package updates, invalid package states (i.e. not `ii` state), and obsolete configuration files that require cleanup. It will raise a health delta of 1 for each type of inconsistency, for a maximum of 3. It will thus never, on its own, trigger a node or cluster to be in a warning or critical state, but will show the errors for administrator analysis, as an example of a more "configuration anomaly"-type plugin. + +#### `edac` + +This plugin checks the EDAC utility for messages about errors, primarily in the ECC memory subsystem. It will raise a health delta of 50 if any `Uncorrected` EDAC errors are detected, possibly indicating failing memory. + +#### `load` + +This plugin checks the current 1-minute system load (as reported during keepalives) against the number of total CPU threads available on the node. If the load average is greater, i.e. the node is overloaded, it raises a health delta of 50. + +#### `nics` + +This plugin checks that all NICs underlying PVC networks and bridges are operating correctly, specifically that bond interfaces have at least 2 active slaves and that all physical NICs are operating at their maximum possible speed. It takes into account several possible options to determine this. + +* For each device defined (`bridge_dev`, `upstream_dev`, `cluster_dev`, and `storage_dev`), it determines the type of device. If it is a vLAN, it obtains the underlying device; otherwise, it uses the specified device. It then adds this device to a list of core NICs. Ideally, this list will contain either bonding interfaces or actual ethernet NICs. + +* For each core NIC, it checks its type. If it is a `bond` device, it checks the bonding state to ensure that at least 2 slave interfaces are up and operating. If there are not, it raises a health delta of 10. It then performs the following step for each slave NIC. + +* For each core NIC or bond slave device, it checks its maximum possible speed as reported by `ethtool` as well as the current active speed. If the NIC is operating at less than its maximum possible speed, it raises a health delta of 10. + +Note that this check may pose problems in some deployment scenarios (e.g. running 25GbE NICs at 10GbE by design). Currently the plugin logic cannot handle this and manual modifications may be required. This is left to the administrator if applicable. + +### Custom Health Plugins + +In addition to the included health plugins, the plugin architecture allows administrators to write their own plugins as required to check specific node details that might not be checked by the default plugins. While the author has endeavoured to cover as many important aspects as possible with the default plugins, there is always the possibility that some other condition becomes important and thus the system is flexible to this need. That said, we would welcome pull requests of new plugins to future version of PVC should they be widely applicable. + +As a warning, health plugins are run in a `root` context by PVC. They must therefore be carefully vetted to avoid damaging the system. DO NOT run untrusted health plugins. + +To create a health plugin, first reference the existing health plugins and create a base template. + +Each health plugin consists of three main parts: + +* An import, which must at least include the `MonitoringPlugin` class from the `pvcnoded.objects.MonitoringInstance` library. You can also load additional imports here, or import them within the functions (which is recommended for namespace simplicity). + +``` +# This import is always required here, as MonitoringPlugin is used by the MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin +``` + + +* A `PLUGIN_NAME` variable which defines the name of the plugin. This must match the filename. + +``` +# A monitoring plugin script must always expose its nice name, which must be identical to the file name +PLUGIN_NAME = "nics" +``` + +* An instance of a `MonitoringPluginScript` class which extends the `MonitoringPlugin` class. + +``` +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + ... +``` + +Within the `MonitoringPluginScript` class must be 3 primary functions as detailed below. While it is possible to do nothing except `pass` in these functions, or even exclude them (the parent includes empty defaults), all 3 should be included for consistency. + +#### `def setup(self):` + +This function is run once during the node daemon startup, when the plugin is loaded. It can be used to get one-time setup information, populate plugin instance variables, etc. + +The function must take no arguments except `self` and anything returned is ignored. + +A plugin can also be disabled live in the setup function by throwing any `Exception`. Such exceptions will be caught and the plugin will not be loaded in such a case. + +#### `def cleanup(self):` + +This function mirrors the setup function, and is run once during the node daemon shutdown process. It can be used to clean up any lingering items (e.g. temporary files) created by the setup or run functions, if required; generally plugins do not need to do any cleanup. + +#### `def run(self):` + +This function is run each time the plugin is called during a keepalive. It performs the main work of the plugin before returning the end result in a specific format. + +Note that this function runs once for each keepalive, which by default is every 5 seconds. It is thus important to keep the runtime as short as possible and avoid doing complex calculations, file I/O, etc. during the plugin run. Do as much as possible in the setup function to keep the run function as quick as possible. + +What happens during the run function is of course completely up to the plugin, but it must return a standardized set of details upon completing the run. + +An instance of the `PluginResult` object is helpfully created by the caller and passed in via `self.plugin_result`. This can be used to set the results as follows: + +* The `self.plugin_result.set_health_delta()` function can be used to set the current health delta of the result. This should be `0` unless the plugin detects a fault, at which point it can be any integer value below 100, and affects the node and cluster health as detailed above. + +* The `self.plugin_result.set_message()` function can be used to set the message text of the result, explaining in a short but human-readable way what the plugin result is. This will be shown in several places, including the node logs (if enabled), the node info output, and for results that have a health delta above 0, in the cluster status output. + +Finally, the `PluginResult` instance stored as `self.plugin_result` must be returned by the run function to the caller upon completion so that it can be added to the node state. + +### Example Health Plugin + +This is a terse example of the `load` plugin, which is an extremely simple example that shows all the above requirements clearly. Comments are omitted here for simplicity, but these can be seen in the actual plugin file (at `/usr/share/pvc/plugins/load` on any node). + +``` +#!/usr/bin/env python3 + +# load.py: PVC monitoring plugin example + +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + +PLUGIN_NAME = "load" + +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + pass + + def cleanup(self): + pass + + def run(self): + from os import getloadavg + from psutil import cpu_count + + load_average = getloadavg()[0] + cpu_cores = cpu_count() + + if load_average > float(cpu_cores): + health_delta = 50 + else: + health_delta = 0 + + message = f"Current load is {load_average} out pf {cpu_cores} CPU cores" + + self.plugin_result.set_health_delta(health_delta) + self.plugin_result.set_message(message) + + return self.plugin_result +``` From da7394a8de58cc1292ff41401c8f8f36eea427c8 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 10:42:20 -0500 Subject: [PATCH 42/55] Adjust Munin threshold values --- node-daemon/monitoring/munin/pvc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-daemon/monitoring/munin/pvc b/node-daemon/monitoring/munin/pvc index 59c277e9..80d7448e 100755 --- a/node-daemon/monitoring/munin/pvc +++ b/node-daemon/monitoring/munin/pvc @@ -30,8 +30,8 @@ GPLv3 is_multigraph -warning=1 -critical=2 +warning=0.99 +critical=1.99 export PVC_CLIENT_DIR="/run/shm/munin-pvc" PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty" From dcd7ac066c31ec3b1e0b740b9ca6eabbf037d8c1 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 12:21:29 -0500 Subject: [PATCH 43/55] Correct lint error E741 --- client-cli/pvc/cli_lib/network.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/client-cli/pvc/cli_lib/network.py b/client-cli/pvc/cli_lib/network.py index 49eea416..e148582b 100644 --- a/client-cli/pvc/cli_lib/network.py +++ b/client-cli/pvc/cli_lib/network.py @@ -961,7 +961,9 @@ def format_list_dhcp(dhcp_lease_list): ) ) - for dhcp_lease_information in sorted(dhcp_lease_list, key=lambda l: l["hostname"]): + for dhcp_lease_information in sorted( + dhcp_lease_list, key=lambda lease: lease["hostname"] + ): dhcp_lease_list_output.append( "{bold}\ {lease_hostname: <{lease_hostname_length}} \ @@ -1059,7 +1061,7 @@ def format_list_acl(acl_list): ) for acl_information in sorted( - acl_list, key=lambda l: l["direction"] + str(l["order"]) + acl_list, key=lambda acl: acl["direction"] + str(acl["order"]) ): acl_list_output.append( "{bold}\ From 6fd341501b9a1cb26a615a8a9e564e6554aecf4c Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 12:44:34 -0500 Subject: [PATCH 44/55] Adjust comment message --- node-daemon/plugins/load | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/node-daemon/plugins/load b/node-daemon/plugins/load index 8ab6847a..a2014535 100644 --- a/node-daemon/plugins/load +++ b/node-daemon/plugins/load @@ -20,8 +20,7 @@ ############################################################################### # This script provides an example of a PVC monitoring plugin script. It will create -# a simple plugin to check the system load against the total number of CPU cores, -# and return a 10 health delta (100 -> 90) if the load average is > 1/2 that number. +# a simple plugin to check the system load against the total number of CPU cores. # This script can thus be used as an example or reference implementation of a # PVC monitoring pluginscript and expanded upon as required. From e9e9d50ff639efbfb569c2534fcf1fe33b198c26 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 13:08:54 -0500 Subject: [PATCH 45/55] Add PostgreSQL monitoring check --- node-daemon/plugins/psql | 139 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 node-daemon/plugins/psql diff --git a/node-daemon/plugins/psql b/node-daemon/plugins/psql new file mode 100644 index 00000000..ec37a4e3 --- /dev/null +++ b/node-daemon/plugins/psql @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +# psql.py - PVC Monitoring example plugin for Postgres/Patroni +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the Patroni PostgreSQL instance on the node for operation. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "psql" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from psycopg2 import connect + + conn_metadata = None + cur_metadata = None + conn_dns = None + cur_dns = None + + # Set the health delta to 0 (no change) + health_delta = 0 + # Craft a message that can be used by the clients + message = "Successfully connected to PostgreSQL databases on localhost" + + # Check the Metadata database (primary) + try: + conn_metadata = connect( + host='127.0.0.1', + port=self.config["metadata_postgresql_port"], + dbname=self.config["metadata_postgresql_dbname"], + user=self.config["metadata_postgresql_user"], + password=self.config["metadata_postgresql_password"], + ) + cur_metadata = conn_metadata.cursor() + cur_metadata.execute("""SELECT * FROM alembic_version""") + data = cur_metadata.fetchone() + except Exception as e: + health_delta = 50 + err = str(e).split('\n')[0] + message = f"Failed to connect to PostgreSQL database {self.config['metadata_postgresql_dbname']}: {err}" + finally: + if cur_metadata is not None: + cur_metadata.close() + if conn_metadata is not None: + conn_metadata.close() + + if health_delta == 0: + # Check the PowerDNS database (secondary) + try: + conn_pdns = connect( + host='127.0.0.1', + port=self.config["pdns_postgresql_port"], + dbname=self.config["pdns_postgresql_dbname"], + user=self.config["pdns_postgresql_user"], + password=self.config["pdns_postgresql_password"], + ) + cur_pdns = conn_pdns.cursor() + cur_pdns.execute("""SELECT * FROM supermasters""") + data = cur_pdns.fetchone() + except Exception as e: + health_delta = 50 + err = str(e).split('\n')[0] + message = f"Failed to connect to PostgreSQL database {self.config['pdns_postgresql_dbname']}: {err}" + finally: + if cur_pdns is not None: + cur_pdns.close() + if conn_pdns is not None: + conn_pdns.close() + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass From e15b4f14ec16387a04fff8e679354739038f8b20 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 13:24:39 -0500 Subject: [PATCH 46/55] Add Zookeeper monitoring check --- node-daemon/plugins/zkpr | 107 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 node-daemon/plugins/zkpr diff --git a/node-daemon/plugins/zkpr b/node-daemon/plugins/zkpr new file mode 100644 index 00000000..6cf3ee53 --- /dev/null +++ b/node-daemon/plugins/zkpr @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +# zkpr.py - PVC Monitoring example plugin for Zookeeper +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the Zookeeper instance on the node for operation. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "zkpr" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + + If you wish for the plugin to not zkpr in certain conditions, do any checks here + and return a non-None failure message to indicate the error. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from kazoo.client import KazooClient, KazooState + + zk_conn = None + + # Set the health delta to 0 (no change) + health_delta = 0 + # Craft a message that can be used by the clients + message = "Successfully connected to Zookeeper on localhost" + + # Check the Zookeeper connection + try: + zk_conn = KazooClient(hosts=[f"{self.this_node.name}:2181"], timeout=1, read_only=True) + zk_conn.start(timeout=1) + data = zk_conn.get('/primary_node') + except Exception as e: + health_delta = 50 + message = f"Failed to connect to Zookeeper: {e}" + finally: + if zk_conn is not None: + zk_conn.stop() + zk_conn.close() + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass From 137b3010f234414515993b31efc9486588d016dd Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 13:41:33 -0500 Subject: [PATCH 47/55] Add Libvirtd monitoring check --- node-daemon/plugins/lbvt | 105 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 node-daemon/plugins/lbvt diff --git a/node-daemon/plugins/lbvt b/node-daemon/plugins/lbvt new file mode 100644 index 00000000..9333b71f --- /dev/null +++ b/node-daemon/plugins/lbvt @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +# lbvt.py - PVC Monitoring example plugin for Libvirtd +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the Libvirt daemon instance on the node for operation. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "lbvt" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + + If you wish for the plugin to not lbvt in certain conditions, do any checks here + and return a non-None failure message to indicate the error. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from libvirt import openReadOnly as lvopen + + lv_conn = None + + # Set the health delta to 0 (no change) + health_delta = 0 + # Craft a message that can be used by the clients + message = "Successfully connected to Libvirtd on localhost" + + # Check the Zookeeper connection + try: + lv_conn = lvopen(f"qemu+tcp://{self.this_node.name}/system") + data = lv_conn.getHostname() + except Exception as e: + health_delta = 50 + message = f"Failed to connect to Libvirtd: {e}" + finally: + if lv_conn is not None: + lv_conn.close() + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass From 1451c480dc5ea5db4e1686e8c26e804c9aed21b3 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 13:45:21 -0500 Subject: [PATCH 48/55] Use consistent connection with other checks --- node-daemon/plugins/psql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-daemon/plugins/psql b/node-daemon/plugins/psql index ec37a4e3..bd4e9e69 100644 --- a/node-daemon/plugins/psql +++ b/node-daemon/plugins/psql @@ -78,7 +78,7 @@ class MonitoringPluginScript(MonitoringPlugin): # Check the Metadata database (primary) try: conn_metadata = connect( - host='127.0.0.1', + host=self.this_node.name, port=self.config["metadata_postgresql_port"], dbname=self.config["metadata_postgresql_dbname"], user=self.config["metadata_postgresql_user"], @@ -101,7 +101,7 @@ class MonitoringPluginScript(MonitoringPlugin): # Check the PowerDNS database (secondary) try: conn_pdns = connect( - host='127.0.0.1', + host=self.this_node.name, port=self.config["pdns_postgresql_port"], dbname=self.config["pdns_postgresql_dbname"], user=self.config["pdns_postgresql_user"], From 0f3cd13da11d85119e7bac9f656ebdce7995193e Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 14:33:41 -0500 Subject: [PATCH 49/55] Fix bad string value for message --- node-daemon/pvcnoded/objects/MonitoringInstance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 81b0ccd4..2a44fcc1 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -45,7 +45,7 @@ class PluginResult(object): self.plugin_name = plugin_name self.current_time = int(time.time()) self.health_delta = 0 - self.message = None + self.message = "N/A" self.data = {} self.runtime = "0.00" From a9e7713abf2d0bf7cf601cf35a16d30280cbd74f Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 14:57:45 -0500 Subject: [PATCH 50/55] Add health delta change to message output --- node-daemon/pvcnoded/objects/MonitoringInstance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index 2a44fcc1..58cde66b 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -359,7 +359,7 @@ class MonitoringInstance(object): for result in sorted(plugin_results, key=lambda x: x.plugin_name): if self.config["log_keepalive_plugin_details"]: self.logger.out( - result.message, + result.message + f" [-{result.health_delta}]", state="t", prefix=f"{result.plugin_name} ({result.runtime}s)", ) From 8a403e6a2048550f40019b5c4a8abc86afbbeadb Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 15:01:20 -0500 Subject: [PATCH 51/55] Add IPMI monitoring check --- node-daemon/plugins/ipmi | 106 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 node-daemon/plugins/ipmi diff --git a/node-daemon/plugins/ipmi b/node-daemon/plugins/ipmi new file mode 100644 index 00000000..bbc2402c --- /dev/null +++ b/node-daemon/plugins/ipmi @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 + +# ipmi.py - PVC Monitoring example plugin for IPMI +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check whether the system IPMI is reachable. + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "ipmi" + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + + If you wish for the plugin to not ipmi in certain conditions, do any checks here + and return a non-None failure message to indicate the error. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + from daemon_lib.common import run_os_command + + # Check the node's IPMI interface + ipmi_hostname = self.config["ipmi_hostname"] + ipmi_username = self.config["ipmi_username"] + ipmi_password = self.config["ipmi_password"] + retcode, _, _ = run_os_command( + f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status" + ) + + if retcode > 0: + # Set the health delta to 10 (subtract 10 from the total of 100) + health_delta = 10 + # Craft a message that can be used by the clients + message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding" + else: + # Set the health delta to 0 (no change) + health_delta = 0 + # Craft a message that can be used by the clients + message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding" + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(message) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass From 74f894913da7650d9f1ed9a3148fda084708ab45 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 15:01:55 -0500 Subject: [PATCH 52/55] Add additional plugins to manual --- docs/manuals/health-plugins.md | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/manuals/health-plugins.md b/docs/manuals/health-plugins.md index e09a7398..002fa7a2 100644 --- a/docs/manuals/health-plugins.md +++ b/docs/manuals/health-plugins.md @@ -58,6 +58,14 @@ This plugin checks for Debian package updates, invalid package states (i.e. not This plugin checks the EDAC utility for messages about errors, primarily in the ECC memory subsystem. It will raise a health delta of 50 if any `Uncorrected` EDAC errors are detected, possibly indicating failing memory. +#### `ipmi` + +This plugin checks whether the daemon can reach its own IPMI address and connect. If it cannot, it raises a health delta of 10. + +#### `lbvt` + +This plugin checks whether the daemon can connect to the local Libvirt daemon instance. If it cannot, it raises a health delta of 50. + #### `load` This plugin checks the current 1-minute system load (as reported during keepalives) against the number of total CPU threads available on the node. If the load average is greater, i.e. the node is overloaded, it raises a health delta of 50. @@ -68,12 +76,20 @@ This plugin checks that all NICs underlying PVC networks and bridges are operati * For each device defined (`bridge_dev`, `upstream_dev`, `cluster_dev`, and `storage_dev`), it determines the type of device. If it is a vLAN, it obtains the underlying device; otherwise, it uses the specified device. It then adds this device to a list of core NICs. Ideally, this list will contain either bonding interfaces or actual ethernet NICs. -* For each core NIC, it checks its type. If it is a `bond` device, it checks the bonding state to ensure that at least 2 slave interfaces are up and operating. If there are not, it raises a health delta of 10. It then performs the following step for each slave NIC. +* For each core NIC, it checks its type. If it is a `bond` device, it checks the bonding state to ensure that at least 2 slave interfaces are up and operating. If there are not, it raises a health delta of 10. -* For each core NIC or bond slave device, it checks its maximum possible speed as reported by `ethtool` as well as the current active speed. If the NIC is operating at less than its maximum possible speed, it raises a health delta of 10. +* For each core NIC, it checks its maximum possible speed as reported by `ethtool` as well as the current active speed. If the NIC is operating at less than its maximum possible speed, it raises a health delta of 10. Note that this check may pose problems in some deployment scenarios (e.g. running 25GbE NICs at 10GbE by design). Currently the plugin logic cannot handle this and manual modifications may be required. This is left to the administrator if applicable. +#### `psql` + +This plugin checks whether the daemon can connect to the local PostgreSQL/Patroni daemon instance. If it cannot, it raises a health delta of 50. + +#### `zkpr` + +This plugin checks whether the daemon can connect to the local Zookeeper daemon instance. If it cannot, it raises a health delta of 50. + ### Custom Health Plugins In addition to the included health plugins, the plugin architecture allows administrators to write their own plugins as required to check specific node details that might not be checked by the default plugins. While the author has endeavoured to cover as many important aspects as possible with the default plugins, there is always the possibility that some other condition becomes important and thus the system is flexible to this need. That said, we would welcome pull requests of new plugins to future version of PVC should they be widely applicable. @@ -92,7 +108,7 @@ from pvcnoded.objects.MonitoringInstance import MonitoringPlugin ``` -* A `PLUGIN_NAME` variable which defines the name of the plugin. This must match the filename. +* A `PLUGIN_NAME` variable which defines the name of the plugin. This must match the filename. Generally, a plugin name will be 4 characters, but this is purely a convention and not a requirement. ``` # A monitoring plugin script must always expose its nice name, which must be identical to the file name From 879a844f280968d5712a291b7432287e28a7b3bf Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 16:05:28 -0500 Subject: [PATCH 53/55] Add PVC version to cluster status output --- api-daemon/pvcapid/flaskapi.py | 4 ++++ client-cli/pvc/cli_lib/cluster.py | 7 +++++++ daemon-common/cluster.py | 12 +++++++++++- docs/manuals/swagger.json | 5 +++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 84931555..01e0b79b 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -486,6 +486,10 @@ class API_Status(Resource): type: string description: The current primary coordinator node example: pvchv1 + pvc_version: + type: string + description: The PVC version of the current primary coordinator node + example: 0.9.61 upstream_ip: type: string description: The cluster upstream IP address in CIDR format diff --git a/client-cli/pvc/cli_lib/cluster.py b/client-cli/pvc/cli_lib/cluster.py index dab99c37..d0ce968a 100644 --- a/client-cli/pvc/cli_lib/cluster.py +++ b/client-cli/pvc/cli_lib/cluster.py @@ -182,6 +182,13 @@ def format_info(cluster_information, oformat): ansiprint.purple(), ansiprint.end(), cluster_information["primary_node"] ) ) + ainformation.append( + "{}PVC Version:{} {}".format( + ansiprint.purple(), + ansiprint.end(), + cluster_information.get("pvc_version", "N/A"), + ) + ) ainformation.append( "{}Cluster upstream IP:{} {}".format( ansiprint.purple(), ansiprint.end(), cluster_information["upstream_ip"] diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index a5139997..645dc165 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -194,6 +194,15 @@ def getClusterInformation(zkhandler): # Get node information object list retcode, node_list = pvc_node.get_list(zkhandler, None) + # Get primary node + primary_node = common.getPrimaryNode(zkhandler) + + # Get PVC version of primary node + pvc_version = "0.0.0" + for node in node_list: + if node["name"] == primary_node: + pvc_version = node["pvc_version"] + # Get vm information object list retcode, vm_list = pvc_vm.get_list(zkhandler, None, None, None, None) @@ -295,7 +304,8 @@ def getClusterInformation(zkhandler): ), "node_health": getNodeHealth(zkhandler, node_list), "maintenance": maintenance_state, - "primary_node": common.getPrimaryNode(zkhandler), + "primary_node": primary_node, + "pvc_version": pvc_version, "upstream_ip": zkhandler.read("base.config.upstream_ip"), "nodes": formatted_node_states, "vms": formatted_vm_states, diff --git a/docs/manuals/swagger.json b/docs/manuals/swagger.json index dd8cce8d..9096fb79 100644 --- a/docs/manuals/swagger.json +++ b/docs/manuals/swagger.json @@ -103,6 +103,11 @@ "example": "pvchv1", "type": "string" }, + "pvc_version": { + "description": "The PVC version of the current primary coordinator node", + "example": "0.9.61", + "type": "string" + }, "snapshots": { "description": "The total number of snapshots in the storage cluster", "type": "integer" From 0614e133fedbdf00cc4e2382c7574ded2b31dcd7 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 18:09:01 -0500 Subject: [PATCH 54/55] Lower default connect timeout to 1s --- client-cli/pvc/cli_lib/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client-cli/pvc/cli_lib/common.py b/client-cli/pvc/cli_lib/common.py index 44cf74b2..c8da62ee 100644 --- a/client-cli/pvc/cli_lib/common.py +++ b/client-cli/pvc/cli_lib/common.py @@ -124,8 +124,8 @@ def call_api( data=None, files=None, ): - # Set the connect timeout to 3 seconds but extremely long (48 hour) data timeout - timeout = (3.05, 172800) + # Set the connect timeout to 1 seconds but extremely long (48 hour) data timeout + timeout = (1.05, 172800) # Craft the URI uri = "{}://{}{}{}".format( From 6561ca6f759e0a0b303f03c0cbf39bfcb821e0bc Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 22 Feb 2023 18:09:11 -0500 Subject: [PATCH 55/55] Add cluster detail list Adds a command to show a list of details including health and item counts for all configured clusters in the client. --- client-cli/pvc/cli_lib/cluster.py | 2 +- client-cli/pvc/pvc.py | 237 +++++++++++++++++++++++++++--- 2 files changed, 218 insertions(+), 21 deletions(-) diff --git a/client-cli/pvc/cli_lib/cluster.py b/client-cli/pvc/cli_lib/cluster.py index d0ce968a..64e620c2 100644 --- a/client-cli/pvc/cli_lib/cluster.py +++ b/client-cli/pvc/cli_lib/cluster.py @@ -183,7 +183,7 @@ def format_info(cluster_information, oformat): ) ) ainformation.append( - "{}PVC Version:{} {}".format( + "{}PVC version:{} {}".format( ansiprint.purple(), ansiprint.end(), cluster_information.get("pvc_version", "N/A"), diff --git a/client-cli/pvc/pvc.py b/client-cli/pvc/pvc.py index f17f5d50..64ed459d 100755 --- a/client-cli/pvc/pvc.py +++ b/client-cli/pvc/pvc.py @@ -134,7 +134,7 @@ def get_config(store_data, cluster=None): config = dict() config["debug"] = False config["cluster"] = cluster - config["desctription"] = description + config["description"] = description config["api_host"] = "{}:{}".format(host, port) config["api_scheme"] = scheme config["api_key"] = api_key @@ -382,8 +382,6 @@ def cluster_list(raw): if not raw: # Display the data nicely - echo("Available clusters:") - echo("") echo( "{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}".format( bold=ansiprint.bold(), @@ -443,6 +441,202 @@ def cluster_list(raw): echo(cluster) +############################################################################### +# pvc cluster detail +############################################################################### +@click.command(name="detail", short_help="Show details of all available clusters.") +def cluster_detail(): + """ + Show quick details of all PVC clusters configured in this CLI instance. + """ + + # Get the existing data + clusters = get_store(store_path) + + cluster_details_list = list() + + echo("Gathering information from clusters... ", nl=False) + + for cluster in clusters: + _store_data = get_store(store_path) + cluster_config = get_config(_store_data, cluster=cluster) + retcode, retdata = pvc_cluster.get_info(cluster_config) + if retcode == 0: + retdata = None + cluster_details = {"config": cluster_config, "data": retdata} + cluster_details_list.append(cluster_details) + + echo("done.") + echo("") + + # Find the lengths of each column + name_length = 5 + description_length = 12 + health_length = 7 + primary_node_length = 8 + pvc_version_length = 8 + nodes_length = 6 + vms_length = 4 + networks_length = 9 + osds_length = 5 + pools_length = 6 + volumes_length = 8 + snapshots_length = 10 + + for cluster_details in cluster_details_list: + _name_length = len(cluster_details["config"]["cluster"]) + 1 + if _name_length > name_length: + name_length = _name_length + + _description_length = len(cluster_details["config"]["description"]) + 1 + if _description_length > description_length: + description_length = _description_length + + if cluster_details["data"] is None: + continue + + _health_length = ( + len(str(cluster_details["data"]["cluster_health"]["health"]) + "%") + 1 + ) + if _health_length > health_length: + health_length = _health_length + + _primary_node_length = len(cluster_details["data"]["primary_node"]) + 1 + if _primary_node_length > primary_node_length: + primary_node_length = _primary_node_length + + _pvc_version_length = len(cluster_details["data"]["pvc_version"]) + 1 + if _pvc_version_length > pvc_version_length: + pvc_version_length = _pvc_version_length + + _nodes_length = len(str(cluster_details["data"]["nodes"]["total"])) + 1 + if _nodes_length > nodes_length: + nodes_length = _nodes_length + + _vms_length = len(str(cluster_details["data"]["vms"]["total"])) + 1 + if _vms_length > vms_length: + vms_length = _vms_length + + _networks_length = len(str(cluster_details["data"]["networks"])) + 1 + if _networks_length > networks_length: + networks_length = _networks_length + + _osds_length = len(str(cluster_details["data"]["osds"]["total"])) + 1 + if _osds_length > osds_length: + osds_length = _osds_length + + _pools_length = len(str(cluster_details["data"]["pools"])) + 1 + if _pools_length > pools_length: + pools_length = _pools_length + + _volumes_length = len(str(cluster_details["data"]["volumes"])) + 1 + if _volumes_length > volumes_length: + volumes_length = _volumes_length + + _snapshots_length = len(str(cluster_details["data"]["snapshots"])) + 1 + if _snapshots_length > snapshots_length: + snapshots_length = _snapshots_length + + # Display the data nicely + echo( + "{bold}{name: <{name_length}} {description: <{description_length}} {health: <{health_length}} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}{end_bold}".format( + bold=ansiprint.bold(), + end_bold=ansiprint.end(), + name="Name", + name_length=name_length, + description="Description", + description_length=description_length, + health="Health", + health_length=health_length, + primary_node="Primary", + primary_node_length=primary_node_length, + pvc_version="Version", + pvc_version_length=pvc_version_length, + nodes="Nodes", + nodes_length=nodes_length, + vms="VMs", + vms_length=vms_length, + networks="Networks", + networks_length=networks_length, + osds="OSDs", + osds_length=osds_length, + pools="Pools", + pools_length=pools_length, + volumes="Volumes", + volumes_length=volumes_length, + snapshots="Snapshots", + snapshots_length=snapshots_length, + ) + ) + + for cluster_details in cluster_details_list: + if cluster_details["data"] is None: + health_colour = ansiprint.blue() + name = cluster_details["config"]["cluster"] + description = cluster_details["config"]["description"] + health = "N/A" + primary_node = "N/A" + pvc_version = "N/A" + nodes = "N/A" + vms = "N/A" + networks = "N/A" + osds = "N/A" + pools = "N/A" + volumes = "N/A" + snapshots = "N/A" + else: + if cluster_details["data"]["cluster_health"]["health"] > 90: + health_colour = ansiprint.green() + elif cluster_details["data"]["cluster_health"]["health"] > 50: + health_colour = ansiprint.yellow() + else: + health_colour = ansiprint.red() + + name = cluster_details["config"]["cluster"] + description = cluster_details["config"]["description"] + health = str(cluster_details["data"]["cluster_health"]["health"]) + "%" + primary_node = cluster_details["data"]["primary_node"] + pvc_version = cluster_details["data"]["pvc_version"] + nodes = str(cluster_details["data"]["nodes"]["total"]) + vms = str(cluster_details["data"]["vms"]["total"]) + networks = str(cluster_details["data"]["networks"]) + osds = str(cluster_details["data"]["osds"]["total"]) + pools = str(cluster_details["data"]["pools"]) + volumes = str(cluster_details["data"]["volumes"]) + snapshots = str(cluster_details["data"]["snapshots"]) + + echo( + "{name: <{name_length}} {description: <{description_length}} {health_colour}{health: <{health_length}}{end_colour} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}".format( + health_colour=health_colour, + end_colour=ansiprint.end(), + name=name, + name_length=name_length, + description=description, + description_length=description_length, + health=health, + health_length=health_length, + primary_node=primary_node, + primary_node_length=primary_node_length, + pvc_version=pvc_version, + pvc_version_length=pvc_version_length, + nodes=nodes, + nodes_length=nodes_length, + vms=vms, + vms_length=vms_length, + networks=networks, + networks_length=networks_length, + osds=osds, + osds_length=osds_length, + pools=pools, + pools_length=pools_length, + volumes=volumes, + volumes_length=volumes_length, + snapshots=snapshots, + snapshots_length=snapshots_length, + ) + ) + + # Validate that the cluster is set for a given command def cluster_req(function): @wraps(function) @@ -452,6 +646,24 @@ def cluster_req(function): 'No cluster specified and no local pvcapid.yaml configuration found. Use "pvc cluster" to add a cluster API to connect to.' ) exit(1) + + if not config["quiet"]: + if config["api_scheme"] == "https" and not config["verify_ssl"]: + ssl_unverified_msg = " (unverified)" + else: + ssl_unverified_msg = "" + echo( + 'Using cluster "{}" - Host: "{}" Scheme: "{}{}" Prefix: "{}"'.format( + config["cluster"], + config["api_host"], + config["api_scheme"], + ssl_unverified_msg, + config["api_prefix"], + ), + err=True, + ) + echo("", err=True) + return function(*args, **kwargs) return validate_cluster @@ -5896,23 +6108,7 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour): config["debug"] = _debug config["unsafe"] = _unsafe config["colour"] = _colour - - if not _quiet: - if config["api_scheme"] == "https" and not config["verify_ssl"]: - ssl_unverified_msg = " (unverified)" - else: - ssl_unverified_msg = "" - echo( - 'Using cluster "{}" - Host: "{}" Scheme: "{}{}" Prefix: "{}"'.format( - config["cluster"], - config["api_host"], - config["api_scheme"], - ssl_unverified_msg, - config["api_prefix"], - ), - err=True, - ) - echo("", err=True) + config["quiet"] = _quiet audit() @@ -5923,6 +6119,7 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour): cli_cluster.add_command(cluster_add) cli_cluster.add_command(cluster_remove) cli_cluster.add_command(cluster_list) +cli_cluster.add_command(cluster_detail) cli_node.add_command(node_secondary) cli_node.add_command(node_primary)