From 9c14d84bfc0874a5eb6f2201e1353b0e1b83f760 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 13 Feb 2023 14:37:44 -0500 Subject: [PATCH] Add node health value and send out API --- client-cli/pvc/cli_lib/node.py | 77 ++++++++++- daemon-common/migrations/versions/9.json | 2 +- daemon-common/node.py | 38 +++++- daemon-common/zkhandler.py | 1 + node-daemon/plugins/ceph | 4 +- node-daemon/plugins/ceph-cluster | 126 ------------------ node-daemon/plugins/dpkg | 3 +- .../pvcnoded/objects/MonitoringInstance.py | 17 ++- 8 files changed, 126 insertions(+), 142 deletions(-) delete mode 100644 node-daemon/plugins/ceph-cluster diff --git a/client-cli/pvc/cli_lib/node.py b/client-cli/pvc/cli_lib/node.py index 26f935ab..22325d20 100644 --- a/client-cli/pvc/cli_lib/node.py +++ b/client-cli/pvc/cli_lib/node.py @@ -215,6 +215,16 @@ def node_list( # Output display functions # def getOutputColours(node_information): + node_health = node_information.get("health", 999) + if node_health <= 50: + health_colour = ansiprint.red() + elif node_health <= 90: + health_colour = ansiprint.yellow() + elif node_health <= 100: + health_colour = ansiprint.green() + else: + health_colour = ansiprint.blue() + if node_information["daemon_state"] == "run": daemon_state_colour = ansiprint.green() elif node_information["daemon_state"] == "stop": @@ -251,6 +261,7 @@ def getOutputColours(node_information): mem_provisioned_colour = "" return ( + health_colour, daemon_state_colour, coordinator_state_colour, domain_state_colour, @@ -261,6 +272,7 @@ def getOutputColours(node_information): def format_info(node_information, long_output): ( + health_colour, daemon_state_colour, coordinator_state_colour, domain_state_colour, @@ -273,14 +285,34 @@ def format_info(node_information, long_output): # Basic information ainformation.append( "{}Name:{} {}".format( - ansiprint.purple(), ansiprint.end(), node_information["name"] + ansiprint.purple(), + ansiprint.end(), + node_information["name"], ) ) ainformation.append( "{}PVC Version:{} {}".format( - ansiprint.purple(), ansiprint.end(), node_information["pvc_version"] + ansiprint.purple(), + ansiprint.end(), + node_information["pvc_version"], ) ) + + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + node_health_text = f"{node_health}%" + else: + node_health_text = node_health + ainformation.append( + "{}Health Value:{} {}{}{}".format( + ansiprint.purple(), + ansiprint.end(), + health_colour, + node_health_text, + ansiprint.end(), + ) + ) + ainformation.append( "{}Daemon State:{} {}{}{}".format( ansiprint.purple(), @@ -397,6 +429,7 @@ def format_list(node_list, raw): # Determine optimal column widths node_name_length = 5 pvc_version_length = 8 + health_length = 7 daemon_state_length = 7 coordinator_state_length = 12 domain_state_length = 7 @@ -417,6 +450,15 @@ def format_list(node_list, raw): _pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1 if _pvc_version_length > pvc_version_length: pvc_version_length = _pvc_version_length + # node_health column + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + node_health_text = f"{node_health}%" + else: + node_health_text = node_health + _health_length = len(node_health_text) + 1 + if _health_length > health_length: + health_length = _health_length # daemon_state column _daemon_state_length = len(node_information["daemon_state"]) + 1 if _daemon_state_length > daemon_state_length: @@ -466,7 +508,10 @@ def format_list(node_list, raw): # Format the string (header) node_list_output.append( "{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format( - node_header_length=node_name_length + pvc_version_length + 1, + node_header_length=node_name_length + + pvc_version_length + + health_length + + 2, state_header_length=daemon_state_length + coordinator_state_length + domain_state_length @@ -484,7 +529,14 @@ def format_list(node_list, raw): bold=ansiprint.bold(), end_bold=ansiprint.end(), node_header="Nodes " - + "".join(["-" for _ in range(6, node_name_length + pvc_version_length)]), + + "".join( + [ + "-" + for _ in range( + 6, node_name_length + pvc_version_length + health_length + 1 + ) + ] + ), state_header="States " + "".join( [ @@ -526,12 +578,13 @@ def format_list(node_list, raw): ) node_list_output.append( - "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \ + "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {node_health: <{health_length}} \ {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \ {node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \ {node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format( node_name_length=node_name_length, pvc_version_length=pvc_version_length, + health_length=health_length, daemon_state_length=daemon_state_length, coordinator_state_length=coordinator_state_length, domain_state_length=domain_state_length, @@ -551,6 +604,7 @@ def format_list(node_list, raw): end_colour="", node_name="Name", node_pvc_version="Version", + node_health="Health", node_daemon_state="Daemon", node_coordinator_state="Coordinator", node_domain_state="Domain", @@ -568,19 +622,28 @@ def format_list(node_list, raw): # Format the string (elements) for node_information in sorted(node_list, key=lambda n: n["name"]): ( + health_colour, daemon_state_colour, coordinator_state_colour, domain_state_colour, mem_allocated_colour, mem_provisioned_colour, ) = getOutputColours(node_information) + + node_health = node_information.get("health", "N/A") + if isinstance(node_health, int): + node_health_text = f"{node_health}%" + else: + node_health_text = node_health + node_list_output.append( - "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \ + "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {health_colour}{node_health: <{health_length}}{end_colour} \ {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \ {node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \ {node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format( node_name_length=node_name_length, pvc_version_length=pvc_version_length, + health_length=health_length, daemon_state_length=daemon_state_length, coordinator_state_length=coordinator_state_length, domain_state_length=domain_state_length, @@ -594,6 +657,7 @@ def format_list(node_list, raw): mem_prov_length=mem_prov_length, bold="", end_bold="", + health_colour=health_colour, daemon_state_colour=daemon_state_colour, coordinator_state_colour=coordinator_state_colour, domain_state_colour=domain_state_colour, @@ -602,6 +666,7 @@ def format_list(node_list, raw): end_colour=ansiprint.end(), node_name=node_information["name"], node_pvc_version=node_information.get("pvc_version", "N/A"), + node_health=node_health_text, node_daemon_state=node_information["daemon_state"], node_coordinator_state=node_information["coordinator_state"], node_domain_state=node_information["domain_state"], diff --git a/daemon-common/migrations/versions/9.json b/daemon-common/migrations/versions/9.json index 84ea8ac1..0401b70f 100644 --- a/daemon-common/migrations/versions/9.json +++ b/daemon-common/migrations/versions/9.json @@ -1 +1 @@ -{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file +{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/node.py b/daemon-common/node.py index ae5f7e5d..58962f2e 100644 --- a/daemon-common/node.py +++ b/daemon-common/node.py @@ -21,6 +21,7 @@ import time import re +import json import daemon_lib.common as common @@ -49,6 +50,35 @@ def getNodeInformation(zkhandler, node_name): zkhandler.read(("node.count.provisioned_domains", node_name)) ) node_running_domains = zkhandler.read(("node.running_domains", node_name)).split() + node_health = int(zkhandler.read(("node.monitoring.health", node_name))) + node_health_plugins = zkhandler.read(("node.monitoring.plugins", node_name)).split() + node_health_details = list() + for plugin in node_health_plugins: + plugin_last_run = zkhandler.read( + ("node.monitoring.data", node_name, "monitoring_plugin.last_run", plugin) + ) + plugin_health_delta = zkhandler.read( + ( + "node.monitoring.data", + node_name, + "monitoring_plugin.health_delta", + plugin, + ) + ) + plugin_message = zkhandler.read( + ("node.monitoring.data", node_name, "monitoring_plugin.message", plugin) + ) + plugin_data = zkhandler.read( + ("node.monitoring.data", node_name, "monitoring_plugin.data", plugin) + ) + plugin_output = { + "name": plugin, + "last_run": int(plugin_last_run), + "health_delta": int(plugin_health_delta), + "message": plugin_message, + "data": json.loads(plugin_data), + } + node_health_details.append(plugin_output) # Construct a data structure to represent the data node_information = { @@ -61,10 +91,16 @@ def getNodeInformation(zkhandler, node_name): "kernel": node_kernel, "os": node_os, "arch": node_arch, + "health": node_health, + "health_plugins": node_health_plugins, + "health_details": node_health_details, "load": node_load, "domains_count": node_domains_count, "running_domains": node_running_domains, - "vcpu": {"total": node_cpu_count, "allocated": node_vcpu_allocated}, + "vcpu": { + "total": node_cpu_count, + "allocated": node_vcpu_allocated, + }, "memory": { "total": node_mem_total, "allocated": node_mem_allocated, diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index 5ec58b83..a52c4ec1 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -610,6 +610,7 @@ class ZKSchema(object): "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", + "monitoring.health": "/monitoring_health", }, # The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin}) "monitoring_plugin": { diff --git a/node-daemon/plugins/ceph b/node-daemon/plugins/ceph index dc0bf8e5..31fc7551 100644 --- a/node-daemon/plugins/ceph +++ b/node-daemon/plugins/ceph @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# ceph.py - PVC Monitoring example plugin for ceph status +# ceph.py - PVC Monitoring example plugin for Ceph status # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018-2022 Joshua M. Boniface @@ -111,7 +111,7 @@ class MonitoringPluginScript(MonitoringPlugin): self.plugin_result.set_message(message) # Set the detailed data in our local PluginResult object - self.plugin_result.set_data(dumps(health_status)) + self.plugin_result.set_data(health_status) # Return our local PluginResult object return self.plugin_result diff --git a/node-daemon/plugins/ceph-cluster b/node-daemon/plugins/ceph-cluster deleted file mode 100644 index 48788925..00000000 --- a/node-daemon/plugins/ceph-cluster +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python3 - -# ceph-cluster.py - PVC Monitoring example plugin for Ceph status -# Part of the Parallel Virtual Cluster (PVC) system -# -# Copyright (C) 2018-2022 Joshua M. Boniface -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -############################################################################### - -# This script provides an example of a PVC monitoring plugin script. It will create -# a simple plugin to check the Ceph cluster health for anomalies, and return a health -# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50). - -# This script can thus be used as an example or reference implementation of a -# PVC monitoring pluginscript and expanded upon as required. - -# A monitoring plugin script must implement the class "MonitoringPluginScript" which -# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation -# of the role of each function is provided in context of the example; see the other -# examples for more potential uses. - -# WARNING: -# -# This script will run in the context of the node daemon keepalives as root. -# DO NOT install untrusted, unvetted plugins under any circumstances. - - -# This import is always required here, as MonitoringPlugin is used by the -# MonitoringPluginScript class -from pvcnoded.objects.MonitoringInstance import MonitoringPlugin - - -# A monitoring plugin script must always expose its nice name, which must be identical to -# the file name -PLUGIN_NAME = "ceph-cluster" - - -# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. -class MonitoringPluginScript(MonitoringPlugin): - def setup(self): - """ - setup(): Perform special setup steps during node daemon startup - - This step is optional and should be used sparingly. - """ - - pass - - def run(self): - """ - run(): Perform the check actions and return a PluginResult object - """ - - # Run any imports first - from rados import Rados - from json import loads, dumps - - # Connect to the Ceph cluster - try: - ceph_conn = Rados( - conffile=self.config["ceph_config_file"], - conf=dict(keyring=self.config["ceph_admin_keyring"]), - ) - ceph_conn.connect(timeout=1) - except Exception as e: - self.log(f"Failed to connect to Ceph cluster: {e}", state="e") - return self.plugin_result - - # Get the Ceph cluster health - try: - health_status = loads( - ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1] - ) - ceph_health = health_status["status"] - except Exception as e: - self.log(f"Failed to get health data from Ceph cluster: {e}", state="e") - return self.plugin_result - finally: - ceph_conn.shutdown() - - # Get a list of error entries in the health status output - error_entries = health_status["checks"].keys() - - # Set the health delta based on the errors presented - if ceph_health == "HEALTH_ERR": - health_delta = 50 - message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}" - elif ceph_health == "HEALTH_WARN": - health_delta = 10 - message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}" - else: - health_delta = 0 - message = "Ceph cluster in OK state" - - # Set the health delta in our local PluginResult object - self.plugin_result.set_health_delta(health_delta) - - # Set the message in our local PluginResult object - self.plugin_result.set_message(message) - - # Set the detailed data in our local PluginResult object - self.plugin_result.set_data(dumps(health_status)) - - # Return our local PluginResult object - return self.plugin_result - - def cleanup(self): - """ - cleanup(): Perform special cleanup steps during node daemon termination - - This step is optional and should be used sparingly. - """ - - pass diff --git a/node-daemon/plugins/dpkg b/node-daemon/plugins/dpkg index 74d00789..40b6990b 100644 --- a/node-daemon/plugins/dpkg +++ b/node-daemon/plugins/dpkg @@ -66,7 +66,6 @@ class MonitoringPluginScript(MonitoringPlugin): # Run any imports first from re import match - from json import dumps import daemon_lib.common as pvc_common # Get Debian version @@ -143,7 +142,7 @@ class MonitoringPluginScript(MonitoringPlugin): "inconsistent_packages": list_inconsistent, "upgradable_packages": list_upgradable, } - self.plugin_result.set_data(dumps(detailed_data)) + self.plugin_result.set_data(detailed_data) # Return our local PluginResult object return self.plugin_result diff --git a/node-daemon/pvcnoded/objects/MonitoringInstance.py b/node-daemon/pvcnoded/objects/MonitoringInstance.py index c4d0f3f2..5bdc8162 100644 --- a/node-daemon/pvcnoded/objects/MonitoringInstance.py +++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py @@ -25,6 +25,7 @@ import importlib.util from os import walk from datetime import datetime +from json import dumps class PluginResult(object): @@ -37,7 +38,7 @@ class PluginResult(object): self.current_time = int(time.time()) self.health_delta = 0 self.message = None - self.data = None + self.data = {} self.runtime = "0.00" def set_health_delta(self, new_delta): @@ -98,7 +99,7 @@ class PluginResult(object): "monitoring_plugin.data", self.plugin_name, ), - self.data, + dumps(self.data), ), ( ( @@ -259,7 +260,7 @@ class MonitoringInstance(object): "monitoring_plugin.data", plugin.plugin_name, ), - None, + dumps({}), ), ( ( @@ -286,7 +287,7 @@ class MonitoringInstance(object): [ ( ("node.monitoring.plugins", self.this_node.name), - self.all_plugin_names, + " ".join(self.all_plugin_names), ), ] ) @@ -346,6 +347,14 @@ class MonitoringInstance(object): else: health_colour = self.logger.fmt_red + self.zkhandler.write( + [ + ( + ("node.monitoring.health", self.this_node.name), + total_health, + ), + ] + ) self.logger.out( f"System health: {health_colour}{total_health}/100{self.logger.fmt_end}", state="t",