Adjust health delta of EDAC Uncorrected to 50

This is a very bad situation and should be critical.
Add last item to swagger doc
2023-02-22 01:01:54 -05:00 · 2023-02-22 00:25:27 -05:00 · 2023-02-22 00:19:05 -05:00 · 2023-02-22 00:06:52 -05:00 · 2023-02-18 00:00:04 -05:00 · 2023-02-17 16:18:42 -05:00
27 changed files with 97 additions and 1084 deletions
--- a/.version
+++ b/.version
@@ -1 +1 @@
-0.9.62
+0.9.61
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,5 @@
 ## PVC Changelog

-###### [v0.9.62](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.62)
-
-  * [all] Adds an enhanced health checking, monitoring, and reporting system for nodes and clusters
-  * [cli] Adds a cluster detail command
-
 ###### [v0.9.61](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.61)

  * [provisioner] Fixes a bug in network comparison
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@@ -27,7 +27,7 @@ from ssl import SSLContext, TLSVersion
 from distutils.util import strtobool as dustrtobool

 # Daemon version
-version = "0.9.62"
+version = "0.9.61"

 # API version
 API_VERSION = 1.0
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@@ -486,10 +486,6 @@ class API_Status(Resource):
                  type: string
                  description: The current primary coordinator node
                  example: pvchv1
-                pvc_version:
-                  type: string
-                  description: The PVC version of the current primary coordinator node
-                  example: 0.9.61
                upstream_ip:
                  type: string
                  description: The cluster upstream IP address in CIDR format
--- a/client-cli/pvc/cli_lib/cluster.py
+++ b/client-cli/pvc/cli_lib/cluster.py
@@ -125,14 +125,11 @@ def format_info(cluster_information, oformat):
        return json.dumps(cluster_information, indent=4)

    # Plain formatting, i.e. human-readable
-    if (
-        cluster_information.get("maintenance") == "true"
-        or cluster_information.get("cluster_health", {}).get("health", "N/A") == "N/A"
-    ):
+    if cluster_information["maintenance"] == "true":
        health_colour = ansiprint.blue()
-    elif cluster_information.get("cluster_health", {}).get("health", 100) > 90:
+    elif cluster_information["cluster_health"]["health"] > 90:
        health_colour = ansiprint.green()
-    elif cluster_information.get("cluster_health", {}).get("health", 100) > 50:
+    elif cluster_information["cluster_health"]["health"] > 50:
        health_colour = ansiprint.yellow()
    else:
        health_colour = ansiprint.red()
@@ -144,12 +141,8 @@ def format_info(cluster_information, oformat):
    )
    ainformation.append("")

-    health_text = (
-        f"{cluster_information.get('cluster_health', {}).get('health', 'N/A')}"
-    )
-    if health_text != "N/A":
-        health_text += "%"
-    if cluster_information.get("maintenance") == "true":
+    health_text = f"{cluster_information['cluster_health']['health']}%"
+    if cluster_information["maintenance"] == "true":
        health_text += " (maintenance on)"

    ainformation.append(
@@ -161,7 +154,7 @@ def format_info(cluster_information, oformat):
            ansiprint.end(),
        )
    )
-    if cluster_information.get("cluster_health", {}).get("messages"):
+    if cluster_information["cluster_health"]["messages"]:
        health_messages = "\n                 > ".join(
            sorted(cluster_information["cluster_health"]["messages"])
        )
@@ -189,13 +182,6 @@ def format_info(cluster_information, oformat):
            ansiprint.purple(), ansiprint.end(), cluster_information["primary_node"]
        )
    )
-    ainformation.append(
-        "{}PVC version:{}         {}".format(
-            ansiprint.purple(),
-            ansiprint.end(),
-            cluster_information.get("pvc_version", "N/A"),
-        )
-    )
    ainformation.append(
        "{}Cluster upstream IP:{} {}".format(
            ansiprint.purple(), ansiprint.end(), cluster_information["upstream_ip"]
--- a/client-cli/pvc/cli_lib/common.py
+++ b/client-cli/pvc/cli_lib/common.py
@@ -124,8 +124,8 @@ def call_api(
    data=None,
    files=None,
 ):
-    # Set the connect timeout to 1 seconds but extremely long (48 hour) data timeout
-    timeout = (1.05, 172800)
+    # Set the connect timeout to 3 seconds but extremely long (48 hour) data timeout
+    timeout = (3.05, 172800)

    # Craft the URI
    uri = "{}://{}{}{}".format(
--- a/client-cli/pvc/cli_lib/network.py
+++ b/client-cli/pvc/cli_lib/network.py
@@ -961,9 +961,7 @@ def format_list_dhcp(dhcp_lease_list):
        )
    )

-    for dhcp_lease_information in sorted(
-        dhcp_lease_list, key=lambda lease: lease["hostname"]
-    ):
+    for dhcp_lease_information in sorted(dhcp_lease_list, key=lambda l: l["hostname"]):
        dhcp_lease_list_output.append(
            "{bold}\
 {lease_hostname: <{lease_hostname_length}} \
@@ -1061,7 +1059,7 @@ def format_list_acl(acl_list):
    )

    for acl_information in sorted(
-        acl_list, key=lambda acl: acl["direction"] + str(acl["order"])
+        acl_list, key=lambda l: l["direction"] + str(l["order"])
    ):
        acl_list_output.append(
            "{bold}\
--- a/client-cli/pvc/pvc.py
+++ b/client-cli/pvc/pvc.py
@@ -134,7 +134,7 @@ def get_config(store_data, cluster=None):
    config = dict()
    config["debug"] = False
    config["cluster"] = cluster
-    config["description"] = description
+    config["desctription"] = description
    config["api_host"] = "{}:{}".format(host, port)
    config["api_scheme"] = scheme
    config["api_key"] = api_key
@@ -382,6 +382,8 @@ def cluster_list(raw):

    if not raw:
        # Display the data nicely
+        echo("Available clusters:")
+        echo("")
        echo(
            "{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}".format(
                bold=ansiprint.bold(),
@@ -441,230 +443,6 @@ def cluster_list(raw):
            echo(cluster)


-###############################################################################
-# pvc cluster detail
-###############################################################################
-@click.command(name="detail", short_help="Show details of all available clusters.")
-def cluster_detail():
-    """
-    Show quick details of all PVC clusters configured in this CLI instance.
-    """
-
-    # Get the existing data
-    clusters = get_store(store_path)
-
-    cluster_details_list = list()
-
-    echo("Gathering information from clusters... ", nl=False)
-
-    for cluster in clusters:
-        _store_data = get_store(store_path)
-        cluster_config = get_config(_store_data, cluster=cluster)
-        retcode, retdata = pvc_cluster.get_info(cluster_config)
-        if retcode == 0:
-            retdata = None
-        cluster_details = {"config": cluster_config, "data": retdata}
-        cluster_details_list.append(cluster_details)
-
-    echo("done.")
-    echo("")
-
-    # Find the lengths of each column
-    name_length = 5
-    description_length = 12
-    health_length = 7
-    primary_node_length = 8
-    pvc_version_length = 8
-    nodes_length = 6
-    vms_length = 4
-    networks_length = 9
-    osds_length = 5
-    pools_length = 6
-    volumes_length = 8
-    snapshots_length = 10
-
-    for cluster_details in cluster_details_list:
-        _name_length = len(cluster_details["config"]["cluster"]) + 1
-        if _name_length > name_length:
-            name_length = _name_length
-
-        _description_length = len(cluster_details["config"]["description"]) + 1
-        if _description_length > description_length:
-            description_length = _description_length
-
-        if cluster_details["data"] is None:
-            continue
-
-        _health_length = (
-            len(
-                str(
-                    cluster_details["data"]
-                    .get("cluster_health", {})
-                    .get("health", "N/A")
-                )
-                + "%"
-            )
-            + 1
-        )
-        if _health_length > health_length:
-            health_length = _health_length
-
-        _primary_node_length = len(cluster_details["data"]["primary_node"]) + 1
-        if _primary_node_length > primary_node_length:
-            primary_node_length = _primary_node_length
-
-        _pvc_version_length = (
-            len(cluster_details["data"].get("pvc_version", "< 0.9.62")) + 1
-        )
-        if _pvc_version_length > pvc_version_length:
-            pvc_version_length = _pvc_version_length
-
-        _nodes_length = len(str(cluster_details["data"]["nodes"]["total"])) + 1
-        if _nodes_length > nodes_length:
-            nodes_length = _nodes_length
-
-        _vms_length = len(str(cluster_details["data"]["vms"]["total"])) + 1
-        if _vms_length > vms_length:
-            vms_length = _vms_length
-
-        _networks_length = len(str(cluster_details["data"]["networks"])) + 1
-        if _networks_length > networks_length:
-            networks_length = _networks_length
-
-        _osds_length = len(str(cluster_details["data"]["osds"]["total"])) + 1
-        if _osds_length > osds_length:
-            osds_length = _osds_length
-
-        _pools_length = len(str(cluster_details["data"]["pools"])) + 1
-        if _pools_length > pools_length:
-            pools_length = _pools_length
-
-        _volumes_length = len(str(cluster_details["data"]["volumes"])) + 1
-        if _volumes_length > volumes_length:
-            volumes_length = _volumes_length
-
-        _snapshots_length = len(str(cluster_details["data"]["snapshots"])) + 1
-        if _snapshots_length > snapshots_length:
-            snapshots_length = _snapshots_length
-
-    # Display the data nicely
-    echo(
-        "{bold}{name: <{name_length}} {description: <{description_length}} {health: <{health_length}} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}{end_bold}".format(
-            bold=ansiprint.bold(),
-            end_bold=ansiprint.end(),
-            name="Name",
-            name_length=name_length,
-            description="Description",
-            description_length=description_length,
-            health="Health",
-            health_length=health_length,
-            primary_node="Primary",
-            primary_node_length=primary_node_length,
-            pvc_version="Version",
-            pvc_version_length=pvc_version_length,
-            nodes="Nodes",
-            nodes_length=nodes_length,
-            vms="VMs",
-            vms_length=vms_length,
-            networks="Networks",
-            networks_length=networks_length,
-            osds="OSDs",
-            osds_length=osds_length,
-            pools="Pools",
-            pools_length=pools_length,
-            volumes="Volumes",
-            volumes_length=volumes_length,
-            snapshots="Snapshots",
-            snapshots_length=snapshots_length,
-        )
-    )
-
-    for cluster_details in cluster_details_list:
-        if cluster_details["data"] is None:
-            health_colour = ansiprint.blue()
-            name = cluster_details["config"]["cluster"]
-            description = cluster_details["config"]["description"]
-            health = "N/A"
-            primary_node = "N/A"
-            pvc_version = "N/A"
-            nodes = "N/A"
-            vms = "N/A"
-            networks = "N/A"
-            osds = "N/A"
-            pools = "N/A"
-            volumes = "N/A"
-            snapshots = "N/A"
-        else:
-            if (
-                cluster_details["data"].get("maintenance") == "true"
-                or cluster_details["data"]
-                .get("cluster_health", {})
-                .get("health", "N/A")
-                == "N/A"
-            ):
-                health_colour = ansiprint.blue()
-            elif (
-                cluster_details["data"].get("cluster_health", {}).get("health", 100)
-                > 90
-            ):
-                health_colour = ansiprint.green()
-            elif (
-                cluster_details["data"].get("cluster_health", {}).get("health", 100)
-                > 50
-            ):
-                health_colour = ansiprint.yellow()
-            else:
-                health_colour = ansiprint.red()
-
-            name = cluster_details["config"]["cluster"]
-            description = cluster_details["config"]["description"]
-            health = str(
-                cluster_details["data"].get("cluster_health", {}).get("health", "N/A")
-            )
-            if health != "N/A":
-                health += "%"
-            primary_node = cluster_details["data"]["primary_node"]
-            pvc_version = cluster_details["data"].get("pvc_version", "< 0.9.62")
-            nodes = str(cluster_details["data"]["nodes"]["total"])
-            vms = str(cluster_details["data"]["vms"]["total"])
-            networks = str(cluster_details["data"]["networks"])
-            osds = str(cluster_details["data"]["osds"]["total"])
-            pools = str(cluster_details["data"]["pools"])
-            volumes = str(cluster_details["data"]["volumes"])
-            snapshots = str(cluster_details["data"]["snapshots"])
-
-        echo(
-            "{name: <{name_length}} {description: <{description_length}} {health_colour}{health: <{health_length}}{end_colour} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}".format(
-                health_colour=health_colour,
-                end_colour=ansiprint.end(),
-                name=name,
-                name_length=name_length,
-                description=description,
-                description_length=description_length,
-                health=health,
-                health_length=health_length,
-                primary_node=primary_node,
-                primary_node_length=primary_node_length,
-                pvc_version=pvc_version,
-                pvc_version_length=pvc_version_length,
-                nodes=nodes,
-                nodes_length=nodes_length,
-                vms=vms,
-                vms_length=vms_length,
-                networks=networks,
-                networks_length=networks_length,
-                osds=osds,
-                osds_length=osds_length,
-                pools=pools,
-                pools_length=pools_length,
-                volumes=volumes,
-                volumes_length=volumes_length,
-                snapshots=snapshots,
-                snapshots_length=snapshots_length,
-            )
-        )
-
-
 # Validate that the cluster is set for a given command
 def cluster_req(function):
    @wraps(function)
@@ -674,24 +452,6 @@ def cluster_req(function):
                'No cluster specified and no local pvcapid.yaml configuration found. Use "pvc cluster" to add a cluster API to connect to.'
            )
            exit(1)
-
-        if not config["quiet"]:
-            if config["api_scheme"] == "https" and not config["verify_ssl"]:
-                ssl_unverified_msg = " (unverified)"
-            else:
-                ssl_unverified_msg = ""
-            echo(
-                'Using cluster "{}" - Host: "{}"  Scheme: "{}{}"  Prefix: "{}"'.format(
-                    config["cluster"],
-                    config["api_host"],
-                    config["api_scheme"],
-                    ssl_unverified_msg,
-                    config["api_prefix"],
-                ),
-                err=True,
-            )
-            echo("", err=True)
-
        return function(*args, **kwargs)

    return validate_cluster
@@ -6136,7 +5896,23 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour):
        config["debug"] = _debug
        config["unsafe"] = _unsafe
        config["colour"] = _colour
-        config["quiet"] = _quiet
+
+        if not _quiet:
+            if config["api_scheme"] == "https" and not config["verify_ssl"]:
+                ssl_unverified_msg = " (unverified)"
+            else:
+                ssl_unverified_msg = ""
+            echo(
+                'Using cluster "{}" - Host: "{}"  Scheme: "{}{}"  Prefix: "{}"'.format(
+                    config["cluster"],
+                    config["api_host"],
+                    config["api_scheme"],
+                    ssl_unverified_msg,
+                    config["api_prefix"],
+                ),
+                err=True,
+            )
+            echo("", err=True)

    audit()

@@ -6147,7 +5923,6 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour):
 cli_cluster.add_command(cluster_add)
 cli_cluster.add_command(cluster_remove)
 cli_cluster.add_command(cluster_list)
-cli_cluster.add_command(cluster_detail)

 cli_node.add_command(node_secondary)
 cli_node.add_command(node_primary)
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="pvc",
-    version="0.9.62",
+    version="0.9.61",
    packages=["pvc", "pvc.cli_lib"],
    install_requires=[
        "Click",
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -194,15 +194,6 @@ def getClusterInformation(zkhandler):
    # Get node information object list
    retcode, node_list = pvc_node.get_list(zkhandler, None)

-    # Get primary node
-    primary_node = common.getPrimaryNode(zkhandler)
-
-    # Get PVC version of primary node
-    pvc_version = "0.0.0"
-    for node in node_list:
-        if node["name"] == primary_node:
-            pvc_version = node["pvc_version"]
-
    # Get vm information object list
    retcode, vm_list = pvc_vm.get_list(zkhandler, None, None, None, None)

@@ -304,8 +295,7 @@ def getClusterInformation(zkhandler):
        ),
        "node_health": getNodeHealth(zkhandler, node_list),
        "maintenance": maintenance_state,
-        "primary_node": primary_node,
-        "pvc_version": pvc_version,
+        "primary_node": common.getPrimaryNode(zkhandler),
        "upstream_ip": zkhandler.read("base.config.upstream_ip"),
        "nodes": formatted_node_states,
        "vms": formatted_vm_states,
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,3 @@
-pvc (0.9.62-0) unstable; urgency=high
-
-  * [all] Adds an enhanced health checking, monitoring, and reporting system for nodes and clusters
-  * [cli] Adds a cluster detail command
-
- -- Joshua M. Boniface <joshua@boniface.me>  Wed, 22 Feb 2023 18:13:45 -0500
-
 pvc (0.9.61-0) unstable; urgency=high

  * [provisioner] Fixes a bug in network comparison
--- a/docs/about.md
+++ b/docs/about.md
@@ -71,8 +71,6 @@ Nodes are networked together via a set of statically-configured, simple layer-2

 Further information about the general cluster architecture, including important considerations for node specifications/sizing and network configuration, [can be found at the cluster architecture page](/cluster-architecture). It is imperative that potential PVC administrators read this document thoroughly to understand the specific requirements of PVC and avoid potential missteps in obtaining and deploying their cluster.

-More information about the node daemon can be found at the [Node Daemon manual page](/manuals/daemon) and details about the health system and health plugins for nodes can be found at the [health plugin manual page](/manuals/health-plugins).
-
 ## Clients

 ### API Client
--- a/docs/manuals/daemon.md
+++ b/docs/manuals/daemon.md
@@ -52,11 +52,9 @@ The daemon startup sequence is documented below. The main daemon entry-point is

 0. The node activates its keepalived timer and begins sending keepalive updates to the cluster. The daemon state transitions from `init` to `run` and the system has started fully.

-## Node health plugins
+# PVC Node Daemon manual

-The PVC node daemon includes a node health plugin system. These plugins are run during keepalives to check various aspects of node health and adjust the overall node and cluster health accordingly. For example, a plugin might check that all configured network interfaces are online and operating at their correct speed, or that all operating system packages are up-to-date.
-
-For the full details of the health and node health plugin system, see the [node health plugin manual](/manuals/health-plugins).
+The PVC node daemon ins build with Python 3 and is run directly on nodes. For details of the startup sequence and general layout, see the [architecture document](/architecture/daemon).

 ## Configuration

--- a/docs/manuals/health-plugins.md
+++ b/docs/manuals/health-plugins.md
@@ -1,210 +0,0 @@
-# Node health plugins
-
-The PVC node daemon includes a node health plugin system. These plugins are run during keepalives to check various aspects of node health and adjust the overall node and cluster health accordingly. For example, a plugin might check that all configured network interfaces are online and operating at their correct speed, or that all operating system packages are up-to-date.
-
-## Configuration
-
-### Plugin Directory
-
-The PVC node configuration includes a configuration option at `system` → `configuration` → `directories` → `plugin_directory` to configure the location of health plugin files on the system. By default if unset, this directory is `/usr/share/pvc/plugins`. An administrator can override this directory if they wish, though custom plugins can be installed to this directory without problems, and thus it is not recommended that it be changed.
-
-### Plugin Logging
-
-Plugin output is logged by default during keepalive messages. This is controlled by the node configuration option at `system` → `configuration` → `logging` → `log_keepalive_plugin_details`. Regardless of this setting, the overall node health is logged at the end of the plugin run.
-
-### Disabling Node Plugins
-
-Node plugins cannot be disabled; at best, a suite of zero plugins can be specified by pointing the above plugin directory to an empty folder. This will effectively render the node at a permanent 100% health. Note however that overall cluster health will still be affected by cluster-wide events (e.g. nodes or VMs being stopped, OSDs going out, etc.).
-
-## Health Plugin Architecture
-
-### Node and Cluster Health
-
-A core concept leveraged by the PVC system is that of node and cluster health. Starting with PVC version 0.9.61, these two health statistics are represented as percentages, with 100% representing optimal health, 51-90% representing a "warning" degraded state, and 0-50% representing a "critical" degraded state.
-
-While a cluster is in maintenance mode (set via `pvc maintenance on` and unset via `pvc maintenance off`), the health values continue to aggregate, but the value is ignored for the purposes of "health" output, i.e. its output colour will not change, and the reference monitoring plugins (for CheckMK and Munin) will not trigger alerting. This allows the administrator to specify that abnormal conditions are OK for some amount of time without triggering upstream alerting. Additionally, while a node is not in `run` Daemon state, its health will be reported as `N/A`, which is treated as 100% but displayed as such to make clear that the node has not initialized and run its health check plugins (yet).
-
-The node health is affected primarily by health plugins as discussed in this manual. Any plugin that adjusts node health lowers the node's health by its `health_delta` value, as well as the cluster health by its `health_delta` value. For example, a plugin might have a `health_delta` in a current state of `10`, which reduces its own node's health value to 90%, and the overall cluster health value to 90%.
-
-In addition, cluster health is affected by several fixed states within the PVC system. These are:
-
-* A node in `flushed` Domain state lowers the cluster health by 10; a node in `stop` Daemon state lowers the cluster health by 50.
-
-* A VM in `stop` state lowers the cluster health by 10 (hint: use `disable` state to avoid this).
-
-* An OSD in `down` state lowers the cluster health by 10; an OSD in `out` state lowers the cluster health by 50.
-
-* Memory overprovisioning (total provisioned and running guest memory allocation exceeds the total N-1 cluster memory availability) lowers the cluster health by 50.
-
-* Each Ceph health check message lowers the cluster health by 10 for a `HEALTH_WARN` severity or by 50 for a `HEALTH_ERR` severity. For example, the `OSDMAP_FLAGS` check (reporting, e.g. `noout` state) reports as a `HEALTH_WARN` severity and will thus decrease the cluster health by 10; if an additional `PG_DEGRADED` check fires (also reporting as `HEALTH_WARN` severity), this will decrease the cluster health by a further 10, or 20 total for both. This cumulative effect ensures that multiple simultaneous Ceph issues escalate in severity. For a full list of possible Ceph health check messages, [please see the Ceph documentation](https://docs.ceph.com/en/nautilus/rados/operations/health-checks/).
-
-### Built-in Health Plugins
-
-PVC ships with several node health plugins installed and loaded by default, to ensure several common aspects of node operation are validated and checked. The following plugins are included:
-
-#### `disk`
-
-This plugin checks all SATA/SAS and NVMe block devices for SMART health, if available, and reports any errors.
-
-For SATA/SAS disks reporting standard ATA SMART attributes, a health delta of 10 is raised for each SMART error on each disk, based on the `when_failed` value being set to true. Note that due to this design, several disks with multiple errors can quickly escalate to a critical condition, quickly alerting the administrator of possible major faults.
-
-For NVMe disks, only 3 specific NVMe health information messages are checked: `critical_warning`, `media_errors`, and `percentage_used` at > 90. Each check can only be reported once per disk and each raises a health delta of 10.
-
-#### `dpkg`
-
-This plugin checks for Debian package updates, invalid package states (i.e. not `ii` state), and obsolete configuration files that require cleanup. It will raise a health delta of 1 for each type of inconsistency, for a maximum of 3. It will thus never, on its own, trigger a node or cluster to be in a warning or critical state, but will show the errors for administrator analysis, as an example of a more "configuration anomaly"-type plugin.
-
-#### `edac`
-
-This plugin checks the EDAC utility for messages about errors, primarily in the ECC memory subsystem. It will raise a health delta of 50 if any `Uncorrected` EDAC errors are detected, possibly indicating failing memory.
-
-#### `ipmi`
-
-This plugin checks whether the daemon can reach its own IPMI address and connect. If it cannot, it raises a health delta of 10.
-
-#### `lbvt`
-
-This plugin checks whether the daemon can connect to the local Libvirt daemon instance. If it cannot, it raises a health delta of 50.
-
-#### `load`
-
-This plugin checks the current 1-minute system load (as reported during keepalives) against the number of total CPU threads available on the node. If the load average is greater, i.e. the node is overloaded, it raises a health delta of 50.
-
-#### `nics`
-
-This plugin checks that all NICs underlying PVC networks and bridges are operating correctly, specifically that bond interfaces have at least 2 active slaves and that all physical NICs are operating at their maximum possible speed. It takes into account several possible options to determine this.
-
-* For each device defined (`bridge_dev`, `upstream_dev`, `cluster_dev`, and `storage_dev`), it determines the type of device. If it is a vLAN, it obtains the underlying device; otherwise, it uses the specified device. It then adds this device to a list of core NICs. Ideally, this list will contain either bonding interfaces or actual ethernet NICs.
-
-* For each core NIC, it checks its type. If it is a `bond` device, it checks the bonding state to ensure that at least 2 slave interfaces are up and operating. If there are not, it raises a health delta of 10.
-
-* For each core NIC, it checks its maximum possible speed as reported by `ethtool` as well as the current active speed. If the NIC is operating at less than its maximum possible speed, it raises a health delta of 10.
-
-Note that this check may pose problems in some deployment scenarios (e.g. running 25GbE NICs at 10GbE by design). Currently the plugin logic cannot handle this and manual modifications may be required. This is left to the administrator if applicable.
-
-#### `psql`
-
-This plugin checks whether the daemon can connect to the local PostgreSQL/Patroni daemon instance. If it cannot, it raises a health delta of 50.
-
-#### `zkpr`
-
-This plugin checks whether the daemon can connect to the local Zookeeper daemon instance. If it cannot, it raises a health delta of 50.
-
-### Custom Health Plugins
-
-In addition to the included health plugins, the plugin architecture allows administrators to write their own plugins as required to check specific node details that might not be checked by the default plugins. While the author has endeavoured to cover as many important aspects as possible with the default plugins, there is always the possibility that some other condition becomes important and thus the system is flexible to this need. That said, we would welcome pull requests of new plugins to future version of PVC should they be widely applicable.
-
-As a warning, health plugins are run in a `root` context by PVC. They must therefore be carefully vetted to avoid damaging the system. DO NOT run untrusted health plugins.
-
-To create a health plugin, first reference the existing health plugins and create a base template.
-
-Each health plugin consists of three main parts:
-
-* An import, which must at least include the `MonitoringPlugin` class from the `pvcnoded.objects.MonitoringInstance` library. You can also load additional imports here, or import them within the functions (which is recommended for namespace simplicity).
-
-```
-# This import is always required here, as MonitoringPlugin is used by the MonitoringPluginScript class
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-```
-
-
-* A `PLUGIN_NAME` variable which defines the name of the plugin. This must match the filename. Generally, a plugin name will be 4 characters, but this is purely a convention and not a requirement.
-
-```
-# A monitoring plugin script must always expose its nice name, which must be identical to the file name
-PLUGIN_NAME = "nics"
-```
-
-* An instance of a `MonitoringPluginScript` class which extends the `MonitoringPlugin` class.
-
-```
-# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
-class MonitoringPluginScript(MonitoringPlugin):
-    ...
-```
-
-Within the `MonitoringPluginScript` class must be 3 primary functions as detailed below. While it is possible to do nothing except `pass` in these functions, or even exclude them (the parent includes empty defaults), all 3 should be included for consistency.
-
-#### `def setup(self):`
-
-This function is run once during the node daemon startup, when the plugin is loaded. It can be used to get one-time setup information, populate plugin instance variables, etc.
-
-The function must take no arguments except `self` and anything returned is ignored.
-
-A plugin can also be disabled live in the setup function by throwing any `Exception`. Such exceptions will be caught and the plugin will not be loaded in such a case.
-
-#### `def cleanup(self):`
-
-This function mirrors the setup function, and is run once during the node daemon shutdown process. It can be used to clean up any lingering items (e.g. temporary files) created by the setup or run functions, if required; generally plugins do not need to do any cleanup.
-
-#### `def run(self):`
-
-This function is run each time the plugin is called during a keepalive. It performs the main work of the plugin before returning the end result in a specific format.
-
-Note that this function runs once for each keepalive, which by default is every 5 seconds. It is thus important to keep the runtime as short as possible and avoid doing complex calculations, file I/O, etc. during the plugin run. Do as much as possible in the setup function to keep the run function as quick as possible. A good safe maximum time for any plugin (e.g. if implementing an internal timeout) is 2 seconds.
-
-What happens during the run function is of course completely up to the plugin, but it must return a standardized set of details upon completing the run.
-
-An instance of the `PluginResult` object is helpfully created by the caller and passed in via `self.plugin_result`. This can be used to set the results as follows:
-
-* The `self.plugin_result.set_health_delta()` function can be used to set the current health delta of the result. This should be `0` unless the plugin detects a fault, at which point it can be any integer value below 100, and affects the node and cluster health as detailed above.
-
-* The `self.plugin_result.set_message()` function can be used to set the message text of the result, explaining in a short but human-readable way what the plugin result is. This will be shown in several places, including the node logs (if enabled), the node info output, and for results that have a health delta above 0, in the cluster status output.
-
-Finally, the `PluginResult` instance stored as `self.plugin_result` must be returned by the run function to the caller upon completion so that it can be added to the node state.
-
-### Logging
-
-The MonitoringPlugin class provides a helper logging method (usable as `self.log()`) to assist a plugin author in logging messages to the node daemon console log. This function takes one primary argument, a string message, and an optional `state` keyword argument for alternate states.
-
-The default state is `d` for debug, e.g. `state="d"`. The possible states for log messages are:
-
-* `"d"`: Debug, only printed when the administrator has debug logging enabled. Useful for detailed analysis of the plugin run state.
-* `"i"`: Informational, printed at all times but with no intrinsic severity. Use these very sparingly if at all.
-* `"t"`: Tick, matches the output of the keepalive itself. Use these very sparingly if at all.
-* `"w"`: Warning, prints a warning message. Use these for non-fatal error conditions within the plugin.
-* `"e"`: Error, prints an error message. Use these for fatal error conditions within the plugin.
-
-None of the example plugins make use of the logging interface, but it is available for custom plugins should it be required.
-
-The final output message of each plugin is automatically logged to the node daemon console log with `"t"` state at the completion of all plugins, if the `log_keepalive_plugin_details` configuration option is true. Otherwise, no final output is displayed. This setting does not affect messages printed from within a plugin.
-
-### Example Health Plugin
-
-This is a terse example of the `load` plugin, which is an extremely simple example that shows all the above requirements clearly. Comments are omitted here for simplicity, but these can be seen in the actual plugin file (at `/usr/share/pvc/plugins/load` on any node).
-
-```
-#!/usr/bin/env python3
-
-# load.py: PVC monitoring plugin example
-
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-
-PLUGIN_NAME = "load"
-
-class MonitoringPluginScript(MonitoringPlugin):
-    def setup(self):
-        pass
-
-    def cleanup(self):
-        pass
-
-    def run(self):
-        from os import getloadavg
-        from psutil import cpu_count
-
-        load_average = getloadavg()[0]
-        cpu_cores = cpu_count()
-
-        if load_average > float(cpu_cores):
-            health_delta = 50
-        else:
-            health_delta = 0
-
-        message = f"Current load is {load_average} out pf {cpu_cores} CPU cores"
-
-        self.plugin_result.set_health_delta(health_delta)
-        self.plugin_result.set_message(message)
-
-        return self.plugin_result
-```
--- a/docs/manuals/swagger.json
+++ b/docs/manuals/swagger.json
@@ -103,11 +103,6 @@
                    "example": "pvchv1",
                    "type": "string"
                },
-                "pvc_version": {
-                    "description": "The PVC version of the current primary coordinator node",
-                    "example": "0.9.61",
-                    "type": "string"
-                },
                "snapshots": {
                    "description": "The total number of snapshots in the storage cluster",
                    "type": "integer"
--- a/node-daemon/monitoring/munin/pvc
+++ b/node-daemon/monitoring/munin/pvc
@@ -30,8 +30,8 @@ GPLv3

 is_multigraph

-warning=0.99
-critical=1.99
+warning=1
+critical=2

 export PVC_CLIENT_DIR="/run/shm/munin-pvc"
 PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
--- a/node-daemon/plugins/disk
+++ b/node-daemon/plugins/disk
@@ -129,7 +129,7 @@ class MonitoringPluginScript(MonitoringPlugin):
                continue

            if disk_type == 'nvme':
-                for attribute in smart_info.get('nvme_smart_health_information_log', {}).items():
+                for attribute in smart_info['nvme_smart_health_information_log'].items():
                    if attribute[0] == "critical_warning" and attribute[1] > 0:
                        health_delta += 10
                        messages.append(f"{disk} critical warning value {attribute[1]}")
@@ -140,7 +140,7 @@ class MonitoringPluginScript(MonitoringPlugin):
                        health_delta += 10
                        messages.append(f"{disk} percentage used value {attribute[1]}%")
            else:
-                for attribute in smart_info.get('ata_smart_attributes', {}).get('table', []):
+                for attribute in smart_info['ata_smart_attributes']['table']:
                    if attribute["when_failed"]:
                        health_delta += 10
                        messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}")
--- a/node-daemon/plugins/ipmi
+++ b/node-daemon/plugins/ipmi
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-# ipmi.py - PVC Monitoring example plugin for IPMI
-# Part of the Parallel Virtual Cluster (PVC) system
-#
-#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, version 3.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-#
-###############################################################################
-
-# This script provides an example of a PVC monitoring plugin script. It will create
-# a simple plugin to check whether the system IPMI is reachable.
-
-# This script can thus be used as an example or reference implementation of a
-# PVC monitoring pluginscript and expanded upon as required.
-
-# A monitoring plugin script must implement the class "MonitoringPluginScript" which
-# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
-# of the role of each function is provided in context of the example; see the other
-# examples for more potential uses.
-
-# WARNING:
-#
-# This script will run in the context of the node daemon keepalives as root.
-# DO NOT install untrusted, unvetted plugins under any circumstances.
-
-
-# This import is always required here, as MonitoringPlugin is used by the
-# MonitoringPluginScript class
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-
-
-# A monitoring plugin script must always expose its nice name, which must be identical to
-# the file name
-PLUGIN_NAME = "ipmi"
-
-
-# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
-class MonitoringPluginScript(MonitoringPlugin):
-    def setup(self):
-        """
-        setup(): Perform special setup steps during node daemon startup
-
-        This step is optional and should be used sparingly.
-
-        If you wish for the plugin to not ipmi in certain conditions, do any checks here
-        and return a non-None failure message to indicate the error.
-        """
-
-        pass
-
-    def run(self):
-        """
-        run(): Perform the check actions and return a PluginResult object
-        """
-
-        # Run any imports first
-        from daemon_lib.common import run_os_command
-
-        # Check the node's IPMI interface
-        ipmi_hostname = self.config["ipmi_hostname"]
-        ipmi_username = self.config["ipmi_username"]
-        ipmi_password = self.config["ipmi_password"]
-        retcode, _, _ = run_os_command(
-            f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status",
-            timeout=2
-        )
-
-        if retcode > 0:
-            # Set the health delta to 10 (subtract 10 from the total of 100)
-            health_delta = 10
-            # Craft a message that can be used by the clients
-            message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding"
-        else:
-            # Set the health delta to 0 (no change)
-            health_delta = 0
-            # Craft a message that can be used by the clients
-            message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding"
-
-        # Set the health delta in our local PluginResult object
-        self.plugin_result.set_health_delta(health_delta)
-
-        # Set the message in our local PluginResult object
-        self.plugin_result.set_message(message)
-
-        # Return our local PluginResult object
-        return self.plugin_result
-
-    def cleanup(self):
-        """
-        cleanup(): Perform special cleanup steps during node daemon termination
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
--- a/node-daemon/plugins/lbvt
+++ b/node-daemon/plugins/lbvt
@@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-
-# lbvt.py - PVC Monitoring example plugin for Libvirtd
-# Part of the Parallel Virtual Cluster (PVC) system
-#
-#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, version 3.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-#
-###############################################################################
-
-# This script provides an example of a PVC monitoring plugin script. It will create
-# a simple plugin to check the Libvirt daemon instance on the node for operation.
-
-# This script can thus be used as an example or reference implementation of a
-# PVC monitoring pluginscript and expanded upon as required.
-
-# A monitoring plugin script must implement the class "MonitoringPluginScript" which
-# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
-# of the role of each function is provided in context of the example; see the other
-# examples for more potential uses.
-
-# WARNING:
-#
-# This script will run in the context of the node daemon keepalives as root.
-# DO NOT install untrusted, unvetted plugins under any circumstances.
-
-
-# This import is always required here, as MonitoringPlugin is used by the
-# MonitoringPluginScript class
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-
-
-# A monitoring plugin script must always expose its nice name, which must be identical to
-# the file name
-PLUGIN_NAME = "lbvt"
-
-
-# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
-class MonitoringPluginScript(MonitoringPlugin):
-    def setup(self):
-        """
-        setup(): Perform special setup steps during node daemon startup
-
-        This step is optional and should be used sparingly.
-
-        If you wish for the plugin to not lbvt in certain conditions, do any checks here
-        and return a non-None failure message to indicate the error.
-        """
-
-        pass
-
-    def run(self):
-        """
-        run(): Perform the check actions and return a PluginResult object
-        """
-
-        # Run any imports first
-        from libvirt import openReadOnly as lvopen
-
-        lv_conn = None
-
-        # Set the health delta to 0 (no change)
-        health_delta = 0
-        # Craft a message that can be used by the clients
-        message = "Successfully connected to Libvirtd on localhost"
-
-        # Check the Zookeeper connection
-        try:
-            lv_conn = lvopen(f"qemu+tcp://{self.this_node.name}/system")
-            data = lv_conn.getHostname()
-        except Exception as e:
-            health_delta = 50
-            message = f"Failed to connect to Libvirtd: {e}"
-        finally:
-            if lv_conn is not None:
-                lv_conn.close()
-
-        # Set the health delta in our local PluginResult object
-        self.plugin_result.set_health_delta(health_delta)
-
-        # Set the message in our local PluginResult object
-        self.plugin_result.set_message(message)
-
-        # Return our local PluginResult object
-        return self.plugin_result
-
-    def cleanup(self):
-        """
-        cleanup(): Perform special cleanup steps during node daemon termination
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
--- a/node-daemon/plugins/load
+++ b/node-daemon/plugins/load
@@ -20,7 +20,8 @@
 ###############################################################################

 # This script provides an example of a PVC monitoring plugin script. It will create
-# a simple plugin to check the system load against the total number of CPU cores.
+# a simple plugin to check the system load against the total number of CPU cores,
+# and return a 10 health delta (100 -> 90) if the load average is > 1/2 that number.

 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
@@ -78,7 +79,7 @@ class MonitoringPluginScript(MonitoringPlugin):
        # Check that the load average is greater or equal to the cpu count
        if load_average > float(cpu_cores):
            # Set the health delta to 10 (subtract 10 from the total of 100)
-            health_delta = 50
+            health_delta = 10
            # Craft a message that can be used by the clients
            message = f"Current load is {load_average} out of {cpu_cores} CPU cores"

--- a/node-daemon/plugins/nics
+++ b/node-daemon/plugins/nics
@@ -46,6 +46,10 @@ from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # the file name
 PLUGIN_NAME = "nics"

+# Set a minimum link speed variable used below
+# For PVC at least 10 Gbps is required for proper operation of a cluster
+MINIMUM_LINKSPEED = 10000
+

 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
@@ -168,7 +172,7 @@ class MonitoringPluginScript(MonitoringPlugin):

            max_supported_link_speed = sorted(list(supported_link_speeds))[-1]

-            # Ensure interface is running at its maximum speed
+            # Ensure interface is running at MINIMUM_LINKSPEED
            with open(f"/sys/class/net/{dev}/speed") as devfh:
                dev_speed = int(devfh.read())
            if dev_speed < max_supported_link_speed:
--- a/node-daemon/plugins/psql
+++ b/node-daemon/plugins/psql
@@ -1,139 +0,0 @@
-#!/usr/bin/env python3
-
-# psql.py - PVC Monitoring example plugin for Postgres/Patroni
-# Part of the Parallel Virtual Cluster (PVC) system
-#
-#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, version 3.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-#
-###############################################################################
-
-# This script provides an example of a PVC monitoring plugin script. It will create
-# a simple plugin to check the Patroni PostgreSQL instance on the node for operation.
-
-# This script can thus be used as an example or reference implementation of a
-# PVC monitoring pluginscript and expanded upon as required.
-
-# A monitoring plugin script must implement the class "MonitoringPluginScript" which
-# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
-# of the role of each function is provided in context of the example; see the other
-# examples for more potential uses.
-
-# WARNING:
-#
-# This script will run in the context of the node daemon keepalives as root.
-# DO NOT install untrusted, unvetted plugins under any circumstances.
-
-
-# This import is always required here, as MonitoringPlugin is used by the
-# MonitoringPluginScript class
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-
-
-# A monitoring plugin script must always expose its nice name, which must be identical to
-# the file name
-PLUGIN_NAME = "psql"
-
-
-# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
-class MonitoringPluginScript(MonitoringPlugin):
-    def setup(self):
-        """
-        setup(): Perform special setup steps during node daemon startup
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
-
-    def run(self):
-        """
-        run(): Perform the check actions and return a PluginResult object
-        """
-
-        # Run any imports first
-        from psycopg2 import connect
-
-        conn_metadata = None
-        cur_metadata = None
-        conn_dns = None
-        cur_dns = None
-
-        # Set the health delta to 0 (no change)
-        health_delta = 0
-        # Craft a message that can be used by the clients
-        message = "Successfully connected to PostgreSQL databases on localhost"
-
-        # Check the Metadata database (primary)
-        try:
-            conn_metadata = connect(
-                host=self.this_node.name,
-                port=self.config["metadata_postgresql_port"],
-                dbname=self.config["metadata_postgresql_dbname"],
-                user=self.config["metadata_postgresql_user"],
-                password=self.config["metadata_postgresql_password"],
-            )
-            cur_metadata = conn_metadata.cursor()
-            cur_metadata.execute("""SELECT * FROM alembic_version""")
-            data = cur_metadata.fetchone()
-        except Exception as e:
-            health_delta = 50
-            err = str(e).split('\n')[0]
-            message = f"Failed to connect to PostgreSQL database {self.config['metadata_postgresql_dbname']}: {err}"
-        finally:
-            if cur_metadata is not None:
-                cur_metadata.close()
-            if conn_metadata is not None:
-                conn_metadata.close()
-
-        if health_delta == 0:
-            # Check the PowerDNS database (secondary)
-            try:
-                conn_pdns = connect(
-                    host=self.this_node.name,
-                    port=self.config["pdns_postgresql_port"],
-                    dbname=self.config["pdns_postgresql_dbname"],
-                    user=self.config["pdns_postgresql_user"],
-                    password=self.config["pdns_postgresql_password"],
-                )
-                cur_pdns = conn_pdns.cursor()
-                cur_pdns.execute("""SELECT * FROM supermasters""")
-                data = cur_pdns.fetchone()
-            except Exception as e:
-                health_delta = 50
-                err = str(e).split('\n')[0]
-                message = f"Failed to connect to PostgreSQL database {self.config['pdns_postgresql_dbname']}: {err}"
-            finally:
-                if cur_pdns is not None:
-                    cur_pdns.close()
-                if conn_pdns is not None:
-                    conn_pdns.close()
-
-        # Set the health delta in our local PluginResult object
-        self.plugin_result.set_health_delta(health_delta)
-
-        # Set the message in our local PluginResult object
-        self.plugin_result.set_message(message)
-
-        # Return our local PluginResult object
-        return self.plugin_result
-
-    def cleanup(self):
-        """
-        cleanup(): Perform special cleanup steps during node daemon termination
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
--- a/node-daemon/plugins/zkpr
+++ b/node-daemon/plugins/zkpr
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-# zkpr.py - PVC Monitoring example plugin for Zookeeper
-# Part of the Parallel Virtual Cluster (PVC) system
-#
-#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, version 3.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-#
-###############################################################################
-
-# This script provides an example of a PVC monitoring plugin script. It will create
-# a simple plugin to check the Zookeeper instance on the node for operation.
-
-# This script can thus be used as an example or reference implementation of a
-# PVC monitoring pluginscript and expanded upon as required.
-
-# A monitoring plugin script must implement the class "MonitoringPluginScript" which
-# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
-# of the role of each function is provided in context of the example; see the other
-# examples for more potential uses.
-
-# WARNING:
-#
-# This script will run in the context of the node daemon keepalives as root.
-# DO NOT install untrusted, unvetted plugins under any circumstances.
-
-
-# This import is always required here, as MonitoringPlugin is used by the
-# MonitoringPluginScript class
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-
-
-# A monitoring plugin script must always expose its nice name, which must be identical to
-# the file name
-PLUGIN_NAME = "zkpr"
-
-
-# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
-class MonitoringPluginScript(MonitoringPlugin):
-    def setup(self):
-        """
-        setup(): Perform special setup steps during node daemon startup
-
-        This step is optional and should be used sparingly.
-
-        If you wish for the plugin to not zkpr in certain conditions, do any checks here
-        and return a non-None failure message to indicate the error.
-        """
-
-        pass
-
-    def run(self):
-        """
-        run(): Perform the check actions and return a PluginResult object
-        """
-
-        # Run any imports first
-        from kazoo.client import KazooClient, KazooState
-
-        zk_conn = None
-
-        # Set the health delta to 0 (no change)
-        health_delta = 0
-        # Craft a message that can be used by the clients
-        message = "Successfully connected to Zookeeper on localhost"
-
-        # Check the Zookeeper connection
-        try:
-            zk_conn = KazooClient(hosts=[f"{self.this_node.name}:2181"], timeout=1, read_only=True)
-            zk_conn.start(timeout=1)
-            data = zk_conn.get('/schema/version')
-        except Exception as e:
-            health_delta = 50
-            message = f"Failed to connect to Zookeeper: {e}"
-        finally:
-            if zk_conn is not None:
-                zk_conn.stop()
-                zk_conn.close()
-
-        # Set the health delta in our local PluginResult object
-        self.plugin_result.set_health_delta(health_delta)
-
-        # Set the message in our local PluginResult object
-        self.plugin_result.set_message(message)
-
-        # Return our local PluginResult object
-        return self.plugin_result
-
-    def cleanup(self):
-        """
-        cleanup(): Perform special cleanup steps during node daemon termination
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@@ -49,7 +49,7 @@ import re
 import json

 # Daemon version
-version = "0.9.62"
+version = "0.9.61"


 ##########################################################
--- a/node-daemon/pvcnoded/objects/MonitoringInstance.py
+++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py
@@ -45,7 +45,7 @@ class PluginResult(object):
        self.plugin_name = plugin_name
        self.current_time = int(time.time())
        self.health_delta = 0
-        self.message = "N/A"
+        self.message = None
        self.data = {}
        self.runtime = "0.00"

@@ -332,15 +332,7 @@ class MonitoringInstance(object):

    def run_plugin(self, plugin):
        time_start = datetime.now()
-        try:
-            result = plugin.run()
-        except Exception as e:
-            self.logger.out(
-                f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}",
-                state="e",
-            )
-            # Whatever it had, we try to return
-            return plugin.plugin_result
+        result = plugin.run()
        time_end = datetime.now()
        time_delta = time_end - time_start
        runtime = "{:0.02f}".format(time_delta.total_seconds())
@@ -365,20 +357,25 @@ class MonitoringInstance(object):
                plugin_results.append(future.result())

        for result in sorted(plugin_results, key=lambda x: x.plugin_name):
-            if (
-                self.config["log_keepalives"]
-                and self.config["log_keepalive_plugin_details"]
-            ):
+            if self.config["log_keepalive_plugin_details"]:
                self.logger.out(
-                    result.message + f" [-{result.health_delta}]",
+                    result.message,
                    state="t",
                    prefix=f"{result.plugin_name} ({result.runtime}s)",
                )
-            total_health -= result.health_delta
+            if result is not None:
+                total_health -= result.health_delta

        if total_health < 0:
            total_health = 0

+        if total_health > 90:
+            health_colour = self.logger.fmt_green
+        elif total_health > 50:
+            health_colour = self.logger.fmt_yellow
+        else:
+            health_colour = self.logger.fmt_red
+
        self.zkhandler.write(
            [
                (
@@ -387,6 +384,10 @@ class MonitoringInstance(object):
                ),
            ]
        )
+        self.logger.out(
+            f"Node health: {health_colour}{total_health}%{self.logger.fmt_end}",
+            state="t",
+        )

    def run_cleanup(self, plugin):
        return plugin.cleanup()
--- a/node-daemon/pvcnoded/objects/NodeInstance.py
+++ b/node-daemon/pvcnoded/objects/NodeInstance.py
@@ -67,7 +67,6 @@ class NodeInstance(object):
        self.network_list = []
        self.domain_list = []
        # Node resources
-        self.health = 100
        self.domains_count = 0
        self.memused = 0
        self.memfree = 0
@@ -225,33 +224,6 @@ class NodeInstance(object):
                        )
                        self.flush_thread.start()

-        try:
-
-            @self.zkhandler.zk_conn.DataWatch(
-                self.zkhandler.schema.path("node.monitoring.health", self.name)
-            )
-            def watch_node_health(data, stat, event=""):
-                if event and event.type == "DELETED":
-                    # The key has been deleted after existing before; terminate this watcher
-                    # because this class instance is about to be reaped in Daemon.py
-                    return False
-
-                try:
-                    data = data.decode("ascii")
-                except AttributeError:
-                    data = 100
-
-                try:
-                    data = int(data)
-                except ValueError:
-                    pass
-
-                if data != self.health:
-                    self.health = data
-
-        except Exception:
-            pass
-
        @self.zkhandler.zk_conn.DataWatch(
            self.zkhandler.schema.path("node.memory.free", self.name)
        )
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@@ -644,27 +644,8 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
 # Keepalive update function
 def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
    debug = config["debug"]
-
-    # Display node information to the terminal
-    if config["log_keepalives"]:
-        if this_node.router_state == "primary":
-            cst_colour = logger.fmt_green
-        elif this_node.router_state == "secondary":
-            cst_colour = logger.fmt_blue
-        else:
-            cst_colour = logger.fmt_cyan
-        logger.out(
-            "{}{} keepalive @ {}{} [{}{}{}]".format(
-                logger.fmt_purple,
-                config["node_hostname"],
-                datetime.now(),
-                logger.fmt_end,
-                logger.fmt_bold + cst_colour,
-                this_node.router_state,
-                logger.fmt_end,
-            ),
-            state="t",
-        )
+    if debug:
+        logger.out("Keepalive starting", state="d", prefix="main-thread")

    # Set the migration selector in Zookeeper for clients to read
    if config["enable_hypervisor"]:
@@ -827,51 +808,44 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
    except Exception:
        logger.out("Failed to set keepalive data", state="e")

-    # Run this here since monitoring plugins output directly
-    monitoring_instance.run_plugins()
-    # Allow the health value to update in the Node instance
-    time.sleep(0.1)
-
+    # Display node information to the terminal
    if config["log_keepalives"]:
-        if this_node.maintenance is True:
-            maintenance_colour = logger.fmt_blue
+        if this_node.router_state == "primary":
+            cst_colour = logger.fmt_green
+        elif this_node.router_state == "secondary":
+            cst_colour = logger.fmt_blue
        else:
-            maintenance_colour = logger.fmt_green
-
-        if isinstance(this_node.health, int):
-            if this_node.health > 90:
-                health_colour = logger.fmt_green
-            elif this_node.health > 50:
-                health_colour = logger.fmt_yellow
-            else:
-                health_colour = logger.fmt_red
-            health_text = str(this_node.health) + "%"
-
-        else:
-            health_colour = logger.fmt_blue
-            health_text = "N/A"
-
+            cst_colour = logger.fmt_cyan
+        logger.out(
+            "{}{} keepalive @ {}{} [{}{}{}]".format(
+                logger.fmt_purple,
+                config["node_hostname"],
+                datetime.now(),
+                logger.fmt_end,
+                logger.fmt_bold + cst_colour,
+                this_node.router_state,
+                logger.fmt_end,
+            ),
+            state="t",
+        )
        if config["log_keepalive_cluster_details"]:
            logger.out(
-                "{bold}Maintenance:{nofmt} {maintenance_colour}{maintenance}{nofmt}  "
-                "{bold}Health:{nofmt} {health_colour}{health}{nofmt}  "
-                "{bold}VMs:{nofmt} {domcount}  "
-                "{bold}OSDs:{nofmt} {osdcount}  "
+                "{bold}Maintenance:{nofmt} {maint}  "
+                "{bold}Node VMs:{nofmt} {domcount}  "
+                "{bold}Node OSDs:{nofmt} {osdcount}  "
                "{bold}Load:{nofmt} {load}  "
-                "{bold}Memory [MiB]: "
+                "{bold}Memory [MiB]: VMs:{nofmt} {allocmem}  "
                "{bold}Used:{nofmt} {usedmem}  "
                "{bold}Free:{nofmt} {freemem}".format(
                    bold=logger.fmt_bold,
-                    maintenance_colour=maintenance_colour,
-                    health_colour=health_colour,
                    nofmt=logger.fmt_end,
-                    maintenance=this_node.maintenance,
-                    health=health_text,
+                    maint=this_node.maintenance,
                    domcount=this_node.domains_count,
                    osdcount=osds_this_node,
                    load=this_node.cpuload,
                    freemem=this_node.memfree,
                    usedmem=this_node.memused,
+                    allocmem=this_node.memalloc,
                ),
                state="t",
            )
@@ -919,3 +893,8 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
                            zkhandler.write(
                                [(("node.state.daemon", node_name), "dead")]
                            )
+
+    monitoring_instance.run_plugins()
+
+    if debug:
+        logger.out("Keepalive finished", state="d", prefix="main-thread")
@@ -1 +1 @@
 .9.62
 .9.61