Merge branch 'revamp-health'

Add detailed health checking, status reporting, and enhancements to the PVC system. Closes #161 #154 #159
2023-02-22 18:11:27 -05:00 · 2023-02-22 18:11:27 -05:00 · 07623fad1a
commit 07623fad1a
parent aeb238f43c 8331b7ecd8
35 changed files with 2884 additions and 424 deletions
--- a/.flake8
+++ b/.flake8
@ -6,7 +6,7 @@
 ignore = W503, E501
 extend-ignore = E203
 # We exclude the Debian, migrations, and provisioner examples
-exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples
+exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples,node-daemon/monitoring
 # Set the max line length to 88 for Black
 max-line-length = 88
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@ -448,18 +448,48 @@ class API_Status(Resource):
              type: object
              id: ClusterStatus
              properties:
-                health:
+                cluster_health:
                  type: object
                  properties:
                    health:
                      type: integer
                      description: The overall health (%) of the cluster
                      example: 100
                    messages:
                      type: array
                      description: A list of health event strings
                      items:
                        type: string
                        example: "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
                node_health:
                  type: object
                  properties:
                    hvX:
                      type: object
                      description: A node entry for per-node health details, one per node in the cluster
                      properties:
                        health:
                          type: integer
                          description: The health (%) of the node
                          example: 100
                        messages:
                          type: array
                          description: A list of health event strings
                          items:
                            type: string
                            example: "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
                maintenance:
                  type: string
-                  description: The overall cluster health
+                  description: Whether the cluster is in maintenance mode or not (string boolean)
-                  example: Optimal
+                  example: true
                storage_health:
                  type: string
                  description: The overall storage cluster health
                  example: Optimal
                primary_node:
                  type: string
                  description: The current primary coordinator node
                  example: pvchv1
                pvc_version:
                  type: string
                  description: The PVC version of the current primary coordinator node
                  example: 0.9.61
                upstream_ip:
                  type: string
                  description: The cluster upstream IP address in CIDR format
@ -605,6 +635,38 @@ class API_Node_Root(Resource):
                arch:
                  type: string
                  description: The architecture of the CPU
                health:
                  type: integer
                  description: The overall health (%) of the node
                  example: 100
                health_plugins:
                  type: array
                  description: A list of health plugin names currently loaded on the node
                  items:
                    type: string
                    example: "nics"
                health_details:
                  type: array
                  description: A list of health plugin results
                  items:
                    type: object
                    properties:
                      name:
                        type: string
                        description: The name of the health plugin
                        example: nics
                      last_run:
                        type: integer
                        description: The UNIX timestamp (s) of the last plugin run
                        example: 1676786078
                      health_delta:
                        type: integer
                        description: The health delta (negatively applied to the health percentage) of the plugin's current state
                        example: 10
                      message:
                        type: string
                        description: The output message of the plugin
                        example: "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
                load:
                  type: number
                  format: float
--- a/client-cli/pvc/cli_lib/cluster.py
+++ b/client-cli/pvc/cli_lib/cluster.py
@ -125,81 +125,56 @@ def format_info(cluster_information, oformat):
        return json.dumps(cluster_information, indent=4)
    # Plain formatting, i.e. human-readable
-    if cluster_information["health"] == "Optimal":
+    if cluster_information["maintenance"] == "true":
        health_colour = ansiprint.green()
    elif cluster_information["health"] == "Maintenance":
        health_colour = ansiprint.blue()
-    else:
+    elif cluster_information["cluster_health"]["health"] > 90:
        health_colour = ansiprint.green()
    elif cluster_information["cluster_health"]["health"] > 50:
        health_colour = ansiprint.yellow()
    if cluster_information["storage_health"] == "Optimal":
        storage_health_colour = ansiprint.green()
    elif cluster_information["storage_health"] == "Maintenance":
        storage_health_colour = ansiprint.blue()
    else:
-        storage_health_colour = ansiprint.yellow()
+        health_colour = ansiprint.red()
    ainformation = []
    if oformat == "short":
        ainformation.append(
            "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
        )
        ainformation.append(
            "{}Cluster health:{}      {}{}{}".format(
                ansiprint.purple(),
                ansiprint.end(),
                health_colour,
                cluster_information["health"],
                ansiprint.end(),
            )
        )
        if cluster_information["health_msg"]:
            for line in cluster_information["health_msg"]:
                ainformation.append("                     > {}".format(line))
        ainformation.append(
            "{}Storage health:{}      {}{}{}".format(
                ansiprint.purple(),
                ansiprint.end(),
                storage_health_colour,
                cluster_information["storage_health"],
                ansiprint.end(),
            )
        )
        if cluster_information["storage_health_msg"]:
            for line in cluster_information["storage_health_msg"]:
                ainformation.append("                     > {}".format(line))
        return "\n".join(ainformation)
    ainformation.append(
        "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
    )
    ainformation.append("")
    health_text = f"{cluster_information['cluster_health']['health']}%"
    if cluster_information["maintenance"] == "true":
        health_text += " (maintenance on)"
    ainformation.append(
-        "{}Cluster health:{}      {}{}{}".format(
+        "{}Cluster health:{}  {}{}{}".format(
            ansiprint.purple(),
            ansiprint.end(),
            health_colour,
-            cluster_information["health"],
+            health_text,
            ansiprint.end(),
        )
    )
-    if cluster_information["health_msg"]:
+    if cluster_information["cluster_health"]["messages"]:
-        for line in cluster_information["health_msg"]:
+        health_messages = "\n                 > ".join(
-            ainformation.append("                     > {}".format(line))
+            sorted(cluster_information["cluster_health"]["messages"])
    ainformation.append(
        "{}Storage health:{}      {}{}{}".format(
            ansiprint.purple(),
            ansiprint.end(),
            storage_health_colour,
            cluster_information["storage_health"],
            ansiprint.end(),
        )
-    )
+        ainformation.append(
-    if cluster_information["storage_health_msg"]:
+            "{}Health messages:{} > {}".format(
-        for line in cluster_information["storage_health_msg"]:
+                ansiprint.purple(),
-            ainformation.append("                     > {}".format(line))
+                ansiprint.end(),
                health_messages,
            )
        )
    else:
        ainformation.append(
            "{}Health messages:{} N/A".format(
                ansiprint.purple(),
                ansiprint.end(),
            )
        )
    if oformat == "short":
        return "\n".join(ainformation)
    ainformation.append("")
    ainformation.append(
@ -207,6 +182,13 @@ def format_info(cluster_information, oformat):
            ansiprint.purple(), ansiprint.end(), cluster_information["primary_node"]
        )
    )
    ainformation.append(
        "{}PVC version:{}         {}".format(
            ansiprint.purple(),
            ansiprint.end(),
            cluster_information.get("pvc_version", "N/A"),
        )
    )
    ainformation.append(
        "{}Cluster upstream IP:{} {}".format(
            ansiprint.purple(), ansiprint.end(), cluster_information["upstream_ip"]
--- a/client-cli/pvc/cli_lib/common.py
+++ b/client-cli/pvc/cli_lib/common.py
@ -124,8 +124,8 @@ def call_api(
    data=None,
    files=None,
 ):
-    # Set the connect timeout to 3 seconds but extremely long (48 hour) data timeout
+    # Set the connect timeout to 1 seconds but extremely long (48 hour) data timeout
-    timeout = (3.05, 172800)
+    timeout = (1.05, 172800)
    # Craft the URI
    uri = "{}://{}{}{}".format(
--- a/client-cli/pvc/cli_lib/network.py
+++ b/client-cli/pvc/cli_lib/network.py
@ -961,7 +961,9 @@ def format_list_dhcp(dhcp_lease_list):
        )
    )
-    for dhcp_lease_information in sorted(dhcp_lease_list, key=lambda l: l["hostname"]):
+    for dhcp_lease_information in sorted(
        dhcp_lease_list, key=lambda lease: lease["hostname"]
    ):
        dhcp_lease_list_output.append(
            "{bold}\
 {lease_hostname: <{lease_hostname_length}} \
@ -1059,7 +1061,7 @@ def format_list_acl(acl_list):
    )
    for acl_information in sorted(
-        acl_list, key=lambda l: l["direction"] + str(l["order"])
+        acl_list, key=lambda acl: acl["direction"] + str(acl["order"])
    ):
        acl_list_output.append(
            "{bold}\
--- a/client-cli/pvc/cli_lib/node.py
+++ b/client-cli/pvc/cli_lib/node.py
@ -215,6 +215,19 @@ def node_list(
 # Output display functions
 #
 def getOutputColours(node_information):
    node_health = node_information.get("health", "N/A")
    if isinstance(node_health, int):
        if node_health <= 50:
            health_colour = ansiprint.red()
        elif node_health <= 90:
            health_colour = ansiprint.yellow()
        elif node_health <= 100:
            health_colour = ansiprint.green()
        else:
            health_colour = ansiprint.blue()
    else:
        health_colour = ansiprint.blue()
    if node_information["daemon_state"] == "run":
        daemon_state_colour = ansiprint.green()
    elif node_information["daemon_state"] == "stop":
@ -251,6 +264,7 @@ def getOutputColours(node_information):
        mem_provisioned_colour = ""
    return (
        health_colour,
        daemon_state_colour,
        coordinator_state_colour,
        domain_state_colour,
@ -261,6 +275,7 @@ def getOutputColours(node_information):
 def format_info(node_information, long_output):
    (
        health_colour,
        daemon_state_colour,
        coordinator_state_colour,
        domain_state_colour,
@ -273,14 +288,56 @@ def format_info(node_information, long_output):
    # Basic information
    ainformation.append(
        "{}Name:{}                  {}".format(
-            ansiprint.purple(), ansiprint.end(), node_information["name"]
+            ansiprint.purple(),
            ansiprint.end(),
            node_information["name"],
        )
    )
    ainformation.append(
        "{}PVC Version:{}           {}".format(
-            ansiprint.purple(), ansiprint.end(), node_information["pvc_version"]
+            ansiprint.purple(),
            ansiprint.end(),
            node_information["pvc_version"],
        )
    )
    node_health = node_information.get("health", "N/A")
    if isinstance(node_health, int):
        node_health_text = f"{node_health}%"
    else:
        node_health_text = node_health
    ainformation.append(
        "{}Health:{}                {}{}{}".format(
            ansiprint.purple(),
            ansiprint.end(),
            health_colour,
            node_health_text,
            ansiprint.end(),
        )
    )
    node_health_details = node_information.get("health_details", [])
    if long_output:
        node_health_messages = "\n                       ".join(
            [f"{plugin['name']}: {plugin['message']}" for plugin in node_health_details]
        )
    else:
        node_health_messages = "\n                       ".join(
            [
                f"{plugin['name']}: {plugin['message']}"
                for plugin in node_health_details
                if int(plugin.get("health_delta", 0)) > 0
            ]
        )
    if len(node_health_messages) > 0:
        ainformation.append(
            "{}Health Plugin Details:{} {}".format(
                ansiprint.purple(), ansiprint.end(), node_health_messages
            )
        )
    ainformation.append("")
    ainformation.append(
        "{}Daemon State:{}          {}{}{}".format(
            ansiprint.purple(),
@ -308,11 +365,6 @@ def format_info(node_information, long_output):
            ansiprint.end(),
        )
    )
    ainformation.append(
        "{}Active VM Count:{}       {}".format(
            ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
        )
    )
    if long_output:
        ainformation.append("")
        ainformation.append(
@ -331,6 +383,11 @@ def format_info(node_information, long_output):
            )
        )
    ainformation.append("")
    ainformation.append(
        "{}Active VM Count:{}       {}".format(
            ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
        )
    )
    ainformation.append(
        "{}Host CPUs:{}             {}".format(
            ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["total"]
@ -397,6 +454,7 @@ def format_list(node_list, raw):
    # Determine optimal column widths
    node_name_length = 5
    pvc_version_length = 8
    health_length = 7
    daemon_state_length = 7
    coordinator_state_length = 12
    domain_state_length = 7
@ -417,6 +475,15 @@ def format_list(node_list, raw):
        _pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1
        if _pvc_version_length > pvc_version_length:
            pvc_version_length = _pvc_version_length
        # node_health column
        node_health = node_information.get("health", "N/A")
        if isinstance(node_health, int):
            node_health_text = f"{node_health}%"
        else:
            node_health_text = node_health
        _health_length = len(node_health_text) + 1
        if _health_length > health_length:
            health_length = _health_length
        # daemon_state column
        _daemon_state_length = len(node_information["daemon_state"]) + 1
        if _daemon_state_length > daemon_state_length:
@ -466,7 +533,10 @@ def format_list(node_list, raw):
    # Format the string (header)
    node_list_output.append(
        "{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format(
-            node_header_length=node_name_length + pvc_version_length + 1,
+            node_header_length=node_name_length
            + pvc_version_length
            + health_length
            + 2,
            state_header_length=daemon_state_length
            + coordinator_state_length
            + domain_state_length
@ -484,7 +554,14 @@ def format_list(node_list, raw):
            bold=ansiprint.bold(),
            end_bold=ansiprint.end(),
            node_header="Nodes "
-            + "".join(["-" for _ in range(6, node_name_length + pvc_version_length)]),
+            + "".join(
                [
                    "-"
                    for _ in range(
                        6, node_name_length + pvc_version_length + health_length + 1
                    )
                ]
            ),
            state_header="States "
            + "".join(
                [
@ -526,12 +603,13 @@ def format_list(node_list, raw):
    )
    node_list_output.append(
-        "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \
+        "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {node_health: <{health_length}} \
 {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
 {node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
 {node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format(
            node_name_length=node_name_length,
            pvc_version_length=pvc_version_length,
            health_length=health_length,
            daemon_state_length=daemon_state_length,
            coordinator_state_length=coordinator_state_length,
            domain_state_length=domain_state_length,
@ -551,6 +629,7 @@ def format_list(node_list, raw):
            end_colour="",
            node_name="Name",
            node_pvc_version="Version",
            node_health="Health",
            node_daemon_state="Daemon",
            node_coordinator_state="Coordinator",
            node_domain_state="Domain",
@ -568,19 +647,28 @@ def format_list(node_list, raw):
    # Format the string (elements)
    for node_information in sorted(node_list, key=lambda n: n["name"]):
        (
            health_colour,
            daemon_state_colour,
            coordinator_state_colour,
            domain_state_colour,
            mem_allocated_colour,
            mem_provisioned_colour,
        ) = getOutputColours(node_information)
        node_health = node_information.get("health", "N/A")
        if isinstance(node_health, int):
            node_health_text = f"{node_health}%"
        else:
            node_health_text = node_health
        node_list_output.append(
-            "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \
+            "{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {health_colour}{node_health: <{health_length}}{end_colour} \
 {daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
 {node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
 {node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format(
                node_name_length=node_name_length,
                pvc_version_length=pvc_version_length,
                health_length=health_length,
                daemon_state_length=daemon_state_length,
                coordinator_state_length=coordinator_state_length,
                domain_state_length=domain_state_length,
@ -594,6 +682,7 @@ def format_list(node_list, raw):
                mem_prov_length=mem_prov_length,
                bold="",
                end_bold="",
                health_colour=health_colour,
                daemon_state_colour=daemon_state_colour,
                coordinator_state_colour=coordinator_state_colour,
                domain_state_colour=domain_state_colour,
@ -602,6 +691,7 @@ def format_list(node_list, raw):
                end_colour=ansiprint.end(),
                node_name=node_information["name"],
                node_pvc_version=node_information.get("pvc_version", "N/A"),
                node_health=node_health_text,
                node_daemon_state=node_information["daemon_state"],
                node_coordinator_state=node_information["coordinator_state"],
                node_domain_state=node_information["domain_state"],
--- a/client-cli/pvc/pvc.py
+++ b/client-cli/pvc/pvc.py
@ -134,7 +134,7 @@ def get_config(store_data, cluster=None):
    config = dict()
    config["debug"] = False
    config["cluster"] = cluster
-    config["desctription"] = description
+    config["description"] = description
    config["api_host"] = "{}:{}".format(host, port)
    config["api_scheme"] = scheme
    config["api_key"] = api_key
@ -382,8 +382,6 @@ def cluster_list(raw):
    if not raw:
        # Display the data nicely
        echo("Available clusters:")
        echo("")
        echo(
            "{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}".format(
                bold=ansiprint.bold(),
@ -443,6 +441,202 @@ def cluster_list(raw):
            echo(cluster)
 ###############################################################################
 # pvc cluster detail
 ###############################################################################
@click.command(name="detail", short_help="Show details of all available clusters.")
 def cluster_detail():
    """
    Show quick details of all PVC clusters configured in this CLI instance.
    """
    # Get the existing data
    clusters = get_store(store_path)
    cluster_details_list = list()
    echo("Gathering information from clusters... ", nl=False)
    for cluster in clusters:
        _store_data = get_store(store_path)
        cluster_config = get_config(_store_data, cluster=cluster)
        retcode, retdata = pvc_cluster.get_info(cluster_config)
        if retcode == 0:
            retdata = None
        cluster_details = {"config": cluster_config, "data": retdata}
        cluster_details_list.append(cluster_details)
    echo("done.")
    echo("")
    # Find the lengths of each column
    name_length = 5
    description_length = 12
    health_length = 7
    primary_node_length = 8
    pvc_version_length = 8
    nodes_length = 6
    vms_length = 4
    networks_length = 9
    osds_length = 5
    pools_length = 6
    volumes_length = 8
    snapshots_length = 10
    for cluster_details in cluster_details_list:
        _name_length = len(cluster_details["config"]["cluster"]) + 1
        if _name_length > name_length:
            name_length = _name_length
        _description_length = len(cluster_details["config"]["description"]) + 1
        if _description_length > description_length:
            description_length = _description_length
        if cluster_details["data"] is None:
            continue
        _health_length = (
            len(str(cluster_details["data"]["cluster_health"]["health"]) + "%") + 1
        )
        if _health_length > health_length:
            health_length = _health_length
        _primary_node_length = len(cluster_details["data"]["primary_node"]) + 1
        if _primary_node_length > primary_node_length:
            primary_node_length = _primary_node_length
        _pvc_version_length = len(cluster_details["data"]["pvc_version"]) + 1
        if _pvc_version_length > pvc_version_length:
            pvc_version_length = _pvc_version_length
        _nodes_length = len(str(cluster_details["data"]["nodes"]["total"])) + 1
        if _nodes_length > nodes_length:
            nodes_length = _nodes_length
        _vms_length = len(str(cluster_details["data"]["vms"]["total"])) + 1
        if _vms_length > vms_length:
            vms_length = _vms_length
        _networks_length = len(str(cluster_details["data"]["networks"])) + 1
        if _networks_length > networks_length:
            networks_length = _networks_length
        _osds_length = len(str(cluster_details["data"]["osds"]["total"])) + 1
        if _osds_length > osds_length:
            osds_length = _osds_length
        _pools_length = len(str(cluster_details["data"]["pools"])) + 1
        if _pools_length > pools_length:
            pools_length = _pools_length
        _volumes_length = len(str(cluster_details["data"]["volumes"])) + 1
        if _volumes_length > volumes_length:
            volumes_length = _volumes_length
        _snapshots_length = len(str(cluster_details["data"]["snapshots"])) + 1
        if _snapshots_length > snapshots_length:
            snapshots_length = _snapshots_length
    # Display the data nicely
    echo(
        "{bold}{name: <{name_length}} {description: <{description_length}} {health: <{health_length}} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}{end_bold}".format(
            bold=ansiprint.bold(),
            end_bold=ansiprint.end(),
            name="Name",
            name_length=name_length,
            description="Description",
            description_length=description_length,
            health="Health",
            health_length=health_length,
            primary_node="Primary",
            primary_node_length=primary_node_length,
            pvc_version="Version",
            pvc_version_length=pvc_version_length,
            nodes="Nodes",
            nodes_length=nodes_length,
            vms="VMs",
            vms_length=vms_length,
            networks="Networks",
            networks_length=networks_length,
            osds="OSDs",
            osds_length=osds_length,
            pools="Pools",
            pools_length=pools_length,
            volumes="Volumes",
            volumes_length=volumes_length,
            snapshots="Snapshots",
            snapshots_length=snapshots_length,
        )
    )
    for cluster_details in cluster_details_list:
        if cluster_details["data"] is None:
            health_colour = ansiprint.blue()
            name = cluster_details["config"]["cluster"]
            description = cluster_details["config"]["description"]
            health = "N/A"
            primary_node = "N/A"
            pvc_version = "N/A"
            nodes = "N/A"
            vms = "N/A"
            networks = "N/A"
            osds = "N/A"
            pools = "N/A"
            volumes = "N/A"
            snapshots = "N/A"
        else:
            if cluster_details["data"]["cluster_health"]["health"] > 90:
                health_colour = ansiprint.green()
            elif cluster_details["data"]["cluster_health"]["health"] > 50:
                health_colour = ansiprint.yellow()
            else:
                health_colour = ansiprint.red()
            name = cluster_details["config"]["cluster"]
            description = cluster_details["config"]["description"]
            health = str(cluster_details["data"]["cluster_health"]["health"]) + "%"
            primary_node = cluster_details["data"]["primary_node"]
            pvc_version = cluster_details["data"]["pvc_version"]
            nodes = str(cluster_details["data"]["nodes"]["total"])
            vms = str(cluster_details["data"]["vms"]["total"])
            networks = str(cluster_details["data"]["networks"])
            osds = str(cluster_details["data"]["osds"]["total"])
            pools = str(cluster_details["data"]["pools"])
            volumes = str(cluster_details["data"]["volumes"])
            snapshots = str(cluster_details["data"]["snapshots"])
        echo(
            "{name: <{name_length}} {description: <{description_length}} {health_colour}{health: <{health_length}}{end_colour} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}".format(
                health_colour=health_colour,
                end_colour=ansiprint.end(),
                name=name,
                name_length=name_length,
                description=description,
                description_length=description_length,
                health=health,
                health_length=health_length,
                primary_node=primary_node,
                primary_node_length=primary_node_length,
                pvc_version=pvc_version,
                pvc_version_length=pvc_version_length,
                nodes=nodes,
                nodes_length=nodes_length,
                vms=vms,
                vms_length=vms_length,
                networks=networks,
                networks_length=networks_length,
                osds=osds,
                osds_length=osds_length,
                pools=pools,
                pools_length=pools_length,
                volumes=volumes,
                volumes_length=volumes_length,
                snapshots=snapshots,
                snapshots_length=snapshots_length,
            )
        )
 # Validate that the cluster is set for a given command
 def cluster_req(function):
    @wraps(function)
@ -452,6 +646,24 @@ def cluster_req(function):
                'No cluster specified and no local pvcapid.yaml configuration found. Use "pvc cluster" to add a cluster API to connect to.'
            )
            exit(1)
        if not config["quiet"]:
            if config["api_scheme"] == "https" and not config["verify_ssl"]:
                ssl_unverified_msg = " (unverified)"
            else:
                ssl_unverified_msg = ""
            echo(
                'Using cluster "{}" - Host: "{}"  Scheme: "{}{}"  Prefix: "{}"'.format(
                    config["cluster"],
                    config["api_host"],
                    config["api_scheme"],
                    ssl_unverified_msg,
                    config["api_prefix"],
                ),
                err=True,
            )
            echo("", err=True)
        return function(*args, **kwargs)
    return validate_cluster
@ -697,15 +909,29 @@ def node_log(node, lines, follow):
    default=False,
    help="Display more detailed information.",
 )
@click.option(
    "-f",
    "--format",
    "oformat",
    default="plain",
    show_default=True,
    type=click.Choice(["plain", "json", "json-pretty"]),
    help="Output format of node status information.",
 )
@cluster_req
-def node_info(node, long_output):
+def node_info(node, long_output, oformat):
    """
    Show information about node NODE. If unspecified, defaults to this host.
    """
    retcode, retdata = pvc_node.node_info(config, node)
    if retcode:
-        retdata = pvc_node.format_info(retdata, long_output)
+        if oformat == "json":
            retdata = json.dumps(retdata)
        elif oformat == "json-pretty":
            retdata = json.dumps(retdata, indent=4)
        else:
            retdata = pvc_node.format_info(retdata, long_output)
    cleanup(retcode, retdata)
@ -5882,23 +6108,7 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour):
        config["debug"] = _debug
        config["unsafe"] = _unsafe
        config["colour"] = _colour
-
+        config["quiet"] = _quiet
        if not _quiet:
            if config["api_scheme"] == "https" and not config["verify_ssl"]:
                ssl_unverified_msg = " (unverified)"
            else:
                ssl_unverified_msg = ""
            echo(
                'Using cluster "{}" - Host: "{}"  Scheme: "{}{}"  Prefix: "{}"'.format(
                    config["cluster"],
                    config["api_host"],
                    config["api_scheme"],
                    ssl_unverified_msg,
                    config["api_prefix"],
                ),
                err=True,
            )
            echo("", err=True)
    audit()
@ -5909,6 +6119,7 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour):
 cli_cluster.add_command(cluster_add)
 cli_cluster.add_command(cluster_remove)
 cli_cluster.add_command(cluster_list)
 cli_cluster.add_command(cluster_detail)
 cli_node.add_command(node_secondary)
 cli_node.add_command(node_primary)
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@ -158,6 +158,19 @@ def get_status(zkhandler):
    return True, status_data
 def get_health(zkhandler):
    primary_node = zkhandler.read("base.config.primary_node")
    ceph_health = zkhandler.read("base.storage.health").rstrip()
    # Create a data structure for the information
    status_data = {
        "type": "health",
        "primary_node": primary_node,
        "ceph_data": ceph_health,
    }
    return True, status_data
 def get_util(zkhandler):
    primary_node = zkhandler.read("base.config.primary_node")
    ceph_df = zkhandler.read("base.storage.util").rstrip()
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@ -19,7 +19,7 @@
 #
 ###############################################################################
-import re
+from json import loads
 import daemon_lib.common as common
 import daemon_lib.vm as pvc_vm
@ -44,17 +44,165 @@ def set_maintenance(zkhandler, maint_state):
        return True, "Successfully set cluster in normal mode"
 def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
    health_delta_map = {
        "node_stopped": 50,
        "node_flushed": 10,
        "vm_stopped": 10,
        "osd_out": 50,
        "osd_down": 10,
        "memory_overprovisioned": 50,
        "ceph_err": 50,
        "ceph_warn": 10,
    }
    # Generate total cluster health numbers
    cluster_health_value = 100
    cluster_health_messages = list()
    for index, node in enumerate(node_list):
        # Apply node health values to total health number
        try:
            node_health_int = int(node["health"])
        except Exception:
            node_health_int = 100
        cluster_health_value -= 100 - node_health_int
        for entry in node["health_details"]:
            if entry["health_delta"] > 0:
                cluster_health_messages.append(
                    f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
                )
        # Handle unhealthy node states
        if node["daemon_state"] not in ["run"]:
            cluster_health_value -= health_delta_map["node_stopped"]
            cluster_health_messages.append(
                f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
            )
        elif node["domain_state"] not in ["ready"]:
            cluster_health_value -= health_delta_map["node_flushed"]
            cluster_health_messages.append(
                f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
            )
    for index, vm in enumerate(vm_list):
        # Handle unhealthy VM states
        if vm["state"] in ["stop", "fail"]:
            cluster_health_value -= health_delta_map["vm_stopped"]
            cluster_health_messages.append(
                f"cluster: VM {vm['name']} in {vm['state'].upper()} state"
            )
    for index, ceph_osd in enumerate(ceph_osd_list):
        in_texts = {1: "in", 0: "out"}
        up_texts = {1: "up", 0: "down"}
        # Handle unhealthy OSD states
        if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
            cluster_health_value -= health_delta_map["osd_out"]
            cluster_health_messages.append(
                f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
            )
        elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
            cluster_health_value -= health_delta_map["osd_down"]
            cluster_health_messages.append(
                f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
            )
    # Check for (n-1) overprovisioning
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
    n_minus_1_total = 0
    alloc_total = 0
    node_largest_index = None
    node_largest_count = 0
    for index, node in enumerate(node_list):
        node_mem_total = node["memory"]["total"]
        node_mem_alloc = node["memory"]["allocated"]
        alloc_total += node_mem_alloc
        # Determine if this node is the largest seen so far
        if node_mem_total > node_largest_count:
            node_largest_index = index
            node_largest_count = node_mem_total
    n_minus_1_node_list = list()
    for index, node in enumerate(node_list):
        if index == node_largest_index:
            continue
        n_minus_1_node_list.append(node)
    for index, node in enumerate(n_minus_1_node_list):
        n_minus_1_total += node["memory"]["total"]
    if alloc_total > n_minus_1_total:
        cluster_health_value -= health_delta_map["memory_overprovisioned"]
        cluster_health_messages.append(
            f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
        )
    # Check Ceph cluster health
    ceph_health = loads(zkhandler.read("base.storage.health"))
    ceph_health_status = ceph_health["status"]
    ceph_health_entries = ceph_health["checks"].keys()
    ceph_health_status_map = {
        "HEALTH_ERR": "ERROR",
        "HEALTH_WARN": "WARNING",
    }
    for entry in ceph_health_entries:
        cluster_health_messages.append(
            f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
        )
    if ceph_health_status == "HEALTH_ERR":
        cluster_health_value -= health_delta_map["ceph_err"]
    elif ceph_health_status == "HEALTH_WARN":
        cluster_health_value -= health_delta_map["ceph_warn"]
    if cluster_health_value < 0:
        cluster_health_value = 0
    cluster_health = {
        "health": cluster_health_value,
        "messages": cluster_health_messages,
    }
    return cluster_health
 def getNodeHealth(zkhandler, node_list):
    node_health = dict()
    for index, node in enumerate(node_list):
        node_health_messages = list()
        node_health_value = node["health"]
        for entry in node["health_details"]:
            if entry["health_delta"] > 0:
                node_health_messages.append(f"'{entry['name']}': {entry['message']}")
        node_health_entry = {
            "health": node_health_value,
            "messages": node_health_messages,
        }
        node_health[node["name"]] = node_health_entry
    return node_health
 def getClusterInformation(zkhandler):
    # Get cluster maintenance state
-    maint_state = zkhandler.read("base.config.maintenance")
+    maintenance_state = zkhandler.read("base.config.maintenance")
    # List of messages to display to the clients
    cluster_health_msg = []
    storage_health_msg = []
    # Get node information object list
    retcode, node_list = pvc_node.get_list(zkhandler, None)
    # Get primary node
    primary_node = common.getPrimaryNode(zkhandler)
    # Get PVC version of primary node
    pvc_version = "0.0.0"
    for node in node_list:
        if node["name"] == primary_node:
            pvc_version = node["pvc_version"]
    # Get vm information object list
    retcode, vm_list = pvc_vm.get_list(zkhandler, None, None, None, None)
@ -78,135 +226,6 @@ def getClusterInformation(zkhandler):
    ceph_volume_count = len(ceph_volume_list)
    ceph_snapshot_count = len(ceph_snapshot_list)
    # Determinations for general cluster health
    cluster_healthy_status = True
    # Check for (n-1) overprovisioning
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
    n_minus_1_total = 0
    alloc_total = 0
    node_largest_index = None
    node_largest_count = 0
    for index, node in enumerate(node_list):
        node_mem_total = node["memory"]["total"]
        node_mem_alloc = node["memory"]["allocated"]
        alloc_total += node_mem_alloc
        # Determine if this node is the largest seen so far
        if node_mem_total > node_largest_count:
            node_largest_index = index
            node_largest_count = node_mem_total
    n_minus_1_node_list = list()
    for index, node in enumerate(node_list):
        if index == node_largest_index:
            continue
        n_minus_1_node_list.append(node)
    for index, node in enumerate(n_minus_1_node_list):
        n_minus_1_total += node["memory"]["total"]
    if alloc_total > n_minus_1_total:
        cluster_healthy_status = False
        cluster_health_msg.append(
            "Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
                alloc_total, n_minus_1_total
            )
        )
    # Determinations for node health
    node_healthy_status = list(range(0, node_count))
    node_report_status = list(range(0, node_count))
    for index, node in enumerate(node_list):
        daemon_state = node["daemon_state"]
        domain_state = node["domain_state"]
        if daemon_state != "run" and domain_state != "ready":
            node_healthy_status[index] = False
            cluster_health_msg.append(
                "Node '{}' in {},{} state".format(
                    node["name"], daemon_state, domain_state
                )
            )
        else:
            node_healthy_status[index] = True
        node_report_status[index] = daemon_state + "," + domain_state
    # Determinations for VM health
    vm_healthy_status = list(range(0, vm_count))
    vm_report_status = list(range(0, vm_count))
    for index, vm in enumerate(vm_list):
        vm_state = vm["state"]
        if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
            vm_healthy_status[index] = False
            cluster_health_msg.append(
                "VM '{}' in {} state".format(vm["name"], vm_state)
            )
        else:
            vm_healthy_status[index] = True
        vm_report_status[index] = vm_state
    # Determinations for OSD health
    ceph_osd_healthy_status = list(range(0, ceph_osd_count))
    ceph_osd_report_status = list(range(0, ceph_osd_count))
    for index, ceph_osd in enumerate(ceph_osd_list):
        try:
            ceph_osd_up = ceph_osd["stats"]["up"]
        except KeyError:
            ceph_osd_up = 0
        try:
            ceph_osd_in = ceph_osd["stats"]["in"]
        except KeyError:
            ceph_osd_in = 0
        up_texts = {1: "up", 0: "down"}
        in_texts = {1: "in", 0: "out"}
        if not ceph_osd_up or not ceph_osd_in:
            ceph_osd_healthy_status[index] = False
            cluster_health_msg.append(
                "OSD {} in {},{} state".format(
                    ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
                )
            )
        else:
            ceph_osd_healthy_status[index] = True
        ceph_osd_report_status[index] = (
            up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
        )
    # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
    if maint_state == "true":
        cluster_health = "Maintenance"
    elif (
        cluster_healthy_status is False
        or False in node_healthy_status
        or False in vm_healthy_status
        or False in ceph_osd_healthy_status
    ):
        cluster_health = "Degraded"
    else:
        cluster_health = "Optimal"
    # Find out our storage health from Ceph
    ceph_status = zkhandler.read("base.storage").split("\n")
    ceph_health = ceph_status[2].split()[-1]
    # Parse the status output to get the health indicators
    line_record = False
    for index, line in enumerate(ceph_status):
        if re.search("services:", line):
            line_record = False
        if line_record and len(line.strip()) > 0:
            storage_health_msg.append(line.strip())
        if re.search("health:", line):
            line_record = True
    if maint_state == "true":
        storage_health = "Maintenance"
    elif ceph_health != "HEALTH_OK":
        storage_health = "Degraded"
    else:
        storage_health = "Optimal"
    # State lists
    node_state_combinations = [
        "run,ready",
@ -237,13 +256,19 @@ def getClusterInformation(zkhandler):
        "unmigrate",
        "provision",
    ]
-    ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
+    ceph_osd_state_combinations = [
        "up,in",
        "up,out",
        "down,in",
        "down,out",
    ]
    # Format the Node states
    formatted_node_states = {"total": node_count}
    for state in node_state_combinations:
        state_count = 0
-        for node_state in node_report_status:
+        for node in node_list:
            node_state = f"{node['daemon_state']},{node['domain_state']}"
            if node_state == state:
                state_count += 1
        if state_count > 0:
@ -253,17 +278,20 @@ def getClusterInformation(zkhandler):
    formatted_vm_states = {"total": vm_count}
    for state in vm_state_combinations:
        state_count = 0
-        for vm_state in vm_report_status:
+        for vm in vm_list:
-            if vm_state == state:
+            if vm["state"] == state:
                state_count += 1
        if state_count > 0:
            formatted_vm_states[state] = state_count
    # Format the OSD states
    up_texts = {1: "up", 0: "down"}
    in_texts = {1: "in", 0: "out"}
    formatted_osd_states = {"total": ceph_osd_count}
    for state in ceph_osd_state_combinations:
        state_count = 0
-        for ceph_osd_state in ceph_osd_report_status:
+        for ceph_osd in ceph_osd_list:
            ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
            if ceph_osd_state == state:
                state_count += 1
        if state_count > 0:
@ -271,11 +299,13 @@ def getClusterInformation(zkhandler):
    # Format the status data
    cluster_information = {
-        "health": cluster_health,
+        "cluster_health": getClusterHealth(
-        "health_msg": cluster_health_msg,
+            zkhandler, node_list, vm_list, ceph_osd_list
-        "storage_health": storage_health,
+        ),
-        "storage_health_msg": storage_health_msg,
+        "node_health": getNodeHealth(zkhandler, node_list),
-        "primary_node": common.getPrimaryNode(zkhandler),
+        "maintenance": maintenance_state,
        "primary_node": primary_node,
        "pvc_version": pvc_version,
        "upstream_ip": zkhandler.read("base.config.upstream_ip"),
        "nodes": formatted_node_states,
        "vms": formatted_vm_states,
--- a/daemon-common/migrations/versions/9.json
+++ b/daemon-common/migrations/versions/9.json
@ -0,0 +1 @@
 {"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
--- a/daemon-common/node.py
+++ b/daemon-common/node.py
@ -21,6 +21,7 @@
 import time
 import re
 import json
 import daemon_lib.common as common
@ -49,6 +50,44 @@ def getNodeInformation(zkhandler, node_name):
        zkhandler.read(("node.count.provisioned_domains", node_name))
    )
    node_running_domains = zkhandler.read(("node.running_domains", node_name)).split()
    try:
        node_health = int(zkhandler.read(("node.monitoring.health", node_name)))
    except Exception:
        node_health = "N/A"
    try:
        node_health_plugins = zkhandler.read(
            ("node.monitoring.plugins", node_name)
        ).split()
    except Exception:
        node_health_plugins = list()
    node_health_details = list()
    for plugin in node_health_plugins:
        plugin_last_run = zkhandler.read(
            ("node.monitoring.data", node_name, "monitoring_plugin.last_run", plugin)
        )
        plugin_health_delta = zkhandler.read(
            (
                "node.monitoring.data",
                node_name,
                "monitoring_plugin.health_delta",
                plugin,
            )
        )
        plugin_message = zkhandler.read(
            ("node.monitoring.data", node_name, "monitoring_plugin.message", plugin)
        )
        plugin_data = zkhandler.read(
            ("node.monitoring.data", node_name, "monitoring_plugin.data", plugin)
        )
        plugin_output = {
            "name": plugin,
            "last_run": int(plugin_last_run),
            "health_delta": int(plugin_health_delta),
            "message": plugin_message,
            "data": json.loads(plugin_data),
        }
        node_health_details.append(plugin_output)
    # Construct a data structure to represent the data
    node_information = {
@ -61,10 +100,16 @@ def getNodeInformation(zkhandler, node_name):
        "kernel": node_kernel,
        "os": node_os,
        "arch": node_arch,
        "health": node_health,
        "health_plugins": node_health_plugins,
        "health_details": node_health_details,
        "load": node_load,
        "domains_count": node_domains_count,
        "running_domains": node_running_domains,
-        "vcpu": {"total": node_cpu_count, "allocated": node_vcpu_allocated},
+        "vcpu": {
            "total": node_cpu_count,
            "allocated": node_vcpu_allocated,
        },
        "memory": {
            "total": node_mem_total,
            "allocated": node_mem_allocated,
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@ -540,7 +540,7 @@ class ZKHandler(object):
 #
 class ZKSchema(object):
    # Current version
-    _version = 8
+    _version = 9
    # Root for doing nested keys
    _schema_root = ""
@ -569,6 +569,7 @@ class ZKSchema(object):
            "domain": f"{_schema_root}/domains",
            "network": f"{_schema_root}/networks",
            "storage": f"{_schema_root}/ceph",
            "storage.health": f"{_schema_root}/ceph/health",
            "storage.util": f"{_schema_root}/ceph/util",
            "osd": f"{_schema_root}/ceph/osds",
            "pool": f"{_schema_root}/ceph/pools",
@ -608,6 +609,18 @@ class ZKSchema(object):
            "sriov": "/sriov",
            "sriov.pf": "/sriov/pf",
            "sriov.vf": "/sriov/vf",
            "monitoring.plugins": "/monitoring_plugins",
            "monitoring.data": "/monitoring_data",
            "monitoring.health": "/monitoring_health",
        },
        # The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin})
        "monitoring_plugin": {
            "name": "",  # The root key
            "last_run": "/last_run",
            "health_delta": "/health_delta",
            "message": "/message",
            "data": "/data",
            "runtime": "/runtime",
        },
        # The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf})
        "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"},  # The root key
@ -874,9 +887,10 @@ class ZKSchema(object):
                                if not zkhandler.zk_conn.exists(nkipath):
                                    result = False
-                    # One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be
+                    # One might expect child keys under node (specifically, sriov.pf, sriov.vf,
-                    # managed here as well, but those are created automatically every time pvcnoded starts
+                    # monitoring.data) to be managed here as well, but those are created
-                    # and thus never need to be validated or applied.
+                    # automatically every time pvcnoded started and thus never need to be validated
                    # or applied.
        # These two have several children layers that must be parsed through
        for elem in ["volume"]:
--- a/debian/pvc-daemon-node.install
+++ b/debian/pvc-daemon-node.install
@ -5,3 +5,4 @@ node-daemon/pvcnoded.service lib/systemd/system
 node-daemon/pvc.target lib/systemd/system
 node-daemon/pvcautoready.service lib/systemd/system
 node-daemon/monitoring usr/share/pvc
 node-daemon/plugins usr/share/pvc
--- a/docs/about.md
+++ b/docs/about.md
@ -71,6 +71,8 @@ Nodes are networked together via a set of statically-configured, simple layer-2
 Further information about the general cluster architecture, including important considerations for node specifications/sizing and network configuration, [can be found at the cluster architecture page](/cluster-architecture). It is imperative that potential PVC administrators read this document thoroughly to understand the specific requirements of PVC and avoid potential missteps in obtaining and deploying their cluster.
 More information about the node daemon can be found at the [Node Daemon manual page](/manuals/daemon) and details about the health system and health plugins for nodes can be found at the [health plugin manual page](/manuals/health-plugins).
 ## Clients
 ### API Client
--- a/docs/manuals/daemon.md
+++ b/docs/manuals/daemon.md
@ -52,9 +52,11 @@ The daemon startup sequence is documented below. The main daemon entry-point is
 0. The node activates its keepalived timer and begins sending keepalive updates to the cluster. The daemon state transitions from `init` to `run` and the system has started fully.
-# PVC Node Daemon manual
+## Node health plugins
-The PVC node daemon ins build with Python 3 and is run directly on nodes. For details of the startup sequence and general layout, see the [architecture document](/architecture/daemon).
+The PVC node daemon includes a node health plugin system. These plugins are run during keepalives to check various aspects of node health and adjust the overall node and cluster health accordingly. For example, a plugin might check that all configured network interfaces are online and operating at their correct speed, or that all operating system packages are up-to-date.
 For the full details of the health and node health plugin system, see the [node health plugin manual](/manuals/health-plugins).
 ## Configuration
@ -132,6 +134,7 @@ pvc:
      target_selector: mem
    configuration:
      directories:
        plugin_directory: "/usr/share/pvc/plugins"
        dynamic_directory: "/run/pvc"
        log_directory: "/var/log/pvc"
        console_log_directory: "/var/log/libvirt"
@ -142,7 +145,7 @@ pvc:
        log_dates: True
        log_keepalives: True
        log_keepalive_cluster_details: True
-        log_keepalive_storage_details: True
+        log_keepalive_plugin_details: True
        console_log_lines: 1000
      networking:
        bridge_device: ens4
@ -367,6 +370,12 @@ For most clusters, `mem` should be sufficient, but others may be used based on t
  * `memprov` looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use.
  * `load` looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs.
 #### `system` → `configuration` → `directories` → `plugin_directory`
 * *optional*
 The directory to load node health plugins from. Defaults to `/usr/share/pvc/plugins` if unset as per default packaging; should only be overridden by advanced users.
 #### `system` → `configuration` → `directories` → `dynamic_directory`
 * *required*
@ -421,11 +430,11 @@ Whether to log keepalive messages or not.
 Whether to log node status information during keepalives or not.
-#### `system` → `configuration` → `logging` → `log_keepalive_storage_details`
+#### `system` → `configuration` → `logging` → `log_keepalive_plugin_details`
 * *required*
-Whether to log storage cluster status information during keepalives or not.
+Whether to log node health plugin status information during keepalives or not.
 #### `system` → `configuration` → `logging` → `console_log_lines`
--- a/docs/manuals/health-plugins.md
+++ b/docs/manuals/health-plugins.md
@ -0,0 +1,194 @@
 # Node health plugins
 The PVC node daemon includes a node health plugin system. These plugins are run during keepalives to check various aspects of node health and adjust the overall node and cluster health accordingly. For example, a plugin might check that all configured network interfaces are online and operating at their correct speed, or that all operating system packages are up-to-date.
 ## Configuration
 ### Plugin Directory
 The PVC node configuration includes a configuration option at `system` → `configuration` → `directories` → `plugin_directory` to configure the location of health plugin files on the system. By default if unset, this directory is `/usr/share/pvc/plugins`. An administrator can override this directory if they wish, though custom plugins can be installed to this directory without problems, and thus it is not recommended that it be changed.
 ### Plugin Logging
 Plugin output is logged by default during keepalive messages. This is controlled by the node configuration option at `system` → `configuration` → `logging` → `log_keepalive_plugin_details`. Regardless of this setting, the overall node health is logged at the end of the plugin run.
 ### Disabling Node Plugins
 Node plugins cannot be disabled; at best, a suite of zero plugins can be specified by pointing the above plugin directory to an empty folder. This will effectively render the node at a permanent 100% health. Note however that overall cluster health will still be affected by cluster-wide events (e.g. nodes or VMs being stopped, OSDs going out, etc.).
 ## Health Plugin Architecture
 ### Node and Cluster Health
 A core concept leveraged by the PVC system is that of node and cluster health. Starting with PVC version 0.9.61, these two health statistics are represented as percentages, with 100% representing optimal health, 51-90% representing a "warning" degraded state, and 0-50% representing a "critical" degraded state.
 While a cluster is in maintenance mode (set via `pvc maintenance on` and unset via `pvc maintenance off`), the health values continue to aggregate, but the value is ignored for the purposes of "health" output, i.e. its output colour will not change, and the reference monitoring plugins (for CheckMK and Munin) will not trigger alerting. This allows the administrator to specify that abnormal conditions are OK for some amount of time without triggering upstream alerting. Additionally, while a node is not in `run` Daemon state, its health will be reported as `N/A`, which is treated as 100% but displayed as such to make clear that the node has not initialized and run its health check plugins (yet).
 The node health is affected primarily by health plugins as discussed in this manual. Any plugin that adjusts node health lowers the node's health by its `health_delta` value, as well as the cluster health by its `health_delta` value. For example, a plugin might have a `health_delta` in a current state of `10`, which reduces its own node's health value to 90%, and the overall cluster health value to 90%.
 In addition, cluster health is affected by several fixed states within the PVC system. These are:
 * A node in `flushed` Domain state lowers the cluster health by 10; a node in `stop` Daemon state lowers the cluster health by 50.
 * A VM in `stop` state lowers the cluster health by 10 (hint: use `disable` state to avoid this).
 * An OSD in `down` state lowers the cluster health by 10; an OSD in `out` state lowers the cluster health by 50.
 * Memory overprovisioning (total provisioned and running guest memory allocation exceeds the total N-1 cluster memory availability) lowers the cluster health by 50.
 * Each Ceph health check message lowers the cluster health by 10 for a `HEALTH_WARN` severity or by 50 for a `HEALTH_ERR` severity. For example, the `OSDMAP_FLAGS` check (reporting, e.g. `noout` state) reports as a `HEALTH_WARN` severity and will thus decrease the cluster health by 10; if an additional `PG_DEGRADED` check fires (also reporting as `HEALTH_WARN` severity), this will decrease the cluster health by a further 10, or 20 total for both. This cumulative effect ensures that multiple simultaneous Ceph issues escalate in severity. For a full list of possible Ceph health check messages, [please see the Ceph documentation](https://docs.ceph.com/en/nautilus/rados/operations/health-checks/).
 ### Built-in Health Plugins
 PVC ships with several node health plugins installed and loaded by default, to ensure several common aspects of node operation are validated and checked. The following plugins are included:
 #### `disk`
 This plugin checks all SATA/SAS and NVMe block devices for SMART health, if available, and reports any errors.
 For SATA/SAS disks reporting standard ATA SMART attributes, a health delta of 10 is raised for each SMART error on each disk, based on the `when_failed` value being set to true. Note that due to this design, several disks with multiple errors can quickly escalate to a critical condition, quickly alerting the administrator of possible major faults.
 For NVMe disks, only 3 specific NVMe health information messages are checked: `critical_warning`, `media_errors`, and `percentage_used` at > 90. Each check can only be reported once per disk and each raises a health delta of 10.
 #### `dpkg`
 This plugin checks for Debian package updates, invalid package states (i.e. not `ii` state), and obsolete configuration files that require cleanup. It will raise a health delta of 1 for each type of inconsistency, for a maximum of 3. It will thus never, on its own, trigger a node or cluster to be in a warning or critical state, but will show the errors for administrator analysis, as an example of a more "configuration anomaly"-type plugin.
 #### `edac`
 This plugin checks the EDAC utility for messages about errors, primarily in the ECC memory subsystem. It will raise a health delta of 50 if any `Uncorrected` EDAC errors are detected, possibly indicating failing memory.
 #### `ipmi`
 This plugin checks whether the daemon can reach its own IPMI address and connect. If it cannot, it raises a health delta of 10.
 #### `lbvt`
 This plugin checks whether the daemon can connect to the local Libvirt daemon instance. If it cannot, it raises a health delta of 50.
 #### `load`
 This plugin checks the current 1-minute system load (as reported during keepalives) against the number of total CPU threads available on the node. If the load average is greater, i.e. the node is overloaded, it raises a health delta of 50.
 #### `nics`
 This plugin checks that all NICs underlying PVC networks and bridges are operating correctly, specifically that bond interfaces have at least 2 active slaves and that all physical NICs are operating at their maximum possible speed. It takes into account several possible options to determine this.
 * For each device defined (`bridge_dev`, `upstream_dev`, `cluster_dev`, and `storage_dev`), it determines the type of device. If it is a vLAN, it obtains the underlying device; otherwise, it uses the specified device. It then adds this device to a list of core NICs. Ideally, this list will contain either bonding interfaces or actual ethernet NICs.
 * For each core NIC, it checks its type. If it is a `bond` device, it checks the bonding state to ensure that at least 2 slave interfaces are up and operating. If there are not, it raises a health delta of 10.
 * For each core NIC, it checks its maximum possible speed as reported by `ethtool` as well as the current active speed. If the NIC is operating at less than its maximum possible speed, it raises a health delta of 10.
 Note that this check may pose problems in some deployment scenarios (e.g. running 25GbE NICs at 10GbE by design). Currently the plugin logic cannot handle this and manual modifications may be required. This is left to the administrator if applicable.
 #### `psql`
 This plugin checks whether the daemon can connect to the local PostgreSQL/Patroni daemon instance. If it cannot, it raises a health delta of 50.
 #### `zkpr`
 This plugin checks whether the daemon can connect to the local Zookeeper daemon instance. If it cannot, it raises a health delta of 50.
 ### Custom Health Plugins
 In addition to the included health plugins, the plugin architecture allows administrators to write their own plugins as required to check specific node details that might not be checked by the default plugins. While the author has endeavoured to cover as many important aspects as possible with the default plugins, there is always the possibility that some other condition becomes important and thus the system is flexible to this need. That said, we would welcome pull requests of new plugins to future version of PVC should they be widely applicable.
 As a warning, health plugins are run in a `root` context by PVC. They must therefore be carefully vetted to avoid damaging the system. DO NOT run untrusted health plugins.
 To create a health plugin, first reference the existing health plugins and create a base template.
 Each health plugin consists of three main parts:
 * An import, which must at least include the `MonitoringPlugin` class from the `pvcnoded.objects.MonitoringInstance` library. You can also load additional imports here, or import them within the functions (which is recommended for namespace simplicity).
 ```
 # This import is always required here, as MonitoringPlugin is used by the MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 ```
 * A `PLUGIN_NAME` variable which defines the name of the plugin. This must match the filename. Generally, a plugin name will be 4 characters, but this is purely a convention and not a requirement.
 ```
 # A monitoring plugin script must always expose its nice name, which must be identical to the file name
 PLUGIN_NAME = "nics"
 ```
 * An instance of a `MonitoringPluginScript` class which extends the `MonitoringPlugin` class.
 ```
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    ...
 ```
 Within the `MonitoringPluginScript` class must be 3 primary functions as detailed below. While it is possible to do nothing except `pass` in these functions, or even exclude them (the parent includes empty defaults), all 3 should be included for consistency.
 #### `def setup(self):`
 This function is run once during the node daemon startup, when the plugin is loaded. It can be used to get one-time setup information, populate plugin instance variables, etc.
 The function must take no arguments except `self` and anything returned is ignored.
 A plugin can also be disabled live in the setup function by throwing any `Exception`. Such exceptions will be caught and the plugin will not be loaded in such a case.
 #### `def cleanup(self):`
 This function mirrors the setup function, and is run once during the node daemon shutdown process. It can be used to clean up any lingering items (e.g. temporary files) created by the setup or run functions, if required; generally plugins do not need to do any cleanup.
 #### `def run(self):`
 This function is run each time the plugin is called during a keepalive. It performs the main work of the plugin before returning the end result in a specific format.
 Note that this function runs once for each keepalive, which by default is every 5 seconds. It is thus important to keep the runtime as short as possible and avoid doing complex calculations, file I/O, etc. during the plugin run. Do as much as possible in the setup function to keep the run function as quick as possible.
 What happens during the run function is of course completely up to the plugin, but it must return a standardized set of details upon completing the run.
 An instance of the `PluginResult` object is helpfully created by the caller and passed in via `self.plugin_result`. This can be used to set the results as follows:
 * The `self.plugin_result.set_health_delta()` function can be used to set the current health delta of the result. This should be `0` unless the plugin detects a fault, at which point it can be any integer value below 100, and affects the node and cluster health as detailed above.
 * The `self.plugin_result.set_message()` function can be used to set the message text of the result, explaining in a short but human-readable way what the plugin result is. This will be shown in several places, including the node logs (if enabled), the node info output, and for results that have a health delta above 0, in the cluster status output.
 Finally, the `PluginResult` instance stored as `self.plugin_result` must be returned by the run function to the caller upon completion so that it can be added to the node state.
 ### Example Health Plugin
 This is a terse example of the `load` plugin, which is an extremely simple example that shows all the above requirements clearly. Comments are omitted here for simplicity, but these can be seen in the actual plugin file (at `/usr/share/pvc/plugins/load` on any node).
 ```
 #!/usr/bin/env python3
 # load.py: PVC monitoring plugin example
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 PLUGIN_NAME = "load"
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        pass
    def cleanup(self):
        pass
    def run(self):
        from os import getloadavg
        from psutil import cpu_count
        load_average = getloadavg()[0]
        cpu_cores = cpu_count()
        if load_average > float(cpu_cores):
            health_delta = 50
        else:
            health_delta = 0
        message = f"Current load is {load_average} out pf {cpu_cores} CPU cores"
        self.plugin_result.set_health_delta(health_delta)
        self.plugin_result.set_message(message)
        return self.plugin_result
 ```
--- a/docs/manuals/swagger.json
+++ b/docs/manuals/swagger.json
@ -15,15 +15,57 @@
        },
        "ClusterStatus": {
            "properties": {
-                "health": {
+                "cluster_health": {
-                    "description": "The overall cluster health",
+                    "properties": {
-                    "example": "Optimal",
+                        "health": {
                            "description": "The overall health (%) of the cluster",
                            "example": 100,
                            "type": "integer"
                        },
                        "messages": {
                            "description": "A list of health event strings",
                            "items": {
                                "example": "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps",
                                "type": "string"
                            },
                            "type": "array"
                        }
                    },
                    "type": "object"
                },
                "maintenance": {
                    "description": "Whether the cluster is in maintenance mode or not (string boolean)",
                    "example": true,
                    "type": "string"
                },
                "networks": {
                    "description": "The total number of networks in the cluster",
                    "type": "integer"
                },
                "node_health": {
                    "properties": {
                        "hvX": {
                            "description": "A node entry for per-node health details, one per node in the cluster",
                            "properties": {
                                "health": {
                                    "description": "The health (%) of the node",
                                    "example": 100,
                                    "type": "integer"
                                },
                                "messages": {
                                    "description": "A list of health event strings",
                                    "items": {
                                        "example": "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps",
                                        "type": "string"
                                    },
                                    "type": "array"
                                }
                            },
                            "type": "object"
                        }
                    },
                    "type": "object"
                },
                "nodes": {
                    "properties": {
                        "state-combination": {
@ -61,15 +103,15 @@
                    "example": "pvchv1",
                    "type": "string"
                },
                "pvc_version": {
                    "description": "The PVC version of the current primary coordinator node",
                    "example": "0.9.61",
                    "type": "string"
                },
                "snapshots": {
                    "description": "The total number of snapshots in the storage cluster",
                    "type": "integer"
                },
                "storage_health": {
                    "description": "The overall storage cluster health",
                    "example": "Optimal",
                    "type": "string"
                },
                "upstream_ip": {
                    "description": "The cluster upstream IP address in CIDR format",
                    "example": "10.0.0.254/24",
@ -456,6 +498,48 @@
                    "description": "The number of running domains (VMs)",
                    "type": "integer"
                },
                "health": {
                    "description": "The overall health (%) of the node",
                    "example": 100,
                    "type": "integer"
                },
                "health_details": {
                    "description": "A list of health plugin results",
                    "items": {
                        "properties": {
                            "health_delta": {
                                "description": "The health delta (negatively applied to the health percentage) of the plugin's current state",
                                "example": 10,
                                "type": "integer"
                            },
                            "last_run": {
                                "description": "The UNIX timestamp (s) of the last plugin run",
                                "example": 1676786078,
                                "type": "integer"
                            },
                            "message": {
                                "description": "The output message of the plugin",
                                "example": "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps",
                                "type": "string"
                            },
                            "name": {
                                "description": "The name of the health plugin",
                                "example": "nics",
                                "type": "string"
                            }
                        },
                        "type": "object"
                    },
                    "type": "array"
                },
                "health_plugins": {
                    "description": "A list of health plugin names currently loaded on the node",
                    "items": {
                        "example": "nics",
                        "type": "string"
                    },
                    "type": "array"
                },
                "kernel": {
                    "desription": "The running kernel version from uname",
                    "type": "string"
@ -6177,7 +6261,7 @@
                        "description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
                        "enum": [
                            "mem",
-                            "memfree",
+                            "memprov",
                            "vcpus",
                            "load",
                            "vms",
@ -6336,7 +6420,7 @@
                        "description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
                        "enum": [
                            "mem",
-                            "memfree",
+                            "memprov",
                            "vcpus",
                            "load",
                            "vms",
@ -6597,7 +6681,7 @@
                        "description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
                        "enum": [
                            "mem",
-                            "memfree",
+                            "memprov",
                            "vcpus",
                            "load",
                            "vms",
--- a/node-daemon/monitoring/README.md
+++ b/node-daemon/monitoring/README.md
@ -2,23 +2,34 @@
 This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
-### Munin
+## Munin
-The included munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time.
+The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
-When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99.
+* `pvc`: Checks the PVC cluster and node health, as well as their status (OK/Warning/Critical, based on maintenance status), providing 4 graphs.
-Example plugin configuration:
+* `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself.
-```
+The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. The alerting is provided by two separate graphs from the health graph so that actual health state is logged regardless of alerting.
 [pvc]
 # Make cluster warn on maintenance
 env.pvc_cluster_warning 0.99
 # Disable critical threshold (>2)
 env.pvc_cluster_critical 3
 # Make storage warn on maintenance, crit on degraded (latter is default)
 env.pvc_storage_warning 0.99
 env.pvc_storage_critical 1.99
 ```
-### Check_MK
+The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well.
 ## CheckMK
 The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin. This monitoring server plugin requires CheckMK version 2.0 or higher. The two parts can be installed as follows:
 * `pvc`: Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node.
 * `pvc.py`: Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory on the CheckMK monitoring host for each monitoring site.
 The plugin provides no configuration: the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK.
 With both the agent and server plugins installed, you can then run `cmk -II <node>` (or use WATO) to inventory each node, which should produce two new checks:
 * `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check).
 * `PVC Node <shortname>`: Provides the per-node health.
 The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage.
 The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself.
--- a/node-daemon/monitoring/checkmk/pvc
+++ b/node-daemon/monitoring/checkmk/pvc
@ -0,0 +1,6 @@
 #!/bin/bash
 # PVC cluster status check for Check_MK (agent-side)
 echo "<<<pvc>>>"
 pvc --quiet status --format json
--- a/node-daemon/monitoring/checkmk/pvc.py
+++ b/node-daemon/monitoring/checkmk/pvc.py
@ -0,0 +1,95 @@
 #!/usr/bin/env python3
 #
 # Check_MK PVC plugin
 #
 # Copyright 2017-2021, Joshua Boniface <joshua@boniface.me>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from .agent_based_api.v1 import *
 from cmk.base.check_api import host_name
 from time import time
 from json import loads
 def discover_pvc(section):
    my_node = host_name().split(".")[0]
    yield Service(item=f"PVC Node {my_node}")
    yield Service(item="PVC Cluster")
 def check_pvc(item, params, section):
    state = State.OK
    summary = "Stuff"
    details = None
    data = loads(" ".join(section[0]))
    my_node = host_name().split(".")[0]
    maintenance_map = {
        "true": "on",
        "false": "off",
    }
    maintenance = maintenance_map[data["maintenance"]]
    # Node check
    if item == f"PVC Node {my_node}":
        my_node = host_name().split(".")[0]
        node_health = data["node_health"][my_node]["health"]
        node_messages = data["node_health"][my_node]["messages"]
        summary = f"Node health is {node_health}% (maintenance {maintenance})"
        if len(node_messages) > 0:
            details = ", ".join(node_messages)
        if node_health <= 50 and maintenance == "off":
            state = State.CRIT
        elif node_health <= 90 and maintenance == "off":
            state = State.WARN
        else:
            state = State.OK
        yield Metric(name="node-health", value=node_health)
    # Cluster check
    elif item == "PVC Cluster":
        cluster_health = data["cluster_health"]["health"]
        cluster_messages = data["cluster_health"]["messages"]
        summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
        if len(cluster_messages) > 0:
            details = ", ".join(cluster_messages)
        if cluster_health <= 50 and maintenance == "off":
            state = State.CRIT
        elif cluster_health <= 90 and maintenance == "off":
            state = State.WARN
        else:
            state = State.OK
        yield Metric(name="cluster-health", value=cluster_health)
    yield Result(state=state, summary=summary, details=details)
    return
 register.check_plugin(
    name="pvc",
    service_name="%s",
    check_ruleset_name="pvc",
    discovery_function=discover_pvc,
    check_function=check_pvc,
    check_default_parameters={},
 )
--- a/node-daemon/monitoring/munin/pvc
+++ b/node-daemon/monitoring/munin/pvc
@ -7,23 +7,6 @@
 pvc - Plugin to monitor a PVC cluster.
 =head1 CONFIGURATION
 Note that due to how Munin thresholds work, these values must always be slightly less than 1 or 2 respectively,
 or the alerts will never be triggered.
 Defaults (no config required):
 [pvc]
 env.warning 1.99
 env.critical 1.99
 Make degraded cluster WARN only (max value is 2, so 3 effectively disables):
 [pvc]
 env.pvc_cluster_warning 1.99
 env.pvc_cluster_critical 3
 =head1 AUTHOR
 Joshua Boniface <joshua@boniface.me>
@ -45,7 +28,9 @@ GPLv3
 . "$MUNIN_LIBDIR/plugins/plugin.sh"
-warning=1.99
+is_multigraph
 warning=0.99
 critical=1.99
 export PVC_CLIENT_DIR="/run/shm/munin-pvc"
@ -53,16 +38,7 @@ PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
 JQ_CMD="/usr/bin/jq"
 output_usage() {
-    echo "This plugin outputs numerical values based on the health of the PVC cluster."
+    echo "This plugin outputs information about a PVC cluster and node"
    echo
    echo "There are separate outputs for both the PVC cluster itself as well as the Ceph storage cluster."
    echo "In normal operation, i.e. when both clusters are in 'Optimal' state, the plugin returns 0 for"
    echo "each cluster. When the cluster is placed into 'Maintenance' mode,the plugin returns 1 for each"
    echo "cluster, and goes into WARN state (limit 0.99); this can be adjusted by overriding the WARNING"
    echo "threshold of the plugin to something other than 0.99 - note that due to Munin's alerting design,"
    echo "the warning value must always be very slightly below the whole number. When either cluster"
    echo "element becomes 'Degraded', the plugin returns 2 for the relevant cluster, which is treated as a"
    echo "critical. Like the WARNING threshold, this can be overridden, and with the same caveat about limit."
    exit 0
 }
@ -84,72 +60,102 @@ output_autoconf() {
 }
 output_config() {
-    echo 'graph_title PVC Clusters'
+    echo 'multigraph pvc_cluster_health'
    echo 'graph_title PVC Cluster Health'
    echo 'graph_args --base 1000'
-    echo 'graph_vlabel Count'
+    echo 'graph_vlabel Health%'
    echo 'graph_category pvc'
-    echo 'graph_period second'
+    echo 'graph_info Health of the PVC cluster'
    echo 'graph_info This graph shows the nodes in the PVC cluster.'
-    echo 'pvc_cluster.label Cluster Degradation'
+    echo 'pvc_cluster_health.label Cluster Health'
-    echo 'pvc_cluster.type GAUGE'
+    echo 'pvc_cluster_health.type GAUGE'
-    echo 'pvc_cluster.max 2'
+    echo 'pvc_cluster_health.max 100'
-    echo 'pvc_cluster.info Whether the PVC cluster is in a degraded state.'
+    echo 'pvc_cluster_health.min 0'
-    print_warning pvc_cluster
+    echo 'pvc_cluster_health.info Health of the PVC cluster in %'
    print_critical pvc_cluster
-    echo 'pvc_storage.label Storage Degradation'
+    echo 'multigraph pvc_cluster_alert'
-    echo 'pvc_storage.type GAUGE'
+    echo 'graph_title PVC Cluster Alerting'
-    echo 'pvc_storage.max 2'
+    echo 'graph_args --base 1000'
-    echo 'pvc_storage.info Whether the storage cluster is in a degraded state.'
+    echo 'graph_vlabel State'
-    print_warning pvc_storage
+    echo 'graph_category pvc'
-    print_critical pvc_storage
+    echo 'graph_info Alerting state of the PVC cluster health'
    echo 'pvc_cluster_alert.label Cluster Health State'
    echo 'pvc_cluster_alert.type GAUGE'
    echo 'pvc_cluster_alert.max 2'
    echo 'pvc_cluster_alert.min 0'
    echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health'
    print_warning pvc_cluster_alert
    print_critical pvc_cluster_alert
    echo 'multigraph pvc_node_health'
    echo 'graph_title PVC Node Health'
    echo 'graph_args --base 1000'
    echo 'graph_vlabel Health%'
    echo 'graph_category pvc'
    echo 'graph_info Health of the PVC node'
    echo 'pvc_node_health.label Node Health'
    echo 'pvc_node_health.type GAUGE'
    echo 'pvc_node_health.max 100'
    echo 'pvc_node_health.min 0'
    echo 'pvc_node_health.info Health of the PVC node in %'
    echo 'multigraph pvc_node_alert'
    echo 'graph_title PVC Node Alerting'
    echo 'graph_args --base 1000'
    echo 'graph_vlabel State'
    echo 'graph_category pvc'
    echo 'graph_info Alerting state of the PVC node health'
    echo 'pvc_node_alert.label Node Health State'
    echo 'pvc_node_alert.type GAUGE'
    echo 'pvc_node_alert.max 2'
    echo 'pvc_node_alert.min 0'
    echo 'pvc_node_alert.info Alerting state of the PVC node health'
    print_warning pvc_node_alert
    print_critical pvc_node_alert
    exit 0
 }
 output_values() {
    PVC_OUTPUT="$( $PVC_CMD )"
    HOST="$( hostname --short )"
-    cluster_health="$( $JQ_CMD '.health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
+    is_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )"
    cluster_failed_reason="$( $JQ_CMD -r '.health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
    case $cluster_health in
        "Optimal")
            cluster_value="0"
            ;;
        "Maintenance")
            cluster_value="1"
            ;;
        "Degraded")
            cluster_value="2"
    esac
-    storage_health="$( $JQ_CMD '.storage_health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
+    cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
-    storage_failed_reason="$( $JQ_CMD -r '.storage_health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
+    cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
-    case $storage_health in
+    echo 'multigraph pvc_cluster_health'
-        "Optimal")
+    echo "pvc_cluster_health.value ${cluster_health}"
-            storage_value="0"
+    echo "pvc_cluster_health.extinfo ${cluster_health_messages}"
            ;;
        "Maintenance")
            storage_value="1"
            ;;
        "Degraded")
            storage_value="2"
    esac
    if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then
        cluster_health_alert=2
    elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then
        cluster_health_alert=1
    else
        cluster_health_alert=0
    fi
    echo 'multigraph pvc_cluster_alert'
    echo "pvc_cluster_alert.value ${cluster_health_alert}"
-    echo "pvc_cluster.value $cluster_value"
+    node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
-    if [[ $cluster_value -eq 1 ]]; then
+    node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
-        echo "pvc_cluster.extinfo Cluster in maintenance mode"
+    echo 'multigraph pvc_node_health'
-    elif [[ $cluster_value -eq 2 ]]; then
+    echo "pvc_node_health.value ${node_health}"
-        echo "pvc_cluster.extinfo ${cluster_failed_reason}"
+    echo "pvc_node_health.extinfo ${node_health_messages}"
-    fi 
+
-    echo "pvc_storage.value $storage_value"
+    if [[ ${node_health} -le 50 && ${is_maintenance} != "true" ]]; then
-    if [[ $storage_value -eq 1 ]]; then
+        node_health_alert=2
-        echo "pvc_storage.extinfo Cluster in maintenance mode"
+    elif [[ ${node_health} -le 90 && ${is_maintenance} != "true" ]]; then
-    elif [[ $storage_value -eq 2 ]]; then
+        node_health_alert=1
-        echo "pvc_storage.extinfo ${storage_failed_reason}"
+    else
-    fi 
+        node_health_alert=0
    fi
    echo 'multigraph pvc_node_alert'
    echo "pvc_node_alert.value ${node_health_alert}"
 }
 case $# in
--- a/node-daemon/plugins/disk
+++ b/node-daemon/plugins/disk
@ -0,0 +1,167 @@
 #!/usr/bin/env python3
 # disk.py - PVC Monitoring example plugin for disk (system + OSD)
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the system and OSD disks for errors and faults and return
 # a health delta corresponding to severity.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "disk"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        from daemon_lib.common import run_os_command
        from json import loads
        _, _all_disks, _ = run_os_command("lsblk --json --paths --include 8,259")
        try:
            all_disks = loads(_all_disks)
        except Exception as e:
            return f"Error loading lsblk JSON: {e}"
        disk_details = list()
        def get_smartinfo(disk, extra_opt=""):
            _, _smart_info, _ = run_os_command(f"smartctl --info --json {extra_opt} {disk}")
            try:
                smart_info = loads(_smart_info)
            except Exception as e:
                return None
            return smart_info
        for disk in [disk["name"] for disk in all_disks['blockdevices']]:
            extra_opt = ""
            smart_info = get_smartinfo(disk)
            if smart_info is None or smart_info["smartctl"]["exit_status"] > 1:
                continue
            elif smart_info["smartctl"]["exit_status"] == 1:
                if "requires option" in smart_info["smartctl"]["messages"][0]["string"]:
                    extra_opt = smart_info["smartctl"]["messages"][0]["string"].split("'")[1].replace('N','0')
                    smart_info = get_smartinfo(disk, extra_opt)
                    if smart_info is None or smart_info["smartctl"]["exit_status"] > 0:
                        continue
                else:
                    continue
            disk_type = smart_info["device"]["type"]
            disk_details.append((disk, extra_opt, disk_type))
        self.disk_details = disk_details
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Re-run setup each time to ensure the disk details are current
        self.setup()
        # Run any imports first
        from daemon_lib.common import run_os_command
        from json import loads
        health_delta = 0
        messages = list()
        for _disk in self.disk_details:
            disk = _disk[0]
            extra_opt = _disk[1]
            disk_type = _disk[2]
            _, _smart_info, _ = run_os_command(f"smartctl --all --json {extra_opt} {disk}")
            try:
                smart_info = loads(_smart_info)
            except Exception as e:
                health_delta += 10
                messages.append(f"{disk} failed to load SMART data")
                continue
            if disk_type == 'nvme':
                for attribute in smart_info['nvme_smart_health_information_log'].items():
                    if attribute[0] == "critical_warning" and attribute[1] > 0:
                        health_delta += 10
                        messages.append(f"{disk} critical warning value {attribute[1]}")
                    if attribute[0] == "media_errors" and attribute[1] > 0:
                        health_delta += 10
                        messages.append(f"{disk} media errors value {attribute[1]}")
                    if attribute[0] == "percentage_used" and attribute[1] > 90:
                        health_delta += 10
                        messages.append(f"{disk} percentage used value {attribute[1]}%")
            else:
                for attribute in smart_info['ata_smart_attributes']['table']:
                    if attribute["when_failed"]:
                        health_delta += 10
                        messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}")
        if len(messages) < 1:
            messages.append(f"All {len(self.disk_details)} checked disks report OK: {', '.join([disk[0] for disk in self.disk_details])}")
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(', '.join(messages))
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/dpkg
+++ b/node-daemon/plugins/dpkg
@ -0,0 +1,160 @@
 #!/usr/bin/env python3
 # dpkg.py - PVC Monitoring example plugin for dpkg status
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the system dpkg status is as expected, with no invalid
 # packages or obsolete configuration files, and will return a 1 health delta for each
 # flaw in invalid packages, upgradable packages, and obsolete config files.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "dpkg"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from re import match
        import daemon_lib.common as pvc_common
        # Get Debian version
        with open('/etc/debian_version', 'r') as fh:
            debian_version = fh.read().strip()
        # Get a list of dpkg packages for analysis
        retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/dpkg --list")
        # Get a list of installed packages and states
        packages = list()
        for dpkg_line in stdout.split('\n'):
            if match('^[a-z][a-z] ', dpkg_line):
                line_split = dpkg_line.split()
                package_state = line_split[0]
                package_name = line_split[1]
                packages.append((package_name, package_state))
        count_ok = 0
        count_inconsistent = 0
        list_inconsistent = list()
        for package in packages:
            if package[1] == "ii":
                count_ok += 1
            else:
                count_inconsistent += 1
                list_inconsistent.append(package[0])
        # Get upgradable packages
        retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/apt list --upgradable")
        list_upgradable = list()
        for apt_line in stdout.split('\n'):
            if match('^[a-z][a-z] ', apt_line):
                line_split = apt_line.split('/')
                package_name = line_split[0]
                list_upgradable.append(package_name)
        count_upgradable = len(list_upgradable)
        # Get obsolete config files (dpkg-* or ucf-* under /etc)
        retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/find /etc -type f -a \( -name '*.dpkg-*' -o -name '*.ucf-*' \)")
        obsolete_conffiles = list()
        for conffile_line in stdout.split('\n'):
            if conffile_line:
                obsolete_conffiles.append(conffile_line)
        count_obsolete_conffiles = len(obsolete_conffiles)
        # Set health_delta based on the results
        health_delta = 0
        if count_inconsistent > 0:
            health_delta += 1
        if count_upgradable > 0:
            health_delta += 1
        if count_obsolete_conffiles > 0:
            health_delta += 1
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Craft the message
        message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Set the detailed data in our local PluginResult object
        detailed_data = {
            "debian_version": debian_version,
            "obsolete_conffiles": obsolete_conffiles,
            "inconsistent_packages": list_inconsistent,
            "upgradable_packages": list_upgradable,
        }
        self.plugin_result.set_data(detailed_data)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/edac
+++ b/node-daemon/plugins/edac
@ -0,0 +1,106 @@
 #!/usr/bin/env python3
 # edac.py - PVC Monitoring example plugin for EDAC
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the system's EDAC registers and report any failures.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "edac"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        import daemon_lib.common as common
        from re import match, search
        # Get edac-util output
        retcode, stdout, stderr = common.run_os_command('/usr/bin/edac-util')
        # If there's no errors, we're OK
        if match(r'^edac-util: No errors to report.', stdout):
            health_delta = 0
            message = "EDAC reports no errors"
        else:
            health_delta = 0
            message = "EDAC reports errors: "
            errors = list()
            for line in stdout.split('\n'):
                if match(r'^mc[0-9]: csrow', line):
                    if 'Uncorrected' in line:
                        health_delta = 50
                    errors.append(' '.join(line.split()[2:]))
            message += ', '.join(errors)
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/ipmi
+++ b/node-daemon/plugins/ipmi
@ -0,0 +1,106 @@
 #!/usr/bin/env python3
 # ipmi.py - PVC Monitoring example plugin for IPMI
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check whether the system IPMI is reachable.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "ipmi"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not ipmi in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from daemon_lib.common import run_os_command
        # Check the node's IPMI interface
        ipmi_hostname = self.config["ipmi_hostname"]
        ipmi_username = self.config["ipmi_username"]
        ipmi_password = self.config["ipmi_password"]
        retcode, _, _ = run_os_command(
            f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status"
        )
        if retcode > 0:
            # Set the health delta to 10 (subtract 10 from the total of 100)
            health_delta = 10
            # Craft a message that can be used by the clients
            message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding"
        else:
            # Set the health delta to 0 (no change)
            health_delta = 0
            # Craft a message that can be used by the clients
            message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding"
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/lbvt
+++ b/node-daemon/plugins/lbvt
@ -0,0 +1,105 @@
 #!/usr/bin/env python3
 # lbvt.py - PVC Monitoring example plugin for Libvirtd
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the Libvirt daemon instance on the node for operation.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "lbvt"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not lbvt in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from libvirt import openReadOnly as lvopen
        lv_conn = None
        # Set the health delta to 0 (no change)
        health_delta = 0
        # Craft a message that can be used by the clients
        message = "Successfully connected to Libvirtd on localhost"
        # Check the Zookeeper connection
        try:
            lv_conn = lvopen(f"qemu+tcp://{self.this_node.name}/system")
            data = lv_conn.getHostname()
        except Exception as e:
            health_delta = 50
            message = f"Failed to connect to Libvirtd: {e}"
        finally:
            if lv_conn is not None:
                lv_conn.close()
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/load
+++ b/node-daemon/plugins/load
@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 # load.py - PVC Monitoring example plugin for load
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the system load against the total number of CPU cores.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "load"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from os import getloadavg
        from psutil import cpu_count
        # Get the current 1-minute system load average
        load_average = getloadavg()[0]
        # Get the number of CPU cores
        cpu_cores = cpu_count()
        # Check that the load average is greater or equal to the cpu count
        if load_average > float(cpu_cores):
            # Set the health delta to 10 (subtract 10 from the total of 100)
            health_delta = 50
            # Craft a message that can be used by the clients
            message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
        else:
            # Set the health delta to 0 (no change)
            health_delta = 0
            # Craft a message that can be used by the clients
            message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/nics
+++ b/node-daemon/plugins/nics
@ -0,0 +1,196 @@
 #!/usr/bin/env python3
 # nics.py - PVC Monitoring example plugin for NIC interfaces
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the network interfaces of the host, specifically for speed
 # and 802.3ad status (if applicable).
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "nics"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        import daemon_lib.common as common
        from re import match, search, findall
        messages = list()
        health_delta = 0
        # Get a list of the various underlying devices
        _core_nics = set()
        for dev in [
                self.config['bridge_dev'],
                self.config['upstream_dev'],
                self.config['cluster_dev'],
                self.config['storage_dev'],
        ]:
            with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
                _devtype = uevent.readlines()[0].split('=')[-1].strip()
            if _devtype == 'vlan':
                with open(f"/proc/net/vlan/{dev}") as devfh:
                    vlan_info = devfh.read().split('\n')
                for line in vlan_info:
                    if match(r'^Device:', line):
                        dev = line.split()[-1]
            _core_nics.add(dev)
        core_nics = sorted(list(_core_nics))
        for dev in core_nics:
            with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
                _devtype = uevent.readlines()[0].split('=')[-1].strip()
            if _devtype == "bond":
                syspath = f"/proc/net/bonding/{dev}"
                with open(syspath) as devfh:
                    bonding_stats = devfh.read()
                _, _mode, _info, *_slaves = bonding_stats.split('\n\n')
                slave_interfaces = list()
                for slavedev in _slaves:
                    lines = slavedev.split('\n')
                    for line in lines:
                        if match(r'^Slave Interface:', line):
                            interface_name = line.split()[-1]
                        if match(r'^MII Status:', line):
                            interface_status = line.split()[-1]
                        if match(r'^Speed:', line):
                            try:
                                interface_speed_mbps = int(line.split()[-2])
                            except Exception:
                                interface_speed_mbps = 0
                        if match(r'^Duplex:', line):
                            interface_duplex = line.split()[-1]
                    slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex))
                # Ensure at least 2 slave interfaces are up
                slave_interface_up_count = 0
                for slave_interface in slave_interfaces:
                    if slave_interface[1] == 'up':
                        slave_interface_up_count += 1
                if slave_interface_up_count < 2:
                    messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
                    health_delta += 10
                else:
                    messages.append(f"{dev} OK with {slave_interface_up_count} active slaves")
                # Get ethtool supported speeds for slave interfaces
                supported_link_speeds = set()
                for slave_interface in slave_interfaces:
                    slave_dev = slave_interface[0]
                    _, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}")
                    in_modes = False
                    for line in ethtool_stdout.split('\n'):
                        if search('Supported link modes:', line):
                            in_modes = True
                        if search('Supported pause frame use:', line):
                            in_modes = False
                            break
                        if in_modes:
                            speed = int(findall(r'\d+', line.split()[-1])[0])
                            supported_link_speeds.add(speed)
            else:
                # Get ethtool supported speeds for interface
                supported_link_speeds = set()
                _, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}")
                in_modes = False
                for line in ethtool_stdout.split('\n'):
                    if search('Supported link modes:', line):
                        in_modes = True
                    if search('Supported pause frame use:', line):
                        in_modes = False
                        break
                    if in_modes:
                        speed = int(line.split()[-1].replace('baseT', '').split('/')[0])
                        supported_link_speeds.add(speed)
            max_supported_link_speed = sorted(list(supported_link_speeds))[-1]
            # Ensure interface is running at its maximum speed
            with open(f"/sys/class/net/{dev}/speed") as devfh:
                dev_speed = int(devfh.read())
            if dev_speed < max_supported_link_speed:
                messages.append(f"{dev} DEGRADED at {dev_speed} Mbps")
                health_delta += 10
            else:
                messages.append(f"{dev} OK at {dev_speed} Mbps")
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(', '.join(messages))
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/psql
+++ b/node-daemon/plugins/psql
@ -0,0 +1,139 @@
 #!/usr/bin/env python3
 # psql.py - PVC Monitoring example plugin for Postgres/Patroni
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the Patroni PostgreSQL instance on the node for operation.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "psql"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from psycopg2 import connect
        conn_metadata = None
        cur_metadata = None
        conn_dns = None
        cur_dns = None
        # Set the health delta to 0 (no change)
        health_delta = 0
        # Craft a message that can be used by the clients
        message = "Successfully connected to PostgreSQL databases on localhost"
        # Check the Metadata database (primary)
        try:
            conn_metadata = connect(
                host=self.this_node.name,
                port=self.config["metadata_postgresql_port"],
                dbname=self.config["metadata_postgresql_dbname"],
                user=self.config["metadata_postgresql_user"],
                password=self.config["metadata_postgresql_password"],
            )
            cur_metadata = conn_metadata.cursor()
            cur_metadata.execute("""SELECT * FROM alembic_version""")
            data = cur_metadata.fetchone()
        except Exception as e:
            health_delta = 50
            err = str(e).split('\n')[0]
            message = f"Failed to connect to PostgreSQL database {self.config['metadata_postgresql_dbname']}: {err}"
        finally:
            if cur_metadata is not None:
                cur_metadata.close()
            if conn_metadata is not None:
                conn_metadata.close()
        if health_delta == 0:
            # Check the PowerDNS database (secondary)
            try:
                conn_pdns = connect(
                    host=self.this_node.name,
                    port=self.config["pdns_postgresql_port"],
                    dbname=self.config["pdns_postgresql_dbname"],
                    user=self.config["pdns_postgresql_user"],
                    password=self.config["pdns_postgresql_password"],
                )
                cur_pdns = conn_pdns.cursor()
                cur_pdns.execute("""SELECT * FROM supermasters""")
                data = cur_pdns.fetchone()
            except Exception as e:
                health_delta = 50
                err = str(e).split('\n')[0]
                message = f"Failed to connect to PostgreSQL database {self.config['pdns_postgresql_dbname']}: {err}"
            finally:
                if cur_pdns is not None:
                    cur_pdns.close()
                if conn_pdns is not None:
                    conn_pdns.close()
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/zkpr
+++ b/node-daemon/plugins/zkpr
@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 # zkpr.py - PVC Monitoring example plugin for Zookeeper
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the Zookeeper instance on the node for operation.
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "zkpr"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not zkpr in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from kazoo.client import KazooClient, KazooState
        zk_conn = None
        # Set the health delta to 0 (no change)
        health_delta = 0
        # Craft a message that can be used by the clients
        message = "Successfully connected to Zookeeper on localhost"
        # Check the Zookeeper connection
        try:
            zk_conn = KazooClient(hosts=[f"{self.this_node.name}:2181"], timeout=1, read_only=True)
            zk_conn.start(timeout=1)
            data = zk_conn.get('/primary_node')
        except Exception as e:
            health_delta = 50
            message = f"Failed to connect to Zookeeper: {e}"
        finally:
            if zk_conn is not None:
                zk_conn.stop()
                zk_conn.close()
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/pvcnoded.sample.yaml
+++ b/node-daemon/pvcnoded.sample.yaml
@ -128,6 +128,8 @@ pvc:
    configuration:
      # directories: PVC system directories
      directories:
        # plugin_directory: Directory containing node monitoring plugins
        plugin_directory: "/usr/share/pvc/plugins"
        # dynamic_directory: Temporary in-memory directory for active configurations
        dynamic_directory: "/run/pvc"
        # log_directory: Logging directory
@ -150,8 +152,8 @@ pvc:
        log_keepalives: True
        # log_keepalive_cluster_details: Enable or disable node status logging during keepalive
        log_keepalive_cluster_details: True
-        # log_keepalive_storage_details: Enable or disable node storage logging during keepalive
+        # log_keepalive_plugin_details: Enable or disable node health plugin logging during keepalive
-        log_keepalive_storage_details: True
+        log_keepalive_plugin_details: True
        # console_log_lines: Number of console log lines to store in Zookeeper per VM
        console_log_lines: 1000
        # node_log_lines: Number of node log lines to store in Zookeeper per node
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -27,6 +27,7 @@ import pvcnoded.util.services
 import pvcnoded.util.libvirt
 import pvcnoded.util.zookeeper
 import pvcnoded.objects.MonitoringInstance as MonitoringInstance
 import pvcnoded.objects.DNSAggregatorInstance as DNSAggregatorInstance
 import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance
 import pvcnoded.objects.VMInstance as VMInstance
@ -58,6 +59,7 @@ version = "0.9.61"
 def entrypoint():
    keepalive_timer = None
    monitoring_instance = None
    # Get our configuration
    config = pvcnoded.util.config.get_configuration()
@ -204,7 +206,7 @@ def entrypoint():
    # Define a cleanup function
    def cleanup(failure=False):
-        nonlocal logger, zkhandler, keepalive_timer, d_domain
+        nonlocal logger, zkhandler, keepalive_timer, d_domain, monitoring_instance
        logger.out("Terminating pvcnoded and cleaning up", state="s")
@ -253,6 +255,13 @@ def entrypoint():
        except Exception:
            pass
        # Clean up any monitoring plugins that have cleanup
        try:
            logger.out("Performing monitoring plugin cleanup", state="s")
            monitoring_instance.run_cleanups()
        except Exception:
            pass
        # Set stop state in Zookeeper
        zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
@ -1015,9 +1024,14 @@ def entrypoint():
                        state="i",
                    )
    # Set up the node monitoring instance
    monitoring_instance = MonitoringInstance.MonitoringInstance(
        zkhandler, config, logger, this_node
    )
    # Start keepalived thread
    keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
-        logger, config, zkhandler, this_node
+        logger, config, zkhandler, this_node, monitoring_instance
    )
    # Tick loop; does nothing since everything is async
--- a/node-daemon/pvcnoded/objects/MonitoringInstance.py
+++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py
@ -0,0 +1,412 @@
 #!/usr/bin/env python3
 # PluginInstance.py - Class implementing a PVC monitoring instance
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 import concurrent.futures
 import time
 import importlib.util
 from os import walk
 from datetime import datetime
 from json import dumps
 class PluginError(Exception):
    """
    An exception that results from a plugin failing setup
    """
    pass
 class PluginResult(object):
    def __init__(self, zkhandler, config, logger, this_node, plugin_name):
        self.zkhandler = zkhandler
        self.config = config
        self.logger = logger
        self.this_node = this_node
        self.plugin_name = plugin_name
        self.current_time = int(time.time())
        self.health_delta = 0
        self.message = "N/A"
        self.data = {}
        self.runtime = "0.00"
    def set_health_delta(self, new_delta):
        self.health_delta = new_delta
    def set_message(self, new_message):
        self.message = new_message
    def set_data(self, new_data):
        self.data = new_data
    def set_runtime(self, new_runtime):
        self.runtime = new_runtime
    def to_zookeeper(self):
        self.zkhandler.write(
            [
                (
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin.name",
                        self.plugin_name,
                    ),
                    self.plugin_name,
                ),
                (
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin.last_run",
                        self.plugin_name,
                    ),
                    self.current_time,
                ),
                (
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin.health_delta",
                        self.plugin_name,
                    ),
                    self.health_delta,
                ),
                (
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin.message",
                        self.plugin_name,
                    ),
                    self.message,
                ),
                (
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin.data",
                        self.plugin_name,
                    ),
                    dumps(self.data),
                ),
                (
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin.runtime",
                        self.plugin_name,
                    ),
                    self.runtime,
                ),
            ]
        )
 class MonitoringPlugin(object):
    def __init__(self, zkhandler, config, logger, this_node, plugin_name):
        self.zkhandler = zkhandler
        self.config = config
        self.logger = logger
        self.this_node = this_node
        self.plugin_name = plugin_name
        self.plugin_result = PluginResult(
            self.zkhandler,
            self.config,
            self.logger,
            self.this_node,
            self.plugin_name,
        )
    def __str__(self):
        return self.plugin_name
    #
    # Helper functions; exposed to child MonitoringPluginScript instances
    #
    def log(self, message, state="d"):
        """
        Log a message to the PVC logger instance using the plugin name as a prefix
        Takes "state" values as defined by the PVC logger instance, defaulting to debug:
            "d": debug
            "i": informational
            "t": tick/keepalive
            "w": warning
            "e": error
        """
        if state == "d" and not self.config["debug"]:
            return
        self.logger.out(message, state=state, prefix=self.plugin_name)
    #
    # Primary class functions; implemented by the individual plugins
    #
    def setup(self):
        """
        setup(): Perform setup of the plugin; run once during daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Run the plugin, returning a PluginResult object
        """
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Clean up after the plugin; run once during daemon shutdown
        OPTIONAL
        """
        pass
 class MonitoringInstance(object):
    def __init__(self, zkhandler, config, logger, this_node):
        self.zkhandler = zkhandler
        self.config = config
        self.logger = logger
        self.this_node = this_node
        # Get a list of plugins from the plugin_directory
        plugin_files = next(walk(self.config["plugin_directory"]), (None, None, []))[
            2
        ]  # [] if no file
        self.all_plugins = list()
        self.all_plugin_names = list()
        successful_plugins = 0
        # Load each plugin file into the all_plugins list
        for plugin_file in sorted(plugin_files):
            try:
                self.logger.out(
                    f"Loading monitoring plugin from {self.config['plugin_directory']}/{plugin_file}",
                    state="i",
                )
                loader = importlib.machinery.SourceFileLoader(
                    "plugin_script", f"{self.config['plugin_directory']}/{plugin_file}"
                )
                spec = importlib.util.spec_from_loader(loader.name, loader)
                plugin_script = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(plugin_script)
                plugin = plugin_script.MonitoringPluginScript(
                    self.zkhandler,
                    self.config,
                    self.logger,
                    self.this_node,
                    plugin_script.PLUGIN_NAME,
                )
                failed_setup = plugin.setup()
                if failed_setup is not None:
                    raise PluginError(f"{failed_setup}")
                # Create plugin key
                self.zkhandler.write(
                    [
                        (
                            (
                                "node.monitoring.data",
                                self.this_node.name,
                                "monitoring_plugin.name",
                                plugin.plugin_name,
                            ),
                            plugin.plugin_name,
                        ),
                        (
                            (
                                "node.monitoring.data",
                                self.this_node.name,
                                "monitoring_plugin.last_run",
                                plugin.plugin_name,
                            ),
                            "0",
                        ),
                        (
                            (
                                "node.monitoring.data",
                                self.this_node.name,
                                "monitoring_plugin.health_delta",
                                plugin.plugin_name,
                            ),
                            "0",
                        ),
                        (
                            (
                                "node.monitoring.data",
                                self.this_node.name,
                                "monitoring_plugin.message",
                                plugin.plugin_name,
                            ),
                            "Initializing",
                        ),
                        (
                            (
                                "node.monitoring.data",
                                self.this_node.name,
                                "monitoring_plugin.data",
                                plugin.plugin_name,
                            ),
                            dumps({}),
                        ),
                        (
                            (
                                "node.monitoring.data",
                                self.this_node.name,
                                "monitoring_plugin.runtime",
                                plugin.plugin_name,
                            ),
                            "0.00",
                        ),
                    ]
                )
                self.all_plugins.append(plugin)
                self.all_plugin_names.append(plugin.plugin_name)
                successful_plugins += 1
                self.logger.out(
                    f"Successfully loaded monitoring plugin '{plugin.plugin_name}'",
                    state="o",
                )
            except Exception as e:
                self.logger.out(
                    f"Failed to load monitoring plugin: {e}",
                    state="w",
                )
        self.zkhandler.write(
            [
                (
                    ("node.monitoring.plugins", self.this_node.name),
                    " ".join(self.all_plugin_names),
                ),
            ]
        )
        if successful_plugins < 1:
            return
        # Clean up any old plugin data for which a plugin file no longer exists
        for plugin_key in self.zkhandler.children(
            ("node.monitoring.data", self.this_node.name)
        ):
            if plugin_key not in self.all_plugin_names:
                self.zkhandler.delete(
                    (
                        "node.monitoring.data",
                        self.this_node.name,
                        "monitoring_plugin",
                        plugin_key,
                    )
                )
    def run_plugin(self, plugin):
        time_start = datetime.now()
        result = plugin.run()
        time_end = datetime.now()
        time_delta = time_end - time_start
        runtime = "{:0.02f}".format(time_delta.total_seconds())
        result.set_runtime(runtime)
        result.to_zookeeper()
        return result
    def run_plugins(self):
        total_health = 100
        if self.config["log_keepalive_plugin_details"]:
            self.logger.out(
                f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}",
                state="t",
            )
        plugin_results = list()
        with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
            to_future_plugin_results = {
                executor.submit(self.run_plugin, plugin): plugin
                for plugin in self.all_plugins
            }
            for future in concurrent.futures.as_completed(to_future_plugin_results):
                plugin_results.append(future.result())
        for result in sorted(plugin_results, key=lambda x: x.plugin_name):
            if self.config["log_keepalive_plugin_details"]:
                self.logger.out(
                    result.message + f" [-{result.health_delta}]",
                    state="t",
                    prefix=f"{result.plugin_name} ({result.runtime}s)",
                )
            if result is not None:
                total_health -= result.health_delta
        if total_health < 0:
            total_health = 0
        if total_health > 90:
            health_colour = self.logger.fmt_green
        elif total_health > 50:
            health_colour = self.logger.fmt_yellow
        else:
            health_colour = self.logger.fmt_red
        self.zkhandler.write(
            [
                (
                    ("node.monitoring.health", self.this_node.name),
                    total_health,
                ),
            ]
        )
        self.logger.out(
            f"Node health: {health_colour}{total_health}%{self.logger.fmt_end}",
            state="t",
        )
    def run_cleanup(self, plugin):
        return plugin.cleanup()
    def run_cleanups(self):
        with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
            to_future_plugin_results = {
                executor.submit(self.run_cleanup, plugin): plugin
                for plugin in self.all_plugins
            }
            for future in concurrent.futures.as_completed(to_future_plugin_results):
                # This doesn't do anything, just lets us wait for them all to complete
                pass
        # Set the node health to None as no previous checks are now valid
        self.zkhandler.write(
            [
                (
                    ("node.monitoring.health", self.this_node.name),
                    None,
                ),
            ]
        )
--- a/node-daemon/pvcnoded/util/config.py
+++ b/node-daemon/pvcnoded/util/config.py
@ -180,6 +180,9 @@ def get_configuration():
        raise MalformedConfigurationError(e)
    config_directories = {
        "plugin_directory": o_directories.get(
            "plugin_directory", "/usr/share/pvc/plugins"
        ),
        "dynamic_directory": o_directories.get("dynamic_directory", None),
        "log_directory": o_directories.get("log_directory", None),
        "console_log_directory": o_directories.get("console_log_directory", None),
@ -225,8 +228,8 @@ def get_configuration():
        "log_keepalive_cluster_details": o_logging.get(
            "log_keepalive_cluster_details", False
        ),
-        "log_keepalive_storage_details": o_logging.get(
+        "log_keepalive_plugin_details": o_logging.get(
-            "log_keepalive_storage_details", False
+            "log_keepalive_plugin_details", False
        ),
        "console_log_lines": o_logging.get("console_log_lines", False),
        "node_log_lines": o_logging.get("node_log_lines", False),
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@ -51,7 +51,7 @@ libvirt_vm_states = {
 }
-def start_keepalive_timer(logger, config, zkhandler, this_node):
+def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_instance):
    keepalive_interval = config["keepalive_interval"]
    logger.out(
        f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node):
    keepalive_timer = BackgroundScheduler()
    keepalive_timer.add_job(
        node_keepalive,
-        args=(logger, config, zkhandler, this_node),
+        args=(logger, config, zkhandler, this_node, monitoring_instance),
        trigger="interval",
        seconds=keepalive_interval,
    )
@ -97,34 +97,12 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
        logger.out("Failed to open connection to Ceph cluster: {}".format(e), state="e")
        return
    if debug:
        logger.out("Getting health stats from monitor", state="d", prefix="ceph-thread")
    # Get Ceph cluster health for local status output
    command = {"prefix": "health", "format": "json"}
    try:
        health_status = json.loads(
            ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[1]
        )
        ceph_health = health_status["status"]
    except Exception as e:
        logger.out("Failed to obtain Ceph health data: {}".format(e), state="e")
        ceph_health = "HEALTH_UNKN"
    if ceph_health in ["HEALTH_OK"]:
        ceph_health_colour = logger.fmt_green
    elif ceph_health in ["HEALTH_UNKN"]:
        ceph_health_colour = logger.fmt_cyan
    elif ceph_health in ["HEALTH_WARN"]:
        ceph_health_colour = logger.fmt_yellow
    else:
        ceph_health_colour = logger.fmt_red
    # Primary-only functions
    if this_node.router_state == "primary":
        # Get Ceph status information (pretty)
        if debug:
            logger.out(
-                "Set ceph health information in zookeeper (primary only)",
+                "Set Ceph status information in zookeeper (primary only)",
                state="d",
                prefix="ceph-thread",
            )
@ -138,9 +116,27 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
        except Exception as e:
            logger.out("Failed to set Ceph status data: {}".format(e), state="e")
        # Get Ceph health information (JSON)
        if debug:
            logger.out(
-                "Set ceph rados df information in zookeeper (primary only)",
+                "Set Ceph health information in zookeeper (primary only)",
                state="d",
                prefix="ceph-thread",
            )
        command = {"prefix": "health", "format": "json"}
        ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
            1
        ].decode("ascii")
        try:
            zkhandler.write([("base.storage.health", str(ceph_health))])
        except Exception as e:
            logger.out("Failed to set Ceph health data: {}".format(e), state="e")
        # Get Ceph df information (pretty)
        if debug:
            logger.out(
                "Set Ceph rados df information in zookeeper (primary only)",
                state="d",
                prefix="ceph-thread",
            )
@ -408,8 +404,6 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
    ceph_conn.shutdown()
    queue.put(ceph_health_colour)
    queue.put(ceph_health)
    queue.put(osds_this_node)
    if debug:
@ -648,7 +642,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
 # Keepalive update function
-def node_keepalive(logger, config, zkhandler, this_node):
+def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
    debug = config["debug"]
    if debug:
        logger.out("Keepalive starting", state="d", prefix="main-thread")
@ -777,16 +771,14 @@ def node_keepalive(logger, config, zkhandler, this_node):
    if config["enable_storage"]:
        try:
-            ceph_health_colour = ceph_thread_queue.get(
+            osds_this_node = ceph_thread_queue.get(
-                timeout=config["keepalive_interval"]
+                timeout=(config["keepalive_interval"] - 1)
            )
            ceph_health = ceph_thread_queue.get(timeout=config["keepalive_interval"])
            osds_this_node = ceph_thread_queue.get(timeout=config["keepalive_interval"])
        except Exception:
            logger.out("Ceph stats queue get exceeded timeout, continuing", state="w")
            ceph_health_colour = logger.fmt_cyan
            ceph_health = "UNKNOWN"
            osds_this_node = "?"
    else:
        osds_this_node = "0"
    # Set our information in zookeeper
    keepalive_time = int(time.time())
@ -839,8 +831,8 @@ def node_keepalive(logger, config, zkhandler, this_node):
        if config["log_keepalive_cluster_details"]:
            logger.out(
                "{bold}Maintenance:{nofmt} {maint}  "
-                "{bold}Active VMs:{nofmt} {domcount}  "
+                "{bold}Node VMs:{nofmt} {domcount}  "
-                "{bold}Networks:{nofmt} {netcount}  "
+                "{bold}Node OSDs:{nofmt} {osdcount}  "
                "{bold}Load:{nofmt} {load}  "
                "{bold}Memory [MiB]: VMs:{nofmt} {allocmem}  "
                "{bold}Used:{nofmt} {usedmem}  "
@ -849,7 +841,7 @@ def node_keepalive(logger, config, zkhandler, this_node):
                    nofmt=logger.fmt_end,
                    maint=this_node.maintenance,
                    domcount=this_node.domains_count,
-                    netcount=len(zkhandler.children("base.network")),
+                    osdcount=osds_this_node,
                    load=this_node.cpuload,
                    freemem=this_node.memfree,
                    usedmem=this_node.memused,
@ -857,22 +849,6 @@ def node_keepalive(logger, config, zkhandler, this_node):
                ),
                state="t",
            )
        if config["enable_storage"] and config["log_keepalive_storage_details"]:
            logger.out(
                "{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt}  "
                "{bold}Total OSDs:{nofmt} {total_osds}  "
                "{bold}Node OSDs:{nofmt} {node_osds}  "
                "{bold}Pools:{nofmt} {total_pools}  ".format(
                    bold=logger.fmt_bold,
                    health_colour=ceph_health_colour,
                    nofmt=logger.fmt_end,
                    health=ceph_health,
                    total_osds=len(zkhandler.children("base.osd")),
                    node_osds=osds_this_node,
                    total_pools=len(zkhandler.children("base.pool")),
                ),
                state="t",
            )
    # Look for dead nodes and fence them
    if not this_node.maintenance:
@ -918,5 +894,7 @@ def node_keepalive(logger, config, zkhandler, this_node):
                                [(("node.state.daemon", node_name), "dead")]
                            )
    monitoring_instance.run_plugins()
    if debug:
        logger.out("Keepalive finished", state="d", prefix="main-thread")
		`@ -0,0 +1 @@`
							{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}