Bump version to 0.9.84

Fix config file for database migrations
Add wait after stopping VM
2023-12-09 23:05:40 -05:00 · 2023-12-09 22:51:54 -05:00 · 2023-12-09 18:14:03 -05:00 · 2023-12-09 17:42:51 -05:00 · 2023-12-09 17:31:56 -05:00 · 2023-12-09 17:28:13 -05:00
19 changed files with 2730 additions and 59 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.83
+0.9.84
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,15 @@
 ## PVC Changelog

+###### [v0.9.84](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.84)
+
+  **Breaking Changes:** This release features a major reconfiguration to how monitoring and reporting of the cluster health works. Node health plugins now report "faults", as do several other issues which were previously manually checked for in "cluster" daemon library for the "/status" endpoint, from within the Health daemon. These faults are persistent, and under each given identifier can be triggered once and subsequent triggers simply update the "last reported" time. An additional set of API endpoints and commands are added to manage these faults, either by "ack"(nowledging) them (keeping the alert around to be further updated but setting its health delta to 0%), or "delete"ing them (completely removing the fault unless it retriggers), both individually, to (from the CLI) multiple, or all. Cluster health reporting is now done based on these faults instead of anything else, and the default interval for health checks is reduced to 15 seconds to accomodate this. In addition to this, Promethius metrics have been added, along with an example Grafana dashboard, for the PVC cluster itself, as well as a proxy to the Ceph cluster metrics. This release also fixes some bugs in the VM provisioner that were introduced in 0.9.83; these fixes require a **reimport or reconfiguration of any provisioner scripts**; reference the updated examples for details.
+
+  * [All] Adds persistent fault reporting to clusters, replacing the old cluster health calculations.
+  * [API Daemon] Adds cluster-level Prometheus metric exporting as well as a Ceph Prometheus proxy to the API.
+  * [CLI Client] Improves formatting output of "pvc cluster status".
+  * [Node Daemon] Fixes several bugs and enhances the working of the psql health check plugin.
+  * [Worker Daemon] Fixes several bugs in the example provisioner scripts, and moves the libvirt_schema library into the daemon common libraries.
+
 ###### [v0.9.83](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.83)

  **Breaking Changes:** This release features a breaking change for the daemon config. A new unified "pvc.conf" file is required for all daemons (and the CLI client for Autobackup and API-on-this-host functionality), which will be written by the "pvc" role in the PVC Ansible framework. Using the "update-pvc-daemons" oneshot playbook from PVC Ansible is **required** to update to this release, as it will ensure this file is written to the proper place before deploying the new package versions, and also ensures that the old entires are cleaned up afterwards. In addition, this release fully splits the node worker and health subsystems into discrete daemons ("pvcworkerd" and "pvchealthd") and packages ("pvc-daemon-worker" and "pvc-daemon-health") respectively. The "pvc-daemon-node" package also now depends on both packages, and the "pvc-daemon-api" package can now be reliably used outside of the PVC nodes themselves (for instance, in a VM) without any strange cross-dependency issues.
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
 import daemon_lib.config as cfg

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"

 # API version
 API_VERSION = 1.0
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@ -538,14 +538,15 @@ def cli_cluster_fault_list(limit, format_function):
    name="ack",
    short_help="Acknowledge a cluster fault.",
 )
-@click.argument("fault_id")
+@click.argument("fault_id", nargs=-1, required=True)
@connection_req
 def cli_cluster_fault_acknowledge(fault_id):
    """
-    Acknowledge the cluster fault FAULT_ID.
+    Acknowledge the cluster fault FAULT_ID; multiple FAULT_IDs may be specified.
    """

-    retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, fault_id)
+    faults = list(fault_id)
+    retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, faults)
    finish(retcode, retdata)


@ -574,14 +575,15 @@ def cli_cluster_fault_acknowledge_all():
    name="delete",
    short_help="Delete a cluster fault.",
 )
-@click.argument("fault_id")
+@click.argument("fault_id", nargs=-1, required=True)
@connection_req
 def cli_cluster_fault_delete(fault_id):
    """
-    Delete the cluster fault FAULT_ID.
+    Delete the cluster fault FAULT_ID; multiple FAULT_IDs may be specified.
    """

-    retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, fault_id)
+    faults = list(fault_id)
+    retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, faults)
    finish(retcode, retdata)


--- a/client-cli/pvc/cli/formatters.py
+++ b/client-cli/pvc/cli/formatters.py
@ -388,22 +388,22 @@ def cli_cluster_fault_list_format_short(CLI_CONFIG, fault_data):
        fault_id_length + fault_status_length + fault_health_delta_length + 2
    )
    detail_header_length = (
-        fault_health_delta_length
+        fault_id_length
+        + fault_health_delta_length
        + fault_status_length
        + fault_last_reported_length
        + fault_message_length
        + 3
        - meta_header_length
-        + 8
    )

    # Format the string (header)
    fault_list_output.append(
-        "{bold}Meta {meta_dashes} Fault {detail_dashes}{end_bold}".format(
+        "{bold}Meta {meta_dashes}  Fault {detail_dashes}{end_bold}".format(
            bold=ansii["bold"],
            end_bold=ansii["end"],
-            meta_dashes="-" * (meta_header_length - len("Meta ")),
-            detail_dashes="-" * (detail_header_length - len("Fault ")),
+            meta_dashes="-" * (meta_header_length - len("Meta  ")),
+            detail_dashes="-" * (detail_header_length - len("Fault  ")),
        )
    )

--- a/client-cli/pvc/lib/faults.py
+++ b/client-cli/pvc/lib/faults.py
@ -45,20 +45,29 @@ def get_list(config, limit=None, sort_key="last_reported"):
        return False, response.json().get("message", "")


-def acknowledge(config, fault_id):
+def acknowledge(config, faults):
    """
-    Acknowledge a PVC fault
+    Acknowledge one or more PVC faults

-    API endpoint: PUT /api/v1/faults/<fault_id>
+    API endpoint: PUT /api/v1/faults/<fault_id> for fault_id in faults
    API arguments:
    API schema: {json_message}
    """
-    response = call_api(config, "put", f"/faults/{fault_id}")
+    status_codes = list()
+    bad_msgs = list()
+    for fault_id in faults:
+        response = call_api(config, "put", f"/faults/{fault_id}")

-    if response.status_code == 200:
-        return True, response.json().get("message", "")
+        if response.status_code == 200:
+            status_codes.append(True)
+        else:
+            status_codes.append(False)
+            bad_msgs.append(response.json().get("message", ""))
+
+    if all(status_codes):
+        return True, f"Successfully acknowledged fault(s) {', '.join(faults)}"
    else:
-        return False, response.json().get("message", "")
+        return False, ", ".join(bad_msgs)


 def acknowledge_all(config):
@ -77,20 +86,29 @@ def acknowledge_all(config):
        return False, response.json().get("message", "")


-def delete(config, fault_id):
+def delete(config, faults):
    """
-    Delete a PVC fault
+    Delete one or more PVC faults

-    API endpoint: DELETE /api/v1/faults/<fault_id>
+    API endpoint: DELETE /api/v1/faults/<fault_id> for fault_id in faults
    API arguments:
    API schema: {json_message}
    """
-    response = call_api(config, "delete", f"/faults/{fault_id}")
+    status_codes = list()
+    bad_msgs = list()
+    for fault_id in faults:
+        response = call_api(config, "delete", f"/faults/{fault_id}")

-    if response.status_code == 200:
-        return True, response.json().get("message", "")
+        if response.status_code == 200:
+            status_codes.append(True)
+        else:
+            status_codes.append(False)
+            bad_msgs.append(response.json().get("message", ""))
+
+    if all(status_codes):
+        return True, f"Successfully deleted fault(s) {', '.join(faults)}"
    else:
-        return False, response.json().get("message", "")
+        return False, ", ".join(bad_msgs)


 def delete_all(config):
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="pvc",
-    version="0.9.83",
+    version="0.9.84",
    packages=["pvc.cli", "pvc.lib"],
    install_requires=[
        "Click",
--- a/daemon-common/faults.py
+++ b/daemon-common/faults.py
@ -20,7 +20,6 @@
 ###############################################################################

 from datetime import datetime
-from hashlib import md5


 def generate_fault(
@ -32,10 +31,6 @@ def generate_fault(
    fault_message,
    fault_details=None,
 ):
-    # Generate a fault ID from the fault_name, fault_delta, and fault_message
-    fault_str = f"{fault_name} {fault_delta} {fault_message}"
-    fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
-
    # Strip the microseconds off of the fault time; we don't care about that precision
    fault_time = str(fault_time).split(".")[0]

@ -45,47 +40,49 @@ def generate_fault(
    # If a fault already exists with this ID, just update the time
    if not zkhandler.exists("base.faults"):
        logger.out(
-            f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
+            f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
            state="w",
        )
        return

    existing_faults = zkhandler.children("base.faults")
-    if fault_id in existing_faults:
+    if fault_name in existing_faults:
        logger.out(
-            f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
+            f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
        )
    else:
        logger.out(
-            f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
+            f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
            state="i",
        )

    if zkhandler.read("base.config.maintenance") == "true":
        logger.out(
-            f"Skipping fault reporting for {fault_id} due to maintenance mode",
+            f"Skipping fault reporting for {fault_name} due to maintenance mode",
            state="w",
        )
        return

-    if fault_id in existing_faults:
+    # Update an existing fault
+    if fault_name in existing_faults:
        zkhandler.write(
            [
-                (("faults.last_time", fault_id), fault_time),
-                (("faults.message", fault_id), fault_message),
+                (("faults.last_time", fault_name), fault_time),
+                (("faults.delta", fault_name), fault_delta),
+                (("faults.message", fault_name), fault_message),
            ]
        )
-    # Otherwise, generate a new fault event
+    # Generate a new fault
    else:
        zkhandler.write(
            [
-                (("faults.id", fault_id), ""),
-                (("faults.first_time", fault_id), fault_time),
-                (("faults.last_time", fault_id), fault_time),
-                (("faults.ack_time", fault_id), ""),
-                (("faults.status", fault_id), "new"),
-                (("faults.delta", fault_id), fault_delta),
-                (("faults.message", fault_id), fault_message),
+                (("faults.id", fault_name), ""),
+                (("faults.first_time", fault_name), fault_time),
+                (("faults.last_time", fault_name), fault_time),
+                (("faults.ack_time", fault_name), ""),
+                (("faults.status", fault_name), "new"),
+                (("faults.delta", fault_name), fault_delta),
+                (("faults.message", fault_name), fault_message),
            ]
        )

--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,15 @@
+pvc (0.9.84-0) unstable; urgency=high
+
+  **Breaking Changes:** This release features a major reconfiguration to how monitoring and reporting of the cluster health works. Node health plugins now report "faults", as do several other issues which were previously manually checked for in "cluster" daemon library for the "/status" endpoint, from within the Health daemon. These faults are persistent, and under each given identifier can be triggered once and subsequent triggers simply update the "last reported" time. An additional set of API endpoints and commands are added to manage these faults, either by "ack"(nowledging) them (keeping the alert around to be further updated but setting its health delta to 0%), or "delete"ing them (completely removing the fault unless it retriggers), both individually, to (from the CLI) multiple, or all. Cluster health reporting is now done based on these faults instead of anything else, and the default interval for health checks is reduced to 15 seconds to accomodate this. In addition to this, Promethius metrics have been added, along with an example Grafana dashboard, for the PVC cluster itself, as well as a proxy to the Ceph cluster metrics. This release also fixes some bugs in the VM provisioner that were introduced in 0.9.83; these fixes require a **reimport or reconfiguration of any provisioner scripts**; reference the updated examples for details.
+
+  * [All] Adds persistent fault reporting to clusters, replacing the old cluster health calculations.
+  * [API Daemon] Adds cluster-level Prometheus metric exporting as well as a Ceph Prometheus proxy to the API.
+  * [CLI Client] Improves formatting output of "pvc cluster status".
+  * [Node Daemon] Fixes several bugs and enhances the working of the psql health check plugin.
+  * [Worker Daemon] Fixes several bugs in the example provisioner scripts, and moves the libvirt_schema library into the daemon common libraries.
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sat, 09 Dec 2023 23:05:40 -0500
+
 pvc (0.9.83-0) unstable; urgency=high

  **Breaking Changes:** This release features a breaking change for the daemon config. A new unified "pvc.conf" file is required for all daemons (and the CLI client for Autobackup and API-on-this-host functionality), which will be written by the "pvc" role in the PVC Ansible framework. Using the "update-pvc-daemons" oneshot playbook from PVC Ansible is **required** to update to this release, as it will ensure this file is written to the proper place before deploying the new package versions, and also ensures that the old entires are cleaned up afterwards. In addition, this release fully splits the node worker and health subsystems into discrete daemons ("pvcworkerd" and "pvchealthd") and packages ("pvc-daemon-worker" and "pvc-daemon-health") respectively. The "pvc-daemon-node" package also now depends on both packages, and the "pvc-daemon-api" package can now be reliably used outside of the PVC nodes themselves (for instance, in a VM) without any strange cross-dependency issues.
--- a/2
+++ b/2
@ -6,7 +6,7 @@ VERSION="$( head -1 debian/changelog | awk -F'[()-]' '{ print $2 }' )"

 pushd $( git rev-parse --show-toplevel ) &>/dev/null
 pushd api-daemon &>/dev/null
-export PVC_CONFIG_FILE="./pvcapid.sample.yaml"
+export PVC_CONFIG_FILE="../pvc.sample.conf"
 ./pvcapid-manage_flask.py db migrate -m "PVC version ${VERSION}"
 ./pvcapid-manage_flask.py db upgrade
 popd &>/dev/null
--- a/health-daemon/pvchealthd/Daemon.py
+++ b/health-daemon/pvchealthd/Daemon.py
@ -33,7 +33,7 @@ import os
 import signal

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"


 ##########################################################
--- a/health-daemon/pvchealthd/objects/MonitoringInstance.py
+++ b/health-daemon/pvchealthd/objects/MonitoringInstance.py
@ -228,7 +228,7 @@ class MonitoringInstance(object):
        def get_ceph_health_entries():
            ceph_health_entries = [
                {
-                    "entry": f"{value['severity']} {key}",
+                    "entry": key,
                    "check": value["severity"],
                    "details": value["summary"]["message"],
                }
@ -281,36 +281,42 @@ class MonitoringInstance(object):
        # This is a list of all possible faults (cluster error messages) and their corresponding details
        self.cluster_faults_map = {
            "dead_or_fenced_node": {
+                "name": "DEAD_NODE_{entry}",
                "entries": get_node_daemon_states,
                "conditions": ["dead", "fenced"],
                "delta": 50,
                "message": "Node {entry} was dead and/or fenced",
            },
            "ceph_osd_out": {
+                "name": "CEPH_OSD_OUT_{entry}",
                "entries": get_osd_in_states,
                "conditions": ["0"],
                "delta": 50,
                "message": "OSD {entry} was marked out",
            },
            "ceph_warn": {
+                "name": "CEPH_WARN_{entry}",
                "entries": get_ceph_health_entries,
                "conditions": ["HEALTH_WARN"],
                "delta": 10,
                "message": "{entry} reported by Ceph cluster",
            },
            "ceph_err": {
+                "name": "CEPH_ERR_{entry}",
                "entries": get_ceph_health_entries,
                "conditions": ["HEALTH_ERR"],
                "delta": 50,
                "message": "{entry} reported by Ceph cluster",
            },
            "vm_failed": {
+                "name": "VM_FAILED_{entry}",
                "entries": get_vm_states,
                "conditions": ["fail"],
                "delta": 10,
                "message": "VM {entry} was failed",
            },
            "memory_overprovisioned": {
+                "name": "MEMORY_OVERPROVISIONED",
                "entries": get_overprovisioned_memory,
                "conditions": ["overprovisioned"],
                "delta": 50,
@ -531,11 +537,12 @@ class MonitoringInstance(object):
                    if str(condition) == str(check):
                        fault_time = datetime.now()
                        fault_delta = fault_data["delta"]
+                        fault_name = fault_data["name"].format(entry=entry.upper())
                        fault_message = fault_data["message"].format(entry=entry)
                        generate_fault(
                            self.zkhandler,
                            self.logger,
-                            fault_type,
+                            fault_name,
                            fault_time,
                            fault_delta,
                            fault_message,
@ -587,7 +594,7 @@ class MonitoringInstance(object):

            # Generate a cluster fault if the plugin is in a suboptimal state
            if result.health_delta > 0:
-                fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
+                fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}"
                fault_time = datetime.now()

                # Map our check results to fault results
@ -602,11 +609,11 @@ class MonitoringInstance(object):
                generate_fault(
                    self.zkhandler,
                    self.logger,
-                    fault_type,
+                    fault_name,
                    fault_time,
                    fault_delta,
                    fault_message,
-                    fault_detail=None,
+                    fault_details=None,
                )
                self.faults += 1

@ -661,7 +668,7 @@ class MonitoringInstance(object):

        self.run_plugins(coordinator_state=coordinator_state)

-        if coordinator_state in ["primary", "secondary", "takeover", "relinquish"]:
+        if coordinator_state in ["primary", "takeover"]:
            self.run_faults(coordinator_state=coordinator_state)

        runtime_end = datetime.now()
--- a/node-daemon/monitoring/README.md
+++ b/node-daemon/monitoring/README.md
@ -2,6 +2,14 @@

 This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.

+## Prometheus + Grafana
+
+The included example Prometheus configuration and Grafana dashboard can be used to query the PVC API for Prometheus data and display it with a consistent dashboard.
+
+Note that the default configuration here also includes Ceph cluster information; a Ceph dashboard can be found externally.
+
+Note too that this does not include node export examples from individual PVC nodes; those must be set up separately.
+
 ## Munin

 The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
--- a/node-daemon/monitoring/prometheus/grafana-pvc-dashboard.json
+++ b/node-daemon/monitoring/prometheus/grafana-pvc-dashboard.json
--- a/node-daemon/monitoring/prometheus/prometheus.yml
+++ b/node-daemon/monitoring/prometheus/prometheus.yml
@ -0,0 +1,8 @@
+# Other configuration omitted
+scrape_configs:
+  - job_name: "pvc_cluster"
+    metrics_path: /api/v1/metrics
+    scheme: "http"
+    file_sd_configs:
+      - files:
+          - 'targets-pvc_cluster.json'
--- a/node-daemon/monitoring/prometheus/targets-pvc_cluster.json
+++ b/node-daemon/monitoring/prometheus/targets-pvc_cluster.json
@ -0,0 +1,11 @@
+[
+  {
+    "targets": [
+      "pvc.upstream.floating.address.tld:7370"
+    ],
+    "labels": {
+      "cluster": "cluster1"
+    }
+  }
+]
+
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -48,7 +48,7 @@ import re
 import json

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"


 ##########################################################
--- a/test-cluster.sh
+++ b/test-cluster.sh
@ -167,6 +167,7 @@ _pvc storage pool remove --yes testing

 # Remove the VM
 _pvc vm stop --yes testx
+sleep 5
 _pvc vm remove --yes testx

 _pvc provisioner profile remove --yes test
--- a/worker-daemon/pvcworkerd/Daemon.py
+++ b/worker-daemon/pvcworkerd/Daemon.py
@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
 )

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"


 config = cfg.get_configuration()
Author	SHA1	Message	Date
Joshua M. Boniface	9aee2a9075	Bump version to 0.9.84	2023-12-09 23:05:40 -05:00
Joshua M. Boniface	8f0ae3e2dd	Fix config file for database migrations	2023-12-09 22:51:54 -05:00
Joshua M. Boniface	946d3eaf43	Add wait after stopping VM	2023-12-09 18:14:03 -05:00
Joshua M. Boniface	1f6347d24b	Add Prometheus monitoring examples	2023-12-09 17:42:51 -05:00
Joshua M. Boniface	e8552b471b	Require at least one FAULT_ID	2023-12-09 17:31:56 -05:00
Joshua M. Boniface	fc443a323b	Allow ack/delete of multiple faults at once	2023-12-09 17:28:13 -05:00
Joshua M. Boniface	b0557edb76	Ensure entry in name is uppercase	2023-12-09 17:01:41 -05:00
Joshua M. Boniface	47bd7bf2f5	Only run cluster-wide health checks on primary Avoids multiple coordinators trying to write updated cluster-wide fault events. Instead, they are now only written by the primary (or the incoming primary if still in a transition).	2023-12-09 16:50:51 -05:00
Joshua M. Boniface	b9fbfe2ed5	Improve fault ID format Instead of using random hex characters from an md5sum, use a nice name in all-caps similar to how Ceph does. This further helps prevent dupes but also permits a changing health delta within a single event (which would really only ever apply to plugin faults).	2023-12-09 16:48:14 -05:00
Joshua M. Boniface	764e3e3722	Fix bug in fault header format	2023-12-09 16:47:56 -05:00
 @ -1 +1 @@
 .9.83
 .9.84