Bump version to 0.9.84

Fix config file for database migrations
Add wait after stopping VM
2023-12-09 23:05:40 -05:00 · 2023-12-09 22:51:54 -05:00 · 2023-12-09 18:14:03 -05:00 · 2023-12-09 17:42:51 -05:00 · 2023-12-09 17:31:56 -05:00 · 2023-12-09 17:28:13 -05:00
16 changed files with 2694 additions and 27 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.83
+0.9.84
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,15 @@
 ## PVC Changelog

+###### [v0.9.84](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.84)
+
+  **Breaking Changes:** This release features a major reconfiguration to how monitoring and reporting of the cluster health works. Node health plugins now report "faults", as do several other issues which were previously manually checked for in "cluster" daemon library for the "/status" endpoint, from within the Health daemon. These faults are persistent, and under each given identifier can be triggered once and subsequent triggers simply update the "last reported" time. An additional set of API endpoints and commands are added to manage these faults, either by "ack"(nowledging) them (keeping the alert around to be further updated but setting its health delta to 0%), or "delete"ing them (completely removing the fault unless it retriggers), both individually, to (from the CLI) multiple, or all. Cluster health reporting is now done based on these faults instead of anything else, and the default interval for health checks is reduced to 15 seconds to accomodate this. In addition to this, Promethius metrics have been added, along with an example Grafana dashboard, for the PVC cluster itself, as well as a proxy to the Ceph cluster metrics. This release also fixes some bugs in the VM provisioner that were introduced in 0.9.83; these fixes require a **reimport or reconfiguration of any provisioner scripts**; reference the updated examples for details.
+
+  * [All] Adds persistent fault reporting to clusters, replacing the old cluster health calculations.
+  * [API Daemon] Adds cluster-level Prometheus metric exporting as well as a Ceph Prometheus proxy to the API.
+  * [CLI Client] Improves formatting output of "pvc cluster status".
+  * [Node Daemon] Fixes several bugs and enhances the working of the psql health check plugin.
+  * [Worker Daemon] Fixes several bugs in the example provisioner scripts, and moves the libvirt_schema library into the daemon common libraries.
+
 ###### [v0.9.83](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.83)

  **Breaking Changes:** This release features a breaking change for the daemon config. A new unified "pvc.conf" file is required for all daemons (and the CLI client for Autobackup and API-on-this-host functionality), which will be written by the "pvc" role in the PVC Ansible framework. Using the "update-pvc-daemons" oneshot playbook from PVC Ansible is **required** to update to this release, as it will ensure this file is written to the proper place before deploying the new package versions, and also ensures that the old entires are cleaned up afterwards. In addition, this release fully splits the node worker and health subsystems into discrete daemons ("pvcworkerd" and "pvchealthd") and packages ("pvc-daemon-worker" and "pvc-daemon-health") respectively. The "pvc-daemon-node" package also now depends on both packages, and the "pvc-daemon-api" package can now be reliably used outside of the PVC nodes themselves (for instance, in a VM) without any strange cross-dependency issues.
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
 import daemon_lib.config as cfg

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"

 # API version
 API_VERSION = 1.0
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@ -538,14 +538,15 @@ def cli_cluster_fault_list(limit, format_function):
    name="ack",
    short_help="Acknowledge a cluster fault.",
 )
-@click.argument("fault_id")
+@click.argument("fault_id", nargs=-1, required=True)
@connection_req
 def cli_cluster_fault_acknowledge(fault_id):
    """
-    Acknowledge the cluster fault FAULT_ID.
+    Acknowledge the cluster fault FAULT_ID; multiple FAULT_IDs may be specified.
    """

-    retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, fault_id)
+    faults = list(fault_id)
+    retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, faults)
    finish(retcode, retdata)


@ -574,14 +575,15 @@ def cli_cluster_fault_acknowledge_all():
    name="delete",
    short_help="Delete a cluster fault.",
 )
-@click.argument("fault_id")
+@click.argument("fault_id", nargs=-1, required=True)
@connection_req
 def cli_cluster_fault_delete(fault_id):
    """
-    Delete the cluster fault FAULT_ID.
+    Delete the cluster fault FAULT_ID; multiple FAULT_IDs may be specified.
    """

-    retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, fault_id)
+    faults = list(fault_id)
+    retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, faults)
    finish(retcode, retdata)


--- a/client-cli/pvc/lib/faults.py
+++ b/client-cli/pvc/lib/faults.py
@ -45,20 +45,29 @@ def get_list(config, limit=None, sort_key="last_reported"):
        return False, response.json().get("message", "")


-def acknowledge(config, fault_id):
+def acknowledge(config, faults):
    """
-    Acknowledge a PVC fault
+    Acknowledge one or more PVC faults

-    API endpoint: PUT /api/v1/faults/<fault_id>
+    API endpoint: PUT /api/v1/faults/<fault_id> for fault_id in faults
    API arguments:
    API schema: {json_message}
    """
-    response = call_api(config, "put", f"/faults/{fault_id}")
+    status_codes = list()
+    bad_msgs = list()
+    for fault_id in faults:
+        response = call_api(config, "put", f"/faults/{fault_id}")

-    if response.status_code == 200:
-        return True, response.json().get("message", "")
+        if response.status_code == 200:
+            status_codes.append(True)
+        else:
+            status_codes.append(False)
+            bad_msgs.append(response.json().get("message", ""))
+
+    if all(status_codes):
+        return True, f"Successfully acknowledged fault(s) {', '.join(faults)}"
    else:
-        return False, response.json().get("message", "")
+        return False, ", ".join(bad_msgs)


 def acknowledge_all(config):
@ -77,20 +86,29 @@ def acknowledge_all(config):
        return False, response.json().get("message", "")


-def delete(config, fault_id):
+def delete(config, faults):
    """
-    Delete a PVC fault
+    Delete one or more PVC faults

-    API endpoint: DELETE /api/v1/faults/<fault_id>
+    API endpoint: DELETE /api/v1/faults/<fault_id> for fault_id in faults
    API arguments:
    API schema: {json_message}
    """
-    response = call_api(config, "delete", f"/faults/{fault_id}")
+    status_codes = list()
+    bad_msgs = list()
+    for fault_id in faults:
+        response = call_api(config, "delete", f"/faults/{fault_id}")

-    if response.status_code == 200:
-        return True, response.json().get("message", "")
+        if response.status_code == 200:
+            status_codes.append(True)
+        else:
+            status_codes.append(False)
+            bad_msgs.append(response.json().get("message", ""))
+
+    if all(status_codes):
+        return True, f"Successfully deleted fault(s) {', '.join(faults)}"
    else:
-        return False, response.json().get("message", "")
+        return False, ", ".join(bad_msgs)


 def delete_all(config):
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="pvc",
-    version="0.9.83",
+    version="0.9.84",
    packages=["pvc.cli", "pvc.lib"],
    install_requires=[
        "Click",
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,15 @@
+pvc (0.9.84-0) unstable; urgency=high
+
+  **Breaking Changes:** This release features a major reconfiguration to how monitoring and reporting of the cluster health works. Node health plugins now report "faults", as do several other issues which were previously manually checked for in "cluster" daemon library for the "/status" endpoint, from within the Health daemon. These faults are persistent, and under each given identifier can be triggered once and subsequent triggers simply update the "last reported" time. An additional set of API endpoints and commands are added to manage these faults, either by "ack"(nowledging) them (keeping the alert around to be further updated but setting its health delta to 0%), or "delete"ing them (completely removing the fault unless it retriggers), both individually, to (from the CLI) multiple, or all. Cluster health reporting is now done based on these faults instead of anything else, and the default interval for health checks is reduced to 15 seconds to accomodate this. In addition to this, Promethius metrics have been added, along with an example Grafana dashboard, for the PVC cluster itself, as well as a proxy to the Ceph cluster metrics. This release also fixes some bugs in the VM provisioner that were introduced in 0.9.83; these fixes require a **reimport or reconfiguration of any provisioner scripts**; reference the updated examples for details.
+
+  * [All] Adds persistent fault reporting to clusters, replacing the old cluster health calculations.
+  * [API Daemon] Adds cluster-level Prometheus metric exporting as well as a Ceph Prometheus proxy to the API.
+  * [CLI Client] Improves formatting output of "pvc cluster status".
+  * [Node Daemon] Fixes several bugs and enhances the working of the psql health check plugin.
+  * [Worker Daemon] Fixes several bugs in the example provisioner scripts, and moves the libvirt_schema library into the daemon common libraries.
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sat, 09 Dec 2023 23:05:40 -0500
+
 pvc (0.9.83-0) unstable; urgency=high

  **Breaking Changes:** This release features a breaking change for the daemon config. A new unified "pvc.conf" file is required for all daemons (and the CLI client for Autobackup and API-on-this-host functionality), which will be written by the "pvc" role in the PVC Ansible framework. Using the "update-pvc-daemons" oneshot playbook from PVC Ansible is **required** to update to this release, as it will ensure this file is written to the proper place before deploying the new package versions, and also ensures that the old entires are cleaned up afterwards. In addition, this release fully splits the node worker and health subsystems into discrete daemons ("pvcworkerd" and "pvchealthd") and packages ("pvc-daemon-worker" and "pvc-daemon-health") respectively. The "pvc-daemon-node" package also now depends on both packages, and the "pvc-daemon-api" package can now be reliably used outside of the PVC nodes themselves (for instance, in a VM) without any strange cross-dependency issues.
--- a/2
+++ b/2
@ -6,7 +6,7 @@ VERSION="$( head -1 debian/changelog | awk -F'[()-]' '{ print $2 }' )"

 pushd $( git rev-parse --show-toplevel ) &>/dev/null
 pushd api-daemon &>/dev/null
-export PVC_CONFIG_FILE="./pvcapid.sample.yaml"
+export PVC_CONFIG_FILE="../pvc.sample.conf"
 ./pvcapid-manage_flask.py db migrate -m "PVC version ${VERSION}"
 ./pvcapid-manage_flask.py db upgrade
 popd &>/dev/null
--- a/health-daemon/pvchealthd/Daemon.py
+++ b/health-daemon/pvchealthd/Daemon.py
@ -33,7 +33,7 @@ import os
 import signal

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"


 ##########################################################
--- a/node-daemon/monitoring/README.md
+++ b/node-daemon/monitoring/README.md
@ -2,6 +2,14 @@

 This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.

+## Prometheus + Grafana
+
+The included example Prometheus configuration and Grafana dashboard can be used to query the PVC API for Prometheus data and display it with a consistent dashboard.
+
+Note that the default configuration here also includes Ceph cluster information; a Ceph dashboard can be found externally.
+
+Note too that this does not include node export examples from individual PVC nodes; those must be set up separately.
+
 ## Munin

 The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
--- a/node-daemon/monitoring/prometheus/grafana-pvc-dashboard.json
+++ b/node-daemon/monitoring/prometheus/grafana-pvc-dashboard.json
--- a/node-daemon/monitoring/prometheus/prometheus.yml
+++ b/node-daemon/monitoring/prometheus/prometheus.yml
@ -0,0 +1,8 @@
+# Other configuration omitted
+scrape_configs:
+  - job_name: "pvc_cluster"
+    metrics_path: /api/v1/metrics
+    scheme: "http"
+    file_sd_configs:
+      - files:
+          - 'targets-pvc_cluster.json'
--- a/node-daemon/monitoring/prometheus/targets-pvc_cluster.json
+++ b/node-daemon/monitoring/prometheus/targets-pvc_cluster.json
@ -0,0 +1,11 @@
+[
+  {
+    "targets": [
+      "pvc.upstream.floating.address.tld:7370"
+    ],
+    "labels": {
+      "cluster": "cluster1"
+    }
+  }
+]
+
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -48,7 +48,7 @@ import re
 import json

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"


 ##########################################################
--- a/test-cluster.sh
+++ b/test-cluster.sh
@ -167,6 +167,7 @@ _pvc storage pool remove --yes testing

 # Remove the VM
 _pvc vm stop --yes testx
+sleep 5
 _pvc vm remove --yes testx

 _pvc provisioner profile remove --yes test
--- a/worker-daemon/pvcworkerd/Daemon.py
+++ b/worker-daemon/pvcworkerd/Daemon.py
@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
 )

 # Daemon version
-version = "0.9.83"
+version = "0.9.84"


 config = cfg.get_configuration()
Author	SHA1	Message	Date
Joshua M. Boniface	9aee2a9075	Bump version to 0.9.84	2023-12-09 23:05:40 -05:00
Joshua M. Boniface	8f0ae3e2dd	Fix config file for database migrations	2023-12-09 22:51:54 -05:00
Joshua M. Boniface	946d3eaf43	Add wait after stopping VM	2023-12-09 18:14:03 -05:00
Joshua M. Boniface	1f6347d24b	Add Prometheus monitoring examples	2023-12-09 17:42:51 -05:00
Joshua M. Boniface	e8552b471b	Require at least one FAULT_ID	2023-12-09 17:31:56 -05:00
Joshua M. Boniface	fc443a323b	Allow ack/delete of multiple faults at once	2023-12-09 17:28:13 -05:00
 @ -1 +1 @@
 .9.83
 .9.84