Compare commits
10 Commits
7e6d922877
...
v0.9.84
Author | SHA1 | Date | |
---|---|---|---|
9aee2a9075 | |||
8f0ae3e2dd | |||
946d3eaf43 | |||
1f6347d24b | |||
e8552b471b | |||
fc443a323b | |||
b0557edb76 | |||
47bd7bf2f5 | |||
b9fbfe2ed5 | |||
764e3e3722 |
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,5 +1,15 @@
|
||||
## PVC Changelog
|
||||
|
||||
###### [v0.9.84](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.84)
|
||||
|
||||
**Breaking Changes:** This release features a major reconfiguration to how monitoring and reporting of the cluster health works. Node health plugins now report "faults", as do several other issues which were previously manually checked for in "cluster" daemon library for the "/status" endpoint, from within the Health daemon. These faults are persistent, and under each given identifier can be triggered once and subsequent triggers simply update the "last reported" time. An additional set of API endpoints and commands are added to manage these faults, either by "ack"(nowledging) them (keeping the alert around to be further updated but setting its health delta to 0%), or "delete"ing them (completely removing the fault unless it retriggers), both individually, to (from the CLI) multiple, or all. Cluster health reporting is now done based on these faults instead of anything else, and the default interval for health checks is reduced to 15 seconds to accomodate this. In addition to this, Promethius metrics have been added, along with an example Grafana dashboard, for the PVC cluster itself, as well as a proxy to the Ceph cluster metrics. This release also fixes some bugs in the VM provisioner that were introduced in 0.9.83; these fixes require a **reimport or reconfiguration of any provisioner scripts**; reference the updated examples for details.
|
||||
|
||||
* [All] Adds persistent fault reporting to clusters, replacing the old cluster health calculations.
|
||||
* [API Daemon] Adds cluster-level Prometheus metric exporting as well as a Ceph Prometheus proxy to the API.
|
||||
* [CLI Client] Improves formatting output of "pvc cluster status".
|
||||
* [Node Daemon] Fixes several bugs and enhances the working of the psql health check plugin.
|
||||
* [Worker Daemon] Fixes several bugs in the example provisioner scripts, and moves the libvirt_schema library into the daemon common libraries.
|
||||
|
||||
###### [v0.9.83](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.83)
|
||||
|
||||
**Breaking Changes:** This release features a breaking change for the daemon config. A new unified "pvc.conf" file is required for all daemons (and the CLI client for Autobackup and API-on-this-host functionality), which will be written by the "pvc" role in the PVC Ansible framework. Using the "update-pvc-daemons" oneshot playbook from PVC Ansible is **required** to update to this release, as it will ensure this file is written to the proper place before deploying the new package versions, and also ensures that the old entires are cleaned up afterwards. In addition, this release fully splits the node worker and health subsystems into discrete daemons ("pvcworkerd" and "pvchealthd") and packages ("pvc-daemon-worker" and "pvc-daemon-health") respectively. The "pvc-daemon-node" package also now depends on both packages, and the "pvc-daemon-api" package can now be reliably used outside of the PVC nodes themselves (for instance, in a VM) without any strange cross-dependency issues.
|
||||
|
@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
|
||||
import daemon_lib.config as cfg
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.83"
|
||||
version = "0.9.84"
|
||||
|
||||
# API version
|
||||
API_VERSION = 1.0
|
||||
|
@ -538,14 +538,15 @@ def cli_cluster_fault_list(limit, format_function):
|
||||
name="ack",
|
||||
short_help="Acknowledge a cluster fault.",
|
||||
)
|
||||
@click.argument("fault_id")
|
||||
@click.argument("fault_id", nargs=-1, required=True)
|
||||
@connection_req
|
||||
def cli_cluster_fault_acknowledge(fault_id):
|
||||
"""
|
||||
Acknowledge the cluster fault FAULT_ID.
|
||||
Acknowledge the cluster fault FAULT_ID; multiple FAULT_IDs may be specified.
|
||||
"""
|
||||
|
||||
retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, fault_id)
|
||||
faults = list(fault_id)
|
||||
retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, faults)
|
||||
finish(retcode, retdata)
|
||||
|
||||
|
||||
@ -574,14 +575,15 @@ def cli_cluster_fault_acknowledge_all():
|
||||
name="delete",
|
||||
short_help="Delete a cluster fault.",
|
||||
)
|
||||
@click.argument("fault_id")
|
||||
@click.argument("fault_id", nargs=-1, required=True)
|
||||
@connection_req
|
||||
def cli_cluster_fault_delete(fault_id):
|
||||
"""
|
||||
Delete the cluster fault FAULT_ID.
|
||||
Delete the cluster fault FAULT_ID; multiple FAULT_IDs may be specified.
|
||||
"""
|
||||
|
||||
retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, fault_id)
|
||||
faults = list(fault_id)
|
||||
retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, faults)
|
||||
finish(retcode, retdata)
|
||||
|
||||
|
||||
|
@ -388,22 +388,22 @@ def cli_cluster_fault_list_format_short(CLI_CONFIG, fault_data):
|
||||
fault_id_length + fault_status_length + fault_health_delta_length + 2
|
||||
)
|
||||
detail_header_length = (
|
||||
fault_health_delta_length
|
||||
fault_id_length
|
||||
+ fault_health_delta_length
|
||||
+ fault_status_length
|
||||
+ fault_last_reported_length
|
||||
+ fault_message_length
|
||||
+ 3
|
||||
- meta_header_length
|
||||
+ 8
|
||||
)
|
||||
|
||||
# Format the string (header)
|
||||
fault_list_output.append(
|
||||
"{bold}Meta {meta_dashes} Fault {detail_dashes}{end_bold}".format(
|
||||
"{bold}Meta {meta_dashes} Fault {detail_dashes}{end_bold}".format(
|
||||
bold=ansii["bold"],
|
||||
end_bold=ansii["end"],
|
||||
meta_dashes="-" * (meta_header_length - len("Meta ")),
|
||||
detail_dashes="-" * (detail_header_length - len("Fault ")),
|
||||
meta_dashes="-" * (meta_header_length - len("Meta ")),
|
||||
detail_dashes="-" * (detail_header_length - len("Fault ")),
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -45,20 +45,29 @@ def get_list(config, limit=None, sort_key="last_reported"):
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
|
||||
def acknowledge(config, fault_id):
|
||||
def acknowledge(config, faults):
|
||||
"""
|
||||
Acknowledge a PVC fault
|
||||
Acknowledge one or more PVC faults
|
||||
|
||||
API endpoint: PUT /api/v1/faults/<fault_id>
|
||||
API endpoint: PUT /api/v1/faults/<fault_id> for fault_id in faults
|
||||
API arguments:
|
||||
API schema: {json_message}
|
||||
"""
|
||||
response = call_api(config, "put", f"/faults/{fault_id}")
|
||||
status_codes = list()
|
||||
bad_msgs = list()
|
||||
for fault_id in faults:
|
||||
response = call_api(config, "put", f"/faults/{fault_id}")
|
||||
|
||||
if response.status_code == 200:
|
||||
return True, response.json().get("message", "")
|
||||
if response.status_code == 200:
|
||||
status_codes.append(True)
|
||||
else:
|
||||
status_codes.append(False)
|
||||
bad_msgs.append(response.json().get("message", ""))
|
||||
|
||||
if all(status_codes):
|
||||
return True, f"Successfully acknowledged fault(s) {', '.join(faults)}"
|
||||
else:
|
||||
return False, response.json().get("message", "")
|
||||
return False, ", ".join(bad_msgs)
|
||||
|
||||
|
||||
def acknowledge_all(config):
|
||||
@ -77,20 +86,29 @@ def acknowledge_all(config):
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
|
||||
def delete(config, fault_id):
|
||||
def delete(config, faults):
|
||||
"""
|
||||
Delete a PVC fault
|
||||
Delete one or more PVC faults
|
||||
|
||||
API endpoint: DELETE /api/v1/faults/<fault_id>
|
||||
API endpoint: DELETE /api/v1/faults/<fault_id> for fault_id in faults
|
||||
API arguments:
|
||||
API schema: {json_message}
|
||||
"""
|
||||
response = call_api(config, "delete", f"/faults/{fault_id}")
|
||||
status_codes = list()
|
||||
bad_msgs = list()
|
||||
for fault_id in faults:
|
||||
response = call_api(config, "delete", f"/faults/{fault_id}")
|
||||
|
||||
if response.status_code == 200:
|
||||
return True, response.json().get("message", "")
|
||||
if response.status_code == 200:
|
||||
status_codes.append(True)
|
||||
else:
|
||||
status_codes.append(False)
|
||||
bad_msgs.append(response.json().get("message", ""))
|
||||
|
||||
if all(status_codes):
|
||||
return True, f"Successfully deleted fault(s) {', '.join(faults)}"
|
||||
else:
|
||||
return False, response.json().get("message", "")
|
||||
return False, ", ".join(bad_msgs)
|
||||
|
||||
|
||||
def delete_all(config):
|
||||
|
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="pvc",
|
||||
version="0.9.83",
|
||||
version="0.9.84",
|
||||
packages=["pvc.cli", "pvc.lib"],
|
||||
install_requires=[
|
||||
"Click",
|
||||
|
@ -20,7 +20,6 @@
|
||||
###############################################################################
|
||||
|
||||
from datetime import datetime
|
||||
from hashlib import md5
|
||||
|
||||
|
||||
def generate_fault(
|
||||
@ -32,10 +31,6 @@ def generate_fault(
|
||||
fault_message,
|
||||
fault_details=None,
|
||||
):
|
||||
# Generate a fault ID from the fault_name, fault_delta, and fault_message
|
||||
fault_str = f"{fault_name} {fault_delta} {fault_message}"
|
||||
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
|
||||
|
||||
# Strip the microseconds off of the fault time; we don't care about that precision
|
||||
fault_time = str(fault_time).split(".")[0]
|
||||
|
||||
@ -45,47 +40,49 @@ def generate_fault(
|
||||
# If a fault already exists with this ID, just update the time
|
||||
if not zkhandler.exists("base.faults"):
|
||||
logger.out(
|
||||
f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
|
||||
f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
|
||||
state="w",
|
||||
)
|
||||
return
|
||||
|
||||
existing_faults = zkhandler.children("base.faults")
|
||||
if fault_id in existing_faults:
|
||||
if fault_name in existing_faults:
|
||||
logger.out(
|
||||
f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
|
||||
f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
|
||||
)
|
||||
else:
|
||||
logger.out(
|
||||
f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
|
||||
f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
|
||||
state="i",
|
||||
)
|
||||
|
||||
if zkhandler.read("base.config.maintenance") == "true":
|
||||
logger.out(
|
||||
f"Skipping fault reporting for {fault_id} due to maintenance mode",
|
||||
f"Skipping fault reporting for {fault_name} due to maintenance mode",
|
||||
state="w",
|
||||
)
|
||||
return
|
||||
|
||||
if fault_id in existing_faults:
|
||||
# Update an existing fault
|
||||
if fault_name in existing_faults:
|
||||
zkhandler.write(
|
||||
[
|
||||
(("faults.last_time", fault_id), fault_time),
|
||||
(("faults.message", fault_id), fault_message),
|
||||
(("faults.last_time", fault_name), fault_time),
|
||||
(("faults.delta", fault_name), fault_delta),
|
||||
(("faults.message", fault_name), fault_message),
|
||||
]
|
||||
)
|
||||
# Otherwise, generate a new fault event
|
||||
# Generate a new fault
|
||||
else:
|
||||
zkhandler.write(
|
||||
[
|
||||
(("faults.id", fault_id), ""),
|
||||
(("faults.first_time", fault_id), fault_time),
|
||||
(("faults.last_time", fault_id), fault_time),
|
||||
(("faults.ack_time", fault_id), ""),
|
||||
(("faults.status", fault_id), "new"),
|
||||
(("faults.delta", fault_id), fault_delta),
|
||||
(("faults.message", fault_id), fault_message),
|
||||
(("faults.id", fault_name), ""),
|
||||
(("faults.first_time", fault_name), fault_time),
|
||||
(("faults.last_time", fault_name), fault_time),
|
||||
(("faults.ack_time", fault_name), ""),
|
||||
(("faults.status", fault_name), "new"),
|
||||
(("faults.delta", fault_name), fault_delta),
|
||||
(("faults.message", fault_name), fault_message),
|
||||
]
|
||||
)
|
||||
|
||||
|
12
debian/changelog
vendored
12
debian/changelog
vendored
@ -1,3 +1,15 @@
|
||||
pvc (0.9.84-0) unstable; urgency=high
|
||||
|
||||
**Breaking Changes:** This release features a major reconfiguration to how monitoring and reporting of the cluster health works. Node health plugins now report "faults", as do several other issues which were previously manually checked for in "cluster" daemon library for the "/status" endpoint, from within the Health daemon. These faults are persistent, and under each given identifier can be triggered once and subsequent triggers simply update the "last reported" time. An additional set of API endpoints and commands are added to manage these faults, either by "ack"(nowledging) them (keeping the alert around to be further updated but setting its health delta to 0%), or "delete"ing them (completely removing the fault unless it retriggers), both individually, to (from the CLI) multiple, or all. Cluster health reporting is now done based on these faults instead of anything else, and the default interval for health checks is reduced to 15 seconds to accomodate this. In addition to this, Promethius metrics have been added, along with an example Grafana dashboard, for the PVC cluster itself, as well as a proxy to the Ceph cluster metrics. This release also fixes some bugs in the VM provisioner that were introduced in 0.9.83; these fixes require a **reimport or reconfiguration of any provisioner scripts**; reference the updated examples for details.
|
||||
|
||||
* [All] Adds persistent fault reporting to clusters, replacing the old cluster health calculations.
|
||||
* [API Daemon] Adds cluster-level Prometheus metric exporting as well as a Ceph Prometheus proxy to the API.
|
||||
* [CLI Client] Improves formatting output of "pvc cluster status".
|
||||
* [Node Daemon] Fixes several bugs and enhances the working of the psql health check plugin.
|
||||
* [Worker Daemon] Fixes several bugs in the example provisioner scripts, and moves the libvirt_schema library into the daemon common libraries.
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sat, 09 Dec 2023 23:05:40 -0500
|
||||
|
||||
pvc (0.9.83-0) unstable; urgency=high
|
||||
|
||||
**Breaking Changes:** This release features a breaking change for the daemon config. A new unified "pvc.conf" file is required for all daemons (and the CLI client for Autobackup and API-on-this-host functionality), which will be written by the "pvc" role in the PVC Ansible framework. Using the "update-pvc-daemons" oneshot playbook from PVC Ansible is **required** to update to this release, as it will ensure this file is written to the proper place before deploying the new package versions, and also ensures that the old entires are cleaned up afterwards. In addition, this release fully splits the node worker and health subsystems into discrete daemons ("pvcworkerd" and "pvchealthd") and packages ("pvc-daemon-worker" and "pvc-daemon-health") respectively. The "pvc-daemon-node" package also now depends on both packages, and the "pvc-daemon-api" package can now be reliably used outside of the PVC nodes themselves (for instance, in a VM) without any strange cross-dependency issues.
|
||||
|
@ -6,7 +6,7 @@ VERSION="$( head -1 debian/changelog | awk -F'[()-]' '{ print $2 }' )"
|
||||
|
||||
pushd $( git rev-parse --show-toplevel ) &>/dev/null
|
||||
pushd api-daemon &>/dev/null
|
||||
export PVC_CONFIG_FILE="./pvcapid.sample.yaml"
|
||||
export PVC_CONFIG_FILE="../pvc.sample.conf"
|
||||
./pvcapid-manage_flask.py db migrate -m "PVC version ${VERSION}"
|
||||
./pvcapid-manage_flask.py db upgrade
|
||||
popd &>/dev/null
|
||||
|
@ -33,7 +33,7 @@ import os
|
||||
import signal
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.83"
|
||||
version = "0.9.84"
|
||||
|
||||
|
||||
##########################################################
|
||||
|
@ -228,7 +228,7 @@ class MonitoringInstance(object):
|
||||
def get_ceph_health_entries():
|
||||
ceph_health_entries = [
|
||||
{
|
||||
"entry": f"{value['severity']} {key}",
|
||||
"entry": key,
|
||||
"check": value["severity"],
|
||||
"details": value["summary"]["message"],
|
||||
}
|
||||
@ -281,36 +281,42 @@ class MonitoringInstance(object):
|
||||
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
||||
self.cluster_faults_map = {
|
||||
"dead_or_fenced_node": {
|
||||
"name": "DEAD_NODE_{entry}",
|
||||
"entries": get_node_daemon_states,
|
||||
"conditions": ["dead", "fenced"],
|
||||
"delta": 50,
|
||||
"message": "Node {entry} was dead and/or fenced",
|
||||
},
|
||||
"ceph_osd_out": {
|
||||
"name": "CEPH_OSD_OUT_{entry}",
|
||||
"entries": get_osd_in_states,
|
||||
"conditions": ["0"],
|
||||
"delta": 50,
|
||||
"message": "OSD {entry} was marked out",
|
||||
},
|
||||
"ceph_warn": {
|
||||
"name": "CEPH_WARN_{entry}",
|
||||
"entries": get_ceph_health_entries,
|
||||
"conditions": ["HEALTH_WARN"],
|
||||
"delta": 10,
|
||||
"message": "{entry} reported by Ceph cluster",
|
||||
},
|
||||
"ceph_err": {
|
||||
"name": "CEPH_ERR_{entry}",
|
||||
"entries": get_ceph_health_entries,
|
||||
"conditions": ["HEALTH_ERR"],
|
||||
"delta": 50,
|
||||
"message": "{entry} reported by Ceph cluster",
|
||||
},
|
||||
"vm_failed": {
|
||||
"name": "VM_FAILED_{entry}",
|
||||
"entries": get_vm_states,
|
||||
"conditions": ["fail"],
|
||||
"delta": 10,
|
||||
"message": "VM {entry} was failed",
|
||||
},
|
||||
"memory_overprovisioned": {
|
||||
"name": "MEMORY_OVERPROVISIONED",
|
||||
"entries": get_overprovisioned_memory,
|
||||
"conditions": ["overprovisioned"],
|
||||
"delta": 50,
|
||||
@ -531,11 +537,12 @@ class MonitoringInstance(object):
|
||||
if str(condition) == str(check):
|
||||
fault_time = datetime.now()
|
||||
fault_delta = fault_data["delta"]
|
||||
fault_name = fault_data["name"].format(entry=entry.upper())
|
||||
fault_message = fault_data["message"].format(entry=entry)
|
||||
generate_fault(
|
||||
self.zkhandler,
|
||||
self.logger,
|
||||
fault_type,
|
||||
fault_name,
|
||||
fault_time,
|
||||
fault_delta,
|
||||
fault_message,
|
||||
@ -587,7 +594,7 @@ class MonitoringInstance(object):
|
||||
|
||||
# Generate a cluster fault if the plugin is in a suboptimal state
|
||||
if result.health_delta > 0:
|
||||
fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
|
||||
fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}"
|
||||
fault_time = datetime.now()
|
||||
|
||||
# Map our check results to fault results
|
||||
@ -602,11 +609,11 @@ class MonitoringInstance(object):
|
||||
generate_fault(
|
||||
self.zkhandler,
|
||||
self.logger,
|
||||
fault_type,
|
||||
fault_name,
|
||||
fault_time,
|
||||
fault_delta,
|
||||
fault_message,
|
||||
fault_detail=None,
|
||||
fault_details=None,
|
||||
)
|
||||
self.faults += 1
|
||||
|
||||
@ -661,7 +668,7 @@ class MonitoringInstance(object):
|
||||
|
||||
self.run_plugins(coordinator_state=coordinator_state)
|
||||
|
||||
if coordinator_state in ["primary", "secondary", "takeover", "relinquish"]:
|
||||
if coordinator_state in ["primary", "takeover"]:
|
||||
self.run_faults(coordinator_state=coordinator_state)
|
||||
|
||||
runtime_end = datetime.now()
|
||||
|
@ -2,6 +2,14 @@
|
||||
|
||||
This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
|
||||
|
||||
## Prometheus + Grafana
|
||||
|
||||
The included example Prometheus configuration and Grafana dashboard can be used to query the PVC API for Prometheus data and display it with a consistent dashboard.
|
||||
|
||||
Note that the default configuration here also includes Ceph cluster information; a Ceph dashboard can be found externally.
|
||||
|
||||
Note too that this does not include node export examples from individual PVC nodes; those must be set up separately.
|
||||
|
||||
## Munin
|
||||
|
||||
The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
|
||||
|
2597
node-daemon/monitoring/prometheus/grafana-pvc-dashboard.json
Normal file
2597
node-daemon/monitoring/prometheus/grafana-pvc-dashboard.json
Normal file
File diff suppressed because it is too large
Load Diff
8
node-daemon/monitoring/prometheus/prometheus.yml
Normal file
8
node-daemon/monitoring/prometheus/prometheus.yml
Normal file
@ -0,0 +1,8 @@
|
||||
# Other configuration omitted
|
||||
scrape_configs:
|
||||
- job_name: "pvc_cluster"
|
||||
metrics_path: /api/v1/metrics
|
||||
scheme: "http"
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- 'targets-pvc_cluster.json'
|
11
node-daemon/monitoring/prometheus/targets-pvc_cluster.json
Normal file
11
node-daemon/monitoring/prometheus/targets-pvc_cluster.json
Normal file
@ -0,0 +1,11 @@
|
||||
[
|
||||
{
|
||||
"targets": [
|
||||
"pvc.upstream.floating.address.tld:7370"
|
||||
],
|
||||
"labels": {
|
||||
"cluster": "cluster1"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -48,7 +48,7 @@ import re
|
||||
import json
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.83"
|
||||
version = "0.9.84"
|
||||
|
||||
|
||||
##########################################################
|
||||
|
@ -167,6 +167,7 @@ _pvc storage pool remove --yes testing
|
||||
|
||||
# Remove the VM
|
||||
_pvc vm stop --yes testx
|
||||
sleep 5
|
||||
_pvc vm remove --yes testx
|
||||
|
||||
_pvc provisioner profile remove --yes test
|
||||
|
@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
|
||||
)
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.83"
|
||||
version = "0.9.84"
|
||||
|
||||
|
||||
config = cfg.get_configuration()
|
||||
|
Reference in New Issue
Block a user