Compare commits

...

9 Commits

Author SHA1 Message Date
c64e888d30 Fix incorrect cast of None 2023-12-14 16:00:53 -05:00
f1249452e5 Fix bug if no nodes are present 2023-12-14 15:32:18 -05:00
0a93f526e0 Bump version to 0.9.86 2023-12-14 14:46:29 -05:00
7c9512fb22 Fix broken config file in API migration script 2023-12-14 14:45:58 -05:00
e88b97f3a9 Print fenced state in red 2023-12-13 15:02:18 -05:00
709c9cb73e Pause pvchealthd startup until node daemon is run
If the health daemon starts too soon during a node bootup, it will
generate generate tons of erroneous faults while the node starts up.
Adds a conditional wait for the current node daemon to be in "run"
state before the health daemon really starts up.
2023-12-13 14:53:54 -05:00
f41c5176be Ensure health value is an int properly 2023-12-13 14:34:02 -05:00
38e43b46c3 Update health detail messages format 2023-12-13 03:17:47 -05:00
ed9c37982a Move metric collection into daemon library 2023-12-11 19:20:30 -05:00
14 changed files with 199 additions and 158 deletions

View File

@ -1 +1 @@
0.9.85 0.9.86

View File

@ -1,5 +1,14 @@
## PVC Changelog ## PVC Changelog
###### [v0.9.86](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.86)
* [API Daemon] Significantly improves the performance of several commands via async Zookeeper calls and removal of superfluous backend calls.
* [Docs] Improves the project README and updates screenshot images to show the current output and more functionality.
* [API Daemon/CLI] Corrects some bugs in VM metainformation output.
* [Node Daemon] Fixes resource reporting bugs from 0.9.81 and properly clears node resource numbers on a fence.
* [Health Daemon] Adds a wait during pvchealthd startup until the node is in run state, to avoid erroneous faults during node bootup.
* [API Daemon] Fixes an incorrect reference to legacy pvcapid.yaml file in migration script.
###### [v0.9.85](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.85) ###### [v0.9.85](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.85)
* [Packaging] Fixes a dependency bug introduced in 0.9.84 * [Packaging] Fixes a dependency bug introduced in 0.9.84

View File

@ -3,7 +3,7 @@
# Apply PVC database migrations # Apply PVC database migrations
# Part of the Parallel Virtual Cluster (PVC) system # Part of the Parallel Virtual Cluster (PVC) system
export PVC_CONFIG_FILE="/etc/pvc/pvcapid.yaml" export PVC_CONFIG_FILE="/etc/pvc/pvc.conf"
if [[ ! -f ${PVC_CONFIG_FILE} ]]; then if [[ ! -f ${PVC_CONFIG_FILE} ]]; then
echo "Create a configuration file at ${PVC_CONFIG_FILE} before upgrading the database." echo "Create a configuration file at ${PVC_CONFIG_FILE} before upgrading the database."

View File

@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
import daemon_lib.config as cfg import daemon_lib.config as cfg
# Daemon version # Daemon version
version = "0.9.85" version = "0.9.86"
# API version # API version
API_VERSION = 1.0 API_VERSION = 1.0

View File

@ -131,154 +131,12 @@ def cluster_metrics(zkhandler):
Format status data from cluster_status into Prometheus-compatible metrics Format status data from cluster_status into Prometheus-compatible metrics
""" """
# Get general cluster information retflag, retdata = pvc_cluster.get_metrics(zkhandler)
status_retflag, status_data = pvc_cluster.get_info(zkhandler) if retflag:
if not status_retflag: retcode = 200
return "Error: Status data threw error", 400
faults_data = status_data["detail"]["faults"]
node_data = status_data["detail"]["node"]
vm_data = status_data["detail"]["vm"]
osd_data = status_data["detail"]["osd"]
output_lines = list()
output_lines.append("# HELP pvc_info PVC cluster information")
output_lines.append("# TYPE pvc_info gauge")
output_lines.append(
f"pvc_info{{primary_node=\"{status_data['primary_node']}\", version=\"{status_data['pvc_version']}\", upstream_ip=\"{status_data['upstream_ip']}\"}} 1"
)
output_lines.append("# HELP pvc_cluster_maintenance PVC cluster maintenance state")
output_lines.append("# TYPE pvc_cluster_maintenance gauge")
output_lines.append(
f"pvc_cluster_maintenance {1 if bool(strtobool(status_data['maintenance'])) else 0}"
)
output_lines.append("# HELP pvc_cluster_health PVC cluster health status")
output_lines.append("# TYPE pvc_cluster_health gauge")
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
output_lines.append("# TYPE pvc_cluster_faults gauge")
fault_map = dict()
for fault_type in pvc_common.fault_state_combinations:
fault_map[fault_type] = 0
for fault in faults_data:
fault_map[fault["status"]] += 1
for fault_type in fault_map:
output_lines.append(
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
)
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
# output_lines.append("# TYPE pvc_cluster_faults gauge")
# for fault_msg in status_data["cluster_health"]["messages"]:
# output_lines.append(
# f"pvc_cluster_faults{{id=\"{fault_msg['id']}\", message=\"{fault_msg['text']}\"}} {fault_msg['health_delta']}"
# )
output_lines.append("# HELP pvc_node_health PVC cluster node health status")
output_lines.append("# TYPE pvc_node_health gauge")
for node in status_data["node_health"]:
if isinstance(status_data["node_health"][node]["health"], int):
output_lines.append(
f"pvc_node_health{{node=\"{node}\"}} {status_data['node_health'][node]['health']}"
)
output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts")
output_lines.append("# TYPE pvc_node_daemon_states gauge")
node_daemon_state_map = dict()
for state in set([s.split(",")[0] for s in pvc_common.node_state_combinations]):
node_daemon_state_map[state] = 0
for node in node_data:
node_daemon_state_map[node["daemon_state"]] += 1
for state in node_daemon_state_map:
output_lines.append(
f'pvc_node_daemon_states{{state="{state}"}} {node_daemon_state_map[state]}'
)
output_lines.append("# HELP pvc_node_domain_states PVC Node domain state counts")
output_lines.append("# TYPE pvc_node_domain_states gauge")
node_domain_state_map = dict()
for state in set([s.split(",")[1] for s in pvc_common.node_state_combinations]):
node_domain_state_map[state] = 0
for node in node_data:
node_domain_state_map[node["domain_state"]] += 1
for state in node_domain_state_map:
output_lines.append(
f'pvc_node_domain_states{{state="{state}"}} {node_domain_state_map[state]}'
)
output_lines.append("# HELP pvc_vm_states PVC VM state counts")
output_lines.append("# TYPE pvc_vm_states gauge")
vm_state_map = dict()
for state in set(pvc_common.vm_state_combinations):
vm_state_map[state] = 0
for vm in vm_data:
vm_state_map[vm["state"]] += 1
for state in vm_state_map:
output_lines.append(f'pvc_vm_states{{state="{state}"}} {vm_state_map[state]}')
output_lines.append("# HELP pvc_osd_up_states PVC OSD up state counts")
output_lines.append("# TYPE pvc_osd_up_states gauge")
osd_up_state_map = dict()
for state in set([s.split(",")[0] for s in pvc_common.ceph_osd_state_combinations]):
osd_up_state_map[state] = 0
for osd in osd_data:
if osd["up"] == "up":
osd_up_state_map["up"] += 1
else: else:
osd_up_state_map["down"] += 1 retcode = 400
for state in osd_up_state_map: return retdata, retcode
output_lines.append(
f'pvc_osd_up_states{{state="{state}"}} {osd_up_state_map[state]}'
)
output_lines.append("# HELP pvc_osd_in_states PVC OSD in state counts")
output_lines.append("# TYPE pvc_osd_in_states gauge")
osd_in_state_map = dict()
for state in set([s.split(",")[1] for s in pvc_common.ceph_osd_state_combinations]):
osd_in_state_map[state] = 0
for osd in osd_data:
if osd["in"] == "in":
osd_in_state_map["in"] += 1
else:
osd_in_state_map["out"] += 1
for state in osd_in_state_map:
output_lines.append(
f'pvc_osd_in_states{{state="{state}"}} {osd_in_state_map[state]}'
)
output_lines.append("# HELP pvc_nodes PVC Node count")
output_lines.append("# TYPE pvc_nodes gauge")
output_lines.append(f"pvc_nodes {status_data['nodes']['total']}")
output_lines.append("# HELP pvc_vms PVC VM count")
output_lines.append("# TYPE pvc_vms gauge")
output_lines.append(f"pvc_vms {status_data['vms']['total']}")
output_lines.append("# HELP pvc_osds PVC OSD count")
output_lines.append("# TYPE pvc_osds gauge")
output_lines.append(f"pvc_osds {status_data['osds']['total']}")
output_lines.append("# HELP pvc_networks PVC Network count")
output_lines.append("# TYPE pvc_networks gauge")
output_lines.append(f"pvc_networks {status_data['networks']}")
output_lines.append("# HELP pvc_pools PVC Storage Pool count")
output_lines.append("# TYPE pvc_pools gauge")
output_lines.append(f"pvc_pools {status_data['pools']}")
output_lines.append("# HELP pvc_volumes PVC Storage Volume count")
output_lines.append("# TYPE pvc_volumes gauge")
output_lines.append(f"pvc_volumes {status_data['volumes']}")
output_lines.append("# HELP pvc_snapshots PVC Storage Snapshot count")
output_lines.append("# TYPE pvc_snapshots gauge")
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
return "\n".join(output_lines) + "\n", 200
@pvc_common.Profiler(config) @pvc_common.Profiler(config)

View File

@ -249,6 +249,8 @@ def getOutputColours(node_information):
daemon_state_colour = ansiprint.yellow() daemon_state_colour = ansiprint.yellow()
elif node_information["daemon_state"] == "dead": elif node_information["daemon_state"] == "dead":
daemon_state_colour = ansiprint.red() + ansiprint.bold() daemon_state_colour = ansiprint.red() + ansiprint.bold()
elif node_information["daemon_state"] == "fenced":
daemon_state_colour = ansiprint.red()
else: else:
daemon_state_colour = ansiprint.blue() daemon_state_colour = ansiprint.blue()

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="pvc", name="pvc",
version="0.9.85", version="0.9.86",
packages=["pvc.cli", "pvc.lib"], packages=["pvc.cli", "pvc.lib"],
install_requires=[ install_requires=[
"Click", "Click",

View File

@ -19,6 +19,7 @@
# #
############################################################################### ###############################################################################
from distutils.util import strtobool
from json import loads from json import loads
import daemon_lib.common as common import daemon_lib.common as common
@ -240,7 +241,9 @@ def getNodeHealth(zkhandler, node_list):
node_health_messages.append(f"'{entry['name']}': {entry['message']}") node_health_messages.append(f"'{entry['name']}': {entry['message']}")
node_health_entry = { node_health_entry = {
"health": int(node_health_value), "health": int(node_health_value)
if isinstance(node_health_value, int)
else node_health_value,
"messages": node_health_messages, "messages": node_health_messages,
} }
node_health[node] = node_health_entry node_health[node] = node_health_entry
@ -434,6 +437,157 @@ def get_info(zkhandler):
return False, "ERROR: Failed to obtain cluster information!" return False, "ERROR: Failed to obtain cluster information!"
def get_metrics(zkhandler):
# Get general cluster information
status_retflag, status_data = get_info(zkhandler)
if not status_retflag:
return False, "Error: Status data threw error"
faults_data = status_data["detail"]["faults"]
node_data = status_data["detail"]["node"]
vm_data = status_data["detail"]["vm"]
osd_data = status_data["detail"]["osd"]
output_lines = list()
output_lines.append("# HELP pvc_info PVC cluster information")
output_lines.append("# TYPE pvc_info gauge")
output_lines.append(
f"pvc_info{{primary_node=\"{status_data['primary_node']}\", version=\"{status_data['pvc_version']}\", upstream_ip=\"{status_data['upstream_ip']}\"}} 1"
)
output_lines.append("# HELP pvc_cluster_maintenance PVC cluster maintenance state")
output_lines.append("# TYPE pvc_cluster_maintenance gauge")
output_lines.append(
f"pvc_cluster_maintenance {1 if bool(strtobool(status_data['maintenance'])) else 0}"
)
output_lines.append("# HELP pvc_cluster_health PVC cluster health status")
output_lines.append("# TYPE pvc_cluster_health gauge")
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
output_lines.append("# TYPE pvc_cluster_faults gauge")
fault_map = dict()
for fault_type in common.fault_state_combinations:
fault_map[fault_type] = 0
for fault in faults_data:
fault_map[fault["status"]] += 1
for fault_type in fault_map:
output_lines.append(
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
)
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
# output_lines.append("# TYPE pvc_cluster_faults gauge")
# for fault_msg in status_data["cluster_health"]["messages"]:
# output_lines.append(
# f"pvc_cluster_faults{{id=\"{fault_msg['id']}\", message=\"{fault_msg['text']}\"}} {fault_msg['health_delta']}"
# )
output_lines.append("# HELP pvc_node_health PVC cluster node health status")
output_lines.append("# TYPE pvc_node_health gauge")
for node in status_data["node_health"]:
if isinstance(status_data["node_health"][node]["health"], int):
output_lines.append(
f"pvc_node_health{{node=\"{node}\"}} {status_data['node_health'][node]['health']}"
)
output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts")
output_lines.append("# TYPE pvc_node_daemon_states gauge")
node_daemon_state_map = dict()
for state in set([s.split(",")[0] for s in common.node_state_combinations]):
node_daemon_state_map[state] = 0
for node in node_data:
node_daemon_state_map[node["daemon_state"]] += 1
for state in node_daemon_state_map:
output_lines.append(
f'pvc_node_daemon_states{{state="{state}"}} {node_daemon_state_map[state]}'
)
output_lines.append("# HELP pvc_node_domain_states PVC Node domain state counts")
output_lines.append("# TYPE pvc_node_domain_states gauge")
node_domain_state_map = dict()
for state in set([s.split(",")[1] for s in common.node_state_combinations]):
node_domain_state_map[state] = 0
for node in node_data:
node_domain_state_map[node["domain_state"]] += 1
for state in node_domain_state_map:
output_lines.append(
f'pvc_node_domain_states{{state="{state}"}} {node_domain_state_map[state]}'
)
output_lines.append("# HELP pvc_vm_states PVC VM state counts")
output_lines.append("# TYPE pvc_vm_states gauge")
vm_state_map = dict()
for state in set(common.vm_state_combinations):
vm_state_map[state] = 0
for vm in vm_data:
vm_state_map[vm["state"]] += 1
for state in vm_state_map:
output_lines.append(f'pvc_vm_states{{state="{state}"}} {vm_state_map[state]}')
output_lines.append("# HELP pvc_osd_up_states PVC OSD up state counts")
output_lines.append("# TYPE pvc_osd_up_states gauge")
osd_up_state_map = dict()
for state in set([s.split(",")[0] for s in common.ceph_osd_state_combinations]):
osd_up_state_map[state] = 0
for osd in osd_data:
if osd["up"] == "up":
osd_up_state_map["up"] += 1
else:
osd_up_state_map["down"] += 1
for state in osd_up_state_map:
output_lines.append(
f'pvc_osd_up_states{{state="{state}"}} {osd_up_state_map[state]}'
)
output_lines.append("# HELP pvc_osd_in_states PVC OSD in state counts")
output_lines.append("# TYPE pvc_osd_in_states gauge")
osd_in_state_map = dict()
for state in set([s.split(",")[1] for s in common.ceph_osd_state_combinations]):
osd_in_state_map[state] = 0
for osd in osd_data:
if osd["in"] == "in":
osd_in_state_map["in"] += 1
else:
osd_in_state_map["out"] += 1
for state in osd_in_state_map:
output_lines.append(
f'pvc_osd_in_states{{state="{state}"}} {osd_in_state_map[state]}'
)
output_lines.append("# HELP pvc_nodes PVC Node count")
output_lines.append("# TYPE pvc_nodes gauge")
output_lines.append(f"pvc_nodes {status_data['nodes']['total']}")
output_lines.append("# HELP pvc_vms PVC VM count")
output_lines.append("# TYPE pvc_vms gauge")
output_lines.append(f"pvc_vms {status_data['vms']['total']}")
output_lines.append("# HELP pvc_osds PVC OSD count")
output_lines.append("# TYPE pvc_osds gauge")
output_lines.append(f"pvc_osds {status_data['osds']['total']}")
output_lines.append("# HELP pvc_networks PVC Network count")
output_lines.append("# TYPE pvc_networks gauge")
output_lines.append(f"pvc_networks {status_data['networks']}")
output_lines.append("# HELP pvc_pools PVC Storage Pool count")
output_lines.append("# TYPE pvc_pools gauge")
output_lines.append(f"pvc_pools {status_data['pools']}")
output_lines.append("# HELP pvc_volumes PVC Storage Volume count")
output_lines.append("# TYPE pvc_volumes gauge")
output_lines.append(f"pvc_volumes {status_data['volumes']}")
output_lines.append("# HELP pvc_snapshots PVC Storage Snapshot count")
output_lines.append("# TYPE pvc_snapshots gauge")
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
return True, "\n".join(output_lines) + "\n"
def cluster_initialize(zkhandler, overwrite=False): def cluster_initialize(zkhandler, overwrite=False):
# Abort if we've initialized the cluster before # Abort if we've initialized the cluster before
if zkhandler.exists("base.config.primary_node") and not overwrite: if zkhandler.exists("base.config.primary_node") and not overwrite:

View File

@ -71,7 +71,7 @@ def getNodeHealthDetails(zkhandler, node_name, node_health_plugins):
) = tuple(all_plugin_data[pos_start:pos_end]) ) = tuple(all_plugin_data[pos_start:pos_end])
plugin_output = { plugin_output = {
"name": plugin, "name": plugin,
"last_run": int(plugin_last_run), "last_run": int(plugin_last_run) if plugin_last_run is not None else None,
"health_delta": int(plugin_health_delta), "health_delta": int(plugin_health_delta),
"message": plugin_message, "message": plugin_message,
"data": json.loads(plugin_data), "data": json.loads(plugin_data),
@ -334,6 +334,8 @@ def get_list(
): ):
node_list = [] node_list = []
full_node_list = zkhandler.children("base.node") full_node_list = zkhandler.children("base.node")
if full_node_list is None:
full_node_list = list()
full_node_list.sort() full_node_list.sort()
if is_fuzzy and limit: if is_fuzzy and limit:

11
debian/changelog vendored
View File

@ -1,3 +1,14 @@
pvc (0.9.86-0) unstable; urgency=high
* [API Daemon] Significantly improves the performance of several commands via async Zookeeper calls and removal of superfluous backend calls.
* [Docs] Improves the project README and updates screenshot images to show the current output and more functionality.
* [API Daemon/CLI] Corrects some bugs in VM metainformation output.
* [Node Daemon] Fixes resource reporting bugs from 0.9.81 and properly clears node resource numbers on a fence.
* [Health Daemon] Adds a wait during pvchealthd startup until the node is in run state, to avoid erroneous faults during node bootup.
* [API Daemon] Fixes an incorrect reference to legacy pvcapid.yaml file in migration script.
-- Joshua M. Boniface <joshua@boniface.me> Thu, 14 Dec 2023 14:46:29 -0500
pvc (0.9.85-0) unstable; urgency=high pvc (0.9.85-0) unstable; urgency=high
* [Packaging] Fixes a dependency bug introduced in 0.9.84 * [Packaging] Fixes a dependency bug introduced in 0.9.84

View File

@ -33,7 +33,7 @@ import os
import signal import signal
# Daemon version # Daemon version
version = "0.9.85" version = "0.9.86"
########################################################## ##########################################################
@ -80,6 +80,11 @@ def entrypoint():
# Connect to Zookeeper and return our handler and current schema version # Connect to Zookeeper and return our handler and current schema version
zkhandler, _ = pvchealthd.util.zookeeper.connect(logger, config) zkhandler, _ = pvchealthd.util.zookeeper.connect(logger, config)
logger.out("Waiting for node daemon to be operating", state="s")
while zkhandler.read(("node.state.daemon", config["node_hostname"])) != "run":
sleep(5)
logger.out("Node daemon in run state, continuing health daemon startup", state="s")
# Define a cleanup function # Define a cleanup function
def cleanup(failure=False): def cleanup(failure=False):
nonlocal logger, zkhandler, monitoring_instance nonlocal logger, zkhandler, monitoring_instance

View File

@ -70,7 +70,7 @@ def check_pvc(item, params, section):
summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})" summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
if len(cluster_messages) > 0: if len(cluster_messages) > 0:
details = ", ".join(cluster_messages) details = ", ".join([m["text"] for m in cluster_messages])
if cluster_health <= 50 and maintenance == "off": if cluster_health <= 50 and maintenance == "off":
state = State.CRIT state = State.CRIT

View File

@ -48,7 +48,7 @@ import re
import json import json
# Daemon version # Daemon version
version = "0.9.85" version = "0.9.86"
########################################################## ##########################################################

View File

@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
) )
# Daemon version # Daemon version
version = "0.9.85" version = "0.9.86"
config = cfg.get_configuration() config = cfg.get_configuration()