Compare commits
24 Commits
746416b8ed
...
v0.9.86
Author | SHA1 | Date | |
---|---|---|---|
c64e888d30 | |||
f1249452e5 | |||
0a93f526e0 | |||
7c9512fb22 | |||
e88b97f3a9 | |||
709c9cb73e | |||
f41c5176be | |||
38e43b46c3 | |||
ed9c37982a | |||
0f24184b78 | |||
1ba37fe33d | |||
1a05077b10 | |||
57c28376a6 | |||
e781d742e6 | |||
6c6d1508a1 | |||
741dafb26b | |||
032d3ebf18 | |||
5d9e83e8ed | |||
ad0bd8649f | |||
9b5e53e4b6 | |||
9617660342 | |||
ab0a1e0946 | |||
7c116b2fbc | |||
1023c55087 |
@ -1,5 +1,14 @@
|
||||
## PVC Changelog
|
||||
|
||||
###### [v0.9.86](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.86)
|
||||
|
||||
* [API Daemon] Significantly improves the performance of several commands via async Zookeeper calls and removal of superfluous backend calls.
|
||||
* [Docs] Improves the project README and updates screenshot images to show the current output and more functionality.
|
||||
* [API Daemon/CLI] Corrects some bugs in VM metainformation output.
|
||||
* [Node Daemon] Fixes resource reporting bugs from 0.9.81 and properly clears node resource numbers on a fence.
|
||||
* [Health Daemon] Adds a wait during pvchealthd startup until the node is in run state, to avoid erroneous faults during node bootup.
|
||||
* [API Daemon] Fixes an incorrect reference to legacy pvcapid.yaml file in migration script.
|
||||
|
||||
###### [v0.9.85](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.85)
|
||||
|
||||
* [Packaging] Fixes a dependency bug introduced in 0.9.84
|
||||
|
@ -52,7 +52,7 @@ These screenshots show some of the available functionality of the PVC system and
|
||||
</p>
|
||||
|
||||
<p><img alt="4. VM information" src="images/4-vm-information.png"/><br/>
|
||||
<i>PVC can show details about the VMs in the cluster, including their state, resource allocations</i>
|
||||
<i>PVC can show details about the VMs in the cluster, including their state, resource allocations, current hosting node, and metadata.</i>
|
||||
</p>
|
||||
|
||||
<p><img alt="5. VM details" src="images/5-vm-details.png"/><br/>
|
||||
@ -78,3 +78,7 @@ These screenshots show some of the available functionality of the PVC system and
|
||||
<p><img alt="10. Provisioner" src="images/10-provisioner.png"/><br/>
|
||||
<i>PVC features an extensively customizable and configurable VM provisioner system, including EC2-compatible CloudInit support, allowing you to define flexible VM profiles and provision new VMs with a single command.</i>
|
||||
</p>
|
||||
|
||||
<p><img alt="11. Prometheus and Grafana dashboard" src="images/11-prometheus-grafana.png"/><br/>
|
||||
<i>PVC features several monitoring integration examples under "node-daemon/monitoring", including CheckMK, Munin, and, most recently, Prometheus, including an example Grafana dashboard for cluster monitoring and alerting.</i>
|
||||
</p>
|
||||
|
@ -3,7 +3,7 @@
|
||||
# Apply PVC database migrations
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
|
||||
export PVC_CONFIG_FILE="/etc/pvc/pvcapid.yaml"
|
||||
export PVC_CONFIG_FILE="/etc/pvc/pvc.conf"
|
||||
|
||||
if [[ ! -f ${PVC_CONFIG_FILE} ]]; then
|
||||
echo "Create a configuration file at ${PVC_CONFIG_FILE} before upgrading the database."
|
||||
|
@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
|
||||
import daemon_lib.config as cfg
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.85"
|
||||
version = "0.9.86"
|
||||
|
||||
# API version
|
||||
API_VERSION = 1.0
|
||||
|
@ -131,154 +131,12 @@ def cluster_metrics(zkhandler):
|
||||
Format status data from cluster_status into Prometheus-compatible metrics
|
||||
"""
|
||||
|
||||
# Get general cluster information
|
||||
status_retflag, status_data = pvc_cluster.get_info(zkhandler)
|
||||
if not status_retflag:
|
||||
return "Error: Status data threw error", 400
|
||||
|
||||
faults_data = status_data["detail"]["faults"]
|
||||
node_data = status_data["detail"]["node"]
|
||||
vm_data = status_data["detail"]["vm"]
|
||||
osd_data = status_data["detail"]["osd"]
|
||||
|
||||
output_lines = list()
|
||||
|
||||
output_lines.append("# HELP pvc_info PVC cluster information")
|
||||
output_lines.append("# TYPE pvc_info gauge")
|
||||
output_lines.append(
|
||||
f"pvc_info{{primary_node=\"{status_data['primary_node']}\", version=\"{status_data['pvc_version']}\", upstream_ip=\"{status_data['upstream_ip']}\"}} 1"
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_cluster_maintenance PVC cluster maintenance state")
|
||||
output_lines.append("# TYPE pvc_cluster_maintenance gauge")
|
||||
output_lines.append(
|
||||
f"pvc_cluster_maintenance {1 if bool(strtobool(status_data['maintenance'])) else 0}"
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_cluster_health PVC cluster health status")
|
||||
output_lines.append("# TYPE pvc_cluster_health gauge")
|
||||
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
|
||||
|
||||
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
|
||||
output_lines.append("# TYPE pvc_cluster_faults gauge")
|
||||
fault_map = dict()
|
||||
for fault_type in pvc_common.fault_state_combinations:
|
||||
fault_map[fault_type] = 0
|
||||
for fault in faults_data:
|
||||
fault_map[fault["status"]] += 1
|
||||
for fault_type in fault_map:
|
||||
output_lines.append(
|
||||
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
|
||||
)
|
||||
|
||||
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
|
||||
# output_lines.append("# TYPE pvc_cluster_faults gauge")
|
||||
# for fault_msg in status_data["cluster_health"]["messages"]:
|
||||
# output_lines.append(
|
||||
# f"pvc_cluster_faults{{id=\"{fault_msg['id']}\", message=\"{fault_msg['text']}\"}} {fault_msg['health_delta']}"
|
||||
# )
|
||||
|
||||
output_lines.append("# HELP pvc_node_health PVC cluster node health status")
|
||||
output_lines.append("# TYPE pvc_node_health gauge")
|
||||
for node in status_data["node_health"]:
|
||||
if isinstance(status_data["node_health"][node]["health"], int):
|
||||
output_lines.append(
|
||||
f"pvc_node_health{{node=\"{node}\"}} {status_data['node_health'][node]['health']}"
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts")
|
||||
output_lines.append("# TYPE pvc_node_daemon_states gauge")
|
||||
node_daemon_state_map = dict()
|
||||
for state in set([s.split(",")[0] for s in pvc_common.node_state_combinations]):
|
||||
node_daemon_state_map[state] = 0
|
||||
for node in node_data:
|
||||
node_daemon_state_map[node["daemon_state"]] += 1
|
||||
for state in node_daemon_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_node_daemon_states{{state="{state}"}} {node_daemon_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_node_domain_states PVC Node domain state counts")
|
||||
output_lines.append("# TYPE pvc_node_domain_states gauge")
|
||||
node_domain_state_map = dict()
|
||||
for state in set([s.split(",")[1] for s in pvc_common.node_state_combinations]):
|
||||
node_domain_state_map[state] = 0
|
||||
for node in node_data:
|
||||
node_domain_state_map[node["domain_state"]] += 1
|
||||
for state in node_domain_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_node_domain_states{{state="{state}"}} {node_domain_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_vm_states PVC VM state counts")
|
||||
output_lines.append("# TYPE pvc_vm_states gauge")
|
||||
vm_state_map = dict()
|
||||
for state in set(pvc_common.vm_state_combinations):
|
||||
vm_state_map[state] = 0
|
||||
for vm in vm_data:
|
||||
vm_state_map[vm["state"]] += 1
|
||||
for state in vm_state_map:
|
||||
output_lines.append(f'pvc_vm_states{{state="{state}"}} {vm_state_map[state]}')
|
||||
|
||||
output_lines.append("# HELP pvc_osd_up_states PVC OSD up state counts")
|
||||
output_lines.append("# TYPE pvc_osd_up_states gauge")
|
||||
osd_up_state_map = dict()
|
||||
for state in set([s.split(",")[0] for s in pvc_common.ceph_osd_state_combinations]):
|
||||
osd_up_state_map[state] = 0
|
||||
for osd in osd_data:
|
||||
if osd["up"] == "up":
|
||||
osd_up_state_map["up"] += 1
|
||||
else:
|
||||
osd_up_state_map["down"] += 1
|
||||
for state in osd_up_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_osd_up_states{{state="{state}"}} {osd_up_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_osd_in_states PVC OSD in state counts")
|
||||
output_lines.append("# TYPE pvc_osd_in_states gauge")
|
||||
osd_in_state_map = dict()
|
||||
for state in set([s.split(",")[1] for s in pvc_common.ceph_osd_state_combinations]):
|
||||
osd_in_state_map[state] = 0
|
||||
for osd in osd_data:
|
||||
if osd["in"] == "in":
|
||||
osd_in_state_map["in"] += 1
|
||||
else:
|
||||
osd_in_state_map["out"] += 1
|
||||
for state in osd_in_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_osd_in_states{{state="{state}"}} {osd_in_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_nodes PVC Node count")
|
||||
output_lines.append("# TYPE pvc_nodes gauge")
|
||||
output_lines.append(f"pvc_nodes {status_data['nodes']['total']}")
|
||||
|
||||
output_lines.append("# HELP pvc_vms PVC VM count")
|
||||
output_lines.append("# TYPE pvc_vms gauge")
|
||||
output_lines.append(f"pvc_vms {status_data['vms']['total']}")
|
||||
|
||||
output_lines.append("# HELP pvc_osds PVC OSD count")
|
||||
output_lines.append("# TYPE pvc_osds gauge")
|
||||
output_lines.append(f"pvc_osds {status_data['osds']['total']}")
|
||||
|
||||
output_lines.append("# HELP pvc_networks PVC Network count")
|
||||
output_lines.append("# TYPE pvc_networks gauge")
|
||||
output_lines.append(f"pvc_networks {status_data['networks']}")
|
||||
|
||||
output_lines.append("# HELP pvc_pools PVC Storage Pool count")
|
||||
output_lines.append("# TYPE pvc_pools gauge")
|
||||
output_lines.append(f"pvc_pools {status_data['pools']}")
|
||||
|
||||
output_lines.append("# HELP pvc_volumes PVC Storage Volume count")
|
||||
output_lines.append("# TYPE pvc_volumes gauge")
|
||||
output_lines.append(f"pvc_volumes {status_data['volumes']}")
|
||||
|
||||
output_lines.append("# HELP pvc_snapshots PVC Storage Snapshot count")
|
||||
output_lines.append("# TYPE pvc_snapshots gauge")
|
||||
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
|
||||
|
||||
return "\n".join(output_lines) + "\n", 200
|
||||
retflag, retdata = pvc_cluster.get_metrics(zkhandler)
|
||||
if retflag:
|
||||
retcode = 200
|
||||
else:
|
||||
retcode = 400
|
||||
return retdata, retcode
|
||||
|
||||
|
||||
@pvc_common.Profiler(config)
|
||||
|
@ -249,6 +249,8 @@ def getOutputColours(node_information):
|
||||
daemon_state_colour = ansiprint.yellow()
|
||||
elif node_information["daemon_state"] == "dead":
|
||||
daemon_state_colour = ansiprint.red() + ansiprint.bold()
|
||||
elif node_information["daemon_state"] == "fenced":
|
||||
daemon_state_colour = ansiprint.red()
|
||||
else:
|
||||
daemon_state_colour = ansiprint.blue()
|
||||
|
||||
|
@ -1659,24 +1659,26 @@ def format_info(config, domain_information, long_output):
|
||||
)
|
||||
|
||||
if not domain_information.get("node_selector"):
|
||||
formatted_node_selector = "False"
|
||||
formatted_node_selector = "Default"
|
||||
else:
|
||||
formatted_node_selector = domain_information["node_selector"]
|
||||
formatted_node_selector = str(domain_information["node_selector"]).title()
|
||||
|
||||
if not domain_information.get("node_limit"):
|
||||
formatted_node_limit = "False"
|
||||
formatted_node_limit = "Any"
|
||||
else:
|
||||
formatted_node_limit = ", ".join(domain_information["node_limit"])
|
||||
|
||||
if not domain_information.get("node_autostart"):
|
||||
autostart_colour = ansiprint.blue()
|
||||
formatted_node_autostart = "False"
|
||||
else:
|
||||
formatted_node_autostart = domain_information["node_autostart"]
|
||||
autostart_colour = ansiprint.green()
|
||||
formatted_node_autostart = "True"
|
||||
|
||||
if not domain_information.get("migration_method"):
|
||||
formatted_migration_method = "any"
|
||||
formatted_migration_method = "Any"
|
||||
else:
|
||||
formatted_migration_method = domain_information["migration_method"]
|
||||
formatted_migration_method = str(domain_information["migration_method"]).title()
|
||||
|
||||
ainformation.append(
|
||||
"{}Migration selector:{} {}".format(
|
||||
@ -1689,8 +1691,12 @@ def format_info(config, domain_information, long_output):
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Autostart:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), formatted_node_autostart
|
||||
"{}Autostart:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
autostart_colour,
|
||||
formatted_node_autostart,
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
@ -1736,13 +1742,17 @@ def format_info(config, domain_information, long_output):
|
||||
domain_information["tags"], key=lambda t: t["type"] + t["name"]
|
||||
):
|
||||
ainformation.append(
|
||||
" {tags_name: <{tags_name_length}} {tags_type: <{tags_type_length}} {tags_protected: <{tags_protected_length}}".format(
|
||||
" {tags_name: <{tags_name_length}} {tags_type: <{tags_type_length}} {tags_protected_colour}{tags_protected: <{tags_protected_length}}{end}".format(
|
||||
tags_name_length=tags_name_length,
|
||||
tags_type_length=tags_type_length,
|
||||
tags_protected_length=tags_protected_length,
|
||||
tags_name=tag["name"],
|
||||
tags_type=tag["type"],
|
||||
tags_protected=str(tag["protected"]),
|
||||
tags_protected_colour=ansiprint.green()
|
||||
if tag["protected"]
|
||||
else ansiprint.blue(),
|
||||
end=ansiprint.end(),
|
||||
)
|
||||
)
|
||||
else:
|
||||
|
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="pvc",
|
||||
version="0.9.85",
|
||||
version="0.9.86",
|
||||
packages=["pvc.cli", "pvc.lib"],
|
||||
install_requires=[
|
||||
"Click",
|
||||
|
@ -320,13 +320,18 @@ def get_list_osd(zkhandler, limit=None, is_fuzzy=True):
|
||||
#
|
||||
def getPoolInformation(zkhandler, pool):
|
||||
# Parse the stats data
|
||||
pool_stats_raw = zkhandler.read(("pool.stats", pool))
|
||||
(pool_stats_raw, tier, pgs,) = zkhandler.read_many(
|
||||
[
|
||||
("pool.stats", pool),
|
||||
("pool.tier", pool),
|
||||
("pool.pgs", pool),
|
||||
]
|
||||
)
|
||||
|
||||
pool_stats = dict(json.loads(pool_stats_raw))
|
||||
volume_count = len(getCephVolumes(zkhandler, pool))
|
||||
tier = zkhandler.read(("pool.tier", pool))
|
||||
if tier is None:
|
||||
tier = "default"
|
||||
pgs = zkhandler.read(("pool.pgs", pool))
|
||||
|
||||
pool_information = {
|
||||
"name": pool,
|
||||
|
@ -19,6 +19,7 @@
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from distutils.util import strtobool
|
||||
from json import loads
|
||||
|
||||
import daemon_lib.common as common
|
||||
@ -240,7 +241,9 @@ def getNodeHealth(zkhandler, node_list):
|
||||
node_health_messages.append(f"'{entry['name']}': {entry['message']}")
|
||||
|
||||
node_health_entry = {
|
||||
"health": node_health_value,
|
||||
"health": int(node_health_value)
|
||||
if isinstance(node_health_value, int)
|
||||
else node_health_value,
|
||||
"messages": node_health_messages,
|
||||
}
|
||||
node_health[node] = node_health_entry
|
||||
@ -315,8 +318,8 @@ def getClusterInformation(zkhandler):
|
||||
for vidx, vm in enumerate(vm_list):
|
||||
# Split the large list of return values by the IDX of this VM
|
||||
# Each VM result is 2 field long
|
||||
pos_start = nidx * 2
|
||||
pos_end = nidx * 2 + 2
|
||||
pos_start = vidx * 2
|
||||
pos_end = vidx * 2 + 2
|
||||
vm_name, vm_state = tuple(all_vm_states[pos_start:pos_end])
|
||||
vm_data.append(
|
||||
{
|
||||
@ -379,11 +382,21 @@ def getClusterInformation(zkhandler):
|
||||
ceph_pool_count = len(ceph_pool_list)
|
||||
|
||||
# Get the list of Ceph volumes
|
||||
ceph_volume_list = zkhandler.children("base.volume")
|
||||
ceph_volume_list = list()
|
||||
for pool in ceph_pool_list:
|
||||
ceph_volume_list_pool = zkhandler.children(("volume", pool))
|
||||
if ceph_volume_list_pool is not None:
|
||||
ceph_volume_list += [f"{pool}/{volume}" for volume in ceph_volume_list_pool]
|
||||
ceph_volume_count = len(ceph_volume_list)
|
||||
|
||||
# Get the list of Ceph snapshots
|
||||
ceph_snapshot_list = zkhandler.children("base.snapshot")
|
||||
ceph_snapshot_list = list()
|
||||
for volume in ceph_volume_list:
|
||||
ceph_snapshot_list_volume = zkhandler.children(("snapshot", volume))
|
||||
if ceph_snapshot_list_volume is not None:
|
||||
ceph_snapshot_list += [
|
||||
f"{volume}@{snapshot}" for snapshot in ceph_snapshot_list_volume
|
||||
]
|
||||
ceph_snapshot_count = len(ceph_snapshot_list)
|
||||
|
||||
# Get the list of faults
|
||||
@ -424,6 +437,157 @@ def get_info(zkhandler):
|
||||
return False, "ERROR: Failed to obtain cluster information!"
|
||||
|
||||
|
||||
def get_metrics(zkhandler):
|
||||
# Get general cluster information
|
||||
status_retflag, status_data = get_info(zkhandler)
|
||||
if not status_retflag:
|
||||
return False, "Error: Status data threw error"
|
||||
|
||||
faults_data = status_data["detail"]["faults"]
|
||||
node_data = status_data["detail"]["node"]
|
||||
vm_data = status_data["detail"]["vm"]
|
||||
osd_data = status_data["detail"]["osd"]
|
||||
|
||||
output_lines = list()
|
||||
|
||||
output_lines.append("# HELP pvc_info PVC cluster information")
|
||||
output_lines.append("# TYPE pvc_info gauge")
|
||||
output_lines.append(
|
||||
f"pvc_info{{primary_node=\"{status_data['primary_node']}\", version=\"{status_data['pvc_version']}\", upstream_ip=\"{status_data['upstream_ip']}\"}} 1"
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_cluster_maintenance PVC cluster maintenance state")
|
||||
output_lines.append("# TYPE pvc_cluster_maintenance gauge")
|
||||
output_lines.append(
|
||||
f"pvc_cluster_maintenance {1 if bool(strtobool(status_data['maintenance'])) else 0}"
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_cluster_health PVC cluster health status")
|
||||
output_lines.append("# TYPE pvc_cluster_health gauge")
|
||||
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
|
||||
|
||||
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
|
||||
output_lines.append("# TYPE pvc_cluster_faults gauge")
|
||||
fault_map = dict()
|
||||
for fault_type in common.fault_state_combinations:
|
||||
fault_map[fault_type] = 0
|
||||
for fault in faults_data:
|
||||
fault_map[fault["status"]] += 1
|
||||
for fault_type in fault_map:
|
||||
output_lines.append(
|
||||
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
|
||||
)
|
||||
|
||||
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
|
||||
# output_lines.append("# TYPE pvc_cluster_faults gauge")
|
||||
# for fault_msg in status_data["cluster_health"]["messages"]:
|
||||
# output_lines.append(
|
||||
# f"pvc_cluster_faults{{id=\"{fault_msg['id']}\", message=\"{fault_msg['text']}\"}} {fault_msg['health_delta']}"
|
||||
# )
|
||||
|
||||
output_lines.append("# HELP pvc_node_health PVC cluster node health status")
|
||||
output_lines.append("# TYPE pvc_node_health gauge")
|
||||
for node in status_data["node_health"]:
|
||||
if isinstance(status_data["node_health"][node]["health"], int):
|
||||
output_lines.append(
|
||||
f"pvc_node_health{{node=\"{node}\"}} {status_data['node_health'][node]['health']}"
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts")
|
||||
output_lines.append("# TYPE pvc_node_daemon_states gauge")
|
||||
node_daemon_state_map = dict()
|
||||
for state in set([s.split(",")[0] for s in common.node_state_combinations]):
|
||||
node_daemon_state_map[state] = 0
|
||||
for node in node_data:
|
||||
node_daemon_state_map[node["daemon_state"]] += 1
|
||||
for state in node_daemon_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_node_daemon_states{{state="{state}"}} {node_daemon_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_node_domain_states PVC Node domain state counts")
|
||||
output_lines.append("# TYPE pvc_node_domain_states gauge")
|
||||
node_domain_state_map = dict()
|
||||
for state in set([s.split(",")[1] for s in common.node_state_combinations]):
|
||||
node_domain_state_map[state] = 0
|
||||
for node in node_data:
|
||||
node_domain_state_map[node["domain_state"]] += 1
|
||||
for state in node_domain_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_node_domain_states{{state="{state}"}} {node_domain_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_vm_states PVC VM state counts")
|
||||
output_lines.append("# TYPE pvc_vm_states gauge")
|
||||
vm_state_map = dict()
|
||||
for state in set(common.vm_state_combinations):
|
||||
vm_state_map[state] = 0
|
||||
for vm in vm_data:
|
||||
vm_state_map[vm["state"]] += 1
|
||||
for state in vm_state_map:
|
||||
output_lines.append(f'pvc_vm_states{{state="{state}"}} {vm_state_map[state]}')
|
||||
|
||||
output_lines.append("# HELP pvc_osd_up_states PVC OSD up state counts")
|
||||
output_lines.append("# TYPE pvc_osd_up_states gauge")
|
||||
osd_up_state_map = dict()
|
||||
for state in set([s.split(",")[0] for s in common.ceph_osd_state_combinations]):
|
||||
osd_up_state_map[state] = 0
|
||||
for osd in osd_data:
|
||||
if osd["up"] == "up":
|
||||
osd_up_state_map["up"] += 1
|
||||
else:
|
||||
osd_up_state_map["down"] += 1
|
||||
for state in osd_up_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_osd_up_states{{state="{state}"}} {osd_up_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_osd_in_states PVC OSD in state counts")
|
||||
output_lines.append("# TYPE pvc_osd_in_states gauge")
|
||||
osd_in_state_map = dict()
|
||||
for state in set([s.split(",")[1] for s in common.ceph_osd_state_combinations]):
|
||||
osd_in_state_map[state] = 0
|
||||
for osd in osd_data:
|
||||
if osd["in"] == "in":
|
||||
osd_in_state_map["in"] += 1
|
||||
else:
|
||||
osd_in_state_map["out"] += 1
|
||||
for state in osd_in_state_map:
|
||||
output_lines.append(
|
||||
f'pvc_osd_in_states{{state="{state}"}} {osd_in_state_map[state]}'
|
||||
)
|
||||
|
||||
output_lines.append("# HELP pvc_nodes PVC Node count")
|
||||
output_lines.append("# TYPE pvc_nodes gauge")
|
||||
output_lines.append(f"pvc_nodes {status_data['nodes']['total']}")
|
||||
|
||||
output_lines.append("# HELP pvc_vms PVC VM count")
|
||||
output_lines.append("# TYPE pvc_vms gauge")
|
||||
output_lines.append(f"pvc_vms {status_data['vms']['total']}")
|
||||
|
||||
output_lines.append("# HELP pvc_osds PVC OSD count")
|
||||
output_lines.append("# TYPE pvc_osds gauge")
|
||||
output_lines.append(f"pvc_osds {status_data['osds']['total']}")
|
||||
|
||||
output_lines.append("# HELP pvc_networks PVC Network count")
|
||||
output_lines.append("# TYPE pvc_networks gauge")
|
||||
output_lines.append(f"pvc_networks {status_data['networks']}")
|
||||
|
||||
output_lines.append("# HELP pvc_pools PVC Storage Pool count")
|
||||
output_lines.append("# TYPE pvc_pools gauge")
|
||||
output_lines.append(f"pvc_pools {status_data['pools']}")
|
||||
|
||||
output_lines.append("# HELP pvc_volumes PVC Storage Volume count")
|
||||
output_lines.append("# TYPE pvc_volumes gauge")
|
||||
output_lines.append(f"pvc_volumes {status_data['volumes']}")
|
||||
|
||||
output_lines.append("# HELP pvc_snapshots PVC Storage Snapshot count")
|
||||
output_lines.append("# TYPE pvc_snapshots gauge")
|
||||
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
|
||||
|
||||
return True, "\n".join(output_lines) + "\n"
|
||||
|
||||
|
||||
def cluster_initialize(zkhandler, overwrite=False):
|
||||
# Abort if we've initialized the cluster before
|
||||
if zkhandler.exists("base.config.primary_node") and not overwrite:
|
||||
|
@ -401,13 +401,23 @@ def getDomainTags(zkhandler, dom_uuid):
|
||||
"""
|
||||
tags = list()
|
||||
|
||||
for tag in zkhandler.children(("domain.meta.tags", dom_uuid)):
|
||||
tag_type = zkhandler.read(("domain.meta.tags", dom_uuid, "tag.type", tag))
|
||||
protected = bool(
|
||||
strtobool(
|
||||
zkhandler.read(("domain.meta.tags", dom_uuid, "tag.protected", tag))
|
||||
)
|
||||
)
|
||||
all_tags = zkhandler.children(("domain.meta.tags", dom_uuid))
|
||||
|
||||
tag_reads = list()
|
||||
for tag in all_tags:
|
||||
tag_reads += [
|
||||
("domain.meta.tags", dom_uuid, "tag.type", tag),
|
||||
("domain.meta.tags", dom_uuid, "tag.protected", tag),
|
||||
]
|
||||
all_tag_data = zkhandler.read_many(tag_reads)
|
||||
|
||||
for tidx, tag in enumerate(all_tags):
|
||||
# Split the large list of return values by the IDX of this tag
|
||||
# Each tag result is 2 fields long
|
||||
pos_start = tidx * 2
|
||||
pos_end = tidx * 2 + 2
|
||||
tag_type, protected = tuple(all_tag_data[pos_start:pos_end])
|
||||
protected = bool(strtobool(protected))
|
||||
tags.append({"name": tag, "type": tag_type, "protected": protected})
|
||||
|
||||
return tags
|
||||
@ -422,19 +432,34 @@ def getDomainMetadata(zkhandler, dom_uuid):
|
||||
|
||||
The UUID must be validated before calling this function!
|
||||
"""
|
||||
domain_node_limit = zkhandler.read(("domain.meta.node_limit", dom_uuid))
|
||||
domain_node_selector = zkhandler.read(("domain.meta.node_selector", dom_uuid))
|
||||
domain_node_autostart = zkhandler.read(("domain.meta.autostart", dom_uuid))
|
||||
domain_migration_method = zkhandler.read(("domain.meta.migrate_method", dom_uuid))
|
||||
(
|
||||
domain_node_limit,
|
||||
domain_node_selector,
|
||||
domain_node_autostart,
|
||||
domain_migration_method,
|
||||
) = zkhandler.read_many(
|
||||
[
|
||||
("domain.meta.node_limit", dom_uuid),
|
||||
("domain.meta.node_selector", dom_uuid),
|
||||
("domain.meta.autostart", dom_uuid),
|
||||
("domain.meta.migrate_method", dom_uuid),
|
||||
]
|
||||
)
|
||||
|
||||
if not domain_node_limit:
|
||||
domain_node_limit = None
|
||||
else:
|
||||
domain_node_limit = domain_node_limit.split(",")
|
||||
|
||||
if not domain_node_selector or domain_node_selector == "none":
|
||||
domain_node_selector = None
|
||||
|
||||
if not domain_node_autostart:
|
||||
domain_node_autostart = None
|
||||
|
||||
if not domain_migration_method or domain_migration_method == "none":
|
||||
domain_migration_method = None
|
||||
|
||||
return (
|
||||
domain_node_limit,
|
||||
domain_node_selector,
|
||||
@ -451,10 +476,25 @@ def getInformationFromXML(zkhandler, uuid):
|
||||
Gather information about a VM from the Libvirt XML configuration in the Zookeper database
|
||||
and return a dict() containing it.
|
||||
"""
|
||||
domain_state = zkhandler.read(("domain.state", uuid))
|
||||
domain_node = zkhandler.read(("domain.node", uuid))
|
||||
domain_lastnode = zkhandler.read(("domain.last_node", uuid))
|
||||
domain_failedreason = zkhandler.read(("domain.failed_reason", uuid))
|
||||
(
|
||||
domain_state,
|
||||
domain_node,
|
||||
domain_lastnode,
|
||||
domain_failedreason,
|
||||
domain_profile,
|
||||
domain_vnc,
|
||||
stats_data,
|
||||
) = zkhandler.read_many(
|
||||
[
|
||||
("domain.state", uuid),
|
||||
("domain.node", uuid),
|
||||
("domain.last_node", uuid),
|
||||
("domain.failed_reason", uuid),
|
||||
("domain.profile", uuid),
|
||||
("domain.console.vnc", uuid),
|
||||
("domain.stats", uuid),
|
||||
]
|
||||
)
|
||||
|
||||
(
|
||||
domain_node_limit,
|
||||
@ -462,19 +502,17 @@ def getInformationFromXML(zkhandler, uuid):
|
||||
domain_node_autostart,
|
||||
domain_migration_method,
|
||||
) = getDomainMetadata(zkhandler, uuid)
|
||||
domain_tags = getDomainTags(zkhandler, uuid)
|
||||
domain_profile = zkhandler.read(("domain.profile", uuid))
|
||||
|
||||
domain_vnc = zkhandler.read(("domain.console.vnc", uuid))
|
||||
domain_tags = getDomainTags(zkhandler, uuid)
|
||||
|
||||
if domain_vnc:
|
||||
domain_vnc_listen, domain_vnc_port = domain_vnc.split(":")
|
||||
else:
|
||||
domain_vnc_listen = "None"
|
||||
domain_vnc_port = "None"
|
||||
domain_vnc_listen = None
|
||||
domain_vnc_port = None
|
||||
|
||||
parsed_xml = getDomainXML(zkhandler, uuid)
|
||||
|
||||
stats_data = zkhandler.read(("domain.stats", uuid))
|
||||
if stats_data is not None:
|
||||
try:
|
||||
stats_data = loads(stats_data)
|
||||
@ -491,6 +529,7 @@ def getInformationFromXML(zkhandler, uuid):
|
||||
domain_vcpu,
|
||||
domain_vcputopo,
|
||||
) = getDomainMainDetails(parsed_xml)
|
||||
|
||||
domain_networks = getDomainNetworks(parsed_xml, stats_data)
|
||||
|
||||
(
|
||||
|
@ -71,7 +71,7 @@ def getNodeHealthDetails(zkhandler, node_name, node_health_plugins):
|
||||
) = tuple(all_plugin_data[pos_start:pos_end])
|
||||
plugin_output = {
|
||||
"name": plugin,
|
||||
"last_run": int(plugin_last_run),
|
||||
"last_run": int(plugin_last_run) if plugin_last_run is not None else None,
|
||||
"health_delta": int(plugin_health_delta),
|
||||
"message": plugin_message,
|
||||
"data": json.loads(plugin_data),
|
||||
@ -334,6 +334,8 @@ def get_list(
|
||||
):
|
||||
node_list = []
|
||||
full_node_list = zkhandler.children("base.node")
|
||||
if full_node_list is None:
|
||||
full_node_list = list()
|
||||
full_node_list.sort()
|
||||
|
||||
if is_fuzzy and limit:
|
||||
|
11
debian/changelog
vendored
@ -1,3 +1,14 @@
|
||||
pvc (0.9.86-0) unstable; urgency=high
|
||||
|
||||
* [API Daemon] Significantly improves the performance of several commands via async Zookeeper calls and removal of superfluous backend calls.
|
||||
* [Docs] Improves the project README and updates screenshot images to show the current output and more functionality.
|
||||
* [API Daemon/CLI] Corrects some bugs in VM metainformation output.
|
||||
* [Node Daemon] Fixes resource reporting bugs from 0.9.81 and properly clears node resource numbers on a fence.
|
||||
* [Health Daemon] Adds a wait during pvchealthd startup until the node is in run state, to avoid erroneous faults during node bootup.
|
||||
* [API Daemon] Fixes an incorrect reference to legacy pvcapid.yaml file in migration script.
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Thu, 14 Dec 2023 14:46:29 -0500
|
||||
|
||||
pvc (0.9.85-0) unstable; urgency=high
|
||||
|
||||
* [Packaging] Fixes a dependency bug introduced in 0.9.84
|
||||
|
@ -33,7 +33,7 @@ import os
|
||||
import signal
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.85"
|
||||
version = "0.9.86"
|
||||
|
||||
|
||||
##########################################################
|
||||
@ -80,6 +80,11 @@ def entrypoint():
|
||||
# Connect to Zookeeper and return our handler and current schema version
|
||||
zkhandler, _ = pvchealthd.util.zookeeper.connect(logger, config)
|
||||
|
||||
logger.out("Waiting for node daemon to be operating", state="s")
|
||||
while zkhandler.read(("node.state.daemon", config["node_hostname"])) != "run":
|
||||
sleep(5)
|
||||
logger.out("Node daemon in run state, continuing health daemon startup", state="s")
|
||||
|
||||
# Define a cleanup function
|
||||
def cleanup(failure=False):
|
||||
nonlocal logger, zkhandler, monitoring_instance
|
||||
|
BIN
images/11-prometheus-grafana.png
Normal file
After Width: | Height: | Size: 168 KiB |
Before Width: | Height: | Size: 115 KiB After Width: | Height: | Size: 140 KiB |
Before Width: | Height: | Size: 94 KiB After Width: | Height: | Size: 109 KiB |
Before Width: | Height: | Size: 126 KiB After Width: | Height: | Size: 136 KiB |
@ -70,7 +70,7 @@ def check_pvc(item, params, section):
|
||||
summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
|
||||
|
||||
if len(cluster_messages) > 0:
|
||||
details = ", ".join(cluster_messages)
|
||||
details = ", ".join([m["text"] for m in cluster_messages])
|
||||
|
||||
if cluster_health <= 50 and maintenance == "off":
|
||||
state = State.CRIT
|
||||
|
@ -2555,7 +2555,9 @@
|
||||
],
|
||||
"refresh": "5s",
|
||||
"schemaVersion": 38,
|
||||
"tags": [],
|
||||
"tags": [
|
||||
"pvc"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
@ -2592,6 +2594,6 @@
|
||||
"timezone": "",
|
||||
"title": "PVC Cluster",
|
||||
"uid": "fbddd9f9-aadb-4c97-8aea-57c29e5de234",
|
||||
"version": 56,
|
||||
"version": 57,
|
||||
"weekStart": ""
|
||||
}
|
@ -48,7 +48,7 @@ import re
|
||||
import json
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.85"
|
||||
version = "0.9.86"
|
||||
|
||||
|
||||
##########################################################
|
||||
|
@ -115,6 +115,27 @@ def fence_node(node_name, zkhandler, config, logger):
|
||||
):
|
||||
migrateFromFencedNode(zkhandler, node_name, config, logger)
|
||||
|
||||
# Reset all node resource values
|
||||
logger.out(
|
||||
f"Resetting all resource values for dead node {node_name} to zero",
|
||||
state="i",
|
||||
prefix=f"fencing {node_name}",
|
||||
)
|
||||
zkhandler.write(
|
||||
[
|
||||
(("node.running_domains", node_name), "0"),
|
||||
(("node.count.provisioned_domains", node_name), "0"),
|
||||
(("node.cpu.load", node_name), "0"),
|
||||
(("node.vcpu.allocated", node_name), "0"),
|
||||
(("node.memory.total", node_name), "0"),
|
||||
(("node.memory.used", node_name), "0"),
|
||||
(("node.memory.free", node_name), "0"),
|
||||
(("node.memory.allocated", node_name), "0"),
|
||||
(("node.memory.provisioned", node_name), "0"),
|
||||
(("node.monitoring.health", node_name), None),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Migrate hosts away from a fenced node
|
||||
def migrateFromFencedNode(zkhandler, node_name, config, logger):
|
||||
|
@ -477,6 +477,10 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
|
||||
fixed_d_domain = this_node.d_domain.copy()
|
||||
for domain, instance in fixed_d_domain.items():
|
||||
if domain in this_node.domain_list:
|
||||
# Add the allocated memory to our memalloc value
|
||||
memalloc += instance.getmemory()
|
||||
memprov += instance.getmemory()
|
||||
vcpualloc += instance.getvcpus()
|
||||
if instance.getstate() == "start" and instance.getnode() == this_node.name:
|
||||
if instance.getdom() is not None:
|
||||
try:
|
||||
@ -532,11 +536,6 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
|
||||
continue
|
||||
domain_memory_stats = domain.memoryStats()
|
||||
domain_cpu_stats = domain.getCPUStats(True)[0]
|
||||
|
||||
# Add the allocated memory to our memalloc value
|
||||
memalloc += instance.getmemory()
|
||||
memprov += instance.getmemory()
|
||||
vcpualloc += instance.getvcpus()
|
||||
except Exception as e:
|
||||
if debug:
|
||||
try:
|
||||
@ -701,7 +700,7 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
||||
|
||||
runtime_start = datetime.now()
|
||||
logger.out(
|
||||
"Starting node keepalive run at {datetime.now()}",
|
||||
f"Starting node keepalive run at {datetime.now()}",
|
||||
state="t",
|
||||
)
|
||||
|
||||
|
@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
|
||||
)
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.85"
|
||||
version = "0.9.86"
|
||||
|
||||
|
||||
config = cfg.get_configuration()
|
||||
|