Improve node utilization metrics and fix bugs

This commit is contained in:
Joshua Boniface 2023-12-25 02:47:41 -05:00
parent 3e4cc53fdd
commit 9604f655d0
1 changed files with 168 additions and 15 deletions

View File

@ -498,8 +498,7 @@ def get_health_metrics(zkhandler):
output_lines.append("# TYPE pvc_node_health gauge") output_lines.append("# TYPE pvc_node_health gauge")
for node in status_data["node_health"]: for node in status_data["node_health"]:
node_health = status_data["node_health"][node]["health"] node_health = status_data["node_health"][node]["health"]
print(node_health) if isinstance(node_health, (int, float)):
if isinstance(node_health, int):
output_lines.append(f'pvc_node_health{{node="{node}"}} {node_health}') output_lines.append(f'pvc_node_health{{node="{node}"}} {node_health}')
output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts") output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts")
@ -630,6 +629,7 @@ def get_resource_metrics(zkhandler):
all_total_speed = 0 all_total_speed = 0
all_total_util = 0 all_total_util = 0
all_total_count = 0 all_total_count = 0
per_node_network_utilization = dict()
for node in node_data: for node in node_data:
if node["daemon_state"] != "run": if node["daemon_state"] != "run":
continue continue
@ -652,13 +652,17 @@ def get_resource_metrics(zkhandler):
if total_count > 0: if total_count > 0:
# Average the speed and util by the count # Average the speed and util by the count
avg_speed = int(total_speed / total_count) avg_speed = float(total_speed / total_count)
all_total_speed += avg_speed all_total_speed += avg_speed
avg_util = int(total_util / total_count) avg_util = float(total_util / total_count)
all_total_util += avg_util all_total_util += avg_util
all_total_count += 1 all_total_count += 1
per_node_network_utilization[node["name"]] = avg_util / avg_speed * 100
else:
per_node_network_utilization[node["name"]] = 0.0
if all_total_count > 0: if all_total_count > 0:
all_avg_speed = all_total_speed / all_total_count all_avg_speed = all_total_speed / all_total_count
all_avg_util = all_total_util / all_total_count all_avg_util = all_total_util / all_total_count
@ -678,7 +682,7 @@ def get_resource_metrics(zkhandler):
n["cpu_count"] n["cpu_count"]
for n in sorted(node_data, key=lambda n: n["cpu_count"], reverse=False) for n in sorted(node_data, key=lambda n: n["cpu_count"], reverse=False)
] ]
total_cpu = sum(node_sorted_cpu[:-2]) total_cpu = sum(node_sorted_cpu[:-1])
used_cpu = sum([n["load"] for n in node_data]) used_cpu = sum([n["load"] for n in node_data])
used_cpu_percentage = used_cpu / total_cpu * 100 used_cpu_percentage = used_cpu / total_cpu * 100
output_lines.append(f"pvc_cluster_cpu_utilization {used_cpu_percentage:2.2f}") output_lines.append(f"pvc_cluster_cpu_utilization {used_cpu_percentage:2.2f}")
@ -695,7 +699,7 @@ def get_resource_metrics(zkhandler):
n["memory"]["total"] n["memory"]["total"]
for n in sorted(node_data, key=lambda n: n["memory"]["total"], reverse=False) for n in sorted(node_data, key=lambda n: n["memory"]["total"], reverse=False)
] ]
total_memory = sum(node_sorted_memory[:-2]) total_memory = sum(node_sorted_memory[:-1])
used_memory = sum([n["memory"]["used"] for n in node_data]) used_memory = sum([n["memory"]["used"] for n in node_data])
used_memory_percentage = used_memory / total_memory * 100 used_memory_percentage = used_memory / total_memory * 100
@ -750,7 +754,9 @@ def get_resource_metrics(zkhandler):
output_lines.append("# TYPE pvc_node_host_cpus gauge") output_lines.append("# TYPE pvc_node_host_cpus gauge")
for node in node_data: for node in node_data:
total_cpus = ( total_cpus = (
node["vcpu"]["total"] if isinstance(node["vcpu"]["total"], int) else 0 node["vcpu"]["total"]
if isinstance(node["vcpu"]["total"], (int, float))
else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_host_cpus{{node=\"{node['name']}\"}} {total_cpus}" f"pvc_node_host_cpus{{node=\"{node['name']}\"}} {total_cpus}"
@ -761,7 +767,7 @@ def get_resource_metrics(zkhandler):
for node in node_data: for node in node_data:
allocated_cpus = ( allocated_cpus = (
node["vcpu"]["allocated"] node["vcpu"]["allocated"]
if isinstance(node["vcpu"]["allocated"], int) if isinstance(node["vcpu"]["allocated"], (int, float))
else 0 else 0
) )
output_lines.append( output_lines.append(
@ -771,16 +777,33 @@ def get_resource_metrics(zkhandler):
output_lines.append("# HELP pvc_node_load PVC node 1 minute load average") output_lines.append("# HELP pvc_node_load PVC node 1 minute load average")
output_lines.append("# TYPE pvc_node_load gauge") output_lines.append("# TYPE pvc_node_load gauge")
for node in node_data: for node in node_data:
load_average = node["load"] if isinstance(node["load"], float) else 0.0 load_average = node["load"] if isinstance(node["load"], (int, float)) else 0.0
output_lines.append( output_lines.append(
f"pvc_node_load_average{{node=\"{node['name']}\"}} {load_average}" f"pvc_node_load_average{{node=\"{node['name']}\"}} {load_average}"
) )
output_lines.append("# HELP pvc_node_cpu_utilization PVC node CPU utilization")
output_lines.append("# TYPE pvc_node_cpu_utilization gauge")
for node in node_data:
load_average = node["load"] if isinstance(node["load"], (int, float)) else 0.0
cpu_count = (
node["cpu_count"] if isinstance(node["cpu_count"], (int, float)) else 0
)
if cpu_count > 0:
used_cpu_percentage = load_average / cpu_count * 100
else:
used_cpu_percentage = 0.0
output_lines.append(
f"pvc_node_cpu_utilization{{node=\"{node['name']}\"}} {used_cpu_percentage:2.2f}"
)
output_lines.append("# HELP pvc_node_domains_count PVC node running domain count") output_lines.append("# HELP pvc_node_domains_count PVC node running domain count")
output_lines.append("# TYPE pvc_node_domains_count gauge") output_lines.append("# TYPE pvc_node_domains_count gauge")
for node in node_data: for node in node_data:
running_domains_count = ( running_domains_count = (
node["domains_count"] if isinstance(node["domains_count"], int) else 0 node["domains_count"]
if isinstance(node["domains_count"], (int, float))
else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_domains_count{{node=\"{node['name']}\"}} {running_domains_count}" f"pvc_node_domains_count{{node=\"{node['name']}\"}} {running_domains_count}"
@ -802,11 +825,71 @@ def get_resource_metrics(zkhandler):
f"pvc_node_kernel{{node=\"{node['name']}\",kernel=\"{kernel}\"}} 1" f"pvc_node_kernel{{node=\"{node['name']}\",kernel=\"{kernel}\"}} 1"
) )
output_lines.append(
"# HELP pvc_node_network_traffic_rx PVC node received network traffic"
)
output_lines.append("# TYPE pvc_node_network_traffic_rx gauge")
for node in node_data:
rx_bps = 0
for interface in node["interfaces"].keys():
rx_bps += node["interfaces"][interface]["rx_bps"]
output_lines.append(
f"pvc_node_network_traffic_rx{{node=\"{node['name']}\"}} {rx_bps:2.2f}"
)
output_lines.append(
"# HELP pvc_node_network_traffic_tx PVC node transmitted network traffic"
)
output_lines.append("# TYPE pvc_node_network_traffic_tx gauge")
for node in node_data:
tx_bps = 0
for interface in node["interfaces"].keys():
tx_bps += node["interfaces"][interface]["tx_bps"]
output_lines.append(
f"pvc_node_network_traffic_tx{{node=\"{node['name']}\"}} {tx_bps:2.2f}"
)
output_lines.append(
"# HELP pvc_node_network_packets_rx PVC node received network packets"
)
output_lines.append("# TYPE pvc_node_network_packets_rx gauge")
for node in node_data:
rx_pps = 0
for interface in node["interfaces"].keys():
rx_pps += node["interfaces"][interface]["rx_pps"]
output_lines.append(
f"pvc_node_network_packets_rx{{node=\"{node['name']}\"}} {rx_pps:2.2f}"
)
output_lines.append(
"# HELP pvc_node_network_packets_tx PVC node transmitted network packets"
)
output_lines.append("# TYPE pvc_node_network_packets_tx gauge")
for node in node_data:
tx_pps = 0
for interface in node["interfaces"].keys():
tx_pps += node["interfaces"][interface]["tx_pps"]
output_lines.append(
f"pvc_node_network_packets_tx{{node=\"{node['name']}\"}} {tx_pps:2.2f}"
)
output_lines.append(
"# HELP pvc_node_network_utilization PVC node network utilization percentage"
)
output_lines.append("# TYPE pvc_node_network_utilization gauge")
for node in node_data:
used_network_percentage = per_node_network_utilization[node["name"]]
output_lines.append(
f"pvc_node_network_utilization{{node=\"{node['name']}\"}} {used_network_percentage:2.2f}"
)
output_lines.append("# HELP pvc_node_total_memory PVC node total memory in MB") output_lines.append("# HELP pvc_node_total_memory PVC node total memory in MB")
output_lines.append("# TYPE pvc_node_total_memory gauge") output_lines.append("# TYPE pvc_node_total_memory gauge")
for node in node_data: for node in node_data:
total_memory = ( total_memory = (
node["memory"]["total"] if isinstance(node["memory"]["total"], int) else 0 node["memory"]["total"]
if isinstance(node["memory"]["total"], (int, float))
else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_total_memory{{node=\"{node['name']}\"}} {total_memory}" f"pvc_node_total_memory{{node=\"{node['name']}\"}} {total_memory}"
@ -819,13 +902,35 @@ def get_resource_metrics(zkhandler):
for node in node_data: for node in node_data:
allocated_memory = ( allocated_memory = (
node["memory"]["allocated"] node["memory"]["allocated"]
if isinstance(node["memory"]["allocated"], int) if isinstance(node["memory"]["allocated"], (int, float))
else 0 else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_allocated_memory{{node=\"{node['name']}\"}} {allocated_memory}" f"pvc_node_allocated_memory{{node=\"{node['name']}\"}} {allocated_memory}"
) )
output_lines.append(
"# HELP pvc_node_allocated_memory_utilization PVC node allocated memory utilization"
)
output_lines.append("# TYPE pvc_node_allocated_memory_utilization gauge")
for node in node_data:
allocated_memory = (
node["memory"]["allocated"]
if isinstance(node["memory"]["allocated"], (int, float))
else 0
)
total_memory = (
node["memory"]["total"]
if isinstance(node["memory"]["total"], (int, float))
else 0
)
allocated_memory_utilization = (
(allocated_memory / total_memory * 100) if total_memory > 0 else 0.0
)
output_lines.append(
f"pvc_node_allocated_memory_utilization{{node=\"{node['name']}\"}} {allocated_memory_utilization}"
)
output_lines.append( output_lines.append(
"# HELP pvc_node_provisioned_memory PVC node provisioned memory in MB" "# HELP pvc_node_provisioned_memory PVC node provisioned memory in MB"
) )
@ -833,28 +938,76 @@ def get_resource_metrics(zkhandler):
for node in node_data: for node in node_data:
provisioned_memory = ( provisioned_memory = (
node["memory"]["provisioned"] node["memory"]["provisioned"]
if isinstance(node["memory"]["provisioned"], int) if isinstance(node["memory"]["provisioned"], (int, float))
else 0 else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_provisioned_memory{{node=\"{node['name']}\"}} {provisioned_memory}" f"pvc_node_provisioned_memory{{node=\"{node['name']}\"}} {provisioned_memory}"
) )
output_lines.append(
"# HELP pvc_node_provisioned_memory_utilization PVC node provisioned memory utilization"
)
output_lines.append("# TYPE pvc_node_provisioned_memory_utilization gauge")
for node in node_data:
provisioned_memory = (
node["memory"]["provisioned"]
if isinstance(node["memory"]["provisioned"], (int, float))
else 0
)
total_memory = (
node["memory"]["total"]
if isinstance(node["memory"]["total"], (int, float))
else 0
)
provisioned_memory_utilization = (
(provisioned_memory / total_memory * 100) if total_memory > 0 else 0.0
)
output_lines.append(
f"pvc_node_provisioned_memory_utilization{{node=\"{node['name']}\"}} {provisioned_memory_utilization}"
)
output_lines.append("# HELP pvc_node_used_memory PVC node used memory in MB") output_lines.append("# HELP pvc_node_used_memory PVC node used memory in MB")
output_lines.append("# TYPE pvc_node_used_memory gauge") output_lines.append("# TYPE pvc_node_used_memory gauge")
for node in node_data: for node in node_data:
used_memory = ( used_memory = (
node["memory"]["used"] if isinstance(node["memory"]["used"], int) else 0 node["memory"]["used"]
if isinstance(node["memory"]["used"], (int, float))
else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_used_memory{{node=\"{node['name']}\"}} {used_memory}" f"pvc_node_used_memory{{node=\"{node['name']}\"}} {used_memory}"
) )
output_lines.append(
"# HELP pvc_node_used_memory_utilization PVC node used memory utilization"
)
output_lines.append("# TYPE pvc_node_used_memory_utilization gauge")
for node in node_data:
used_memory = (
node["memory"]["used"]
if isinstance(node["memory"]["used"], (int, float))
else 0
)
total_memory = (
node["memory"]["total"]
if isinstance(node["memory"]["total"], (int, float))
else 0
)
used_memory_utilization = (
(used_memory / total_memory * 100) if total_memory > 0 else 0.0
)
output_lines.append(
f"pvc_node_used_memory_utilization{{node=\"{node['name']}\"}} {used_memory_utilization}"
)
output_lines.append("# HELP pvc_node_free_memory PVC node free memory in MB") output_lines.append("# HELP pvc_node_free_memory PVC node free memory in MB")
output_lines.append("# TYPE pvc_node_free_memory gauge") output_lines.append("# TYPE pvc_node_free_memory gauge")
for node in node_data: for node in node_data:
free_memory = ( free_memory = (
node["memory"]["free"] if isinstance(node["memory"]["free"], int) else 0 node["memory"]["free"]
if isinstance(node["memory"]["free"], (int, float))
else 0
) )
output_lines.append( output_lines.append(
f"pvc_node_free_memory{{node=\"{node['name']}\"}} {free_memory}" f"pvc_node_free_memory{{node=\"{node['name']}\"}} {free_memory}"