Add total cluster utilization stats

Useful for evaluating the cluster resources as a whole.
2024-09-05 13:41:01 -04:00
parent 7543eb839d
commit 2de999c700
3 changed files with 203 additions and 2 deletions
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@@ -576,6 +576,63 @@ class API_Status(Resource):
                snapshots:
                  type: integer
                  description: The total number of snapshots in the storage cluster
                resources:
                  type: object
                  properties:
                    memory:
                      type: object
                      properties:
                        total:
                          type: integer
                          description: The total amount of RAM (all nodes) in MB
                        used:
                          type: integer
                          description: The total used RAM (all nodes) in MB
                        free:
                          type: integer
                          description: The total free RAM (all nodes) in MB
                        allocated:
                          type: integer
                          description: The total amount of RAM allocated to running domains in MB
                        provisioned:
                          type: integer
                          description: The total amount of RAM provisioned to all domains (regardless of state) in MB
                        utilization:
                          type: float
                          description: The memory utilization percentage (average) of the cluster
                    vcpu:
                      type: object
                      properties:
                        total:
                          type: integer
                          description: The total number of real CPU cores (all nodes)
                        load:
                          type: float
                          description: The current 5-minute CPU load (all nodes summed)
                        allocated:
                          type: integer
                          description: The total number of vCPUs allocated to running domains
                        provisioned:
                          type: integer
                          description: The total number of vCPUs provisioned to all domains (regardless of state)
                        utilization:
                          type: float
                          description: The CPU utilization percentage (average) of the cluster
                    disk:
                      type: object
                      properties:
                        total:
                          type: integer
                          description: The total size of all OSDs in KB
                        used:
                          type: integer
                          description: The total used size of all OSDs in KB
                        free:
                          type: integer
                          description: The total free size of all OSDs in KB
                        utilization:
                          type: float
                          description: The disk utilization percentage (average) of the cluster
          400:
            description: Bad request
        """
--- a/client-cli/pvc/cli/formatters.py
+++ b/client-cli/pvc/cli/formatters.py
@@ -83,6 +83,37 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
    total_volumes = data.get("volumes", 0)
    total_snapshots = data.get("snapshots", 0)
    total_cpu_total = data.get("resources", {}).get("cpu", {}).get("total", 0)
    total_cpu_load = data.get("resources", {}).get("cpu", {}).get("load", 0)
    total_cpu_utilization = (
        data.get("resources", {}).get("cpu", {}).get("utilization", 0)
    )
    total_cpu_str = (
        f"{total_cpu_utilization:.1f}% ({total_cpu_load:.1f} / {total_cpu_total})"
    )
    total_memory_total = (
        data.get("resources", {}).get("memory", {}).get("total", 0) / 1024
    )
    total_memory_used = (
        data.get("resources", {}).get("memory", {}).get("used", 0) / 1024
    )
    total_memory_utilization = (
        data.get("resources", {}).get("memory", {}).get("utilization", 0)
    )
    total_memory_str = f"{total_memory_utilization:.1f}% ({total_memory_used:.1f} GB / {total_memory_total:.1f} GB)"
    total_disk_total = (
        data.get("resources", {}).get("disk", {}).get("total", 0) / 1024 / 1024
    )
    total_disk_used = (
        data.get("resources", {}).get("disk", {}).get("used", 0) / 1024 / 1024
    )
    total_disk_utilization = round(
        data.get("resources", {}).get("disk", {}).get("utilization", 0)
    )
    total_disk_str = f"{total_disk_utilization:.1f}% ({total_disk_used:.1f} GB / {total_disk_total:.1f} GB)"
    if maintenance == "true" or health == -1:
        health_colour = ansii["blue"]
    elif health > 90:
@@ -234,6 +265,12 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
    output.append("")
    output.append(f"{ansii['purple']}CPU Usage:{ansii['end']}     {total_cpu_str}")
    output.append(f"{ansii['purple']}Memory Usage:{ansii['end']}  {total_memory_str}")
    output.append(f"{ansii['purple']}Disk Usage:{ansii['end']}    {total_disk_str}")
    return "\n".join(output)
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -262,6 +262,22 @@ def getClusterInformation(zkhandler):
    # Get cluster maintenance state
    maintenance_state = zkhandler.read("base.config.maintenance")
    # Prepare cluster total values
    cluster_total_node_memory = 0
    cluster_total_used_memory = 0
    cluster_total_free_memory = 0
    cluster_total_allocated_memory = 0
    cluster_total_provisioned_memory = 0
    cluster_total_average_memory_utilization = 0
    cluster_total_cpu_cores = 0
    cluster_total_cpu_load = 0
    cluster_total_average_cpu_utilization = 0
    cluster_total_allocated_cores = 0
    cluster_total_osd_space = 0
    cluster_total_used_space = 0
    cluster_total_free_space = 0
    cluster_total_average_osd_utilization = 0
    # Get primary node
    maintenance_state, primary_node = zkhandler.read_many(
        [
@@ -276,19 +292,36 @@ def getClusterInformation(zkhandler):
    # Get the list of Nodes
    node_list = zkhandler.children("base.node")
    node_count = len(node_list)
-    # Get the daemon and domain states of all Nodes
+    # Get the information of all Nodes
    node_state_reads = list()
    node_memory_reads = list()
    node_cpu_reads = list()
    for node in node_list:
        node_state_reads += [
            ("node.state.daemon", node),
            ("node.state.domain", node),
        ]
        node_memory_reads += [
            ("node.memory.total", node),
            ("node.memory.used", node),
            ("node.memory.free", node),
            ("node.memory.allocated", node),
            ("node.memory.provisioned", node),
        ]
        node_cpu_reads += [
            ("node.data.static", node),
            ("node.vcpu.allocated", node),
            ("node.cpu.load", node),
        ]
    all_node_states = zkhandler.read_many(node_state_reads)
    all_node_memory = zkhandler.read_many(node_memory_reads)
    all_node_cpu = zkhandler.read_many(node_cpu_reads)
    # Parse out the Node states
    node_data = list()
    formatted_node_states = {"total": node_count}
    for nidx, node in enumerate(node_list):
-        # Split the large list of return values by the IDX of this node
+        # Split the large list of return values by the IDX of this node (states)
        # Each node result is 2 fields long
        pos_start = nidx * 2
        pos_end = nidx * 2 + 2
@@ -308,6 +341,46 @@ def getClusterInformation(zkhandler):
            else:
                formatted_node_states[node_state] = 1
        # Split the large list of return values by the IDX of this node (memory)
        # Each node result is 5 fields long
        pos_start = nidx * 5
        pos_end = nidx * 5 + 5
        (
            node_memory_total,
            node_memory_used,
            node_memory_free,
            node_memory_allocated,
            node_memory_provisioned,
        ) = tuple(all_node_memory[pos_start:pos_end])
        cluster_total_node_memory += int(node_memory_total)
        cluster_total_used_memory += int(node_memory_used)
        cluster_total_free_memory += int(node_memory_free)
        cluster_total_allocated_memory += int(node_memory_allocated)
        cluster_total_provisioned_memory += int(node_memory_provisioned)
        # Split the large list of return values by the IDX of this node (cpu)
        # Each nod result is 3 fields long
        pos_start = nidx * 3
        pos_end = nidx * 3 + 3
        node_static_data, node_vcpu_allocated, node_cpu_load = tuple(
            all_node_cpu[pos_start:pos_end]
        )
        cluster_total_cpu_cores += int(node_static_data.split()[0])
        cluster_total_cpu_load += round(float(node_cpu_load), 2)
        cluster_total_allocated_cores += int(node_vcpu_allocated)
    cluster_total_average_memory_utilization = (
        (round((cluster_total_used_memory / cluster_total_node_memory) * 100, 2))
        if cluster_total_node_memory > 0
        else 0.00
    )
    cluster_total_average_cpu_utilization = (
        (round((cluster_total_cpu_load / cluster_total_cpu_cores) * 100, 2))
        if cluster_total_cpu_cores > 0
        else 0.00
    )
    # Get the list of VMs
    vm_list = zkhandler.children("base.domain")
    vm_count = len(vm_list)
@@ -380,6 +453,18 @@ def getClusterInformation(zkhandler):
            else:
                formatted_osd_states[osd_state] = 1
        # Add the OSD utilization
        cluster_total_osd_space += int(osd_stats["kb"])
        cluster_total_used_space += int(osd_stats["kb_used"])
        cluster_total_free_space += int(osd_stats["kb_avail"])
        cluster_total_average_osd_utilization += float(osd_stats["utilization"])
    cluster_total_average_osd_utilization = (
        (round(cluster_total_average_osd_utilization / len(ceph_osd_list), 2))
        if ceph_osd_list
        else 0.00
    )
    # Get the list of Networks
    network_list = zkhandler.children("base.network")
    network_count = len(network_list)
@@ -424,6 +509,28 @@ def getClusterInformation(zkhandler):
        "pools": ceph_pool_count,
        "volumes": ceph_volume_count,
        "snapshots": ceph_snapshot_count,
        "resources": {
            "memory": {
                "total": cluster_total_node_memory,
                "free": cluster_total_free_memory,
                "used": cluster_total_used_memory,
                "allocated": cluster_total_allocated_memory,
                "provisioned": cluster_total_provisioned_memory,
                "utilization": cluster_total_average_memory_utilization,
            },
            "cpu": {
                "total": cluster_total_cpu_cores,
                "load": cluster_total_cpu_load,
                "allocated": cluster_total_allocated_cores,
                "utilization": cluster_total_average_cpu_utilization,
            },
            "disk": {
                "total": cluster_total_osd_space,
                "used": cluster_total_used_space,
                "free": cluster_total_free_space,
                "utilization": cluster_total_average_osd_utilization,
            },
        },
        "detail": {
            "node": node_data,
            "vm": vm_data,