Remove headers and add util to short output

Add total cluster utilization stats
Useful for evaluating the cluster resources as a whole.
2024-09-06 11:40:39 -04:00 · 2024-09-05 16:05:33 -04:00 · 2024-09-03 20:32:27 -04:00
5 changed files with 350 additions and 69 deletions
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@ -576,6 +576,63 @@ class API_Status(Resource):
                snapshots:
                  type: integer
                  description: The total number of snapshots in the storage cluster
+                resources:
+                  type: object
+                  properties:
+                    memory:
+                      type: object
+                      properties:
+                        total:
+                          type: integer
+                          description: The total amount of RAM (all nodes) in MB
+                        used:
+                          type: integer
+                          description: The total used RAM (all nodes) in MB
+                        free:
+                          type: integer
+                          description: The total free RAM (all nodes) in MB
+                        allocated:
+                          type: integer
+                          description: The total amount of RAM allocated to running domains in MB
+                        provisioned:
+                          type: integer
+                          description: The total amount of RAM provisioned to all domains (regardless of state) in MB
+                        utilization:
+                          type: float
+                          description: The memory utilization percentage (average) of the cluster
+                    vcpu:
+                      type: object
+                      properties:
+                        total:
+                          type: integer
+                          description: The total number of real CPU cores (all nodes)
+                        load:
+                          type: float
+                          description: The current 5-minute CPU load (all nodes summed)
+                        allocated:
+                          type: integer
+                          description: The total number of vCPUs allocated to running domains
+                        provisioned:
+                          type: integer
+                          description: The total number of vCPUs provisioned to all domains (regardless of state)
+                        utilization:
+                          type: float
+                          description: The CPU utilization percentage (average) of the cluster
+                    disk:
+                      type: object
+                      properties:
+                        total:
+                          type: integer
+                          description: The total size of all OSDs in KB
+                        used:
+                          type: integer
+                          description: The total used size of all OSDs in KB
+                        free:
+                          type: integer
+                          description: The total free size of all OSDs in KB
+                        utilization:
+                          type: float
+                          description: The disk utilization percentage (average) of the cluster
          400:
            description: Bad request
        """
@ -6461,6 +6518,41 @@ api.add_resource(
 )


+# /storage/ceph/volume/<pool>/<volume>/scan
+class API_Storage_Ceph_Volume_Element_Scan(Resource):
+    @Authenticator
+    def post(self, pool, volume):
+        """
+        Scan a Ceph volume {volume} in pool {pool} for stats (after import)
+        ---
+        tags:
+          - storage / ceph
+        parameters:
+        responses:
+          200:
+            description: OK
+            schema:
+              type: object
+              id: Message
+          404:
+            description: Not found
+            schema:
+              type: object
+              id: Message
+          400:
+            description: Bad request
+            schema:
+              type: object
+              id: Message
+        """
+        return api_helper.ceph_volume_scan(pool, volume)
+
+
+api.add_resource(
+    API_Storage_Ceph_Volume_Element_Scan, "/storage/ceph/volume/<pool>/<volume>/scan"
+)
+
+
 # /storage/ceph/volume/<pool>/<volume>/clone
 class API_Storage_Ceph_Volume_Element_Clone(Resource):
    @RequestParser(
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@ -1996,6 +1996,22 @@ def ceph_volume_list(zkhandler, pool=None, limit=None, is_fuzzy=True):
    return retdata, retcode


+@ZKConnection(config)
+def ceph_volume_scan(zkhandler, pool, name):
+    """
+    (Re)scan a Ceph RBD volume for stats in the PVC Ceph storage cluster.
+    """
+    retflag, retdata = pvc_ceph.scan_volume(zkhandler, pool, name)
+
+    if retflag:
+        retcode = 200
+    else:
+        retcode = 400
+
+    output = {"message": retdata.replace('"', "'")}
+    return output, retcode
+
+
@ZKConnection(config)
 def ceph_volume_add(zkhandler, pool, name, size, force_flag=False):
    """
--- a/client-cli/pvc/cli/formatters.py
+++ b/client-cli/pvc/cli/formatters.py
@ -83,6 +83,37 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
    total_volumes = data.get("volumes", 0)
    total_snapshots = data.get("snapshots", 0)

+    total_cpu_total = data.get("resources", {}).get("cpu", {}).get("total", 0)
+    total_cpu_load = data.get("resources", {}).get("cpu", {}).get("load", 0)
+    total_cpu_utilization = (
+        data.get("resources", {}).get("cpu", {}).get("utilization", 0)
+    )
+    total_cpu_string = (
+        f"{total_cpu_utilization:.1f}% ({total_cpu_load:.1f} / {total_cpu_total})"
+    )
+
+    total_memory_total = (
+        data.get("resources", {}).get("memory", {}).get("total", 0) / 1024
+    )
+    total_memory_used = (
+        data.get("resources", {}).get("memory", {}).get("used", 0) / 1024
+    )
+    total_memory_utilization = (
+        data.get("resources", {}).get("memory", {}).get("utilization", 0)
+    )
+    total_memory_string = f"{total_memory_utilization:.1f}% ({total_memory_used:.1f} GB / {total_memory_total:.1f} GB)"
+
+    total_disk_total = (
+        data.get("resources", {}).get("disk", {}).get("total", 0) / 1024 / 1024
+    )
+    total_disk_used = (
+        data.get("resources", {}).get("disk", {}).get("used", 0) / 1024 / 1024
+    )
+    total_disk_utilization = round(
+        data.get("resources", {}).get("disk", {}).get("utilization", 0)
+    )
+    total_disk_string = f"{total_disk_utilization:.1f}% ({total_disk_used:.1f} GB / {total_disk_total:.1f} GB)"
+
    if maintenance == "true" or health == -1:
        health_colour = ansii["blue"]
    elif health > 90:
@ -94,12 +125,9 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):

    output = list()

-    output.append(f"{ansii['bold']}PVC cluster status:{ansii['end']}")
-    output.append("")
-
-    output.append(f"{ansii['purple']}Primary node:{ansii['end']}  {primary_node}")
-    output.append(f"{ansii['purple']}PVC version:{ansii['end']}   {pvc_version}")
-    output.append(f"{ansii['purple']}Upstream IP:{ansii['end']}   {upstream_ip}")
+    output.append(f"{ansii['purple']}Primary node:{ansii['end']}   {primary_node}")
+    output.append(f"{ansii['purple']}PVC version:{ansii['end']}    {pvc_version}")
+    output.append(f"{ansii['purple']}Upstream IP:{ansii['end']}    {upstream_ip}")
    output.append("")

    if health != "-1":
@ -111,7 +139,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
        health = f"{health} (maintenance on)"

    output.append(
-        f"{ansii['purple']}Health:{ansii['end']}        {health_colour}{health}{ansii['end']}"
+        f"{ansii['purple']}Health:{ansii['end']}         {health_colour}{health}{ansii['end']}"
    )

    if messages is not None and len(messages) > 0:
@ -136,7 +164,17 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
            )

        messages = "\n               ".join(message_list)
-        output.append(f"{ansii['purple']}Active Faults:{ansii['end']} {messages}")
+    else:
+        messages = "None"
+    output.append(f"{ansii['purple']}Active faults:{ansii['end']}  {messages}")
+
+    output.append(f"{ansii['purple']}Total CPU:{ansii['end']}      {total_cpu_string}")
+
+    output.append(
+        f"{ansii['purple']}Total memory:{ansii['end']}   {total_memory_string}"
+    )
+
+    output.append(f"{ansii['purple']}Total disk:{ansii['end']}     {total_disk_string}")

    output.append("")

@ -166,7 +204,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):

    nodes_string = ", ".join(nodes_strings)

-    output.append(f"{ansii['purple']}Nodes:{ansii['end']}         {nodes_string}")
+    output.append(f"{ansii['purple']}Nodes:{ansii['end']}          {nodes_string}")

    vm_states = ["start", "disable"]
    vm_states.extend(
@ -196,7 +234,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):

    vms_string = ", ".join(vms_strings)

-    output.append(f"{ansii['purple']}VMs:{ansii['end']}           {vms_string}")
+    output.append(f"{ansii['purple']}VMs:{ansii['end']}            {vms_string}")

    osd_states = ["up,in"]
    osd_states.extend(
@ -222,15 +260,15 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):

    osds_string = " ".join(osds_strings)

-    output.append(f"{ansii['purple']}OSDs:{ansii['end']}          {osds_string}")
+    output.append(f"{ansii['purple']}OSDs:{ansii['end']}           {osds_string}")

-    output.append(f"{ansii['purple']}Pools:{ansii['end']}         {total_pools}")
+    output.append(f"{ansii['purple']}Pools:{ansii['end']}          {total_pools}")

-    output.append(f"{ansii['purple']}Volumes:{ansii['end']}       {total_volumes}")
+    output.append(f"{ansii['purple']}Volumes:{ansii['end']}        {total_volumes}")

-    output.append(f"{ansii['purple']}Snapshots:{ansii['end']}     {total_snapshots}")
+    output.append(f"{ansii['purple']}Snapshots:{ansii['end']}      {total_snapshots}")

-    output.append(f"{ansii['purple']}Networks:{ansii['end']}      {total_networks}")
+    output.append(f"{ansii['purple']}Networks:{ansii['end']}       {total_networks}")

    output.append("")

@ -258,9 +296,6 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):

    output = list()

-    output.append(f"{ansii['bold']}PVC cluster status:{ansii['end']}")
-    output.append("")
-
    if health != "-1":
        health = f"{health}%"
    else:
@ -270,7 +305,7 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):
        health = f"{health} (maintenance on)"

    output.append(
-        f"{ansii['purple']}Health:{ansii['end']}        {health_colour}{health}{ansii['end']}"
+        f"{ansii['purple']}Health:{ansii['end']}         {health_colour}{health}{ansii['end']}"
    )

    if messages is not None and len(messages) > 0:
@ -295,7 +330,48 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):
            )

        messages = "\n               ".join(message_list)
-        output.append(f"{ansii['purple']}Active Faults:{ansii['end']} {messages}")
+    else:
+        messages = "None"
+    output.append(f"{ansii['purple']}Active faults:{ansii['end']}  {messages}")
+
+    total_cpu_total = data.get("resources", {}).get("cpu", {}).get("total", 0)
+    total_cpu_load = data.get("resources", {}).get("cpu", {}).get("load", 0)
+    total_cpu_utilization = (
+        data.get("resources", {}).get("cpu", {}).get("utilization", 0)
+    )
+    total_cpu_string = (
+        f"{total_cpu_utilization:.1f}% ({total_cpu_load:.1f} / {total_cpu_total})"
+    )
+
+    total_memory_total = (
+        data.get("resources", {}).get("memory", {}).get("total", 0) / 1024
+    )
+    total_memory_used = (
+        data.get("resources", {}).get("memory", {}).get("used", 0) / 1024
+    )
+    total_memory_utilization = (
+        data.get("resources", {}).get("memory", {}).get("utilization", 0)
+    )
+    total_memory_string = f"{total_memory_utilization:.1f}% ({total_memory_used:.1f} GB / {total_memory_total:.1f} GB)"
+
+    total_disk_total = (
+        data.get("resources", {}).get("disk", {}).get("total", 0) / 1024 / 1024
+    )
+    total_disk_used = (
+        data.get("resources", {}).get("disk", {}).get("used", 0) / 1024 / 1024
+    )
+    total_disk_utilization = round(
+        data.get("resources", {}).get("disk", {}).get("utilization", 0)
+    )
+    total_disk_string = f"{total_disk_utilization:.1f}% ({total_disk_used:.1f} GB / {total_disk_total:.1f} GB)"
+
+    output.append(f"{ansii['purple']}CPU usage:{ansii['end']}      {total_cpu_string}")
+
+    output.append(
+        f"{ansii['purple']}Memory usage:{ansii['end']}   {total_memory_string}"
+    )
+
+    output.append(f"{ansii['purple']}Disk usage:{ansii['end']}     {total_disk_string}")

    output.append("")

--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@ -560,7 +560,21 @@ def getVolumeInformation(zkhandler, pool, volume):
    return volume_information


-def add_volume(zkhandler, pool, name, size, force_flag=False):
+def scan_volume(zkhandler, pool, name):
+    retcode, stdout, stderr = common.run_os_command(
+        "rbd info --format json {}/{}".format(pool, name)
+    )
+    volstats = stdout
+
+    # 3. Add the new volume to Zookeeper
+    zkhandler.write(
+        [
+            (("volume.stats", f"{pool}/{name}"), volstats),
+        ]
+    )
+
+
+def add_volume(zkhandler, pool, name, size, force_flag=False, zk_only=False):
    # 1. Verify the size of the volume
    pool_information = getPoolInformation(zkhandler, pool)
    size_bytes = format_bytes_fromhuman(size)
@ -592,27 +606,28 @@ def add_volume(zkhandler, pool, name, size, force_flag=False):
        )

    # 2. Create the volume
-    retcode, stdout, stderr = common.run_os_command(
-        "rbd create --size {}B {}/{}".format(size_bytes, pool, name)
-    )
-    if retcode:
-        return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(name, stderr)
-
-    # 2. Get volume stats
-    retcode, stdout, stderr = common.run_os_command(
-        "rbd info --format json {}/{}".format(pool, name)
-    )
-    volstats = stdout
+    # zk_only flag skips actually creating the volume - this would be done by some other mechanism
+    if not zk_only:
+        retcode, stdout, stderr = common.run_os_command(
+            "rbd create --size {}B {}/{}".format(size_bytes, pool, name)
+        )
+        if retcode:
+            return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(
+                name, stderr
+            )

    # 3. Add the new volume to Zookeeper
    zkhandler.write(
        [
            (("volume", f"{pool}/{name}"), ""),
-            (("volume.stats", f"{pool}/{name}"), volstats),
+            (("volume.stats", f"{pool}/{name}"), ""),
            (("snapshot", f"{pool}/{name}"), ""),
        ]
    )

+    # 4. Scan the volume stats
+    scan_volume(zkhandler, pool, name)
+
    return True, 'Created RBD volume "{}" of size "{}" in pool "{}".'.format(
        name, format_bytes_tohuman(size_bytes), pool
    )
@ -662,21 +677,18 @@ def clone_volume(zkhandler, pool, name_src, name_new, force_flag=False):
            ),
        )

-    # 3. Get volume stats
-    retcode, stdout, stderr = common.run_os_command(
-        "rbd info --format json {}/{}".format(pool, name_new)
-    )
-    volstats = stdout
-
-    # 4. Add the new volume to Zookeeper
+    # 3. Add the new volume to Zookeeper
    zkhandler.write(
        [
            (("volume", f"{pool}/{name_new}"), ""),
-            (("volume.stats", f"{pool}/{name_new}"), volstats),
+            (("volume.stats", f"{pool}/{name_new}"), ""),
            (("snapshot", f"{pool}/{name_new}"), ""),
        ]
    )

+    # 4. Scan the volume stats
+    scan_volume(zkhandler, pool, name_new)
+
    return True, 'Cloned RBD volume "{}" to "{}" in pool "{}"'.format(
        name_src, name_new, pool
    )
@ -761,20 +773,8 @@ def resize_volume(zkhandler, pool, name, size, force_flag=False):
        except Exception:
            pass

-    # 4. Get volume stats
-    retcode, stdout, stderr = common.run_os_command(
-        "rbd info --format json {}/{}".format(pool, name)
-    )
-    volstats = stdout
-
-    # 5. Update the volume in Zookeeper
-    zkhandler.write(
-        [
-            (("volume", f"{pool}/{name}"), ""),
-            (("volume.stats", f"{pool}/{name}"), volstats),
-            (("snapshot", f"{pool}/{name}"), ""),
-        ]
-    )
+    # 4. Scan the volume stats
+    scan_volume(zkhandler, pool, name)

    return True, 'Resized RBD volume "{}" to size "{}" in pool "{}".'.format(
        name, format_bytes_tohuman(size_bytes), pool
@ -807,18 +807,8 @@ def rename_volume(zkhandler, pool, name, new_name):
        ]
    )

-    # 3. Get volume stats
-    retcode, stdout, stderr = common.run_os_command(
-        "rbd info --format json {}/{}".format(pool, new_name)
-    )
-    volstats = stdout
-
-    # 4. Update the volume stats in Zookeeper
-    zkhandler.write(
-        [
-            (("volume.stats", f"{pool}/{new_name}"), volstats),
-        ]
-    )
+    # 3. Scan the volume stats
+    scan_volume(zkhandler, pool, new_name)

    return True, 'Renamed RBD volume "{}" to "{}" in pool "{}".'.format(
        name, new_name, pool
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@ -262,6 +262,22 @@ def getClusterInformation(zkhandler):
    # Get cluster maintenance state
    maintenance_state = zkhandler.read("base.config.maintenance")

+    # Prepare cluster total values
+    cluster_total_node_memory = 0
+    cluster_total_used_memory = 0
+    cluster_total_free_memory = 0
+    cluster_total_allocated_memory = 0
+    cluster_total_provisioned_memory = 0
+    cluster_total_average_memory_utilization = 0
+    cluster_total_cpu_cores = 0
+    cluster_total_cpu_load = 0
+    cluster_total_average_cpu_utilization = 0
+    cluster_total_allocated_cores = 0
+    cluster_total_osd_space = 0
+    cluster_total_used_space = 0
+    cluster_total_free_space = 0
+    cluster_total_average_osd_utilization = 0
+
    # Get primary node
    maintenance_state, primary_node = zkhandler.read_many(
        [
@ -276,19 +292,36 @@ def getClusterInformation(zkhandler):
    # Get the list of Nodes
    node_list = zkhandler.children("base.node")
    node_count = len(node_list)
-    # Get the daemon and domain states of all Nodes
+    # Get the information of all Nodes
    node_state_reads = list()
+    node_memory_reads = list()
+    node_cpu_reads = list()
    for node in node_list:
        node_state_reads += [
            ("node.state.daemon", node),
            ("node.state.domain", node),
        ]
+        node_memory_reads += [
+            ("node.memory.total", node),
+            ("node.memory.used", node),
+            ("node.memory.free", node),
+            ("node.memory.allocated", node),
+            ("node.memory.provisioned", node),
+        ]
+        node_cpu_reads += [
+            ("node.data.static", node),
+            ("node.vcpu.allocated", node),
+            ("node.cpu.load", node),
+        ]
    all_node_states = zkhandler.read_many(node_state_reads)
+    all_node_memory = zkhandler.read_many(node_memory_reads)
+    all_node_cpu = zkhandler.read_many(node_cpu_reads)
+
    # Parse out the Node states
    node_data = list()
    formatted_node_states = {"total": node_count}
    for nidx, node in enumerate(node_list):
-        # Split the large list of return values by the IDX of this node
+        # Split the large list of return values by the IDX of this node (states)
        # Each node result is 2 fields long
        pos_start = nidx * 2
        pos_end = nidx * 2 + 2
@ -308,6 +341,46 @@ def getClusterInformation(zkhandler):
            else:
                formatted_node_states[node_state] = 1

+        # Split the large list of return values by the IDX of this node (memory)
+        # Each node result is 5 fields long
+        pos_start = nidx * 5
+        pos_end = nidx * 5 + 5
+        (
+            node_memory_total,
+            node_memory_used,
+            node_memory_free,
+            node_memory_allocated,
+            node_memory_provisioned,
+        ) = tuple(all_node_memory[pos_start:pos_end])
+        cluster_total_node_memory += int(node_memory_total)
+        cluster_total_used_memory += int(node_memory_used)
+        cluster_total_free_memory += int(node_memory_free)
+        cluster_total_allocated_memory += int(node_memory_allocated)
+        cluster_total_provisioned_memory += int(node_memory_provisioned)
+
+        # Split the large list of return values by the IDX of this node (cpu)
+        # Each nod result is 3 fields long
+        pos_start = nidx * 3
+        pos_end = nidx * 3 + 3
+        node_static_data, node_vcpu_allocated, node_cpu_load = tuple(
+            all_node_cpu[pos_start:pos_end]
+        )
+        cluster_total_cpu_cores += int(node_static_data.split()[0])
+        cluster_total_cpu_load += round(float(node_cpu_load), 2)
+        cluster_total_allocated_cores += int(node_vcpu_allocated)
+
+    cluster_total_average_memory_utilization = (
+        (round((cluster_total_used_memory / cluster_total_node_memory) * 100, 2))
+        if cluster_total_node_memory > 0
+        else 0.00
+    )
+
+    cluster_total_average_cpu_utilization = (
+        (round((cluster_total_cpu_load / cluster_total_cpu_cores) * 100, 2))
+        if cluster_total_cpu_cores > 0
+        else 0.00
+    )
+
    # Get the list of VMs
    vm_list = zkhandler.children("base.domain")
    vm_count = len(vm_list)
@ -380,6 +453,18 @@ def getClusterInformation(zkhandler):
            else:
                formatted_osd_states[osd_state] = 1

+        # Add the OSD utilization
+        cluster_total_osd_space += int(osd_stats["kb"])
+        cluster_total_used_space += int(osd_stats["kb_used"])
+        cluster_total_free_space += int(osd_stats["kb_avail"])
+        cluster_total_average_osd_utilization += float(osd_stats["utilization"])
+
+    cluster_total_average_osd_utilization = (
+        (round(cluster_total_average_osd_utilization / len(ceph_osd_list), 2))
+        if ceph_osd_list
+        else 0.00
+    )
+
    # Get the list of Networks
    network_list = zkhandler.children("base.network")
    network_count = len(network_list)
@ -424,6 +509,28 @@ def getClusterInformation(zkhandler):
        "pools": ceph_pool_count,
        "volumes": ceph_volume_count,
        "snapshots": ceph_snapshot_count,
+        "resources": {
+            "memory": {
+                "total": cluster_total_node_memory,
+                "free": cluster_total_free_memory,
+                "used": cluster_total_used_memory,
+                "allocated": cluster_total_allocated_memory,
+                "provisioned": cluster_total_provisioned_memory,
+                "utilization": cluster_total_average_memory_utilization,
+            },
+            "cpu": {
+                "total": cluster_total_cpu_cores,
+                "load": cluster_total_cpu_load,
+                "allocated": cluster_total_allocated_cores,
+                "utilization": cluster_total_average_cpu_utilization,
+            },
+            "disk": {
+                "total": cluster_total_osd_space,
+                "used": cluster_total_used_space,
+                "free": cluster_total_free_space,
+                "utilization": cluster_total_average_osd_utilization,
+            },
+        },
        "detail": {
            "node": node_data,
            "vm": vm_data,