Compare commits

...

3 Commits

Author SHA1 Message Date
Joshua Boniface 73c0834f85 Remove headers and add util to short output 2024-09-06 11:40:39 -04:00
Joshua Boniface 2de999c700 Add total cluster utilization stats
Useful for evaluating the cluster resources as a whole.
2024-09-05 16:05:33 -04:00
Joshua Boniface 7543eb839d Add dedicated volume scan endpoint
Allows an imported volume to be scanned for stats independently.

Designed to be used as part of a snapshot import via API, to allow the
"create" to happen before the real import (to check for available space,
etc.) and then run this import after when the RBD volume actually
exists.
2024-09-03 20:32:27 -04:00
5 changed files with 350 additions and 69 deletions

View File

@ -576,6 +576,63 @@ class API_Status(Resource):
snapshots:
type: integer
description: The total number of snapshots in the storage cluster
resources:
type: object
properties:
memory:
type: object
properties:
total:
type: integer
description: The total amount of RAM (all nodes) in MB
used:
type: integer
description: The total used RAM (all nodes) in MB
free:
type: integer
description: The total free RAM (all nodes) in MB
allocated:
type: integer
description: The total amount of RAM allocated to running domains in MB
provisioned:
type: integer
description: The total amount of RAM provisioned to all domains (regardless of state) in MB
utilization:
type: float
description: The memory utilization percentage (average) of the cluster
vcpu:
type: object
properties:
total:
type: integer
description: The total number of real CPU cores (all nodes)
load:
type: float
description: The current 5-minute CPU load (all nodes summed)
allocated:
type: integer
description: The total number of vCPUs allocated to running domains
provisioned:
type: integer
description: The total number of vCPUs provisioned to all domains (regardless of state)
utilization:
type: float
description: The CPU utilization percentage (average) of the cluster
disk:
type: object
properties:
total:
type: integer
description: The total size of all OSDs in KB
used:
type: integer
description: The total used size of all OSDs in KB
free:
type: integer
description: The total free size of all OSDs in KB
utilization:
type: float
description: The disk utilization percentage (average) of the cluster
400:
description: Bad request
"""
@ -6461,6 +6518,41 @@ api.add_resource(
)
# /storage/ceph/volume/<pool>/<volume>/scan
class API_Storage_Ceph_Volume_Element_Scan(Resource):
@Authenticator
def post(self, pool, volume):
"""
Scan a Ceph volume {volume} in pool {pool} for stats (after import)
---
tags:
- storage / ceph
parameters:
responses:
200:
description: OK
schema:
type: object
id: Message
404:
description: Not found
schema:
type: object
id: Message
400:
description: Bad request
schema:
type: object
id: Message
"""
return api_helper.ceph_volume_scan(pool, volume)
api.add_resource(
API_Storage_Ceph_Volume_Element_Scan, "/storage/ceph/volume/<pool>/<volume>/scan"
)
# /storage/ceph/volume/<pool>/<volume>/clone
class API_Storage_Ceph_Volume_Element_Clone(Resource):
@RequestParser(

View File

@ -1996,6 +1996,22 @@ def ceph_volume_list(zkhandler, pool=None, limit=None, is_fuzzy=True):
return retdata, retcode
@ZKConnection(config)
def ceph_volume_scan(zkhandler, pool, name):
"""
(Re)scan a Ceph RBD volume for stats in the PVC Ceph storage cluster.
"""
retflag, retdata = pvc_ceph.scan_volume(zkhandler, pool, name)
if retflag:
retcode = 200
else:
retcode = 400
output = {"message": retdata.replace('"', "'")}
return output, retcode
@ZKConnection(config)
def ceph_volume_add(zkhandler, pool, name, size, force_flag=False):
"""

View File

@ -83,6 +83,37 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
total_volumes = data.get("volumes", 0)
total_snapshots = data.get("snapshots", 0)
total_cpu_total = data.get("resources", {}).get("cpu", {}).get("total", 0)
total_cpu_load = data.get("resources", {}).get("cpu", {}).get("load", 0)
total_cpu_utilization = (
data.get("resources", {}).get("cpu", {}).get("utilization", 0)
)
total_cpu_string = (
f"{total_cpu_utilization:.1f}% ({total_cpu_load:.1f} / {total_cpu_total})"
)
total_memory_total = (
data.get("resources", {}).get("memory", {}).get("total", 0) / 1024
)
total_memory_used = (
data.get("resources", {}).get("memory", {}).get("used", 0) / 1024
)
total_memory_utilization = (
data.get("resources", {}).get("memory", {}).get("utilization", 0)
)
total_memory_string = f"{total_memory_utilization:.1f}% ({total_memory_used:.1f} GB / {total_memory_total:.1f} GB)"
total_disk_total = (
data.get("resources", {}).get("disk", {}).get("total", 0) / 1024 / 1024
)
total_disk_used = (
data.get("resources", {}).get("disk", {}).get("used", 0) / 1024 / 1024
)
total_disk_utilization = round(
data.get("resources", {}).get("disk", {}).get("utilization", 0)
)
total_disk_string = f"{total_disk_utilization:.1f}% ({total_disk_used:.1f} GB / {total_disk_total:.1f} GB)"
if maintenance == "true" or health == -1:
health_colour = ansii["blue"]
elif health > 90:
@ -94,12 +125,9 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
output = list()
output.append(f"{ansii['bold']}PVC cluster status:{ansii['end']}")
output.append("")
output.append(f"{ansii['purple']}Primary node:{ansii['end']} {primary_node}")
output.append(f"{ansii['purple']}PVC version:{ansii['end']} {pvc_version}")
output.append(f"{ansii['purple']}Upstream IP:{ansii['end']} {upstream_ip}")
output.append(f"{ansii['purple']}Primary node:{ansii['end']} {primary_node}")
output.append(f"{ansii['purple']}PVC version:{ansii['end']} {pvc_version}")
output.append(f"{ansii['purple']}Upstream IP:{ansii['end']} {upstream_ip}")
output.append("")
if health != "-1":
@ -111,7 +139,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
health = f"{health} (maintenance on)"
output.append(
f"{ansii['purple']}Health:{ansii['end']} {health_colour}{health}{ansii['end']}"
f"{ansii['purple']}Health:{ansii['end']} {health_colour}{health}{ansii['end']}"
)
if messages is not None and len(messages) > 0:
@ -136,7 +164,17 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
)
messages = "\n ".join(message_list)
output.append(f"{ansii['purple']}Active Faults:{ansii['end']} {messages}")
else:
messages = "None"
output.append(f"{ansii['purple']}Active faults:{ansii['end']} {messages}")
output.append(f"{ansii['purple']}Total CPU:{ansii['end']} {total_cpu_string}")
output.append(
f"{ansii['purple']}Total memory:{ansii['end']} {total_memory_string}"
)
output.append(f"{ansii['purple']}Total disk:{ansii['end']} {total_disk_string}")
output.append("")
@ -166,7 +204,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
nodes_string = ", ".join(nodes_strings)
output.append(f"{ansii['purple']}Nodes:{ansii['end']} {nodes_string}")
output.append(f"{ansii['purple']}Nodes:{ansii['end']} {nodes_string}")
vm_states = ["start", "disable"]
vm_states.extend(
@ -196,7 +234,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
vms_string = ", ".join(vms_strings)
output.append(f"{ansii['purple']}VMs:{ansii['end']} {vms_string}")
output.append(f"{ansii['purple']}VMs:{ansii['end']} {vms_string}")
osd_states = ["up,in"]
osd_states.extend(
@ -222,15 +260,15 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
osds_string = " ".join(osds_strings)
output.append(f"{ansii['purple']}OSDs:{ansii['end']} {osds_string}")
output.append(f"{ansii['purple']}OSDs:{ansii['end']} {osds_string}")
output.append(f"{ansii['purple']}Pools:{ansii['end']} {total_pools}")
output.append(f"{ansii['purple']}Pools:{ansii['end']} {total_pools}")
output.append(f"{ansii['purple']}Volumes:{ansii['end']} {total_volumes}")
output.append(f"{ansii['purple']}Volumes:{ansii['end']} {total_volumes}")
output.append(f"{ansii['purple']}Snapshots:{ansii['end']} {total_snapshots}")
output.append(f"{ansii['purple']}Snapshots:{ansii['end']} {total_snapshots}")
output.append(f"{ansii['purple']}Networks:{ansii['end']} {total_networks}")
output.append(f"{ansii['purple']}Networks:{ansii['end']} {total_networks}")
output.append("")
@ -258,9 +296,6 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):
output = list()
output.append(f"{ansii['bold']}PVC cluster status:{ansii['end']}")
output.append("")
if health != "-1":
health = f"{health}%"
else:
@ -270,7 +305,7 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):
health = f"{health} (maintenance on)"
output.append(
f"{ansii['purple']}Health:{ansii['end']} {health_colour}{health}{ansii['end']}"
f"{ansii['purple']}Health:{ansii['end']} {health_colour}{health}{ansii['end']}"
)
if messages is not None and len(messages) > 0:
@ -295,7 +330,48 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):
)
messages = "\n ".join(message_list)
output.append(f"{ansii['purple']}Active Faults:{ansii['end']} {messages}")
else:
messages = "None"
output.append(f"{ansii['purple']}Active faults:{ansii['end']} {messages}")
total_cpu_total = data.get("resources", {}).get("cpu", {}).get("total", 0)
total_cpu_load = data.get("resources", {}).get("cpu", {}).get("load", 0)
total_cpu_utilization = (
data.get("resources", {}).get("cpu", {}).get("utilization", 0)
)
total_cpu_string = (
f"{total_cpu_utilization:.1f}% ({total_cpu_load:.1f} / {total_cpu_total})"
)
total_memory_total = (
data.get("resources", {}).get("memory", {}).get("total", 0) / 1024
)
total_memory_used = (
data.get("resources", {}).get("memory", {}).get("used", 0) / 1024
)
total_memory_utilization = (
data.get("resources", {}).get("memory", {}).get("utilization", 0)
)
total_memory_string = f"{total_memory_utilization:.1f}% ({total_memory_used:.1f} GB / {total_memory_total:.1f} GB)"
total_disk_total = (
data.get("resources", {}).get("disk", {}).get("total", 0) / 1024 / 1024
)
total_disk_used = (
data.get("resources", {}).get("disk", {}).get("used", 0) / 1024 / 1024
)
total_disk_utilization = round(
data.get("resources", {}).get("disk", {}).get("utilization", 0)
)
total_disk_string = f"{total_disk_utilization:.1f}% ({total_disk_used:.1f} GB / {total_disk_total:.1f} GB)"
output.append(f"{ansii['purple']}CPU usage:{ansii['end']} {total_cpu_string}")
output.append(
f"{ansii['purple']}Memory usage:{ansii['end']} {total_memory_string}"
)
output.append(f"{ansii['purple']}Disk usage:{ansii['end']} {total_disk_string}")
output.append("")

View File

@ -560,7 +560,21 @@ def getVolumeInformation(zkhandler, pool, volume):
return volume_information
def add_volume(zkhandler, pool, name, size, force_flag=False):
def scan_volume(zkhandler, pool, name):
retcode, stdout, stderr = common.run_os_command(
"rbd info --format json {}/{}".format(pool, name)
)
volstats = stdout
# 3. Add the new volume to Zookeeper
zkhandler.write(
[
(("volume.stats", f"{pool}/{name}"), volstats),
]
)
def add_volume(zkhandler, pool, name, size, force_flag=False, zk_only=False):
# 1. Verify the size of the volume
pool_information = getPoolInformation(zkhandler, pool)
size_bytes = format_bytes_fromhuman(size)
@ -592,27 +606,28 @@ def add_volume(zkhandler, pool, name, size, force_flag=False):
)
# 2. Create the volume
retcode, stdout, stderr = common.run_os_command(
"rbd create --size {}B {}/{}".format(size_bytes, pool, name)
)
if retcode:
return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(name, stderr)
# 2. Get volume stats
retcode, stdout, stderr = common.run_os_command(
"rbd info --format json {}/{}".format(pool, name)
)
volstats = stdout
# zk_only flag skips actually creating the volume - this would be done by some other mechanism
if not zk_only:
retcode, stdout, stderr = common.run_os_command(
"rbd create --size {}B {}/{}".format(size_bytes, pool, name)
)
if retcode:
return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(
name, stderr
)
# 3. Add the new volume to Zookeeper
zkhandler.write(
[
(("volume", f"{pool}/{name}"), ""),
(("volume.stats", f"{pool}/{name}"), volstats),
(("volume.stats", f"{pool}/{name}"), ""),
(("snapshot", f"{pool}/{name}"), ""),
]
)
# 4. Scan the volume stats
scan_volume(zkhandler, pool, name)
return True, 'Created RBD volume "{}" of size "{}" in pool "{}".'.format(
name, format_bytes_tohuman(size_bytes), pool
)
@ -662,21 +677,18 @@ def clone_volume(zkhandler, pool, name_src, name_new, force_flag=False):
),
)
# 3. Get volume stats
retcode, stdout, stderr = common.run_os_command(
"rbd info --format json {}/{}".format(pool, name_new)
)
volstats = stdout
# 4. Add the new volume to Zookeeper
# 3. Add the new volume to Zookeeper
zkhandler.write(
[
(("volume", f"{pool}/{name_new}"), ""),
(("volume.stats", f"{pool}/{name_new}"), volstats),
(("volume.stats", f"{pool}/{name_new}"), ""),
(("snapshot", f"{pool}/{name_new}"), ""),
]
)
# 4. Scan the volume stats
scan_volume(zkhandler, pool, name_new)
return True, 'Cloned RBD volume "{}" to "{}" in pool "{}"'.format(
name_src, name_new, pool
)
@ -761,20 +773,8 @@ def resize_volume(zkhandler, pool, name, size, force_flag=False):
except Exception:
pass
# 4. Get volume stats
retcode, stdout, stderr = common.run_os_command(
"rbd info --format json {}/{}".format(pool, name)
)
volstats = stdout
# 5. Update the volume in Zookeeper
zkhandler.write(
[
(("volume", f"{pool}/{name}"), ""),
(("volume.stats", f"{pool}/{name}"), volstats),
(("snapshot", f"{pool}/{name}"), ""),
]
)
# 4. Scan the volume stats
scan_volume(zkhandler, pool, name)
return True, 'Resized RBD volume "{}" to size "{}" in pool "{}".'.format(
name, format_bytes_tohuman(size_bytes), pool
@ -807,18 +807,8 @@ def rename_volume(zkhandler, pool, name, new_name):
]
)
# 3. Get volume stats
retcode, stdout, stderr = common.run_os_command(
"rbd info --format json {}/{}".format(pool, new_name)
)
volstats = stdout
# 4. Update the volume stats in Zookeeper
zkhandler.write(
[
(("volume.stats", f"{pool}/{new_name}"), volstats),
]
)
# 3. Scan the volume stats
scan_volume(zkhandler, pool, new_name)
return True, 'Renamed RBD volume "{}" to "{}" in pool "{}".'.format(
name, new_name, pool

View File

@ -262,6 +262,22 @@ def getClusterInformation(zkhandler):
# Get cluster maintenance state
maintenance_state = zkhandler.read("base.config.maintenance")
# Prepare cluster total values
cluster_total_node_memory = 0
cluster_total_used_memory = 0
cluster_total_free_memory = 0
cluster_total_allocated_memory = 0
cluster_total_provisioned_memory = 0
cluster_total_average_memory_utilization = 0
cluster_total_cpu_cores = 0
cluster_total_cpu_load = 0
cluster_total_average_cpu_utilization = 0
cluster_total_allocated_cores = 0
cluster_total_osd_space = 0
cluster_total_used_space = 0
cluster_total_free_space = 0
cluster_total_average_osd_utilization = 0
# Get primary node
maintenance_state, primary_node = zkhandler.read_many(
[
@ -276,19 +292,36 @@ def getClusterInformation(zkhandler):
# Get the list of Nodes
node_list = zkhandler.children("base.node")
node_count = len(node_list)
# Get the daemon and domain states of all Nodes
# Get the information of all Nodes
node_state_reads = list()
node_memory_reads = list()
node_cpu_reads = list()
for node in node_list:
node_state_reads += [
("node.state.daemon", node),
("node.state.domain", node),
]
node_memory_reads += [
("node.memory.total", node),
("node.memory.used", node),
("node.memory.free", node),
("node.memory.allocated", node),
("node.memory.provisioned", node),
]
node_cpu_reads += [
("node.data.static", node),
("node.vcpu.allocated", node),
("node.cpu.load", node),
]
all_node_states = zkhandler.read_many(node_state_reads)
all_node_memory = zkhandler.read_many(node_memory_reads)
all_node_cpu = zkhandler.read_many(node_cpu_reads)
# Parse out the Node states
node_data = list()
formatted_node_states = {"total": node_count}
for nidx, node in enumerate(node_list):
# Split the large list of return values by the IDX of this node
# Split the large list of return values by the IDX of this node (states)
# Each node result is 2 fields long
pos_start = nidx * 2
pos_end = nidx * 2 + 2
@ -308,6 +341,46 @@ def getClusterInformation(zkhandler):
else:
formatted_node_states[node_state] = 1
# Split the large list of return values by the IDX of this node (memory)
# Each node result is 5 fields long
pos_start = nidx * 5
pos_end = nidx * 5 + 5
(
node_memory_total,
node_memory_used,
node_memory_free,
node_memory_allocated,
node_memory_provisioned,
) = tuple(all_node_memory[pos_start:pos_end])
cluster_total_node_memory += int(node_memory_total)
cluster_total_used_memory += int(node_memory_used)
cluster_total_free_memory += int(node_memory_free)
cluster_total_allocated_memory += int(node_memory_allocated)
cluster_total_provisioned_memory += int(node_memory_provisioned)
# Split the large list of return values by the IDX of this node (cpu)
# Each nod result is 3 fields long
pos_start = nidx * 3
pos_end = nidx * 3 + 3
node_static_data, node_vcpu_allocated, node_cpu_load = tuple(
all_node_cpu[pos_start:pos_end]
)
cluster_total_cpu_cores += int(node_static_data.split()[0])
cluster_total_cpu_load += round(float(node_cpu_load), 2)
cluster_total_allocated_cores += int(node_vcpu_allocated)
cluster_total_average_memory_utilization = (
(round((cluster_total_used_memory / cluster_total_node_memory) * 100, 2))
if cluster_total_node_memory > 0
else 0.00
)
cluster_total_average_cpu_utilization = (
(round((cluster_total_cpu_load / cluster_total_cpu_cores) * 100, 2))
if cluster_total_cpu_cores > 0
else 0.00
)
# Get the list of VMs
vm_list = zkhandler.children("base.domain")
vm_count = len(vm_list)
@ -380,6 +453,18 @@ def getClusterInformation(zkhandler):
else:
formatted_osd_states[osd_state] = 1
# Add the OSD utilization
cluster_total_osd_space += int(osd_stats["kb"])
cluster_total_used_space += int(osd_stats["kb_used"])
cluster_total_free_space += int(osd_stats["kb_avail"])
cluster_total_average_osd_utilization += float(osd_stats["utilization"])
cluster_total_average_osd_utilization = (
(round(cluster_total_average_osd_utilization / len(ceph_osd_list), 2))
if ceph_osd_list
else 0.00
)
# Get the list of Networks
network_list = zkhandler.children("base.network")
network_count = len(network_list)
@ -424,6 +509,28 @@ def getClusterInformation(zkhandler):
"pools": ceph_pool_count,
"volumes": ceph_volume_count,
"snapshots": ceph_snapshot_count,
"resources": {
"memory": {
"total": cluster_total_node_memory,
"free": cluster_total_free_memory,
"used": cluster_total_used_memory,
"allocated": cluster_total_allocated_memory,
"provisioned": cluster_total_provisioned_memory,
"utilization": cluster_total_average_memory_utilization,
},
"cpu": {
"total": cluster_total_cpu_cores,
"load": cluster_total_cpu_load,
"allocated": cluster_total_allocated_cores,
"utilization": cluster_total_average_cpu_utilization,
},
"disk": {
"total": cluster_total_osd_space,
"used": cluster_total_used_space,
"free": cluster_total_free_space,
"utilization": cluster_total_average_osd_utilization,
},
},
"detail": {
"node": node_data,
"vm": vm_data,