diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 60b2df8b..9f27b213 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -5058,95 +5058,7 @@ class API_Storage_Ceph_Benchmark(Resource): description: The PVC benchmark format of the results benchmark_result: type: object - description: A format 0 test result - properties: - test_name: - type: object - properties: - overall: - type: object - properties: - iosize: - type: string (integer) - description: The total size of the benchmark data - bandwidth: - type: string (integer) - description: The average bandwidth (KiB/s) - iops: - type: string (integer) - description: The average IOPS - runtime: - type: string (integer) - description: The total test time in milliseconds - latency: - type: object - properties: - min: - type: string (integer) - description: The minimum latency measurement - max: - type: string (integer) - description: The maximum latency measurement - mean: - type: string (float) - description: The mean latency measurement - stdev: - type: string (float) - description: The standard deviation of latency - bandwidth: - type: object - properties: - min: - type: string (integer) - description: The minimum bandwidth (KiB/s) measurement - max: - type: string (integer) - description: The maximum bandwidth (KiB/s) measurement - mean: - type: string (float) - description: The mean bandwidth (KiB/s) measurement - stdev: - type: string (float) - description: The standard deviation of bandwidth - numsamples: - type: string (integer) - description: The number of samples taken during the test - iops: - type: object - properties: - min: - type: string (integer) - description: The minimum IOPS measurement - max: - type: string (integer) - description: The maximum IOPS measurement - mean: - type: string (float) - description: The mean IOPS measurement - stdev: - type: string (float) - description: The standard deviation of IOPS - numsamples: - type: string (integer) - description: The number of samples taken during the test - cpu: - type: object - properties: - user: - type: string (float percentage) - description: The percentage of test time spent in user space - system: - type: string (float percentage) - description: The percentage of test time spent in system (kernel) space - ctxsw: - type: string (integer) - description: The number of context switches during the test - majfault: - type: string (integer) - description: The number of major page faults during the test - minfault: - type: string (integer) - description: The number of minor page faults during the test + description: A benchmark test result; format not documented due to complexity """ return list_benchmarks(config, reqargs.get("job", None)) diff --git a/client-cli/pvc/lib/storage.py b/client-cli/pvc/lib/storage.py index 7f3bcb95..1dbca65c 100644 --- a/client-cli/pvc/lib/storage.py +++ b/client-cli/pvc/lib/storage.py @@ -30,6 +30,7 @@ from requests_toolbelt.multipart.encoder import ( import pvc.lib.ansiprint as ansiprint from pvc.lib.common import UploadProgressBar, call_api, get_wait_retdata +from pvc.cli.helpers import MAX_CONTENT_WIDTH # # Supplemental functions @@ -1804,7 +1805,7 @@ def get_benchmark_list_results(benchmark_format, benchmark_data): benchmark_bandwidth, benchmark_iops = get_benchmark_list_results_legacy( benchmark_data ) - elif benchmark_format == 1: + elif benchmark_format == 1 or benchmark_format == 2: benchmark_bandwidth, benchmark_iops = get_benchmark_list_results_json( benchmark_data ) @@ -2006,6 +2007,7 @@ def format_info_benchmark(config, benchmark_information): benchmark_matrix = { 0: format_info_benchmark_legacy, 1: format_info_benchmark_json, + 2: format_info_benchmark_json, } benchmark_version = benchmark_information[0]["test_format"] @@ -2340,12 +2342,15 @@ def format_info_benchmark_json(config, benchmark_information): if benchmark_information["benchmark_result"] == "Running": return "Benchmark test is still running." + benchmark_format = benchmark_information["test_format"] benchmark_details = benchmark_information["benchmark_result"] # Format a nice output; do this line-by-line then concat the elements at the end ainformation = [] ainformation.append( - "{}Storage Benchmark details:{}".format(ansiprint.bold(), ansiprint.end()) + "{}Storage Benchmark details (format {}):{}".format( + ansiprint.bold(), benchmark_format, ansiprint.end() + ) ) nice_test_name_map = { @@ -2393,7 +2398,7 @@ def format_info_benchmark_json(config, benchmark_information): if element[1] != 0: useful_latency_tree.append(element) - max_rows = 9 + max_rows = 5 if len(useful_latency_tree) > 9: max_rows = len(useful_latency_tree) elif len(useful_latency_tree) < 9: @@ -2402,15 +2407,10 @@ def format_info_benchmark_json(config, benchmark_information): # Format the static data overall_label = [ - "Overall BW/s:", - "Overall IOPS:", - "Total I/O:", - "Runtime (s):", - "User CPU %:", - "System CPU %:", - "Ctx Switches:", - "Major Faults:", - "Minor Faults:", + "BW/s:", + "IOPS:", + "I/O:", + "Time:", ] while len(overall_label) < max_rows: overall_label.append("") @@ -2419,68 +2419,149 @@ def format_info_benchmark_json(config, benchmark_information): format_bytes_tohuman(int(job_details[io_class]["bw_bytes"])), format_ops_tohuman(int(job_details[io_class]["iops"])), format_bytes_tohuman(int(job_details[io_class]["io_bytes"])), - job_details["job_runtime"] / 1000, - job_details["usr_cpu"], - job_details["sys_cpu"], - job_details["ctx"], - job_details["majf"], - job_details["minf"], + str(job_details["job_runtime"] / 1000) + "s", ] while len(overall_data) < max_rows: overall_data.append("") + cpu_label = [ + "Total:", + "User:", + "Sys:", + "OSD:", + "MON:", + ] + while len(cpu_label) < max_rows: + cpu_label.append("") + + cpu_data = [ + ( + benchmark_details[test]["avg_cpu_util_percent"]["total"] + if benchmark_format > 1 + else "N/A" + ), + round(job_details["usr_cpu"], 2), + round(job_details["sys_cpu"], 2), + ( + benchmark_details[test]["avg_cpu_util_percent"]["ceph-osd"] + if benchmark_format > 1 + else "N/A" + ), + ( + benchmark_details[test]["avg_cpu_util_percent"]["ceph-mon"] + if benchmark_format > 1 + else "N/A" + ), + ] + while len(cpu_data) < max_rows: + cpu_data.append("") + + memory_label = [ + "Total:", + "OSD:", + "MON:", + ] + while len(memory_label) < max_rows: + memory_label.append("") + + memory_data = [ + ( + benchmark_details[test]["avg_memory_util_percent"]["total"] + if benchmark_format > 1 + else "N/A" + ), + ( + benchmark_details[test]["avg_memory_util_percent"]["ceph-osd"] + if benchmark_format > 1 + else "N/A" + ), + ( + benchmark_details[test]["avg_memory_util_percent"]["ceph-mon"] + if benchmark_format > 1 + else "N/A" + ), + ] + while len(memory_data) < max_rows: + memory_data.append("") + + network_label = [ + "Total:", + "Sent:", + "Recv:", + ] + while len(network_label) < max_rows: + network_label.append("") + + network_data = [ + ( + format_bytes_tohuman( + int(benchmark_details[test]["avg_network_util_bps"]["total"]) + ) + if benchmark_format > 1 + else "N/A" + ), + ( + format_bytes_tohuman( + int(benchmark_details[test]["avg_network_util_bps"]["sent"]) + ) + if benchmark_format > 1 + else "N/A" + ), + ( + format_bytes_tohuman( + int(benchmark_details[test]["avg_network_util_bps"]["recv"]) + ) + if benchmark_format > 1 + else "N/A" + ), + ] + while len(network_data) < max_rows: + network_data.append("") + bandwidth_label = [ "Min:", "Max:", "Mean:", "StdDev:", "Samples:", - "", - "", - "", - "", ] while len(bandwidth_label) < max_rows: bandwidth_label.append("") bandwidth_data = [ - format_bytes_tohuman(int(job_details[io_class]["bw_min"]) * 1024), - format_bytes_tohuman(int(job_details[io_class]["bw_max"]) * 1024), - format_bytes_tohuman(int(job_details[io_class]["bw_mean"]) * 1024), - format_bytes_tohuman(int(job_details[io_class]["bw_dev"]) * 1024), - job_details[io_class]["bw_samples"], - "", - "", - "", - "", + format_bytes_tohuman(int(job_details[io_class]["bw_min"]) * 1024) + + " / " + + format_ops_tohuman(int(job_details[io_class]["iops_min"])), + format_bytes_tohuman(int(job_details[io_class]["bw_max"]) * 1024) + + " / " + + format_ops_tohuman(int(job_details[io_class]["iops_max"])), + format_bytes_tohuman(int(job_details[io_class]["bw_mean"]) * 1024) + + " / " + + format_ops_tohuman(int(job_details[io_class]["iops_mean"])), + format_bytes_tohuman(int(job_details[io_class]["bw_dev"]) * 1024) + + " / " + + format_ops_tohuman(int(job_details[io_class]["iops_stddev"])), + str(job_details[io_class]["bw_samples"]) + + " / " + + str(job_details[io_class]["iops_samples"]), ] while len(bandwidth_data) < max_rows: bandwidth_data.append("") - iops_data = [ - format_ops_tohuman(int(job_details[io_class]["iops_min"])), - format_ops_tohuman(int(job_details[io_class]["iops_max"])), - format_ops_tohuman(int(job_details[io_class]["iops_mean"])), - format_ops_tohuman(int(job_details[io_class]["iops_stddev"])), - job_details[io_class]["iops_samples"], - "", - "", - "", - "", + lat_label = [ + "Min:", + "Max:", + "Mean:", + "StdDev:", ] - while len(iops_data) < max_rows: - iops_data.append("") + while len(lat_label) < max_rows: + lat_label.append("") lat_data = [ int(job_details[io_class]["lat_ns"]["min"]) / 1000, int(job_details[io_class]["lat_ns"]["max"]) / 1000, int(job_details[io_class]["lat_ns"]["mean"]) / 1000, int(job_details[io_class]["lat_ns"]["stddev"]) / 1000, - "", - "", - "", - "", - "", ] while len(lat_data) < max_rows: lat_data.append("") @@ -2489,98 +2570,119 @@ def format_info_benchmark_json(config, benchmark_information): lat_bucket_label = list() lat_bucket_data = list() for element in useful_latency_tree: - lat_bucket_label.append(element[0]) - lat_bucket_data.append(element[1]) + lat_bucket_label.append(element[0] + ":" if element[0] else "") + lat_bucket_data.append(round(float(element[1]), 2) if element[1] else "") + while len(lat_bucket_label) < max_rows: + lat_bucket_label.append("") + while len(lat_bucket_data) < max_rows: + lat_bucket_label.append("") # Column default widths - overall_label_length = 0 + overall_label_length = 5 overall_column_length = 0 - bandwidth_label_length = 0 - bandwidth_column_length = 11 - iops_column_length = 4 - latency_column_length = 12 + cpu_label_length = 6 + cpu_column_length = 0 + memory_label_length = 6 + memory_column_length = 0 + network_label_length = 6 + network_column_length = 6 + bandwidth_label_length = 8 + bandwidth_column_length = 0 + latency_label_length = 7 + latency_column_length = 0 latency_bucket_label_length = 0 + latency_bucket_column_length = 0 # Column layout: - # General Bandwidth IOPS Latency Percentiles - # --------- ---------- -------- -------- --------------- - # Size Min Min Min A - # BW Max Max Max B - # IOPS Mean Mean Mean ... - # Runtime StdDev StdDev StdDev Z - # UsrCPU Samples Samples - # SysCPU - # CtxSw - # MajFault - # MinFault + # Overall CPU Memory Network Bandwidth/IOPS Latency Percentiles + # --------- ----- ------- -------- -------------- -------- --------------- + # BW Total Total Total Min Min A + # IOPS Usr OSD Send Max Max B + # Time Sys MON Recv Mean Mean ... + # Size OSD StdDev StdDev Z + # MON Samples # Set column widths - for item in overall_label: - _item_length = len(str(item)) - if _item_length > overall_label_length: - overall_label_length = _item_length - for item in overall_data: _item_length = len(str(item)) if _item_length > overall_column_length: overall_column_length = _item_length - test_name_length = len(nice_test_name_map[test]) - if test_name_length > overall_label_length + overall_column_length: - _diff = test_name_length - (overall_label_length + overall_column_length) - overall_column_length += _diff - - for item in bandwidth_label: + for item in cpu_data: _item_length = len(str(item)) - if _item_length > bandwidth_label_length: - bandwidth_label_length = _item_length + if _item_length > cpu_column_length: + cpu_column_length = _item_length + + for item in memory_data: + _item_length = len(str(item)) + if _item_length > memory_column_length: + memory_column_length = _item_length + + for item in network_data: + _item_length = len(str(item)) + if _item_length > network_column_length: + network_column_length = _item_length for item in bandwidth_data: _item_length = len(str(item)) if _item_length > bandwidth_column_length: bandwidth_column_length = _item_length - for item in iops_data: - _item_length = len(str(item)) - if _item_length > iops_column_length: - iops_column_length = _item_length - for item in lat_data: _item_length = len(str(item)) if _item_length > latency_column_length: latency_column_length = _item_length - for item in lat_bucket_label: + for item in lat_bucket_data: _item_length = len(str(item)) - if _item_length > latency_bucket_label_length: - latency_bucket_label_length = _item_length + if _item_length > latency_bucket_column_length: + latency_bucket_column_length = _item_length # Top row (Headers) ainformation.append( - "{bold}\ -{overall_label: <{overall_label_length}} \ -{bandwidth_label: <{bandwidth_label_length}} \ -{bandwidth: <{bandwidth_length}} \ -{iops: <{iops_length}} \ -{latency: <{latency_length}} \ -{latency_bucket_label: <{latency_bucket_label_length}} \ -{latency_bucket} \ -{end_bold}".format( + "{bold}{overall_label: <{overall_label_length}} {header_fill}{end_bold}".format( bold=ansiprint.bold(), end_bold=ansiprint.end(), overall_label=nice_test_name_map[test], overall_label_length=overall_label_length, - bandwidth_label="", - bandwidth_label_length=bandwidth_label_length, - bandwidth="Bandwidth/s", - bandwidth_length=bandwidth_column_length, - iops="IOPS", - iops_length=iops_column_length, - latency="Latency (μs)", - latency_length=latency_column_length, - latency_bucket_label="Latency Buckets (μs/%)", - latency_bucket_label_length=latency_bucket_label_length, - latency_bucket="", + header_fill="-" + * ( + (MAX_CONTENT_WIDTH if MAX_CONTENT_WIDTH <= 120 else 120) + - len(nice_test_name_map[test]) + - 4 + ), + ) + ) + + ainformation.append( + "{bold}\ +{overall_label: <{overall_label_length}} \ +{cpu_label: <{cpu_label_length}} \ +{memory_label: <{memory_label_length}} \ +{network_label: <{network_label_length}} \ +{bandwidth_label: <{bandwidth_label_length}} \ +{latency_label: <{latency_label_length}} \ +{latency_bucket_label: <{latency_bucket_label_length}}\ +{end_bold}".format( + bold=ansiprint.bold(), + end_bold=ansiprint.end(), + overall_label="Overall", + overall_label_length=overall_label_length + overall_column_length + 1, + cpu_label="CPU (%)", + cpu_label_length=cpu_label_length + cpu_column_length + 1, + memory_label="Memory (%)", + memory_label_length=memory_label_length + memory_column_length + 1, + network_label="Network (bps)", + network_label_length=network_label_length + network_column_length + 1, + bandwidth_label="Bandwidth / IOPS", + bandwidth_label_length=bandwidth_label_length + + bandwidth_column_length + + 1, + latency_label="Latency (μs)", + latency_label_length=latency_label_length + latency_column_length + 1, + latency_bucket_label="Buckets (μs/%)", + latency_bucket_label_length=latency_bucket_label_length + + latency_bucket_column_length, ) ) @@ -2588,14 +2690,20 @@ def format_info_benchmark_json(config, benchmark_information): # Top row (Headers) ainformation.append( "{bold}\ -{overall_label: >{overall_label_length}} \ -{overall: <{overall_length}} \ -{bandwidth_label: >{bandwidth_label_length}} \ -{bandwidth: <{bandwidth_length}} \ -{iops: <{iops_length}} \ -{latency: <{latency_length}} \ -{latency_bucket_label: >{latency_bucket_label_length}} \ -{latency_bucket} \ +{overall_label: <{overall_label_length}} \ +{overall: <{overall_length}} \ +{cpu_label: <{cpu_label_length}} \ +{cpu: <{cpu_length}} \ +{memory_label: <{memory_label_length}} \ +{memory: <{memory_length}} \ +{network_label: <{network_label_length}} \ +{network: <{network_length}} \ +{bandwidth_label: <{bandwidth_label_length}} \ +{bandwidth: <{bandwidth_length}} \ +{latency_label: <{latency_label_length}} \ +{latency: <{latency_length}} \ +{latency_bucket_label: <{latency_bucket_label_length}} \ +{latency_bucket}\ {end_bold}".format( bold="", end_bold="", @@ -2603,12 +2711,24 @@ def format_info_benchmark_json(config, benchmark_information): overall_label_length=overall_label_length, overall=overall_data[idx], overall_length=overall_column_length, + cpu_label=cpu_label[idx], + cpu_label_length=cpu_label_length, + cpu=cpu_data[idx], + cpu_length=cpu_column_length, + memory_label=memory_label[idx], + memory_label_length=memory_label_length, + memory=memory_data[idx], + memory_length=memory_column_length, + network_label=network_label[idx], + network_label_length=network_label_length, + network=network_data[idx], + network_length=network_column_length, bandwidth_label=bandwidth_label[idx], bandwidth_label_length=bandwidth_label_length, bandwidth=bandwidth_data[idx], bandwidth_length=bandwidth_column_length, - iops=iops_data[idx], - iops_length=iops_column_length, + latency_label=lat_label[idx], + latency_label_length=latency_label_length, latency=lat_data[idx], latency_length=latency_column_length, latency_bucket_label=lat_bucket_label[idx], @@ -2617,4 +2737,4 @@ def format_info_benchmark_json(config, benchmark_information): ) ) - return "\n".join(ainformation) + return "\n".join(ainformation) + "\n" diff --git a/daemon-common/benchmark.py b/daemon-common/benchmark.py index b01a8c36..1a359334 100644 --- a/daemon-common/benchmark.py +++ b/daemon-common/benchmark.py @@ -19,31 +19,34 @@ # ############################################################################### +import os +import psutil import psycopg2 import psycopg2.extras +import subprocess from datetime import datetime from json import loads, dumps +from time import sleep from daemon_lib.celery import start, fail, log_info, update, finish -import daemon_lib.common as pvc_common import daemon_lib.ceph as pvc_ceph # Define the current test format -TEST_FORMAT = 1 +TEST_FORMAT = 2 # We run a total of 8 tests, to give a generalized idea of performance on the cluster: -# 1. A sequential read test of 8GB with a 4M block size -# 2. A sequential write test of 8GB with a 4M block size -# 3. A random read test of 8GB with a 4M block size -# 4. A random write test of 8GB with a 4M block size -# 5. A random read test of 8GB with a 256k block size -# 6. A random write test of 8GB with a 256k block size -# 7. A random read test of 8GB with a 4k block size -# 8. A random write test of 8GB with a 4k block size +# 1. A sequential read test of 64GB with a 4M block size +# 2. A sequential write test of 64GB with a 4M block size +# 3. A random read test of 64GB with a 4M block size +# 4. A random write test of 64GB with a 4M block size +# 5. A random read test of 64GB with a 256k block size +# 6. A random write test of 64GB with a 256k block size +# 7. A random read test of 64GB with a 4k block size +# 8. A random write test of 64GB with a 4k block size # Taken together, these 8 results should give a very good indication of the overall storage performance # for a variety of workloads. test_matrix = { @@ -100,7 +103,7 @@ test_matrix = { # Specify the benchmark volume name and size benchmark_volume_name = "pvcbenchmark" -benchmark_volume_size = "8G" +benchmark_volume_size = "64G" # @@ -226,7 +229,7 @@ def cleanup_benchmark_volume( def run_benchmark_job( - test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None + config, test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None ): test_spec = test_matrix[test] log_info(None, f"Running test '{test}'") @@ -256,24 +259,155 @@ def run_benchmark_job( ) log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split()))) - retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd) + + # Run the fio command manually instead of using our run_os_command wrapper + # This will help us gather statistics about this node while it's running + process = subprocess.Popen( + fio_cmd.split(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Wait 15 seconds for the test to start + log_info(None, "Waiting 15 seconds for test resource stabilization") + sleep(15) + + # Set up function to get process CPU utilization by name + def get_cpu_utilization_by_name(process_name): + cpu_usage = 0 + for proc in psutil.process_iter(["name", "cpu_percent"]): + if proc.info["name"] == process_name: + cpu_usage += proc.info["cpu_percent"] + return cpu_usage + + # Set up function to get process memory utilization by name + def get_memory_utilization_by_name(process_name): + memory_usage = 0 + for proc in psutil.process_iter(["name", "memory_percent"]): + if proc.info["name"] == process_name: + memory_usage += proc.info["memory_percent"] + return memory_usage + + # Set up function to get network traffic utilization in bps + def get_network_traffic_bps(interface, duration=1): + # Get initial network counters + net_io_start = psutil.net_io_counters(pernic=True) + if interface not in net_io_start: + return None, None + + stats_start = net_io_start[interface] + bytes_sent_start = stats_start.bytes_sent + bytes_recv_start = stats_start.bytes_recv + + # Wait for the specified duration + sleep(duration) + + # Get final network counters + net_io_end = psutil.net_io_counters(pernic=True) + stats_end = net_io_end[interface] + bytes_sent_end = stats_end.bytes_sent + bytes_recv_end = stats_end.bytes_recv + + # Calculate bytes per second + bytes_sent_per_sec = (bytes_sent_end - bytes_sent_start) / duration + bytes_recv_per_sec = (bytes_recv_end - bytes_recv_start) / duration + + # Convert to bits per second (bps) + bits_sent_per_sec = bytes_sent_per_sec * 8 + bits_recv_per_sec = bytes_recv_per_sec * 8 + bits_total_per_sec = bits_sent_per_sec + bits_recv_per_sec + + return bits_sent_per_sec, bits_recv_per_sec, bits_total_per_sec + + log_info(None, f"Starting system resource polling for test '{test}'") + storage_interface = config["storage_dev"] + total_cpus = psutil.cpu_count(logical=True) + ticks = 1 + osd_cpu_utilization = 0 + osd_memory_utilization = 0 + mon_cpu_utilization = 0 + mon_memory_utilization = 0 + total_cpu_utilization = 0 + total_memory_utilization = 0 + storage_sent_bps = 0 + storage_recv_bps = 0 + storage_total_bps = 0 + + while process.poll() is None: + # Do collection of statistics like network bandwidth and cpu utilization + current_osd_cpu_utilization = get_cpu_utilization_by_name("ceph-osd") + current_osd_memory_utilization = get_memory_utilization_by_name("ceph-osd") + current_mon_cpu_utilization = get_cpu_utilization_by_name("ceph-mon") + current_mon_memory_utilization = get_memory_utilization_by_name("ceph-mon") + current_total_cpu_utilization = psutil.cpu_percent(interval=1) + current_total_memory_utilization = psutil.virtual_memory().percent + ( + current_storage_sent_bps, + current_storage_recv_bps, + current_storage_total_bps, + ) = get_network_traffic_bps(storage_interface) + # Recheck if the process is done yet; if it's not, we add the values and increase the ticks + # This helps ensure that if the process finishes earlier than the longer polls above, + # this particular tick isn't counted which can skew the average + if process.poll() is None: + osd_cpu_utilization += current_osd_cpu_utilization + osd_memory_utilization += current_osd_memory_utilization + mon_cpu_utilization += current_mon_cpu_utilization + mon_memory_utilization += current_mon_memory_utilization + total_cpu_utilization += current_total_cpu_utilization + total_memory_utilization += current_total_memory_utilization + storage_sent_bps += current_storage_sent_bps + storage_recv_bps += current_storage_recv_bps + storage_total_bps += current_storage_total_bps + ticks += 1 + + # Get the 1-minute load average and CPU utilization, which covers the test duration + load1, _, _ = os.getloadavg() + load1 = round(load1, 2) + + # Calculate the average CPU utilization values over the runtime + # Divide the OSD and MON CPU utilization by the total number of CPU cores, because + # the total is divided this way + avg_osd_cpu_utilization = round(osd_cpu_utilization / ticks / total_cpus, 2) + avg_osd_memory_utilization = round(osd_memory_utilization / ticks, 2) + avg_mon_cpu_utilization = round(mon_cpu_utilization / ticks / total_cpus, 2) + avg_mon_memory_utilization = round(mon_memory_utilization / ticks, 2) + avg_total_cpu_utilization = round(total_cpu_utilization / ticks, 2) + avg_total_memory_utilization = round(total_memory_utilization / ticks, 2) + avg_storage_sent_bps = round(storage_sent_bps / ticks, 2) + avg_storage_recv_bps = round(storage_recv_bps / ticks, 2) + avg_storage_total_bps = round(storage_total_bps / ticks, 2) + + stdout, stderr = process.communicate() + retcode = process.returncode + + resource_data = { + "avg_cpu_util_percent": { + "total": avg_total_cpu_utilization, + "ceph-mon": avg_mon_cpu_utilization, + "ceph-osd": avg_osd_cpu_utilization, + }, + "avg_memory_util_percent": { + "total": avg_total_memory_utilization, + "ceph-mon": avg_mon_memory_utilization, + "ceph-osd": avg_osd_memory_utilization, + }, + "avg_network_util_bps": { + "sent": avg_storage_sent_bps, + "recv": avg_storage_recv_bps, + "total": avg_storage_total_bps, + }, + } + try: jstdout = loads(stdout) if retcode: raise except Exception: - cleanup( - job_name, - db_conn=db_conn, - db_cur=db_cur, - zkhandler=zkhandler, - ) - fail( - None, - f"Failed to run fio test '{test}': {stderr}", - ) + return None, None - return jstdout + return resource_data, jstdout def worker_run_benchmark(zkhandler, celery, config, pool): @@ -358,7 +492,8 @@ def worker_run_benchmark(zkhandler, celery, config, pool): total=total_stages, ) - results[test] = run_benchmark_job( + resource_data, fio_data = run_benchmark_job( + config, test, pool, job_name=job_name, @@ -366,6 +501,25 @@ def worker_run_benchmark(zkhandler, celery, config, pool): db_cur=db_cur, zkhandler=zkhandler, ) + if resource_data is None or fio_data is None: + cleanup_benchmark_volume( + pool, + job_name=job_name, + db_conn=db_conn, + db_cur=db_cur, + zkhandler=zkhandler, + ) + cleanup( + job_name, + db_conn=db_conn, + db_cur=db_cur, + zkhandler=zkhandler, + ) + fail( + None, + f"Failed to run fio test '{test}'", + ) + results[test] = {**resource_data, **fio_data} # Phase 3 - cleanup current_stage += 1