Update benchmarks to include resource utilization
Adds additional polled information on node cpu, memory, and network bandwidth for the node running the test. This should provide additional useful information about the results of the test. Also bumps the test format to 2 to ensure clients can handle the changes properly.
This commit is contained in:
parent
ecb812ccac
commit
736762901c
|
@ -5058,95 +5058,7 @@ class API_Storage_Ceph_Benchmark(Resource):
|
||||||
description: The PVC benchmark format of the results
|
description: The PVC benchmark format of the results
|
||||||
benchmark_result:
|
benchmark_result:
|
||||||
type: object
|
type: object
|
||||||
description: A format 0 test result
|
description: A benchmark test result; format not documented due to complexity
|
||||||
properties:
|
|
||||||
test_name:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
overall:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
iosize:
|
|
||||||
type: string (integer)
|
|
||||||
description: The total size of the benchmark data
|
|
||||||
bandwidth:
|
|
||||||
type: string (integer)
|
|
||||||
description: The average bandwidth (KiB/s)
|
|
||||||
iops:
|
|
||||||
type: string (integer)
|
|
||||||
description: The average IOPS
|
|
||||||
runtime:
|
|
||||||
type: string (integer)
|
|
||||||
description: The total test time in milliseconds
|
|
||||||
latency:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
min:
|
|
||||||
type: string (integer)
|
|
||||||
description: The minimum latency measurement
|
|
||||||
max:
|
|
||||||
type: string (integer)
|
|
||||||
description: The maximum latency measurement
|
|
||||||
mean:
|
|
||||||
type: string (float)
|
|
||||||
description: The mean latency measurement
|
|
||||||
stdev:
|
|
||||||
type: string (float)
|
|
||||||
description: The standard deviation of latency
|
|
||||||
bandwidth:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
min:
|
|
||||||
type: string (integer)
|
|
||||||
description: The minimum bandwidth (KiB/s) measurement
|
|
||||||
max:
|
|
||||||
type: string (integer)
|
|
||||||
description: The maximum bandwidth (KiB/s) measurement
|
|
||||||
mean:
|
|
||||||
type: string (float)
|
|
||||||
description: The mean bandwidth (KiB/s) measurement
|
|
||||||
stdev:
|
|
||||||
type: string (float)
|
|
||||||
description: The standard deviation of bandwidth
|
|
||||||
numsamples:
|
|
||||||
type: string (integer)
|
|
||||||
description: The number of samples taken during the test
|
|
||||||
iops:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
min:
|
|
||||||
type: string (integer)
|
|
||||||
description: The minimum IOPS measurement
|
|
||||||
max:
|
|
||||||
type: string (integer)
|
|
||||||
description: The maximum IOPS measurement
|
|
||||||
mean:
|
|
||||||
type: string (float)
|
|
||||||
description: The mean IOPS measurement
|
|
||||||
stdev:
|
|
||||||
type: string (float)
|
|
||||||
description: The standard deviation of IOPS
|
|
||||||
numsamples:
|
|
||||||
type: string (integer)
|
|
||||||
description: The number of samples taken during the test
|
|
||||||
cpu:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
user:
|
|
||||||
type: string (float percentage)
|
|
||||||
description: The percentage of test time spent in user space
|
|
||||||
system:
|
|
||||||
type: string (float percentage)
|
|
||||||
description: The percentage of test time spent in system (kernel) space
|
|
||||||
ctxsw:
|
|
||||||
type: string (integer)
|
|
||||||
description: The number of context switches during the test
|
|
||||||
majfault:
|
|
||||||
type: string (integer)
|
|
||||||
description: The number of major page faults during the test
|
|
||||||
minfault:
|
|
||||||
type: string (integer)
|
|
||||||
description: The number of minor page faults during the test
|
|
||||||
"""
|
"""
|
||||||
return list_benchmarks(config, reqargs.get("job", None))
|
return list_benchmarks(config, reqargs.get("job", None))
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ from requests_toolbelt.multipart.encoder import (
|
||||||
|
|
||||||
import pvc.lib.ansiprint as ansiprint
|
import pvc.lib.ansiprint as ansiprint
|
||||||
from pvc.lib.common import UploadProgressBar, call_api, get_wait_retdata
|
from pvc.lib.common import UploadProgressBar, call_api, get_wait_retdata
|
||||||
|
from pvc.cli.helpers import MAX_CONTENT_WIDTH
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supplemental functions
|
# Supplemental functions
|
||||||
|
@ -1804,7 +1805,7 @@ def get_benchmark_list_results(benchmark_format, benchmark_data):
|
||||||
benchmark_bandwidth, benchmark_iops = get_benchmark_list_results_legacy(
|
benchmark_bandwidth, benchmark_iops = get_benchmark_list_results_legacy(
|
||||||
benchmark_data
|
benchmark_data
|
||||||
)
|
)
|
||||||
elif benchmark_format == 1:
|
elif benchmark_format == 1 or benchmark_format == 2:
|
||||||
benchmark_bandwidth, benchmark_iops = get_benchmark_list_results_json(
|
benchmark_bandwidth, benchmark_iops = get_benchmark_list_results_json(
|
||||||
benchmark_data
|
benchmark_data
|
||||||
)
|
)
|
||||||
|
@ -2006,6 +2007,7 @@ def format_info_benchmark(config, benchmark_information):
|
||||||
benchmark_matrix = {
|
benchmark_matrix = {
|
||||||
0: format_info_benchmark_legacy,
|
0: format_info_benchmark_legacy,
|
||||||
1: format_info_benchmark_json,
|
1: format_info_benchmark_json,
|
||||||
|
2: format_info_benchmark_json,
|
||||||
}
|
}
|
||||||
|
|
||||||
benchmark_version = benchmark_information[0]["test_format"]
|
benchmark_version = benchmark_information[0]["test_format"]
|
||||||
|
@ -2340,12 +2342,15 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
if benchmark_information["benchmark_result"] == "Running":
|
if benchmark_information["benchmark_result"] == "Running":
|
||||||
return "Benchmark test is still running."
|
return "Benchmark test is still running."
|
||||||
|
|
||||||
|
benchmark_format = benchmark_information["test_format"]
|
||||||
benchmark_details = benchmark_information["benchmark_result"]
|
benchmark_details = benchmark_information["benchmark_result"]
|
||||||
|
|
||||||
# Format a nice output; do this line-by-line then concat the elements at the end
|
# Format a nice output; do this line-by-line then concat the elements at the end
|
||||||
ainformation = []
|
ainformation = []
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Storage Benchmark details:{}".format(ansiprint.bold(), ansiprint.end())
|
"{}Storage Benchmark details (format {}):{}".format(
|
||||||
|
ansiprint.bold(), benchmark_format, ansiprint.end()
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
nice_test_name_map = {
|
nice_test_name_map = {
|
||||||
|
@ -2393,7 +2398,7 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
if element[1] != 0:
|
if element[1] != 0:
|
||||||
useful_latency_tree.append(element)
|
useful_latency_tree.append(element)
|
||||||
|
|
||||||
max_rows = 9
|
max_rows = 5
|
||||||
if len(useful_latency_tree) > 9:
|
if len(useful_latency_tree) > 9:
|
||||||
max_rows = len(useful_latency_tree)
|
max_rows = len(useful_latency_tree)
|
||||||
elif len(useful_latency_tree) < 9:
|
elif len(useful_latency_tree) < 9:
|
||||||
|
@ -2402,15 +2407,10 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
|
|
||||||
# Format the static data
|
# Format the static data
|
||||||
overall_label = [
|
overall_label = [
|
||||||
"Overall BW/s:",
|
"BW/s:",
|
||||||
"Overall IOPS:",
|
"IOPS:",
|
||||||
"Total I/O:",
|
"I/O:",
|
||||||
"Runtime (s):",
|
"Time:",
|
||||||
"User CPU %:",
|
|
||||||
"System CPU %:",
|
|
||||||
"Ctx Switches:",
|
|
||||||
"Major Faults:",
|
|
||||||
"Minor Faults:",
|
|
||||||
]
|
]
|
||||||
while len(overall_label) < max_rows:
|
while len(overall_label) < max_rows:
|
||||||
overall_label.append("")
|
overall_label.append("")
|
||||||
|
@ -2419,68 +2419,149 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
format_bytes_tohuman(int(job_details[io_class]["bw_bytes"])),
|
format_bytes_tohuman(int(job_details[io_class]["bw_bytes"])),
|
||||||
format_ops_tohuman(int(job_details[io_class]["iops"])),
|
format_ops_tohuman(int(job_details[io_class]["iops"])),
|
||||||
format_bytes_tohuman(int(job_details[io_class]["io_bytes"])),
|
format_bytes_tohuman(int(job_details[io_class]["io_bytes"])),
|
||||||
job_details["job_runtime"] / 1000,
|
str(job_details["job_runtime"] / 1000) + "s",
|
||||||
job_details["usr_cpu"],
|
|
||||||
job_details["sys_cpu"],
|
|
||||||
job_details["ctx"],
|
|
||||||
job_details["majf"],
|
|
||||||
job_details["minf"],
|
|
||||||
]
|
]
|
||||||
while len(overall_data) < max_rows:
|
while len(overall_data) < max_rows:
|
||||||
overall_data.append("")
|
overall_data.append("")
|
||||||
|
|
||||||
|
cpu_label = [
|
||||||
|
"Total:",
|
||||||
|
"User:",
|
||||||
|
"Sys:",
|
||||||
|
"OSD:",
|
||||||
|
"MON:",
|
||||||
|
]
|
||||||
|
while len(cpu_label) < max_rows:
|
||||||
|
cpu_label.append("")
|
||||||
|
|
||||||
|
cpu_data = [
|
||||||
|
(
|
||||||
|
benchmark_details[test]["avg_cpu_util_percent"]["total"]
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
round(job_details["usr_cpu"], 2),
|
||||||
|
round(job_details["sys_cpu"], 2),
|
||||||
|
(
|
||||||
|
benchmark_details[test]["avg_cpu_util_percent"]["ceph-osd"]
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
benchmark_details[test]["avg_cpu_util_percent"]["ceph-mon"]
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
while len(cpu_data) < max_rows:
|
||||||
|
cpu_data.append("")
|
||||||
|
|
||||||
|
memory_label = [
|
||||||
|
"Total:",
|
||||||
|
"OSD:",
|
||||||
|
"MON:",
|
||||||
|
]
|
||||||
|
while len(memory_label) < max_rows:
|
||||||
|
memory_label.append("")
|
||||||
|
|
||||||
|
memory_data = [
|
||||||
|
(
|
||||||
|
benchmark_details[test]["avg_memory_util_percent"]["total"]
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
benchmark_details[test]["avg_memory_util_percent"]["ceph-osd"]
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
benchmark_details[test]["avg_memory_util_percent"]["ceph-mon"]
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
while len(memory_data) < max_rows:
|
||||||
|
memory_data.append("")
|
||||||
|
|
||||||
|
network_label = [
|
||||||
|
"Total:",
|
||||||
|
"Sent:",
|
||||||
|
"Recv:",
|
||||||
|
]
|
||||||
|
while len(network_label) < max_rows:
|
||||||
|
network_label.append("")
|
||||||
|
|
||||||
|
network_data = [
|
||||||
|
(
|
||||||
|
format_bytes_tohuman(
|
||||||
|
int(benchmark_details[test]["avg_network_util_bps"]["total"])
|
||||||
|
)
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
format_bytes_tohuman(
|
||||||
|
int(benchmark_details[test]["avg_network_util_bps"]["sent"])
|
||||||
|
)
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
format_bytes_tohuman(
|
||||||
|
int(benchmark_details[test]["avg_network_util_bps"]["recv"])
|
||||||
|
)
|
||||||
|
if benchmark_format > 1
|
||||||
|
else "N/A"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
while len(network_data) < max_rows:
|
||||||
|
network_data.append("")
|
||||||
|
|
||||||
bandwidth_label = [
|
bandwidth_label = [
|
||||||
"Min:",
|
"Min:",
|
||||||
"Max:",
|
"Max:",
|
||||||
"Mean:",
|
"Mean:",
|
||||||
"StdDev:",
|
"StdDev:",
|
||||||
"Samples:",
|
"Samples:",
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
]
|
]
|
||||||
while len(bandwidth_label) < max_rows:
|
while len(bandwidth_label) < max_rows:
|
||||||
bandwidth_label.append("")
|
bandwidth_label.append("")
|
||||||
|
|
||||||
bandwidth_data = [
|
bandwidth_data = [
|
||||||
format_bytes_tohuman(int(job_details[io_class]["bw_min"]) * 1024),
|
format_bytes_tohuman(int(job_details[io_class]["bw_min"]) * 1024)
|
||||||
format_bytes_tohuman(int(job_details[io_class]["bw_max"]) * 1024),
|
+ " / "
|
||||||
format_bytes_tohuman(int(job_details[io_class]["bw_mean"]) * 1024),
|
+ format_ops_tohuman(int(job_details[io_class]["iops_min"])),
|
||||||
format_bytes_tohuman(int(job_details[io_class]["bw_dev"]) * 1024),
|
format_bytes_tohuman(int(job_details[io_class]["bw_max"]) * 1024)
|
||||||
job_details[io_class]["bw_samples"],
|
+ " / "
|
||||||
"",
|
+ format_ops_tohuman(int(job_details[io_class]["iops_max"])),
|
||||||
"",
|
format_bytes_tohuman(int(job_details[io_class]["bw_mean"]) * 1024)
|
||||||
"",
|
+ " / "
|
||||||
"",
|
+ format_ops_tohuman(int(job_details[io_class]["iops_mean"])),
|
||||||
|
format_bytes_tohuman(int(job_details[io_class]["bw_dev"]) * 1024)
|
||||||
|
+ " / "
|
||||||
|
+ format_ops_tohuman(int(job_details[io_class]["iops_stddev"])),
|
||||||
|
str(job_details[io_class]["bw_samples"])
|
||||||
|
+ " / "
|
||||||
|
+ str(job_details[io_class]["iops_samples"]),
|
||||||
]
|
]
|
||||||
while len(bandwidth_data) < max_rows:
|
while len(bandwidth_data) < max_rows:
|
||||||
bandwidth_data.append("")
|
bandwidth_data.append("")
|
||||||
|
|
||||||
iops_data = [
|
lat_label = [
|
||||||
format_ops_tohuman(int(job_details[io_class]["iops_min"])),
|
"Min:",
|
||||||
format_ops_tohuman(int(job_details[io_class]["iops_max"])),
|
"Max:",
|
||||||
format_ops_tohuman(int(job_details[io_class]["iops_mean"])),
|
"Mean:",
|
||||||
format_ops_tohuman(int(job_details[io_class]["iops_stddev"])),
|
"StdDev:",
|
||||||
job_details[io_class]["iops_samples"],
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
]
|
]
|
||||||
while len(iops_data) < max_rows:
|
while len(lat_label) < max_rows:
|
||||||
iops_data.append("")
|
lat_label.append("")
|
||||||
|
|
||||||
lat_data = [
|
lat_data = [
|
||||||
int(job_details[io_class]["lat_ns"]["min"]) / 1000,
|
int(job_details[io_class]["lat_ns"]["min"]) / 1000,
|
||||||
int(job_details[io_class]["lat_ns"]["max"]) / 1000,
|
int(job_details[io_class]["lat_ns"]["max"]) / 1000,
|
||||||
int(job_details[io_class]["lat_ns"]["mean"]) / 1000,
|
int(job_details[io_class]["lat_ns"]["mean"]) / 1000,
|
||||||
int(job_details[io_class]["lat_ns"]["stddev"]) / 1000,
|
int(job_details[io_class]["lat_ns"]["stddev"]) / 1000,
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
]
|
]
|
||||||
while len(lat_data) < max_rows:
|
while len(lat_data) < max_rows:
|
||||||
lat_data.append("")
|
lat_data.append("")
|
||||||
|
@ -2489,98 +2570,119 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
lat_bucket_label = list()
|
lat_bucket_label = list()
|
||||||
lat_bucket_data = list()
|
lat_bucket_data = list()
|
||||||
for element in useful_latency_tree:
|
for element in useful_latency_tree:
|
||||||
lat_bucket_label.append(element[0])
|
lat_bucket_label.append(element[0] + ":" if element[0] else "")
|
||||||
lat_bucket_data.append(element[1])
|
lat_bucket_data.append(round(float(element[1]), 2) if element[1] else "")
|
||||||
|
while len(lat_bucket_label) < max_rows:
|
||||||
|
lat_bucket_label.append("")
|
||||||
|
while len(lat_bucket_data) < max_rows:
|
||||||
|
lat_bucket_label.append("")
|
||||||
|
|
||||||
# Column default widths
|
# Column default widths
|
||||||
overall_label_length = 0
|
overall_label_length = 5
|
||||||
overall_column_length = 0
|
overall_column_length = 0
|
||||||
bandwidth_label_length = 0
|
cpu_label_length = 6
|
||||||
bandwidth_column_length = 11
|
cpu_column_length = 0
|
||||||
iops_column_length = 4
|
memory_label_length = 6
|
||||||
latency_column_length = 12
|
memory_column_length = 0
|
||||||
|
network_label_length = 6
|
||||||
|
network_column_length = 6
|
||||||
|
bandwidth_label_length = 8
|
||||||
|
bandwidth_column_length = 0
|
||||||
|
latency_label_length = 7
|
||||||
|
latency_column_length = 0
|
||||||
latency_bucket_label_length = 0
|
latency_bucket_label_length = 0
|
||||||
|
latency_bucket_column_length = 0
|
||||||
|
|
||||||
# Column layout:
|
# Column layout:
|
||||||
# General Bandwidth IOPS Latency Percentiles
|
# Overall CPU Memory Network Bandwidth/IOPS Latency Percentiles
|
||||||
# --------- ---------- -------- -------- ---------------
|
# --------- ----- ------- -------- -------------- -------- ---------------
|
||||||
# Size Min Min Min A
|
# BW Total Total Total Min Min A
|
||||||
# BW Max Max Max B
|
# IOPS Usr OSD Send Max Max B
|
||||||
# IOPS Mean Mean Mean ...
|
# Time Sys MON Recv Mean Mean ...
|
||||||
# Runtime StdDev StdDev StdDev Z
|
# Size OSD StdDev StdDev Z
|
||||||
# UsrCPU Samples Samples
|
# MON Samples
|
||||||
# SysCPU
|
|
||||||
# CtxSw
|
|
||||||
# MajFault
|
|
||||||
# MinFault
|
|
||||||
|
|
||||||
# Set column widths
|
# Set column widths
|
||||||
for item in overall_label:
|
|
||||||
_item_length = len(str(item))
|
|
||||||
if _item_length > overall_label_length:
|
|
||||||
overall_label_length = _item_length
|
|
||||||
|
|
||||||
for item in overall_data:
|
for item in overall_data:
|
||||||
_item_length = len(str(item))
|
_item_length = len(str(item))
|
||||||
if _item_length > overall_column_length:
|
if _item_length > overall_column_length:
|
||||||
overall_column_length = _item_length
|
overall_column_length = _item_length
|
||||||
|
|
||||||
test_name_length = len(nice_test_name_map[test])
|
for item in cpu_data:
|
||||||
if test_name_length > overall_label_length + overall_column_length:
|
|
||||||
_diff = test_name_length - (overall_label_length + overall_column_length)
|
|
||||||
overall_column_length += _diff
|
|
||||||
|
|
||||||
for item in bandwidth_label:
|
|
||||||
_item_length = len(str(item))
|
_item_length = len(str(item))
|
||||||
if _item_length > bandwidth_label_length:
|
if _item_length > cpu_column_length:
|
||||||
bandwidth_label_length = _item_length
|
cpu_column_length = _item_length
|
||||||
|
|
||||||
|
for item in memory_data:
|
||||||
|
_item_length = len(str(item))
|
||||||
|
if _item_length > memory_column_length:
|
||||||
|
memory_column_length = _item_length
|
||||||
|
|
||||||
|
for item in network_data:
|
||||||
|
_item_length = len(str(item))
|
||||||
|
if _item_length > network_column_length:
|
||||||
|
network_column_length = _item_length
|
||||||
|
|
||||||
for item in bandwidth_data:
|
for item in bandwidth_data:
|
||||||
_item_length = len(str(item))
|
_item_length = len(str(item))
|
||||||
if _item_length > bandwidth_column_length:
|
if _item_length > bandwidth_column_length:
|
||||||
bandwidth_column_length = _item_length
|
bandwidth_column_length = _item_length
|
||||||
|
|
||||||
for item in iops_data:
|
|
||||||
_item_length = len(str(item))
|
|
||||||
if _item_length > iops_column_length:
|
|
||||||
iops_column_length = _item_length
|
|
||||||
|
|
||||||
for item in lat_data:
|
for item in lat_data:
|
||||||
_item_length = len(str(item))
|
_item_length = len(str(item))
|
||||||
if _item_length > latency_column_length:
|
if _item_length > latency_column_length:
|
||||||
latency_column_length = _item_length
|
latency_column_length = _item_length
|
||||||
|
|
||||||
for item in lat_bucket_label:
|
for item in lat_bucket_data:
|
||||||
_item_length = len(str(item))
|
_item_length = len(str(item))
|
||||||
if _item_length > latency_bucket_label_length:
|
if _item_length > latency_bucket_column_length:
|
||||||
latency_bucket_label_length = _item_length
|
latency_bucket_column_length = _item_length
|
||||||
|
|
||||||
# Top row (Headers)
|
# Top row (Headers)
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{bold}\
|
"{bold}{overall_label: <{overall_label_length}} {header_fill}{end_bold}".format(
|
||||||
{overall_label: <{overall_label_length}} \
|
|
||||||
{bandwidth_label: <{bandwidth_label_length}} \
|
|
||||||
{bandwidth: <{bandwidth_length}} \
|
|
||||||
{iops: <{iops_length}} \
|
|
||||||
{latency: <{latency_length}} \
|
|
||||||
{latency_bucket_label: <{latency_bucket_label_length}} \
|
|
||||||
{latency_bucket} \
|
|
||||||
{end_bold}".format(
|
|
||||||
bold=ansiprint.bold(),
|
bold=ansiprint.bold(),
|
||||||
end_bold=ansiprint.end(),
|
end_bold=ansiprint.end(),
|
||||||
overall_label=nice_test_name_map[test],
|
overall_label=nice_test_name_map[test],
|
||||||
overall_label_length=overall_label_length,
|
overall_label_length=overall_label_length,
|
||||||
bandwidth_label="",
|
header_fill="-"
|
||||||
bandwidth_label_length=bandwidth_label_length,
|
* (
|
||||||
bandwidth="Bandwidth/s",
|
(MAX_CONTENT_WIDTH if MAX_CONTENT_WIDTH <= 120 else 120)
|
||||||
bandwidth_length=bandwidth_column_length,
|
- len(nice_test_name_map[test])
|
||||||
iops="IOPS",
|
- 4
|
||||||
iops_length=iops_column_length,
|
),
|
||||||
latency="Latency (μs)",
|
)
|
||||||
latency_length=latency_column_length,
|
)
|
||||||
latency_bucket_label="Latency Buckets (μs/%)",
|
|
||||||
latency_bucket_label_length=latency_bucket_label_length,
|
ainformation.append(
|
||||||
latency_bucket="",
|
"{bold}\
|
||||||
|
{overall_label: <{overall_label_length}} \
|
||||||
|
{cpu_label: <{cpu_label_length}} \
|
||||||
|
{memory_label: <{memory_label_length}} \
|
||||||
|
{network_label: <{network_label_length}} \
|
||||||
|
{bandwidth_label: <{bandwidth_label_length}} \
|
||||||
|
{latency_label: <{latency_label_length}} \
|
||||||
|
{latency_bucket_label: <{latency_bucket_label_length}}\
|
||||||
|
{end_bold}".format(
|
||||||
|
bold=ansiprint.bold(),
|
||||||
|
end_bold=ansiprint.end(),
|
||||||
|
overall_label="Overall",
|
||||||
|
overall_label_length=overall_label_length + overall_column_length + 1,
|
||||||
|
cpu_label="CPU (%)",
|
||||||
|
cpu_label_length=cpu_label_length + cpu_column_length + 1,
|
||||||
|
memory_label="Memory (%)",
|
||||||
|
memory_label_length=memory_label_length + memory_column_length + 1,
|
||||||
|
network_label="Network (bps)",
|
||||||
|
network_label_length=network_label_length + network_column_length + 1,
|
||||||
|
bandwidth_label="Bandwidth / IOPS",
|
||||||
|
bandwidth_label_length=bandwidth_label_length
|
||||||
|
+ bandwidth_column_length
|
||||||
|
+ 1,
|
||||||
|
latency_label="Latency (μs)",
|
||||||
|
latency_label_length=latency_label_length + latency_column_length + 1,
|
||||||
|
latency_bucket_label="Buckets (μs/%)",
|
||||||
|
latency_bucket_label_length=latency_bucket_label_length
|
||||||
|
+ latency_bucket_column_length,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2588,14 +2690,20 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
# Top row (Headers)
|
# Top row (Headers)
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{bold}\
|
"{bold}\
|
||||||
{overall_label: >{overall_label_length}} \
|
{overall_label: <{overall_label_length}} \
|
||||||
{overall: <{overall_length}} \
|
{overall: <{overall_length}} \
|
||||||
{bandwidth_label: >{bandwidth_label_length}} \
|
{cpu_label: <{cpu_label_length}} \
|
||||||
{bandwidth: <{bandwidth_length}} \
|
{cpu: <{cpu_length}} \
|
||||||
{iops: <{iops_length}} \
|
{memory_label: <{memory_label_length}} \
|
||||||
{latency: <{latency_length}} \
|
{memory: <{memory_length}} \
|
||||||
{latency_bucket_label: >{latency_bucket_label_length}} \
|
{network_label: <{network_label_length}} \
|
||||||
{latency_bucket} \
|
{network: <{network_length}} \
|
||||||
|
{bandwidth_label: <{bandwidth_label_length}} \
|
||||||
|
{bandwidth: <{bandwidth_length}} \
|
||||||
|
{latency_label: <{latency_label_length}} \
|
||||||
|
{latency: <{latency_length}} \
|
||||||
|
{latency_bucket_label: <{latency_bucket_label_length}} \
|
||||||
|
{latency_bucket}\
|
||||||
{end_bold}".format(
|
{end_bold}".format(
|
||||||
bold="",
|
bold="",
|
||||||
end_bold="",
|
end_bold="",
|
||||||
|
@ -2603,12 +2711,24 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
overall_label_length=overall_label_length,
|
overall_label_length=overall_label_length,
|
||||||
overall=overall_data[idx],
|
overall=overall_data[idx],
|
||||||
overall_length=overall_column_length,
|
overall_length=overall_column_length,
|
||||||
|
cpu_label=cpu_label[idx],
|
||||||
|
cpu_label_length=cpu_label_length,
|
||||||
|
cpu=cpu_data[idx],
|
||||||
|
cpu_length=cpu_column_length,
|
||||||
|
memory_label=memory_label[idx],
|
||||||
|
memory_label_length=memory_label_length,
|
||||||
|
memory=memory_data[idx],
|
||||||
|
memory_length=memory_column_length,
|
||||||
|
network_label=network_label[idx],
|
||||||
|
network_label_length=network_label_length,
|
||||||
|
network=network_data[idx],
|
||||||
|
network_length=network_column_length,
|
||||||
bandwidth_label=bandwidth_label[idx],
|
bandwidth_label=bandwidth_label[idx],
|
||||||
bandwidth_label_length=bandwidth_label_length,
|
bandwidth_label_length=bandwidth_label_length,
|
||||||
bandwidth=bandwidth_data[idx],
|
bandwidth=bandwidth_data[idx],
|
||||||
bandwidth_length=bandwidth_column_length,
|
bandwidth_length=bandwidth_column_length,
|
||||||
iops=iops_data[idx],
|
latency_label=lat_label[idx],
|
||||||
iops_length=iops_column_length,
|
latency_label_length=latency_label_length,
|
||||||
latency=lat_data[idx],
|
latency=lat_data[idx],
|
||||||
latency_length=latency_column_length,
|
latency_length=latency_column_length,
|
||||||
latency_bucket_label=lat_bucket_label[idx],
|
latency_bucket_label=lat_bucket_label[idx],
|
||||||
|
@ -2617,4 +2737,4 @@ def format_info_benchmark_json(config, benchmark_information):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return "\n".join(ainformation)
|
return "\n".join(ainformation) + "\n"
|
||||||
|
|
|
@ -19,31 +19,34 @@
|
||||||
#
|
#
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
|
import os
|
||||||
|
import psutil
|
||||||
import psycopg2
|
import psycopg2
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
import subprocess
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from json import loads, dumps
|
from json import loads, dumps
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
from daemon_lib.celery import start, fail, log_info, update, finish
|
from daemon_lib.celery import start, fail, log_info, update, finish
|
||||||
|
|
||||||
import daemon_lib.common as pvc_common
|
|
||||||
import daemon_lib.ceph as pvc_ceph
|
import daemon_lib.ceph as pvc_ceph
|
||||||
|
|
||||||
|
|
||||||
# Define the current test format
|
# Define the current test format
|
||||||
TEST_FORMAT = 1
|
TEST_FORMAT = 2
|
||||||
|
|
||||||
|
|
||||||
# We run a total of 8 tests, to give a generalized idea of performance on the cluster:
|
# We run a total of 8 tests, to give a generalized idea of performance on the cluster:
|
||||||
# 1. A sequential read test of 8GB with a 4M block size
|
# 1. A sequential read test of 64GB with a 4M block size
|
||||||
# 2. A sequential write test of 8GB with a 4M block size
|
# 2. A sequential write test of 64GB with a 4M block size
|
||||||
# 3. A random read test of 8GB with a 4M block size
|
# 3. A random read test of 64GB with a 4M block size
|
||||||
# 4. A random write test of 8GB with a 4M block size
|
# 4. A random write test of 64GB with a 4M block size
|
||||||
# 5. A random read test of 8GB with a 256k block size
|
# 5. A random read test of 64GB with a 256k block size
|
||||||
# 6. A random write test of 8GB with a 256k block size
|
# 6. A random write test of 64GB with a 256k block size
|
||||||
# 7. A random read test of 8GB with a 4k block size
|
# 7. A random read test of 64GB with a 4k block size
|
||||||
# 8. A random write test of 8GB with a 4k block size
|
# 8. A random write test of 64GB with a 4k block size
|
||||||
# Taken together, these 8 results should give a very good indication of the overall storage performance
|
# Taken together, these 8 results should give a very good indication of the overall storage performance
|
||||||
# for a variety of workloads.
|
# for a variety of workloads.
|
||||||
test_matrix = {
|
test_matrix = {
|
||||||
|
@ -100,7 +103,7 @@ test_matrix = {
|
||||||
|
|
||||||
# Specify the benchmark volume name and size
|
# Specify the benchmark volume name and size
|
||||||
benchmark_volume_name = "pvcbenchmark"
|
benchmark_volume_name = "pvcbenchmark"
|
||||||
benchmark_volume_size = "8G"
|
benchmark_volume_size = "64G"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -226,7 +229,7 @@ def cleanup_benchmark_volume(
|
||||||
|
|
||||||
|
|
||||||
def run_benchmark_job(
|
def run_benchmark_job(
|
||||||
test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
|
config, test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
|
||||||
):
|
):
|
||||||
test_spec = test_matrix[test]
|
test_spec = test_matrix[test]
|
||||||
log_info(None, f"Running test '{test}'")
|
log_info(None, f"Running test '{test}'")
|
||||||
|
@ -256,24 +259,155 @@ def run_benchmark_job(
|
||||||
)
|
)
|
||||||
|
|
||||||
log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split())))
|
log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split())))
|
||||||
retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd)
|
|
||||||
|
# Run the fio command manually instead of using our run_os_command wrapper
|
||||||
|
# This will help us gather statistics about this node while it's running
|
||||||
|
process = subprocess.Popen(
|
||||||
|
fio_cmd.split(),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait 15 seconds for the test to start
|
||||||
|
log_info(None, "Waiting 15 seconds for test resource stabilization")
|
||||||
|
sleep(15)
|
||||||
|
|
||||||
|
# Set up function to get process CPU utilization by name
|
||||||
|
def get_cpu_utilization_by_name(process_name):
|
||||||
|
cpu_usage = 0
|
||||||
|
for proc in psutil.process_iter(["name", "cpu_percent"]):
|
||||||
|
if proc.info["name"] == process_name:
|
||||||
|
cpu_usage += proc.info["cpu_percent"]
|
||||||
|
return cpu_usage
|
||||||
|
|
||||||
|
# Set up function to get process memory utilization by name
|
||||||
|
def get_memory_utilization_by_name(process_name):
|
||||||
|
memory_usage = 0
|
||||||
|
for proc in psutil.process_iter(["name", "memory_percent"]):
|
||||||
|
if proc.info["name"] == process_name:
|
||||||
|
memory_usage += proc.info["memory_percent"]
|
||||||
|
return memory_usage
|
||||||
|
|
||||||
|
# Set up function to get network traffic utilization in bps
|
||||||
|
def get_network_traffic_bps(interface, duration=1):
|
||||||
|
# Get initial network counters
|
||||||
|
net_io_start = psutil.net_io_counters(pernic=True)
|
||||||
|
if interface not in net_io_start:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
stats_start = net_io_start[interface]
|
||||||
|
bytes_sent_start = stats_start.bytes_sent
|
||||||
|
bytes_recv_start = stats_start.bytes_recv
|
||||||
|
|
||||||
|
# Wait for the specified duration
|
||||||
|
sleep(duration)
|
||||||
|
|
||||||
|
# Get final network counters
|
||||||
|
net_io_end = psutil.net_io_counters(pernic=True)
|
||||||
|
stats_end = net_io_end[interface]
|
||||||
|
bytes_sent_end = stats_end.bytes_sent
|
||||||
|
bytes_recv_end = stats_end.bytes_recv
|
||||||
|
|
||||||
|
# Calculate bytes per second
|
||||||
|
bytes_sent_per_sec = (bytes_sent_end - bytes_sent_start) / duration
|
||||||
|
bytes_recv_per_sec = (bytes_recv_end - bytes_recv_start) / duration
|
||||||
|
|
||||||
|
# Convert to bits per second (bps)
|
||||||
|
bits_sent_per_sec = bytes_sent_per_sec * 8
|
||||||
|
bits_recv_per_sec = bytes_recv_per_sec * 8
|
||||||
|
bits_total_per_sec = bits_sent_per_sec + bits_recv_per_sec
|
||||||
|
|
||||||
|
return bits_sent_per_sec, bits_recv_per_sec, bits_total_per_sec
|
||||||
|
|
||||||
|
log_info(None, f"Starting system resource polling for test '{test}'")
|
||||||
|
storage_interface = config["storage_dev"]
|
||||||
|
total_cpus = psutil.cpu_count(logical=True)
|
||||||
|
ticks = 1
|
||||||
|
osd_cpu_utilization = 0
|
||||||
|
osd_memory_utilization = 0
|
||||||
|
mon_cpu_utilization = 0
|
||||||
|
mon_memory_utilization = 0
|
||||||
|
total_cpu_utilization = 0
|
||||||
|
total_memory_utilization = 0
|
||||||
|
storage_sent_bps = 0
|
||||||
|
storage_recv_bps = 0
|
||||||
|
storage_total_bps = 0
|
||||||
|
|
||||||
|
while process.poll() is None:
|
||||||
|
# Do collection of statistics like network bandwidth and cpu utilization
|
||||||
|
current_osd_cpu_utilization = get_cpu_utilization_by_name("ceph-osd")
|
||||||
|
current_osd_memory_utilization = get_memory_utilization_by_name("ceph-osd")
|
||||||
|
current_mon_cpu_utilization = get_cpu_utilization_by_name("ceph-mon")
|
||||||
|
current_mon_memory_utilization = get_memory_utilization_by_name("ceph-mon")
|
||||||
|
current_total_cpu_utilization = psutil.cpu_percent(interval=1)
|
||||||
|
current_total_memory_utilization = psutil.virtual_memory().percent
|
||||||
|
(
|
||||||
|
current_storage_sent_bps,
|
||||||
|
current_storage_recv_bps,
|
||||||
|
current_storage_total_bps,
|
||||||
|
) = get_network_traffic_bps(storage_interface)
|
||||||
|
# Recheck if the process is done yet; if it's not, we add the values and increase the ticks
|
||||||
|
# This helps ensure that if the process finishes earlier than the longer polls above,
|
||||||
|
# this particular tick isn't counted which can skew the average
|
||||||
|
if process.poll() is None:
|
||||||
|
osd_cpu_utilization += current_osd_cpu_utilization
|
||||||
|
osd_memory_utilization += current_osd_memory_utilization
|
||||||
|
mon_cpu_utilization += current_mon_cpu_utilization
|
||||||
|
mon_memory_utilization += current_mon_memory_utilization
|
||||||
|
total_cpu_utilization += current_total_cpu_utilization
|
||||||
|
total_memory_utilization += current_total_memory_utilization
|
||||||
|
storage_sent_bps += current_storage_sent_bps
|
||||||
|
storage_recv_bps += current_storage_recv_bps
|
||||||
|
storage_total_bps += current_storage_total_bps
|
||||||
|
ticks += 1
|
||||||
|
|
||||||
|
# Get the 1-minute load average and CPU utilization, which covers the test duration
|
||||||
|
load1, _, _ = os.getloadavg()
|
||||||
|
load1 = round(load1, 2)
|
||||||
|
|
||||||
|
# Calculate the average CPU utilization values over the runtime
|
||||||
|
# Divide the OSD and MON CPU utilization by the total number of CPU cores, because
|
||||||
|
# the total is divided this way
|
||||||
|
avg_osd_cpu_utilization = round(osd_cpu_utilization / ticks / total_cpus, 2)
|
||||||
|
avg_osd_memory_utilization = round(osd_memory_utilization / ticks, 2)
|
||||||
|
avg_mon_cpu_utilization = round(mon_cpu_utilization / ticks / total_cpus, 2)
|
||||||
|
avg_mon_memory_utilization = round(mon_memory_utilization / ticks, 2)
|
||||||
|
avg_total_cpu_utilization = round(total_cpu_utilization / ticks, 2)
|
||||||
|
avg_total_memory_utilization = round(total_memory_utilization / ticks, 2)
|
||||||
|
avg_storage_sent_bps = round(storage_sent_bps / ticks, 2)
|
||||||
|
avg_storage_recv_bps = round(storage_recv_bps / ticks, 2)
|
||||||
|
avg_storage_total_bps = round(storage_total_bps / ticks, 2)
|
||||||
|
|
||||||
|
stdout, stderr = process.communicate()
|
||||||
|
retcode = process.returncode
|
||||||
|
|
||||||
|
resource_data = {
|
||||||
|
"avg_cpu_util_percent": {
|
||||||
|
"total": avg_total_cpu_utilization,
|
||||||
|
"ceph-mon": avg_mon_cpu_utilization,
|
||||||
|
"ceph-osd": avg_osd_cpu_utilization,
|
||||||
|
},
|
||||||
|
"avg_memory_util_percent": {
|
||||||
|
"total": avg_total_memory_utilization,
|
||||||
|
"ceph-mon": avg_mon_memory_utilization,
|
||||||
|
"ceph-osd": avg_osd_memory_utilization,
|
||||||
|
},
|
||||||
|
"avg_network_util_bps": {
|
||||||
|
"sent": avg_storage_sent_bps,
|
||||||
|
"recv": avg_storage_recv_bps,
|
||||||
|
"total": avg_storage_total_bps,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
jstdout = loads(stdout)
|
jstdout = loads(stdout)
|
||||||
if retcode:
|
if retcode:
|
||||||
raise
|
raise
|
||||||
except Exception:
|
except Exception:
|
||||||
cleanup(
|
return None, None
|
||||||
job_name,
|
|
||||||
db_conn=db_conn,
|
|
||||||
db_cur=db_cur,
|
|
||||||
zkhandler=zkhandler,
|
|
||||||
)
|
|
||||||
fail(
|
|
||||||
None,
|
|
||||||
f"Failed to run fio test '{test}': {stderr}",
|
|
||||||
)
|
|
||||||
|
|
||||||
return jstdout
|
return resource_data, jstdout
|
||||||
|
|
||||||
|
|
||||||
def worker_run_benchmark(zkhandler, celery, config, pool):
|
def worker_run_benchmark(zkhandler, celery, config, pool):
|
||||||
|
@ -358,7 +492,8 @@ def worker_run_benchmark(zkhandler, celery, config, pool):
|
||||||
total=total_stages,
|
total=total_stages,
|
||||||
)
|
)
|
||||||
|
|
||||||
results[test] = run_benchmark_job(
|
resource_data, fio_data = run_benchmark_job(
|
||||||
|
config,
|
||||||
test,
|
test,
|
||||||
pool,
|
pool,
|
||||||
job_name=job_name,
|
job_name=job_name,
|
||||||
|
@ -366,6 +501,25 @@ def worker_run_benchmark(zkhandler, celery, config, pool):
|
||||||
db_cur=db_cur,
|
db_cur=db_cur,
|
||||||
zkhandler=zkhandler,
|
zkhandler=zkhandler,
|
||||||
)
|
)
|
||||||
|
if resource_data is None or fio_data is None:
|
||||||
|
cleanup_benchmark_volume(
|
||||||
|
pool,
|
||||||
|
job_name=job_name,
|
||||||
|
db_conn=db_conn,
|
||||||
|
db_cur=db_cur,
|
||||||
|
zkhandler=zkhandler,
|
||||||
|
)
|
||||||
|
cleanup(
|
||||||
|
job_name,
|
||||||
|
db_conn=db_conn,
|
||||||
|
db_cur=db_cur,
|
||||||
|
zkhandler=zkhandler,
|
||||||
|
)
|
||||||
|
fail(
|
||||||
|
None,
|
||||||
|
f"Failed to run fio test '{test}'",
|
||||||
|
)
|
||||||
|
results[test] = {**resource_data, **fio_data}
|
||||||
|
|
||||||
# Phase 3 - cleanup
|
# Phase 3 - cleanup
|
||||||
current_stage += 1
|
current_stage += 1
|
||||||
|
|
Loading…
Reference in New Issue