pvc/api-daemon/pvcapid/benchmark.py

334 lines
9.9 KiB
Python
Raw Normal View History

2020-08-24 14:57:52 -04:00
#!/usr/bin/env python3
# benchmark.py - PVC API Benchmark functions
# Part of the Parallel Virtual Cluster (PVC) system
#
2022-10-06 11:55:27 -04:00
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
2020-08-24 14:57:52 -04:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
2020-08-24 14:57:52 -04:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
import psycopg2
import psycopg2.extras
from json import loads, dumps
from pvcapid.Daemon import config
2020-08-24 14:57:52 -04:00
from daemon_lib.zkhandler import ZKHandler
2021-05-29 00:24:53 -04:00
2020-08-24 14:57:52 -04:00
import daemon_lib.common as pvc_common
import daemon_lib.ceph as pvc_ceph
2020-08-24 14:57:52 -04:00
#
# Exceptions (used by Celery tasks)
#
class BenchmarkError(Exception):
"""
An exception that results from the Benchmark job.
"""
def __init__(
self, message, job_name=None, db_conn=None, db_cur=None, zkhandler=None
):
2020-08-24 14:57:52 -04:00
self.message = message
if job_name is not None:
2020-08-24 14:57:52 -04:00
# Clean up our dangling result
query = "DELETE FROM storage_benchmarks WHERE job = %s;"
args = (job_name,)
2020-08-24 14:57:52 -04:00
db_cur.execute(query, args)
db_conn.commit()
# Close the database connections cleanly
close_database(db_conn, db_cur)
2021-05-29 00:24:53 -04:00
zkhandler.disconnect()
2020-08-24 14:57:52 -04:00
def __str__(self):
return str(self.message)
2020-08-24 14:57:52 -04:00
#
# Common functions
#
2020-08-24 14:57:52 -04:00
# Database connections
def open_database(config):
conn = psycopg2.connect(
host=config["database_host"],
port=config["database_port"],
dbname=config["database_name"],
user=config["database_user"],
password=config["database_password"],
2020-08-24 14:57:52 -04:00
)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
return conn, cur
2020-08-24 14:57:52 -04:00
def close_database(conn, cur, failed=False):
if not failed:
conn.commit()
cur.close()
conn.close()
2020-08-24 14:57:52 -04:00
def list_benchmarks(job=None):
2020-08-25 12:23:12 -04:00
if job is not None:
query = "SELECT * FROM {} WHERE job = %s;".format("storage_benchmarks")
args = (job,)
2020-08-24 14:57:52 -04:00
else:
query = "SELECT * FROM {} ORDER BY id DESC;".format("storage_benchmarks")
2020-08-24 14:57:52 -04:00
args = ()
conn, cur = open_database(config)
cur.execute(query, args)
orig_data = cur.fetchall()
data = list()
for benchmark in orig_data:
benchmark_data = dict()
benchmark_data["id"] = benchmark["id"]
benchmark_data["job"] = benchmark["job"]
benchmark_data["test_format"] = benchmark["test_format"]
if benchmark["result"] == "Running":
benchmark_data["benchmark_result"] = "Running"
else:
try:
benchmark_data["benchmark_result"] = loads(benchmark["result"])
except Exception:
benchmark_data["benchmark_result"] = {}
2020-08-24 14:57:52 -04:00
# Append the new data to our actual output structure
data.append(benchmark_data)
close_database(conn, cur)
if data:
return data, 200
else:
return {"message": "No benchmark found."}, 404
2020-08-24 14:57:52 -04:00
def run_benchmark(self, pool):
2020-08-24 14:57:52 -04:00
# Runtime imports
import time
from datetime import datetime
# Define the current test format
TEST_FORMAT = 1
2020-08-24 14:57:52 -04:00
time.sleep(2)
# Phase 0 - connect to databases
try:
db_conn, db_cur = open_database(config)
2020-11-06 18:55:10 -05:00
except Exception:
print("FATAL - failed to connect to Postgres")
2020-08-24 14:57:52 -04:00
raise Exception
try:
zkhandler = ZKHandler(config)
zkhandler.connect()
except Exception:
print("FATAL - failed to connect to Zookeeper")
raise Exception
cur_time = datetime.now().isoformat(timespec="seconds")
cur_primary = zkhandler.read("base.config.primary_node")
job_name = "{}_{}".format(cur_time, cur_primary)
print("Starting storage benchmark '{}' on pool '{}'".format(job_name, pool))
print("Storing running status for job '{}' in database".format(job_name))
2020-08-24 14:57:52 -04:00
try:
query = "INSERT INTO storage_benchmarks (job, test_format, result) VALUES (%s, %s, %s);"
args = (
job_name,
TEST_FORMAT,
"Running",
)
2020-08-24 14:57:52 -04:00
db_cur.execute(query, args)
db_conn.commit()
except Exception as e:
raise BenchmarkError(
"Failed to store running status: {}".format(e),
job_name=job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
2020-08-24 14:57:52 -04:00
# Phase 1 - volume preparation
self.update_state(
state="RUNNING",
meta={"current": 1, "total": 3, "status": "Creating benchmark volume"},
)
2020-08-24 14:57:52 -04:00
time.sleep(1)
volume = "pvcbenchmark"
2020-08-24 14:57:52 -04:00
# Create the RBD volume
2021-05-29 00:24:53 -04:00
retcode, retmsg = pvc_ceph.add_volume(zkhandler, pool, volume, "8G")
2020-08-24 14:57:52 -04:00
if not retcode:
raise BenchmarkError(
'Failed to create volume "{}": {}'.format(volume, retmsg),
job_name=job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
2020-08-24 14:57:52 -04:00
else:
print(retmsg)
# Phase 2 - benchmark run
self.update_state(
state="RUNNING",
meta={"current": 2, "total": 3, "status": "Running fio benchmarks on volume"},
)
2020-08-24 14:57:52 -04:00
time.sleep(1)
# We run a total of 8 tests, to give a generalized idea of performance on the cluster:
2020-08-25 12:16:23 -04:00
# 1. A sequential read test of 8GB with a 4M block size
# 2. A sequential write test of 8GB with a 4M block size
# 3. A random read test of 8GB with a 4M block size
# 4. A random write test of 8GB with a 4M block size
# 5. A random read test of 8GB with a 256k block size
# 6. A random write test of 8GB with a 256k block size
# 7. A random read test of 8GB with a 4k block size
# 8. A random write test of 8GB with a 4k block size
2020-08-24 14:57:52 -04:00
# Taken together, these 8 results should give a very good indication of the overall storage performance
# for a variety of workloads.
test_matrix = {
"seq_read": {"direction": "read", "iodepth": "64", "bs": "4M", "rw": "read"},
"seq_write": {"direction": "write", "iodepth": "64", "bs": "4M", "rw": "write"},
"rand_read_4M": {
"direction": "read",
"iodepth": "64",
"bs": "4M",
"rw": "randread",
2020-08-24 14:57:52 -04:00
},
"rand_write_4M": {
"direction": "write",
"iodepth": "64",
"bs": "4M",
"rw": "randwrite",
2020-08-24 14:57:52 -04:00
},
"rand_read_4K": {
"direction": "read",
"iodepth": "64",
"bs": "4K",
"rw": "randread",
2020-08-24 14:57:52 -04:00
},
"rand_write_4K": {
"direction": "write",
"iodepth": "64",
"bs": "4K",
"rw": "randwrite",
2020-08-24 14:57:52 -04:00
},
"rand_read_4K_lowdepth": {
"direction": "read",
"iodepth": "1",
"bs": "4K",
"rw": "randread",
2020-08-24 14:57:52 -04:00
},
"rand_write_4K_lowdepth": {
"direction": "write",
"iodepth": "1",
"bs": "4K",
"rw": "randwrite",
},
2020-08-24 14:57:52 -04:00
}
results = dict()
2020-08-24 14:57:52 -04:00
for test in test_matrix:
print("Running test '{}'".format(test))
fio_cmd = """
fio \
--name={test} \
2020-08-24 14:57:52 -04:00
--ioengine=rbd \
--pool={pool} \
--rbdname={volume} \
--output-format=json \
2020-08-24 14:57:52 -04:00
--direct=1 \
--randrepeat=1 \
--numjobs=1 \
--time_based \
--runtime=75 \
--group_reporting \
--iodepth={iodepth} \
2020-08-24 14:57:52 -04:00
--bs={bs} \
--readwrite={rw}
""".format(
test=test,
pool=pool,
volume=volume,
iodepth=test_matrix[test]["iodepth"],
bs=test_matrix[test]["bs"],
rw=test_matrix[test]["rw"],
)
2020-08-24 14:57:52 -04:00
print("Running fio job: {}".format(" ".join(fio_cmd.split())))
2020-08-24 14:57:52 -04:00
retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd)
if retcode:
raise BenchmarkError(
"Failed to run fio test: {}".format(stderr),
job_name=job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
2020-08-24 14:57:52 -04:00
results[test] = loads(stdout)
2020-08-24 14:57:52 -04:00
# Phase 3 - cleanup
self.update_state(
state="RUNNING",
meta={"current": 3, "total": 3, "status": "Cleaning up and storing results"},
)
2020-08-24 14:57:52 -04:00
time.sleep(1)
# Remove the RBD volume
2021-05-29 00:24:53 -04:00
retcode, retmsg = pvc_ceph.remove_volume(zkhandler, pool, volume)
2020-08-24 14:57:52 -04:00
if not retcode:
raise BenchmarkError(
'Failed to remove volume "{}": {}'.format(volume, retmsg),
job_name=job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
2020-08-24 14:57:52 -04:00
else:
print(retmsg)
print("Storing result of tests for job '{}' in database".format(job_name))
2020-08-24 14:57:52 -04:00
try:
query = "UPDATE storage_benchmarks SET result = %s WHERE job = %s;"
args = (dumps(results), job_name)
2020-08-24 14:57:52 -04:00
db_cur.execute(query, args)
db_conn.commit()
except Exception as e:
raise BenchmarkError(
"Failed to store test results: {}".format(e),
job_name=job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
2020-08-24 14:57:52 -04:00
close_database(db_conn, db_cur)
zkhandler.disconnect()
del zkhandler
return {
"status": "Storage benchmark '{}' completed successfully.",
"current": 3,
"total": 3,
}