Convert benchmark to use new Celery step structure

This commit is contained in:
Joshua Boniface 2023-11-16 19:21:36 -05:00
parent 4d23d0419c
commit e8da3714c0
1 changed files with 131 additions and 90 deletions

View File

@ -22,16 +22,22 @@
import psycopg2
import psycopg2.extras
from datetime import datetime
from json import loads, dumps
from pvcapid.Daemon import config
from daemon_lib.zkhandler import ZKHandler
from daemon_lib.celery import start, fail, log_info, update, finish
import daemon_lib.common as pvc_common
import daemon_lib.ceph as pvc_ceph
# Define the current test format
TEST_FORMAT = 1
# We run a total of 8 tests, to give a generalized idea of performance on the cluster:
# 1. A sequential read test of 8GB with a 4M block size
# 2. A sequential write test of 8GB with a 4M block size
@ -104,27 +110,7 @@ benchmark_volume_size = "8G"
# Exceptions (used by Celery tasks)
#
class BenchmarkError(Exception):
"""
An exception that results from the Benchmark job.
"""
def __init__(
self, message, job_name=None, db_conn=None, db_cur=None, zkhandler=None
):
self.message = message
if job_name is not None and db_conn is not None and db_cur is not None:
# Clean up our dangling result
query = "DELETE FROM storage_benchmarks WHERE job = %s;"
args = (job_name,)
db_cur.execute(query, args)
db_conn.commit()
# Close the database connections cleanly
close_database(db_conn, db_cur)
if job_name is not None and zkhandler is not None:
zkhandler.disconnect()
def __str__(self):
return str(self.message)
pass
#
@ -132,6 +118,20 @@ class BenchmarkError(Exception):
#
def cleanup(job_name, db_conn=None, db_cur=None, zkhandler=None):
if db_conn is not None and db_cur is not None:
# Clean up our dangling result
query = "DELETE FROM storage_benchmarks WHERE job = %s;"
args = (job_name,)
db_cur.execute(query, args)
db_conn.commit()
# Close the database connections cleanly
close_database(db_conn, db_cur)
if zkhandler is not None:
zkhandler.disconnect()
del zkhandler
# Database connections
def open_database(config):
conn = psycopg2.connect(
@ -193,17 +193,18 @@ def prepare_benchmark_volume(
zkhandler, pool, benchmark_volume_name, benchmark_volume_size
)
if not retcode:
raise BenchmarkError(
'Failed to create volume "{}" on pool "{}": {}'.format(
benchmark_volume_name, pool, retmsg
),
job_name=job_name,
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(
None,
f'Failed to create volume "{benchmark_volume_name}" on pool "{pool}": {retmsg}',
)
else:
print(retmsg)
log_info(None, retmsg)
def cleanup_benchmark_volume(
@ -212,24 +213,25 @@ def cleanup_benchmark_volume(
# Remove the RBD volume
retcode, retmsg = pvc_ceph.remove_volume(zkhandler, pool, benchmark_volume_name)
if not retcode:
raise BenchmarkError(
'Failed to remove volume "{}" on pool "{}": {}'.format(
benchmark_volume_name, pool, retmsg
),
job_name=job_name,
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(
None,
f'Failed to remove volume "{benchmark_volume_name}" from pool "{pool}": {retmsg}',
)
else:
print(retmsg)
log_info(None, retmsg)
def run_benchmark_job(
test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
):
test_spec = test_matrix[test]
print("Running test '{}'".format(test))
log_info(None, f"Running test '{test}'")
fio_cmd = """
fio \
--name={test} \
@ -255,51 +257,73 @@ def run_benchmark_job(
rw=test_spec["rw"],
)
print("Running fio job: {}".format(" ".join(fio_cmd.split())))
log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split())))
retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd)
if retcode:
raise BenchmarkError(
"Failed to run fio test: {}".format(stderr),
job_name=job_name,
try:
jstdout = loads(stdout)
if retcode:
raise
except Exception:
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(
None,
f"Failed to run fio test '{test}': {stderr}",
)
return loads(stdout)
return jstdout
def run_benchmark(self, pool):
# Runtime imports
import time
from datetime import datetime
# Define the current test format
TEST_FORMAT = 1
time.sleep(2)
# Phase 0 - connect to databases
try:
db_conn, db_cur = open_database(config)
except Exception:
print("FATAL - failed to connect to Postgres")
raise Exception
try:
zkhandler = ZKHandler(config)
zkhandler.connect()
except Exception:
print("FATAL - failed to connect to Zookeeper")
raise Exception
fail(
self,
"Failed to connect to Zookeeper",
)
cur_time = datetime.now().isoformat(timespec="seconds")
cur_primary = zkhandler.read("base.config.primary_node")
job_name = "{}_{}".format(cur_time, cur_primary)
job_name = f"{cur_time}_{cur_primary}"
print("Starting storage benchmark '{}' on pool '{}'".format(job_name, pool))
current_stage = 0
total_stages = 13
start(
self,
f"Running storage benchmark '{job_name}' on pool '{pool}'",
current=current_stage,
total=total_stages,
)
try:
db_conn, db_cur = open_database(config)
except Exception:
cleanup(
job_name,
db_conn=None,
db_cur=None,
zkhandler=zkhandler,
)
fail(
self,
"Failed to connect to Postgres",
)
current_stage += 1
update(
self,
"Storing running status in database",
current=current_stage,
total=total_stages,
)
print("Storing running status for job '{}' in database".format(job_name))
try:
query = "INSERT INTO storage_benchmarks (job, test_format, result) VALUES (%s, %s, %s);"
args = (
@ -310,20 +334,21 @@ def run_benchmark(self, pool):
db_cur.execute(query, args)
db_conn.commit()
except Exception as e:
raise BenchmarkError(
"Failed to store running status: {}".format(e),
job_name=job_name,
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(self, f"Failed to store running status: {e}", exception=BenchmarkError)
# Phase 1 - volume preparation
self.update_state(
state="RUNNING",
meta={"current": 1, "total": 3, "status": "Creating benchmark volume"},
current_stage += 1
update(
self,
"Creating benchmark volume",
current=current_stage,
total=total_stages,
)
time.sleep(1)
prepare_benchmark_volume(
pool,
@ -334,14 +359,16 @@ def run_benchmark(self, pool):
)
# Phase 2 - benchmark run
self.update_state(
state="RUNNING",
meta={"current": 2, "total": 3, "status": "Running fio benchmarks on volume"},
)
time.sleep(1)
results = dict()
for test in test_matrix:
current_stage += 1
update(
self,
f"Running benchmark job '{test}'",
current=current_stage,
total=total_stages,
)
results[test] = run_benchmark_job(
test,
pool,
@ -352,11 +379,13 @@ def run_benchmark(self, pool):
)
# Phase 3 - cleanup
self.update_state(
state="RUNNING",
meta={"current": 3, "total": 3, "status": "Cleaning up and storing results"},
current_stage += 1
update(
self,
"Cleaning up venchmark volume",
current=current_stage,
total=total_stages,
)
time.sleep(1)
cleanup_benchmark_volume(
pool,
@ -366,27 +395,39 @@ def run_benchmark(self, pool):
zkhandler=zkhandler,
)
print("Storing result of tests for job '{}' in database".format(job_name))
current_stage += 1
update(
self,
"Storing results in database",
current=current_stage,
total=total_stages,
)
try:
query = "UPDATE storage_benchmarks SET result = %s WHERE job = %s;"
args = (dumps(results), job_name)
db_cur.execute(query, args)
db_conn.commit()
except Exception as e:
raise BenchmarkError(
"Failed to store test results: {}".format(e),
job_name=job_name,
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(self, f"Failed to store test results: {e}", exception=BenchmarkError)
close_database(db_conn, db_cur)
zkhandler.disconnect()
del zkhandler
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
return {
"status": "Storage benchmark '{}' completed successfully.",
"current": 3,
"total": 3,
}
current_stage += 1
return finish(
self,
f"Storage benchmark {job_name} completed successfully.",
current=current_stage,
total=total_stages,
)