From e8da3714c05a80ff413a7dd4278d4910c223878e Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Nov 2023 19:21:36 -0500 Subject: [PATCH] Convert benchmark to use new Celery step structure --- api-daemon/pvcapid/benchmark.py | 221 +++++++++++++++++++------------- 1 file changed, 131 insertions(+), 90 deletions(-) diff --git a/api-daemon/pvcapid/benchmark.py b/api-daemon/pvcapid/benchmark.py index a419f972..d24b6fd4 100755 --- a/api-daemon/pvcapid/benchmark.py +++ b/api-daemon/pvcapid/benchmark.py @@ -22,16 +22,22 @@ import psycopg2 import psycopg2.extras +from datetime import datetime from json import loads, dumps from pvcapid.Daemon import config from daemon_lib.zkhandler import ZKHandler +from daemon_lib.celery import start, fail, log_info, update, finish import daemon_lib.common as pvc_common import daemon_lib.ceph as pvc_ceph +# Define the current test format +TEST_FORMAT = 1 + + # We run a total of 8 tests, to give a generalized idea of performance on the cluster: # 1. A sequential read test of 8GB with a 4M block size # 2. A sequential write test of 8GB with a 4M block size @@ -104,27 +110,7 @@ benchmark_volume_size = "8G" # Exceptions (used by Celery tasks) # class BenchmarkError(Exception): - """ - An exception that results from the Benchmark job. - """ - - def __init__( - self, message, job_name=None, db_conn=None, db_cur=None, zkhandler=None - ): - self.message = message - if job_name is not None and db_conn is not None and db_cur is not None: - # Clean up our dangling result - query = "DELETE FROM storage_benchmarks WHERE job = %s;" - args = (job_name,) - db_cur.execute(query, args) - db_conn.commit() - # Close the database connections cleanly - close_database(db_conn, db_cur) - if job_name is not None and zkhandler is not None: - zkhandler.disconnect() - - def __str__(self): - return str(self.message) + pass # @@ -132,6 +118,20 @@ class BenchmarkError(Exception): # +def cleanup(job_name, db_conn=None, db_cur=None, zkhandler=None): + if db_conn is not None and db_cur is not None: + # Clean up our dangling result + query = "DELETE FROM storage_benchmarks WHERE job = %s;" + args = (job_name,) + db_cur.execute(query, args) + db_conn.commit() + # Close the database connections cleanly + close_database(db_conn, db_cur) + if zkhandler is not None: + zkhandler.disconnect() + del zkhandler + + # Database connections def open_database(config): conn = psycopg2.connect( @@ -193,17 +193,18 @@ def prepare_benchmark_volume( zkhandler, pool, benchmark_volume_name, benchmark_volume_size ) if not retcode: - raise BenchmarkError( - 'Failed to create volume "{}" on pool "{}": {}'.format( - benchmark_volume_name, pool, retmsg - ), - job_name=job_name, + cleanup( + job_name, db_conn=db_conn, db_cur=db_cur, zkhandler=zkhandler, ) + fail( + None, + f'Failed to create volume "{benchmark_volume_name}" on pool "{pool}": {retmsg}', + ) else: - print(retmsg) + log_info(None, retmsg) def cleanup_benchmark_volume( @@ -212,24 +213,25 @@ def cleanup_benchmark_volume( # Remove the RBD volume retcode, retmsg = pvc_ceph.remove_volume(zkhandler, pool, benchmark_volume_name) if not retcode: - raise BenchmarkError( - 'Failed to remove volume "{}" on pool "{}": {}'.format( - benchmark_volume_name, pool, retmsg - ), - job_name=job_name, + cleanup( + job_name, db_conn=db_conn, db_cur=db_cur, zkhandler=zkhandler, ) + fail( + None, + f'Failed to remove volume "{benchmark_volume_name}" from pool "{pool}": {retmsg}', + ) else: - print(retmsg) + log_info(None, retmsg) def run_benchmark_job( test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None ): test_spec = test_matrix[test] - print("Running test '{}'".format(test)) + log_info(None, f"Running test '{test}'") fio_cmd = """ fio \ --name={test} \ @@ -255,51 +257,73 @@ def run_benchmark_job( rw=test_spec["rw"], ) - print("Running fio job: {}".format(" ".join(fio_cmd.split()))) + log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split()))) retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd) - if retcode: - raise BenchmarkError( - "Failed to run fio test: {}".format(stderr), - job_name=job_name, + try: + jstdout = loads(stdout) + if retcode: + raise + except Exception: + cleanup( + job_name, db_conn=db_conn, db_cur=db_cur, zkhandler=zkhandler, ) + fail( + None, + f"Failed to run fio test '{test}': {stderr}", + ) - return loads(stdout) + return jstdout def run_benchmark(self, pool): - # Runtime imports - import time - from datetime import datetime - - # Define the current test format - TEST_FORMAT = 1 - - time.sleep(2) - # Phase 0 - connect to databases - try: - db_conn, db_cur = open_database(config) - except Exception: - print("FATAL - failed to connect to Postgres") - raise Exception - try: zkhandler = ZKHandler(config) zkhandler.connect() except Exception: - print("FATAL - failed to connect to Zookeeper") - raise Exception + fail( + self, + "Failed to connect to Zookeeper", + ) cur_time = datetime.now().isoformat(timespec="seconds") cur_primary = zkhandler.read("base.config.primary_node") - job_name = "{}_{}".format(cur_time, cur_primary) + job_name = f"{cur_time}_{cur_primary}" - print("Starting storage benchmark '{}' on pool '{}'".format(job_name, pool)) + current_stage = 0 + total_stages = 13 + start( + self, + f"Running storage benchmark '{job_name}' on pool '{pool}'", + current=current_stage, + total=total_stages, + ) + + try: + db_conn, db_cur = open_database(config) + except Exception: + cleanup( + job_name, + db_conn=None, + db_cur=None, + zkhandler=zkhandler, + ) + fail( + self, + "Failed to connect to Postgres", + ) + + current_stage += 1 + update( + self, + "Storing running status in database", + current=current_stage, + total=total_stages, + ) - print("Storing running status for job '{}' in database".format(job_name)) try: query = "INSERT INTO storage_benchmarks (job, test_format, result) VALUES (%s, %s, %s);" args = ( @@ -310,20 +334,21 @@ def run_benchmark(self, pool): db_cur.execute(query, args) db_conn.commit() except Exception as e: - raise BenchmarkError( - "Failed to store running status: {}".format(e), - job_name=job_name, + cleanup( + job_name, db_conn=db_conn, db_cur=db_cur, zkhandler=zkhandler, ) + fail(self, f"Failed to store running status: {e}", exception=BenchmarkError) - # Phase 1 - volume preparation - self.update_state( - state="RUNNING", - meta={"current": 1, "total": 3, "status": "Creating benchmark volume"}, + current_stage += 1 + update( + self, + "Creating benchmark volume", + current=current_stage, + total=total_stages, ) - time.sleep(1) prepare_benchmark_volume( pool, @@ -334,14 +359,16 @@ def run_benchmark(self, pool): ) # Phase 2 - benchmark run - self.update_state( - state="RUNNING", - meta={"current": 2, "total": 3, "status": "Running fio benchmarks on volume"}, - ) - time.sleep(1) - results = dict() for test in test_matrix: + current_stage += 1 + update( + self, + f"Running benchmark job '{test}'", + current=current_stage, + total=total_stages, + ) + results[test] = run_benchmark_job( test, pool, @@ -352,11 +379,13 @@ def run_benchmark(self, pool): ) # Phase 3 - cleanup - self.update_state( - state="RUNNING", - meta={"current": 3, "total": 3, "status": "Cleaning up and storing results"}, + current_stage += 1 + update( + self, + "Cleaning up venchmark volume", + current=current_stage, + total=total_stages, ) - time.sleep(1) cleanup_benchmark_volume( pool, @@ -366,27 +395,39 @@ def run_benchmark(self, pool): zkhandler=zkhandler, ) - print("Storing result of tests for job '{}' in database".format(job_name)) + current_stage += 1 + update( + self, + "Storing results in database", + current=current_stage, + total=total_stages, + ) + try: query = "UPDATE storage_benchmarks SET result = %s WHERE job = %s;" args = (dumps(results), job_name) db_cur.execute(query, args) db_conn.commit() except Exception as e: - raise BenchmarkError( - "Failed to store test results: {}".format(e), - job_name=job_name, + cleanup( + job_name, db_conn=db_conn, db_cur=db_cur, zkhandler=zkhandler, ) + fail(self, f"Failed to store test results: {e}", exception=BenchmarkError) - close_database(db_conn, db_cur) - zkhandler.disconnect() - del zkhandler + cleanup( + job_name, + db_conn=db_conn, + db_cur=db_cur, + zkhandler=zkhandler, + ) - return { - "status": "Storage benchmark '{}' completed successfully.", - "current": 3, - "total": 3, - } + current_stage += 1 + return finish( + self, + f"Storage benchmark {job_name} completed successfully.", + current=current_stage, + total=total_stages, + )