From 4ef5fbdbe85a5ecb946ce5dd316f0132fbb5eaab Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sun, 25 Aug 2024 17:04:43 -0400 Subject: [PATCH] Restore previous autobackup continue behaviour With the original system, the failure of one VM's backups would not trigger a total fault, thus allowing other backups to complete. Restore that behaviour. --- daemon-common/autobackup.py | 132 ++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 60 deletions(-) diff --git a/daemon-common/autobackup.py b/daemon-common/autobackup.py index 43bd491b..b9ea175b 100644 --- a/daemon-common/autobackup.py +++ b/daemon-common/autobackup.py @@ -118,7 +118,7 @@ def send_execution_summary_report( email.append("") email.append( - f"A PVC autobackup has been completed at {current_datetime} in {total_time}s." + f"A PVC autobackup has been completed at {current_datetime} in {total_time}." ) email.append("") email.append( @@ -462,6 +462,33 @@ def worker_cluster_autobackup( else: export_fileext = "rbdimg" + failure = False + export_files = None + export_files_size = 0 + + def write_backup_summary(success=False, message=""): + export_details = { + "type": export_type, + "result": success, + "message": message, + "datestring": datestring, + "snapshot_name": snapshot_name, + "incremental_parent": this_backup_incremental_parent, + "vm_detail": vm_detail, + "export_files": export_files, + "export_size_bytes": export_files_size, + } + try: + with open( + f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json", + "w", + ) as fh: + jdump(export_details, fh) + except Exception as e: + log_err(celery, f"Error exporting snapshot details: {e}") + return False, e + return True, "" + snapshot_volumes = list() for rbdsnap in snap_list: pool, _volume = rbdsnap.split("/") @@ -496,15 +523,9 @@ def worker_cluster_autobackup( error_message = ( f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'", ) - log_err(celery, error_message) - send_execution_failure_report( - (celery, current_stage, total_stages), - config, - recipients=email_recipients, - error=error_message, - ) - fail(celery, error_message) - return False + write_backup_summary(message=error_message) + failure = True + break else: export_files.append( ( @@ -520,15 +541,9 @@ def worker_cluster_autobackup( error_message = ( f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'", ) - log_err(celery, error_message) - send_execution_failure_report( - (celery, current_stage, total_stages), - config, - recipients=email_recipients, - error=error_message, - ) - fail(celery, error_message) - return False + write_backup_summary(message=error_message) + failure = True + break else: export_files.append( ( @@ -537,6 +552,18 @@ def worker_cluster_autobackup( ) ) + if failure: + current_stage += 6 + if not this_backup_retain_snapshot: + current_stage += len(snap_list) + update( + celery, + f"[{vm_name}] Error in snapshot export, skipping", + current=current_stage, + total=total_stages, + ) + continue + current_stage += 1 update( celery, @@ -557,33 +584,22 @@ def worker_cluster_autobackup( export_files_size = get_dir_size(export_target_path) - export_details = { - "type": export_type, - "datestring": datestring, - "snapshot_name": snapshot_name, - "incremental_parent": this_backup_incremental_parent, - "vm_detail": vm_detail, - "export_files": export_files, - "export_size_bytes": export_files_size, - } - try: - with open( - f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json", "w" - ) as fh: - jdump(export_details, fh) - except Exception as e: + ret, e = write_backup_summary(success=True) + if not ret: error_message = ( f"[{vm_name}] Failed to export configuration snapshot: {e}", ) - log_err(celery, error_message) - send_execution_failure_report( - (celery, current_stage, total_stages), - config, - recipients=email_recipients, - error=error_message, + write_backup_summary(message=error_message) + current_stage += 5 + if not this_backup_retain_snapshot: + current_stage += len(snap_list) + update( + celery, + error_message, + current=current_stage, + total=total_stages, ) - fail(celery, error_message) - return False + continue # Clean up the snapshot (vm.vm_worker_remove_snapshot) if not this_backup_retain_snapshot: @@ -601,15 +617,19 @@ def worker_cluster_autobackup( ret, msg = ceph.remove_snapshot(zkhandler, pool, volume, name) if not ret: error_message = msg.replace("ERROR: ", "") - log_err(celery, error_message) - send_execution_failure_report( - (celery, current_stage, total_stages), - config, - recipients=email_recipients, - error=error_message, - ) - fail(celery, error_message) - return False + write_backup_summary(message=error_message) + failure = True + break + + if failure: + current_stage += 4 + update( + celery, + f"[{vm_name}] Error in snapshot export, skipping", + current=current_stage, + total=total_stages, + ) + continue current_stage += 1 update( @@ -624,17 +644,9 @@ def worker_cluster_autobackup( ) if not ret: error_message = ( - f"[{vm_name}] Failed to remove snapshot from Zookeeper", + f"[{vm_name}] Failed to remove VM snapshot; continuing", ) log_err(celery, error_message) - send_execution_failure_report( - (celery, current_stage, total_stages), - config, - recipients=email_recipients, - error=error_message, - ) - fail(celery, error_message) - return False current_stage += 1 update(