Restore previous autobackup continue behaviour

With the original system, the failure of one VM's backups would not trigger a total fault, thus allowing other backups to complete. Restore that behaviour.
2024-08-25 17:04:43 -04:00
parent 8fa6bed736
commit 4ef5fbdbe8
1 changed files with 72 additions and 60 deletions
--- a/daemon-common/autobackup.py
+++ b/daemon-common/autobackup.py
@@ -118,7 +118,7 @@ def send_execution_summary_report(
    email.append("")
    email.append(
-        f"A PVC autobackup has been completed at {current_datetime} in {total_time}s."
+        f"A PVC autobackup has been completed at {current_datetime} in {total_time}."
    )
    email.append("")
    email.append(
@@ -462,6 +462,33 @@ def worker_cluster_autobackup(
        else:
            export_fileext = "rbdimg"
        failure = False
        export_files = None
        export_files_size = 0
        def write_backup_summary(success=False, message=""):
            export_details = {
                "type": export_type,
                "result": success,
                "message": message,
                "datestring": datestring,
                "snapshot_name": snapshot_name,
                "incremental_parent": this_backup_incremental_parent,
                "vm_detail": vm_detail,
                "export_files": export_files,
                "export_size_bytes": export_files_size,
            }
            try:
                with open(
                    f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json",
                    "w",
                ) as fh:
                    jdump(export_details, fh)
            except Exception as e:
                log_err(celery, f"Error exporting snapshot details: {e}")
                return False, e
            return True, ""
        snapshot_volumes = list()
        for rbdsnap in snap_list:
            pool, _volume = rbdsnap.split("/")
@@ -496,15 +523,9 @@ def worker_cluster_autobackup(
                    error_message = (
                        f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'",
                    )
-                    log_err(celery, error_message)
+                    write_backup_summary(message=error_message)
-                    send_execution_failure_report(
+                    failure = True
-                        (celery, current_stage, total_stages),
+                    break
                        config,
                        recipients=email_recipients,
                        error=error_message,
                    )
                    fail(celery, error_message)
                    return False
                else:
                    export_files.append(
                        (
@@ -520,15 +541,9 @@ def worker_cluster_autobackup(
                    error_message = (
                        f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'",
                    )
-                    log_err(celery, error_message)
+                    write_backup_summary(message=error_message)
-                    send_execution_failure_report(
+                    failure = True
-                        (celery, current_stage, total_stages),
+                    break
                        config,
                        recipients=email_recipients,
                        error=error_message,
                    )
                    fail(celery, error_message)
                    return False
                else:
                    export_files.append(
                        (
@@ -537,6 +552,18 @@ def worker_cluster_autobackup(
                        )
                    )
        if failure:
            current_stage += 6
            if not this_backup_retain_snapshot:
                current_stage += len(snap_list)
            update(
                celery,
                f"[{vm_name}] Error in snapshot export, skipping",
                current=current_stage,
                total=total_stages,
            )
            continue
        current_stage += 1
        update(
            celery,
@@ -557,33 +584,22 @@ def worker_cluster_autobackup(
        export_files_size = get_dir_size(export_target_path)
-        export_details = {
+        ret, e = write_backup_summary(success=True)
-            "type": export_type,
+        if not ret:
            "datestring": datestring,
            "snapshot_name": snapshot_name,
            "incremental_parent": this_backup_incremental_parent,
            "vm_detail": vm_detail,
            "export_files": export_files,
            "export_size_bytes": export_files_size,
        }
        try:
            with open(
                f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json", "w"
            ) as fh:
                jdump(export_details, fh)
        except Exception as e:
            error_message = (
                f"[{vm_name}] Failed to export configuration snapshot: {e}",
            )
-            log_err(celery, error_message)
+            write_backup_summary(message=error_message)
-            send_execution_failure_report(
+            current_stage += 5
-                (celery, current_stage, total_stages),
+            if not this_backup_retain_snapshot:
-                config,
+                current_stage += len(snap_list)
-                recipients=email_recipients,
+            update(
-                error=error_message,
+                celery,
                error_message,
                current=current_stage,
                total=total_stages,
            )
-            fail(celery, error_message)
+            continue
            return False
        # Clean up the snapshot (vm.vm_worker_remove_snapshot)
        if not this_backup_retain_snapshot:
@@ -601,15 +617,19 @@ def worker_cluster_autobackup(
                ret, msg = ceph.remove_snapshot(zkhandler, pool, volume, name)
                if not ret:
                    error_message = msg.replace("ERROR: ", "")
-                    log_err(celery, error_message)
+                    write_backup_summary(message=error_message)
-                    send_execution_failure_report(
+                    failure = True
-                        (celery, current_stage, total_stages),
+                    break
-                        config,
+
-                        recipients=email_recipients,
+            if failure:
-                        error=error_message,
+                current_stage += 4
-                    )
+                update(
-                    fail(celery, error_message)
+                    celery,
-                    return False
+                    f"[{vm_name}] Error in snapshot export, skipping",
                    current=current_stage,
                    total=total_stages,
                )
                continue
            current_stage += 1
            update(
@@ -624,17 +644,9 @@ def worker_cluster_autobackup(
            )
            if not ret:
                error_message = (
-                    f"[{vm_name}] Failed to remove snapshot from Zookeeper",
+                    f"[{vm_name}] Failed to remove VM snapshot; continuing",
                )
                log_err(celery, error_message)
                send_execution_failure_report(
                    (celery, current_stage, total_stages),
                    config,
                    recipients=email_recipients,
                    error=error_message,
                )
                fail(celery, error_message)
                return False
        current_stage += 1
        update(