Restore previous autobackup continue behaviour

With the original system, the failure of one VM's backups would not
trigger a total fault, thus allowing other backups to complete.
Restore that behaviour.
This commit is contained in:
Joshua Boniface 2024-08-25 17:04:43 -04:00
parent 8fa6bed736
commit 4ef5fbdbe8
1 changed files with 72 additions and 60 deletions

View File

@ -118,7 +118,7 @@ def send_execution_summary_report(
email.append("") email.append("")
email.append( email.append(
f"A PVC autobackup has been completed at {current_datetime} in {total_time}s." f"A PVC autobackup has been completed at {current_datetime} in {total_time}."
) )
email.append("") email.append("")
email.append( email.append(
@ -462,6 +462,33 @@ def worker_cluster_autobackup(
else: else:
export_fileext = "rbdimg" export_fileext = "rbdimg"
failure = False
export_files = None
export_files_size = 0
def write_backup_summary(success=False, message=""):
export_details = {
"type": export_type,
"result": success,
"message": message,
"datestring": datestring,
"snapshot_name": snapshot_name,
"incremental_parent": this_backup_incremental_parent,
"vm_detail": vm_detail,
"export_files": export_files,
"export_size_bytes": export_files_size,
}
try:
with open(
f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json",
"w",
) as fh:
jdump(export_details, fh)
except Exception as e:
log_err(celery, f"Error exporting snapshot details: {e}")
return False, e
return True, ""
snapshot_volumes = list() snapshot_volumes = list()
for rbdsnap in snap_list: for rbdsnap in snap_list:
pool, _volume = rbdsnap.split("/") pool, _volume = rbdsnap.split("/")
@ -496,15 +523,9 @@ def worker_cluster_autobackup(
error_message = ( error_message = (
f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'", f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'",
) )
log_err(celery, error_message) write_backup_summary(message=error_message)
send_execution_failure_report( failure = True
(celery, current_stage, total_stages), break
config,
recipients=email_recipients,
error=error_message,
)
fail(celery, error_message)
return False
else: else:
export_files.append( export_files.append(
( (
@ -520,15 +541,9 @@ def worker_cluster_autobackup(
error_message = ( error_message = (
f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'", f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'",
) )
log_err(celery, error_message) write_backup_summary(message=error_message)
send_execution_failure_report( failure = True
(celery, current_stage, total_stages), break
config,
recipients=email_recipients,
error=error_message,
)
fail(celery, error_message)
return False
else: else:
export_files.append( export_files.append(
( (
@ -537,6 +552,18 @@ def worker_cluster_autobackup(
) )
) )
if failure:
current_stage += 6
if not this_backup_retain_snapshot:
current_stage += len(snap_list)
update(
celery,
f"[{vm_name}] Error in snapshot export, skipping",
current=current_stage,
total=total_stages,
)
continue
current_stage += 1 current_stage += 1
update( update(
celery, celery,
@ -557,33 +584,22 @@ def worker_cluster_autobackup(
export_files_size = get_dir_size(export_target_path) export_files_size = get_dir_size(export_target_path)
export_details = { ret, e = write_backup_summary(success=True)
"type": export_type, if not ret:
"datestring": datestring,
"snapshot_name": snapshot_name,
"incremental_parent": this_backup_incremental_parent,
"vm_detail": vm_detail,
"export_files": export_files,
"export_size_bytes": export_files_size,
}
try:
with open(
f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json", "w"
) as fh:
jdump(export_details, fh)
except Exception as e:
error_message = ( error_message = (
f"[{vm_name}] Failed to export configuration snapshot: {e}", f"[{vm_name}] Failed to export configuration snapshot: {e}",
) )
log_err(celery, error_message) write_backup_summary(message=error_message)
send_execution_failure_report( current_stage += 5
(celery, current_stage, total_stages), if not this_backup_retain_snapshot:
config, current_stage += len(snap_list)
recipients=email_recipients, update(
error=error_message, celery,
error_message,
current=current_stage,
total=total_stages,
) )
fail(celery, error_message) continue
return False
# Clean up the snapshot (vm.vm_worker_remove_snapshot) # Clean up the snapshot (vm.vm_worker_remove_snapshot)
if not this_backup_retain_snapshot: if not this_backup_retain_snapshot:
@ -601,15 +617,19 @@ def worker_cluster_autobackup(
ret, msg = ceph.remove_snapshot(zkhandler, pool, volume, name) ret, msg = ceph.remove_snapshot(zkhandler, pool, volume, name)
if not ret: if not ret:
error_message = msg.replace("ERROR: ", "") error_message = msg.replace("ERROR: ", "")
log_err(celery, error_message) write_backup_summary(message=error_message)
send_execution_failure_report( failure = True
(celery, current_stage, total_stages), break
config,
recipients=email_recipients, if failure:
error=error_message, current_stage += 4
update(
celery,
f"[{vm_name}] Error in snapshot export, skipping",
current=current_stage,
total=total_stages,
) )
fail(celery, error_message) continue
return False
current_stage += 1 current_stage += 1
update( update(
@ -624,17 +644,9 @@ def worker_cluster_autobackup(
) )
if not ret: if not ret:
error_message = ( error_message = (
f"[{vm_name}] Failed to remove snapshot from Zookeeper", f"[{vm_name}] Failed to remove VM snapshot; continuing",
) )
log_err(celery, error_message) log_err(celery, error_message)
send_execution_failure_report(
(celery, current_stage, total_stages),
config,
recipients=email_recipients,
error=error_message,
)
fail(celery, error_message)
return False
current_stage += 1 current_stage += 1
update( update(