Add backup reporting and improve metrics

Major improvements to autobackup and backups, including additional
information/fields in the backup JSON itself, improved error handling,
and the ability to email reports of autobackups using a local sendmail
utility.
This commit is contained in:
Joshua Boniface 2024-01-10 14:18:44 -05:00
parent 8d74ee7273
commit 362edeed8c
3 changed files with 261 additions and 96 deletions

View File

@ -1895,6 +1895,12 @@ def cli_vm_backup_remove(domain, backup_datestring, backup_path):
show_default=True, show_default=True,
help="Override default config file location.", help="Override default config file location.",
) )
@click.option(
"--email-report",
"email_report",
default=None,
help="Email a backup summary report to the specified address(es), comma-separated.",
)
@click.option( @click.option(
"--force-full", "--force-full",
"force_full_flag", "force_full_flag",
@ -1909,7 +1915,7 @@ def cli_vm_backup_remove(domain, backup_datestring, backup_path):
is_flag=True, is_flag=True,
help="Cron mode; don't error exit if this isn't the primary coordinator.", help="Cron mode; don't error exit if this isn't the primary coordinator.",
) )
def cli_vm_autobackup(autobackup_cfgfile, force_full_flag, cron_flag): def cli_vm_autobackup(autobackup_cfgfile, email_report, force_full_flag, cron_flag):
""" """
Perform automated backups of VMs, with integrated cleanup and full/incremental scheduling. Perform automated backups of VMs, with integrated cleanup and full/incremental scheduling.
@ -1936,12 +1942,17 @@ def cli_vm_autobackup(autobackup_cfgfile, force_full_flag, cron_flag):
configuration file path if required by a particular run. For full details of the possible options, please configuration file path if required by a particular run. For full details of the possible options, please
see the example configuration file at "/usr/share/pvc/autobackup.sample.yaml". see the example configuration file at "/usr/share/pvc/autobackup.sample.yaml".
An optional report on all current backups can be emailed to one or more email addresses using the
"--email-report" flag. This report will include information on all current known backups.
The "--force-full" option can be used to force all configured VMs to perform a "full" level backup this run, The "--force-full" option can be used to force all configured VMs to perform a "full" level backup this run,
which can help synchronize the backups of existing VMs with new ones. which can help synchronize the backups of existing VMs with new ones.
""" """
# All work here is done in the helper function for portability; we don't even use "finish" # All work here is done in the helper function for portability; we don't even use "finish"
vm_autobackup(CLI_CONFIG, autobackup_cfgfile, force_full_flag, cron_flag) vm_autobackup(
CLI_CONFIG, autobackup_cfgfile, email_report, force_full_flag, cron_flag
)
############################################################################### ###############################################################################

View File

@ -26,7 +26,7 @@ from distutils.util import strtobool
from getpass import getuser from getpass import getuser
from json import load as jload from json import load as jload
from json import dump as jdump from json import dump as jdump
from os import chmod, environ, getpid, path, makedirs, get_terminal_size from os import chmod, environ, getpid, path, popen, makedirs, get_terminal_size
from re import findall from re import findall
from socket import gethostname from socket import gethostname
from subprocess import run, PIPE from subprocess import run, PIPE
@ -38,6 +38,7 @@ from yaml import SafeLoader
import pvc.lib.provisioner import pvc.lib.provisioner
import pvc.lib.vm import pvc.lib.vm
import pvc.lib.node import pvc.lib.node
import pvc.lib.storage
DEFAULT_STORE_DATA = {"cfgfile": "/etc/pvc/pvc.conf"} DEFAULT_STORE_DATA = {"cfgfile": "/etc/pvc/pvc.conf"}
@ -201,8 +202,8 @@ def get_autobackup_config(CLI_CONFIG, cfgfile):
try: try:
config = dict() config = dict()
with open(cfgfile) as fh: with open(cfgfile) as fh:
backup_config = yload(fh, Loader=SafeLoader)["autobackup"] full_config = yload(fh, Loader=SafeLoader)
backup_config = full_config["autobackup"]
config["backup_root_path"] = backup_config["backup_root_path"] config["backup_root_path"] = backup_config["backup_root_path"]
config["backup_root_suffix"] = backup_config["backup_root_suffix"] config["backup_root_suffix"] = backup_config["backup_root_suffix"]
config["backup_tags"] = backup_config["backup_tags"] config["backup_tags"] = backup_config["backup_tags"]
@ -226,13 +227,10 @@ def get_autobackup_config(CLI_CONFIG, cfgfile):
backup_root_path=backup_config["backup_root_path"] backup_root_path=backup_config["backup_root_path"]
) )
config["unmount_cmds"].append(_unmount_cmd) config["unmount_cmds"].append(_unmount_cmd)
except FileNotFoundError: except FileNotFoundError:
echo(CLI_CONFIG, "ERROR: Specified backup configuration does not exist!") return "Backup configuration does not exist!"
exit(1)
except KeyError as e: except KeyError as e:
echo(CLI_CONFIG, f"ERROR: Backup configuration is invalid: {e}") return f"Backup configuration is invalid: {e}"
exit(1)
return config return config
@ -240,6 +238,7 @@ def get_autobackup_config(CLI_CONFIG, cfgfile):
def vm_autobackup( def vm_autobackup(
CLI_CONFIG, CLI_CONFIG,
autobackup_cfgfile=DEFAULT_AUTOBACKUP_FILENAME, autobackup_cfgfile=DEFAULT_AUTOBACKUP_FILENAME,
email_report=None,
force_full_flag=False, force_full_flag=False,
cron_flag=False, cron_flag=False,
): ):
@ -247,6 +246,48 @@ def vm_autobackup(
Perform automatic backups of VMs based on an external config file. Perform automatic backups of VMs based on an external config file.
""" """
if email_report is not None:
from email.utils import formatdate
from socket import gethostname
try:
with open(autobackup_cfgfile) as fh:
tmp_config = yload(fh, Loader=SafeLoader)
cluster = tmp_config["cluster"]["name"]
except Exception:
cluster = "unknown"
def send_execution_failure_report(error=None):
echo(CLI_CONFIG, f"Sending email failure report to {email_report}")
current_datetime = datetime.now()
email_datetime = formatdate(float(current_datetime.strftime("%s")))
email = list()
email.append(f"Date: {email_datetime}")
email.append(f"Subject: PVC Autobackup execution failure for cluster {cluster}")
recipients = list()
for recipient in email_report.split(","):
recipients.append(f"<{recipient}>")
email.append(f"To: {', '.join(recipients)}")
email.append(f"From: PVC Autobackup System <pvc@{gethostname()}>")
email.append("")
email.append(
f"A PVC autobackup has FAILED at {current_datetime} due to an execution error."
)
email.append("")
email.append("The reported error message is:")
email.append(f" {error}")
try:
p = popen("/usr/sbin/sendmail -t", "w")
p.write("\n".join(email))
p.close()
except Exception as e:
echo(CLI_CONFIG, f"Failed to send report email: {e}")
# Validate that we are running on the current primary coordinator of the 'local' cluster connection # Validate that we are running on the current primary coordinator of the 'local' cluster connection
real_connection = CLI_CONFIG["connection"] real_connection = CLI_CONFIG["connection"]
CLI_CONFIG["connection"] = "local" CLI_CONFIG["connection"] = "local"
@ -267,6 +308,10 @@ def vm_autobackup(
CLI_CONFIG, CLI_CONFIG,
"Autobackup MUST be run from the cluster active primary coordinator using the 'local' connection. See '-h'/'--help' for details.", "Autobackup MUST be run from the cluster active primary coordinator using the 'local' connection. See '-h'/'--help' for details.",
) )
if email_report is not None:
send_execution_failure_report(
error=f"Autobackup run attempted from non-local connection or non-primary coordinator; got connection '{real_connection}', host '{DEFAULT_NODE_HOSTNAME}'."
)
exit(1) exit(1)
# Ensure we're running as root, or show a warning & confirmation # Ensure we're running as root, or show a warning & confirmation
@ -279,6 +324,14 @@ def vm_autobackup(
# Load our YAML config # Load our YAML config
autobackup_config = get_autobackup_config(CLI_CONFIG, autobackup_cfgfile) autobackup_config = get_autobackup_config(CLI_CONFIG, autobackup_cfgfile)
if not isinstance(autobackup_config, dict):
echo(CLI_CONFIG, f"ERROR: {autobackup_config}")
if email_report is not None:
send_execution_failure_report(error=f"{autobackup_config}")
exit(1)
# Get the start time of this run
autobackup_start_time = datetime.now()
# Get a list of all VMs on the cluster # Get a list of all VMs on the cluster
# We don't do tag filtering here, because we could match an arbitrary number of tags; instead, we # We don't do tag filtering here, because we could match an arbitrary number of tags; instead, we
@ -286,6 +339,8 @@ def vm_autobackup(
retcode, retdata = pvc.lib.vm.vm_list(CLI_CONFIG, None, None, None, None, None) retcode, retdata = pvc.lib.vm.vm_list(CLI_CONFIG, None, None, None, None, None)
if not retcode: if not retcode:
echo(CLI_CONFIG, f"ERROR: Failed to fetch VM list: {retdata}") echo(CLI_CONFIG, f"ERROR: Failed to fetch VM list: {retdata}")
if email_report is not None:
send_execution_failure_report(error=f"Failed to fetch VM list: {retdata}")
exit(1) exit(1)
cluster_vms = retdata cluster_vms = retdata
@ -354,6 +409,8 @@ def vm_autobackup(
CLI_CONFIG, CLI_CONFIG,
f"Exiting; command reports: {ret.stderr.decode().strip()}", f"Exiting; command reports: {ret.stderr.decode().strip()}",
) )
if email_report is not None:
send_execution_failure_report(error=ret.stderr.decode().strip())
exit(1) exit(1)
else: else:
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]") echo(CLI_CONFIG, f"done. [{ttot.seconds}s]")
@ -417,27 +474,26 @@ def vm_autobackup(
tend = datetime.now() tend = datetime.now()
ttot = tend - tstart ttot = tend - tstart
if not retcode: if not retcode:
backup_datestring = findall(r"[0-9]{14}", retdata)[0]
echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]") echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]")
echo(CLI_CONFIG, f"Skipping cleanups; command reports: {retdata}") echo(
continue CLI_CONFIG,
retdata.strip().replace(f"ERROR in backup {backup_datestring}: ", ""),
)
skip_cleanup = True
else: else:
backup_datestring = findall(r"[0-9]{14}", retdata)[0] backup_datestring = findall(r"[0-9]{14}", retdata)[0]
echo( echo(
CLI_CONFIG, CLI_CONFIG,
f"done. Backup '{backup_datestring}' created. [{ttot.seconds}s]", f"done. Backup '{backup_datestring}' created. [{ttot.seconds}s]",
) )
skip_cleanup = False
# Read backup file to get details # Read backup file to get details
backup_json_file = f"{backup_path}/{backup_datestring}/pvcbackup.json" backup_json_file = f"{backup_path}/{backup_datestring}/pvcbackup.json"
with open(backup_json_file) as fh: with open(backup_json_file) as fh:
backup_json = jload(fh) backup_json = jload(fh)
backup = { tracked_backups.insert(0, backup_json)
"datestring": backup_json["datestring"],
"type": backup_json["type"],
"parent": backup_json["incremental_parent"],
"retained_snapshot": backup_json["retained_snapshot"],
}
tracked_backups.insert(0, backup)
# Delete any full backups that are expired # Delete any full backups that are expired
marked_for_deletion = list() marked_for_deletion = list()
@ -450,37 +506,47 @@ def vm_autobackup(
# Depete any incremental backups that depend on marked parents # Depete any incremental backups that depend on marked parents
for backup in tracked_backups: for backup in tracked_backups:
if backup["type"] == "incremental" and backup["parent"] in [ if backup["type"] == "incremental" and backup["incremental_parent"] in [
b["datestring"] for b in marked_for_deletion b["datestring"] for b in marked_for_deletion
]: ]:
marked_for_deletion.append(backup) marked_for_deletion.append(backup)
# Execute deletes if len(marked_for_deletion) > 0:
for backup_to_delete in marked_for_deletion: if skip_cleanup:
echo(
CLI_CONFIG,
f"Removing old VM '{vm}' backup '{backup_to_delete['datestring']}' ({backup_to_delete['type']})... ",
newline=False,
)
tstart = datetime.now()
retcode, retdata = pvc.lib.vm.vm_remove_backup(
CLI_CONFIG,
vm,
backup_suffixed_path,
backup_to_delete["datestring"],
)
tend = datetime.now()
ttot = tend - tstart
if not retcode:
echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]")
echo( echo(
CLI_CONFIG, CLI_CONFIG,
f"Skipping removal from tracked backups; command reports: {retdata}", f"Skipping cleanups for {len(marked_for_deletion)} aged-out backups due to backup failure.",
) )
continue
else: else:
tracked_backups.remove(backup_to_delete) echo(
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]") CLI_CONFIG,
f"Running cleanups for {len(marked_for_deletion)} aged-out backups...",
)
# Execute deletes
for backup_to_delete in marked_for_deletion:
echo(
CLI_CONFIG,
f"Removing old VM '{vm}' backup '{backup_to_delete['datestring']}' ({backup_to_delete['type']})... ",
newline=False,
)
tstart = datetime.now()
retcode, retdata = pvc.lib.vm.vm_remove_backup(
CLI_CONFIG,
vm,
backup_suffixed_path,
backup_to_delete["datestring"],
)
tend = datetime.now()
ttot = tend - tstart
if not retcode:
echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]")
echo(
CLI_CONFIG,
f"Skipping removal from tracked backups; command reports: {retdata}",
)
else:
tracked_backups.remove(backup_to_delete)
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]")
# Update tracked state information # Update tracked state information
state_data["tracked_backups"] = tracked_backups state_data["tracked_backups"] = tracked_backups
@ -514,3 +580,78 @@ def vm_autobackup(
) )
else: else:
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]") echo(CLI_CONFIG, f"done. [{ttot.seconds}s]")
autobackup_end_time = datetime.now()
autobackup_total_time = autobackup_end_time - autobackup_start_time
# Handle report emailing
if email_report is not None:
echo(CLI_CONFIG, "")
echo(CLI_CONFIG, f"Sending email summary report to {email_report}")
backup_summary = dict()
for vm in backup_vms:
backup_path = f"{backup_suffixed_path}/{vm}"
autobackup_state_file = f"{backup_path}/.autobackup.json"
if not path.exists(backup_path) or not path.exists(autobackup_state_file):
# There are no new backups so the list is empty
state_data = dict()
tracked_backups = list()
else:
with open(autobackup_state_file) as fh:
state_data = jload(fh)
tracked_backups = state_data["tracked_backups"]
backup_summary[vm] = tracked_backups
current_datetime = datetime.now()
email_datetime = formatdate(float(current_datetime.strftime("%s")))
email = list()
email.append(f"Date: {email_datetime}")
email.append(f"Subject: PVC Autobackup report for cluster {cluster}")
recipients = list()
for recipient in email_report.split(","):
recipients.append(f"<{recipient}>")
email.append(f"To: {', '.join(recipients)}")
email.append(f"From: PVC Autobackup System <pvc@{gethostname()}>")
email.append("")
email.append(
f"A PVC autobackup has been completed at {current_datetime} in {autobackup_total_time}."
)
email.append("")
email.append(
"The following is a summary of all current VM backups after cleanups, most recent first:"
)
email.append("")
for vm in backup_vms:
email.append(f"VM {vm}:")
for backup in backup_summary[vm]:
datestring = backup.get("datestring")
backup_date = datetime.strptime(datestring, "%Y%m%d%H%M%S")
if backup.get("result", False):
email.append(
f" {backup_date}: Success in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}"
)
email.append(
f" Backup contains {len(backup.get('backup_files'))} files totaling {pvc.lib.storage.format_bytes_tohuman(backup.get('backup_size_bytes', 0))} ({backup.get('backup_size_bytes', 0)} bytes)"
)
else:
email.append(
f" {backup_date}: Failure in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}"
)
email.append(
f" {backup.get('result_message')}"
)
try:
p = popen("/usr/sbin/sendmail -t", "w")
p.write("\n".join(email))
p.close()
except Exception as e:
echo(CLI_CONFIG, f"Failed to send report email: {e}")
echo(CLI_CONFIG, "")
echo(CLI_CONFIG, f"Autobackup completed in {autobackup_total_time}.")

View File

@ -32,6 +32,7 @@ from json import dump as jdump
from json import load as jload from json import load as jload
from json import loads as jloads from json import loads as jloads
from libvirt import open as lvopen from libvirt import open as lvopen
from os import scandir
from shutil import rmtree from shutil import rmtree
from socket import gethostname from socket import gethostname
from uuid import UUID from uuid import UUID
@ -1183,12 +1184,15 @@ def backup_vm(
if not re.match(r"^/", backup_path): if not re.match(r"^/", backup_path):
return ( return (
False, False,
f"ERROR: Target path {backup_path} is not a valid absolute path on the primary coordinator!", f"ERROR in backup {datestring}: Target path {backup_path} is not a valid absolute path on the primary coordinator!",
) )
# Ensure that backup_path (on this node) exists # Ensure that backup_path (on this node) exists
if not os.path.isdir(backup_path): if not os.path.isdir(backup_path):
return False, f"ERROR: Target path {backup_path} does not exist!" return (
False,
f"ERROR in backup {datestring}: Target path {backup_path} does not exist!",
)
# 1a. Create destination directory # 1a. Create destination directory
vm_target_root = f"{backup_path}/{domain}" vm_target_root = f"{backup_path}/{domain}"
@ -1197,7 +1201,10 @@ def backup_vm(
try: try:
os.makedirs(vm_target_backup) os.makedirs(vm_target_backup)
except Exception as e: except Exception as e:
return False, f"ERROR: Failed to create backup directory: {e}" return (
False,
f"ERROR in backup {datestring}: Failed to create backup directory: {e}",
)
tstart = time.time() tstart = time.time()
backup_type = "incremental" if incremental_parent is not None else "full" backup_type = "incremental" if incremental_parent is not None else "full"
@ -1222,7 +1229,7 @@ def backup_vm(
"retained_snapshot": retain_snapshot, "retained_snapshot": retain_snapshot,
"result": result, "result": result,
"result_message": result_message, "result_message": result_message,
"runtime_secs": ttot.seconds, "runtime_secs": ttot,
"vm_detail": vm_detail, "vm_detail": vm_detail,
"backup_files": backup_files, "backup_files": backup_files,
"backup_size_bytes": backup_files_size, "backup_size_bytes": backup_files_size,
@ -1233,28 +1240,26 @@ def backup_vm(
# 2. Validations part 2 # 2. Validations part 2
# Disallow retaining snapshots with an incremental parent # Disallow retaining snapshots with an incremental parent
if incremental_parent is not None and retain_snapshot: if incremental_parent is not None and retain_snapshot:
result_message = ( error_message = "Retaining snapshots of incremental backups is not supported!"
"ERROR: Retaining snapshots of incremental backups is not supported!" write_pvcbackup_json(result=False, result_message=f"ERROR: {error_message}")
)
write_pvcbackup_json(result=False, result_message=result_message)
return ( return (
False, False,
result_message, f"ERROR in backup {datestring}: {error_message}",
) )
# Validate that VM exists in cluster # Validate that VM exists in cluster
dom_uuid = getDomainUUID(zkhandler, domain) dom_uuid = getDomainUUID(zkhandler, domain)
if not dom_uuid: if not dom_uuid:
result_message = f'ERROR: Could not find VM "{domain}" in the cluster!' error_message = f'Could not find VM "{domain}" in the cluster!'
write_pvcbackup_json(result=False, result_message=result_message) write_pvcbackup_json(result=False, result_message=f"ERROR: {error_message}")
return False, result_message return False, f"ERROR in backup {datestring}: {error_message}"
# 3. Get information about VM # 3. Get information about VM
vm_detail = get_list(zkhandler, limit=dom_uuid, is_fuzzy=False)[1][0] vm_detail = get_list(zkhandler, limit=dom_uuid, is_fuzzy=False)[1][0]
if not isinstance(vm_detail, dict): if not isinstance(vm_detail, dict):
result_message = f"ERROR: VM listing returned invalid data: {vm_detail}" error_message = f"VM listing returned invalid data: {vm_detail}"
write_pvcbackup_json(result=False, result_message=result_message) write_pvcbackup_json(result=False, result_message=f"ERROR: {error_message}")
return False, result_message return False, f"ERROR in backup {datestring}: {error_message}"
vm_volumes = list() vm_volumes = list()
for disk in vm_detail["disks"]: for disk in vm_detail["disks"]:
@ -1270,39 +1275,47 @@ def backup_vm(
elif len(retdata) > 1: elif len(retdata) > 1:
retdata = "Multiple volumes returned." retdata = "Multiple volumes returned."
result_message = ( error_message = (
f"ERROR: Failed to get volume details for {pool}/{volume}: {retdata}" f"Failed to get volume details for {pool}/{volume}: {retdata}"
) )
write_pvcbackup_json( write_pvcbackup_json(
result=False, result_message=result_message, vm_detail=vm_detail result=False,
result_message=f"ERROR: {error_message}",
vm_detail=vm_detail,
) )
return ( return (
False, False,
result_message, f"ERROR in backup {datestring}: {error_message}",
) )
try: try:
size = retdata[0]["stats"]["size"] size = retdata[0]["stats"]["size"]
except Exception as e: except Exception as e:
return False, f"ERROR: Failed to get volume size for {pool}/{volume}: {e}" error_message = f"Failed to get volume size for {pool}/{volume}: {e}"
write_pvcbackup_json(
result=False,
result_message=f"ERROR: {error_message}",
vm_detail=vm_detail,
)
return (
False,
f"ERROR in backup {datestring}: {error_message}",
)
vm_volumes.append((pool, volume, size)) vm_volumes.append((pool, volume, size))
# 4a. Validate that all volumes exist (they should, but just in case) # 4a. Validate that all volumes exist (they should, but just in case)
for pool, volume, _ in vm_volumes: for pool, volume, _ in vm_volumes:
if not ceph.verifyVolume(zkhandler, pool, volume): if not ceph.verifyVolume(zkhandler, pool, volume):
result_message = ( error_message = f"VM defines a volume {pool}/{volume} which does not exist!"
f"ERROR: VM defines a volume {pool}/{volume} which does not exist!"
)
write_pvcbackup_json( write_pvcbackup_json(
result=False, result=False,
result_message=result_message, result_message=f"ERROR: {error_message}",
vm_detail=vm_detail, vm_detail=vm_detail,
vm_volumes=vm_volumes,
) )
return ( return (
False, False,
result_message, f"ERROR in backup {datestring}: {error_message}",
) )
# 4b. Validate that, if an incremental_parent is given, it is valid # 4b. Validate that, if an incremental_parent is given, it is valid
@ -1312,16 +1325,15 @@ def backup_vm(
if not ceph.verifySnapshot( if not ceph.verifySnapshot(
zkhandler, pool, volume, f"backup_{incremental_parent}" zkhandler, pool, volume, f"backup_{incremental_parent}"
): ):
result_message = f"ERROR: Incremental parent {incremental_parent} given, but no snapshots were found; cannot export an incremental backup." error_message = f"Incremental parent {incremental_parent} given, but no snapshots were found; cannot export an incremental backup."
write_pvcbackup_json( write_pvcbackup_json(
result=False, result=False,
result_message=result_message, result_message=f"ERROR: {error_message}",
vm_detail=vm_detail, vm_detail=vm_detail,
vm_volumes=vm_volumes,
) )
return ( return (
False, False,
result_message, f"ERROR in backup {datestring}: {error_message}",
) )
export_fileext = "rbddiff" export_fileext = "rbddiff"
@ -1334,35 +1346,31 @@ def backup_vm(
# 5. Take snapshot of each disks with the name @backup_{datestring} # 5. Take snapshot of each disks with the name @backup_{datestring}
is_snapshot_create_failed = False is_snapshot_create_failed = False
which_snapshot_create_failed = list() which_snapshot_create_failed = list()
msg_snapshot_create_failed = list()
for pool, volume, _ in vm_volumes: for pool, volume, _ in vm_volumes:
retcode, retmsg = ceph.add_snapshot(zkhandler, pool, volume, snapshot_name) retcode, retmsg = ceph.add_snapshot(zkhandler, pool, volume, snapshot_name)
if not retcode: if not retcode:
is_snapshot_create_failed = True is_snapshot_create_failed = True
which_snapshot_create_failed.append(f"{pool}/{volume}") which_snapshot_create_failed.append(f"{pool}/{volume}")
msg_snapshot_create_failed.append(retmsg)
if is_snapshot_create_failed: if is_snapshot_create_failed:
for pool, volume, _ in vm_volumes: for pool, volume, _ in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name): if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name) ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
result_message = f'ERROR: Failed to create snapshot for volume(s) {", ".join(which_snapshot_create_failed)}: {", ".join(msg_snapshot_create_failed)}' error_message = f'Failed to create snapshot for volume(s) {", ".join(which_snapshot_create_failed)}'
write_pvcbackup_json( write_pvcbackup_json(
result=False, result=False,
result_message=result_message, result_message=f"ERROR: {error_message}",
vm_detail=vm_detail, vm_detail=vm_detail,
vm_volumes=vm_volumes,
) )
return ( return (
False, False,
result_message, f"ERROR in backup {datestring}: {error_message}",
) )
# 6. Dump snapshot to folder with `rbd export` (full) or `rbd export-diff` (incremental) # 6. Dump snapshot to folder with `rbd export` (full) or `rbd export-diff` (incremental)
is_snapshot_export_failed = False is_snapshot_export_failed = False
which_snapshot_export_failed = list() which_snapshot_export_failed = list()
msg_snapshot_export_failed = list()
backup_files = list() backup_files = list()
for pool, volume, size in vm_volumes: for pool, volume, size in vm_volumes:
if incremental_parent is not None: if incremental_parent is not None:
@ -1373,7 +1381,6 @@ def backup_vm(
if retcode: if retcode:
is_snapshot_export_failed = True is_snapshot_export_failed = True
which_snapshot_export_failed.append(f"{pool}/{volume}") which_snapshot_export_failed.append(f"{pool}/{volume}")
msg_snapshot_export_failed.append(stderr)
else: else:
backup_files.append( backup_files.append(
(f"pvcdisks/{pool}.{volume}.{export_fileext}", size) (f"pvcdisks/{pool}.{volume}.{export_fileext}", size)
@ -1385,32 +1392,44 @@ def backup_vm(
if retcode: if retcode:
is_snapshot_export_failed = True is_snapshot_export_failed = True
which_snapshot_export_failed.append(f"{pool}/{volume}") which_snapshot_export_failed.append(f"{pool}/{volume}")
msg_snapshot_export_failed.append(stderr) else:
backup_files.append(
(f"pvcdisks/{pool}.{volume}.{export_fileext}", size)
)
backup_files_size = os.path.getsize(vm_target_backup) def get_dir_size(path):
total = 0
with scandir(path) as it:
for entry in it:
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += get_dir_size(entry.path)
return total
backup_files_size = get_dir_size(vm_target_backup)
if is_snapshot_export_failed: if is_snapshot_export_failed:
for pool, volume, _ in vm_volumes: for pool, volume, _ in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name): if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name) ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
result_message = f'ERROR: Failed to export snapshot for volume(s) {", ".join(which_snapshot_export_failed)}: {", ".join(msg_snapshot_export_failed)}' error_message = f'Failed to export snapshot for volume(s) {", ".join(which_snapshot_export_failed)}'
write_pvcbackup_json( write_pvcbackup_json(
result=False, result=False,
result_message=result_message, result_message=f"ERROR: {error_message}",
vm_detail=vm_detail, vm_detail=vm_detail,
backup_files=backup_files, backup_files=backup_files,
backup_files_size=backup_files_size, backup_files_size=backup_files_size,
) )
return ( return (
False, False,
result_message, f"ERROR in backup {datestring}: {error_message}",
) )
# 8. Remove snapshots if retain_snapshot is False # 8. Remove snapshots if retain_snapshot is False
is_snapshot_remove_failed = False is_snapshot_remove_failed = False
which_snapshot_remove_failed = list() which_snapshot_remove_failed = list()
msg_snapshot_remove_failed = list()
if not retain_snapshot: if not retain_snapshot:
for pool, volume, _ in vm_volumes: for pool, volume, _ in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name): if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
@ -1420,7 +1439,6 @@ def backup_vm(
if not retcode: if not retcode:
is_snapshot_remove_failed = True is_snapshot_remove_failed = True
which_snapshot_remove_failed.append(f"{pool}/{volume}") which_snapshot_remove_failed.append(f"{pool}/{volume}")
msg_snapshot_remove_failed.append(retmsg)
tend = time.time() tend = time.time()
ttot = round(tend - tstart, 2) ttot = round(tend - tstart, 2)
@ -1429,7 +1447,7 @@ def backup_vm(
if is_snapshot_remove_failed: if is_snapshot_remove_failed:
retlines.append( retlines.append(
f"WARNING: Failed to remove snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}" f"WARNING: Failed to remove snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}"
) )
myhostname = gethostname().split(".")[0] myhostname = gethostname().split(".")[0]
@ -1437,7 +1455,7 @@ def backup_vm(
result_message = f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}, snapshots retained) to '{myhostname}:{backup_path}' in {ttot}s." result_message = f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}, snapshots retained) to '{myhostname}:{backup_path}' in {ttot}s."
else: else:
result_message = f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}) to '{myhostname}:{backup_path}' in {ttot}s." result_message = f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}) to '{myhostname}:{backup_path}' in {ttot}s."
retlines.appendr(result_message) retlines.append(result_message)
write_pvcbackup_json( write_pvcbackup_json(
result=True, result=True,
@ -1495,7 +1513,6 @@ def remove_backup(zkhandler, domain, backup_path, datestring):
# 2. Remove snapshots # 2. Remove snapshots
is_snapshot_remove_failed = False is_snapshot_remove_failed = False
which_snapshot_remove_failed = list() which_snapshot_remove_failed = list()
msg_snapshot_remove_failed = list()
if backup_source_details["retained_snapshot"]: if backup_source_details["retained_snapshot"]:
for volume_file, _ in backup_source_details.get("backup_files"): for volume_file, _ in backup_source_details.get("backup_files"):
pool, volume, _ = volume_file.split("/")[-1].split(".") pool, volume, _ = volume_file.split("/")[-1].split(".")
@ -1504,7 +1521,6 @@ def remove_backup(zkhandler, domain, backup_path, datestring):
if not retcode: if not retcode:
is_snapshot_remove_failed = True is_snapshot_remove_failed = True
which_snapshot_remove_failed.append(f"{pool}/{volume}") which_snapshot_remove_failed.append(f"{pool}/{volume}")
msg_snapshot_remove_failed.append(retmsg)
# 3. Remove files # 3. Remove files
is_files_remove_failed = False is_files_remove_failed = False
@ -1521,7 +1537,7 @@ def remove_backup(zkhandler, domain, backup_path, datestring):
if is_snapshot_remove_failed: if is_snapshot_remove_failed:
retlines.append( retlines.append(
f"WARNING: Failed to remove snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}" f"WARNING: Failed to remove snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}"
) )
if is_files_remove_failed: if is_files_remove_failed:
@ -1620,7 +1636,6 @@ def restore_vm(zkhandler, domain, backup_path, datestring, retain_snapshot=False
# 4. Import volumes # 4. Import volumes
is_snapshot_remove_failed = False is_snapshot_remove_failed = False
which_snapshot_remove_failed = list() which_snapshot_remove_failed = list()
msg_snapshot_remove_failed = list()
if incremental_parent is not None: if incremental_parent is not None:
for volume_file, volume_size in backup_source_details.get("backup_files"): for volume_file, volume_size in backup_source_details.get("backup_files"):
pool, volume, _ = volume_file.split("/")[-1].split(".") pool, volume, _ = volume_file.split("/")[-1].split(".")
@ -1696,14 +1711,12 @@ def restore_vm(zkhandler, domain, backup_path, datestring, retain_snapshot=False
if retcode: if retcode:
is_snapshot_remove_failed = True is_snapshot_remove_failed = True
which_snapshot_remove_failed.append(f"{pool}/{volume}") which_snapshot_remove_failed.append(f"{pool}/{volume}")
msg_snapshot_remove_failed.append(retmsg)
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
f"rbd snap rm {pool}/{volume}@backup_{datestring}" f"rbd snap rm {pool}/{volume}@backup_{datestring}"
) )
if retcode: if retcode:
is_snapshot_remove_failed = True is_snapshot_remove_failed = True
which_snapshot_remove_failed.append(f"{pool}/{volume}") which_snapshot_remove_failed.append(f"{pool}/{volume}")
msg_snapshot_remove_failed.append(retmsg)
else: else:
for volume_file, volume_size in backup_source_details.get("backup_files"): for volume_file, volume_size in backup_source_details.get("backup_files"):
@ -1772,7 +1785,7 @@ def restore_vm(zkhandler, domain, backup_path, datestring, retain_snapshot=False
if is_snapshot_remove_failed: if is_snapshot_remove_failed:
retlines.append( retlines.append(
f"WARNING: Failed to remove hanging snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}" f"WARNING: Failed to remove hanging snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}"
) )
myhostname = gethostname().split(".")[0] myhostname = gethostname().split(".")[0]