diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index c5a61643..75e75500 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -3515,6 +3515,70 @@ class API_VM_Snapshot_Import(Resource): api.add_resource(API_VM_Snapshot_Import, "/vm//snapshot/import") +# /vm/autobackup +class API_VM_Autobackup_Root(Resource): + @RequestParser( + [ + {"name": "force_full"}, + {"name": "email_recipients"}, + ] + ) + @Authenticator + def post(self, reqargs): + """ + Trigger a cluster autobackup job + --- + tags: + - provisioner + parameters: + - in: query + name: force_full + type: boolean + required: false + description: If set and true, triggers a full autobackup regardless of schedule + - in: query + name: email_recipients + type: array + description: A list of email addresses to send failure and report emails to + items: + type: string + example: "user@domain.tld" + responses: + 200: + description: OK + schema: + type: object + properties: + task_id: + type: string + description: Task ID for the provisioner Celery worker + 400: + description: Bad request + schema: + type: object + id: Message + """ + + task = run_celery_task( + "cluster.autobackup", + force_full=reqargs.get("force_full", False), + email_recipients=reqargs.get("email_recipients", None), + run_on="primary", + ) + return ( + { + "task_id": task.id, + "task_name": "cluster.autobackup", + "run_on": f"{get_primary_node()} (primary)", + }, + 202, + {"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)}, + ) + + +api.add_resource(API_VM_Autobackup_Root, "/vm/autobackup") + + ########################################################## # Client API - Network ########################################################## @@ -5070,7 +5134,7 @@ class API_Storage_Ceph_Benchmark(Resource): { "task_id": task.id, "task_name": "storage.benchmark", - "run_on": get_primary_node(), + "run_on": f"{get_primary_node()} (primary)", }, 202, {"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)}, @@ -9326,7 +9390,7 @@ class API_Provisioner_Create_Root(Resource): { "task_id": task.id, "task_name": "provisioner.create", - "run_on": get_primary_node(), + "run_on": f"{get_primary_node()} (primary)", }, 202, {"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)}, diff --git a/client-cli/pvc/cli/cli.py b/client-cli/pvc/cli/cli.py index 50f64e20..4087262f 100644 --- a/client-cli/pvc/cli/cli.py +++ b/client-cli/pvc/cli/cli.py @@ -1749,7 +1749,7 @@ def cli_vm_unmigrate(domain, wait, force_live): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) def cli_vm_flush_locks(domain, wait_flag): """ @@ -1793,7 +1793,7 @@ def cli_vm_snapshot(): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) def cli_vm_snapshot_create(domain, snapshot_name, wait_flag): """ @@ -1827,7 +1827,7 @@ def cli_vm_snapshot_create(domain, snapshot_name, wait_flag): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt("Remove shapshot {snapshot_name} of VM {domain}") def cli_vm_snapshot_remove(domain, snapshot_name, wait_flag): @@ -1860,7 +1860,7 @@ def cli_vm_snapshot_remove(domain, snapshot_name, wait_flag): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt( "Roll back to snapshot {snapshot_name} of {domain} and lose all data and changes since this snapshot" @@ -1903,7 +1903,7 @@ def cli_vm_snapshot_rollback(domain, snapshot_name, wait_flag): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) def cli_vm_snapshot_export( domain, snapshot_name, export_path, incremental_parent, wait_flag @@ -1957,7 +1957,7 @@ def cli_vm_snapshot_export( is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) def cli_vm_snapshot_import( domain, snapshot_name, import_path, retain_snapshot, wait_flag @@ -2149,15 +2149,6 @@ def cli_vm_backup_remove(domain, backup_datestring, backup_path): name="autobackup", short_help="Perform automatic virtual machine backups." ) @connection_req -@click.option( - "-f", - "--configuration", - "autobackup_cfgfile", - envvar="PVC_AUTOBACKUP_CFGFILE", - default=DEFAULT_AUTOBACKUP_FILENAME, - show_default=True, - help="Override default config file location.", -) @click.option( "--email-report", "email_report", @@ -2172,38 +2163,31 @@ def cli_vm_backup_remove(domain, backup_datestring, backup_path): help="Force all backups to be full backups this run.", ) @click.option( - "--cron", - "cron_flag", - default=False, + "--wait/--no-wait", + "wait_flag", is_flag=True, - help="Cron mode; don't error exit if this isn't the primary coordinator.", + default=True, + show_default=True, + help="Wait or don't wait for task to complete, showing progress if waiting", ) -def cli_vm_autobackup(autobackup_cfgfile, email_report, force_full_flag, cron_flag): +def cli_vm_autobackup(email_report, force_full_flag, wait_flag): """ Perform automated backups of VMs, with integrated cleanup and full/incremental scheduling. - This command enables automatic backup of PVC VMs at the block level, leveraging the various "pvc vm backup" + This command enables automatic backup of PVC VMs at the block level, leveraging the various "pvc vm snapshot" functions with an internal rentention and cleanup system as well as determination of full vs. incremental backups at different intervals. VMs are selected based on configured VM tags. The destination storage may either be local, or provided by a remote filesystem which is automatically mounted and unmounted during the backup run via a set of configured commands before and after the backup run. - NOTE: This command performs its tasks in a local context. It MUST be run from the cluster's active primary - coordinator using the "local" connection only; if either is not correct, the command will error. - - NOTE: This command should be run as the same user as the API daemon, usually "root" with "sudo -E" or in - a cronjob as "root", to ensure permissions are correct on the backup files. Failure to do so will still take - the backup, but the state update write will likely fail and the backup will become untracked. The command - will prompt for confirmation if it is found not to be running as "root" and this cannot be bypassed. - This command should be run from cron or a timer at a regular interval (e.g. daily, hourly, etc.) which defines how often backups are taken. Backup format (full/incremental) and retention is based only on the number of recorded backups, not on the time interval between them. Backups taken manually outside of the "autobackup" command are not counted towards the format or retention of autobackups. - The PVC_AUTOBACKUP_CFGFILE envvar or "-f"/"--configuration" option can be used to override the default - configuration file path if required by a particular run. For full details of the possible options, please - see the example configuration file at "/usr/share/pvc/autobackup.sample.yaml". + The actual details of the autobackup, including retention policies, full-vs-incremental, pre- and post- run + mounting/unmounting commands, etc. are defined in the main PVC configuration file `/etc/pvc/pvc.conf`. See + the sample configuration for more details. An optional report on all current backups can be emailed to one or more email addresses using the "--email-report" flag. This report will include information on all current known backups. @@ -2212,11 +2196,17 @@ def cli_vm_autobackup(autobackup_cfgfile, email_report, force_full_flag, cron_fl which can help synchronize the backups of existing VMs with new ones. """ - # All work here is done in the helper function for portability; we don't even use "finish" - vm_autobackup( - CLI_CONFIG, autobackup_cfgfile, email_report, force_full_flag, cron_flag + retcode, retmsg = pvc.lib.vm.vm_autobackup( + CLI_CONFIG, + email_recipients=email_report, + force_full_flag=force_full_flag, + wait_flag=wait_flag, ) + if retcode and wait_flag: + retmsg = wait_for_celery_task(CLI_CONFIG, retmsg) + finish(retcode, retmsg) + ############################################################################### # > pvc vm tag @@ -3722,7 +3712,7 @@ def cli_storage_benchmark(): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt( "Storage benchmarks take approximately 10 minutes to run and generate significant load on the cluster; they should be run sparingly. Continue" @@ -3811,7 +3801,7 @@ def cli_storage_osd(): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt( "Destroy all data on and create a new OSD database volume group on node {node} device {device}" @@ -3886,7 +3876,7 @@ def cli_storage_osd_create_db_vg(node, device, wait_flag): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}") def cli_storage_osd_add( @@ -3969,7 +3959,7 @@ def cli_storage_osd_add( is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt( "Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}" @@ -4024,7 +4014,7 @@ def cli_storage_osd_replace( is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt("Refresh OSD {osdid} (and peer split OSDs) on device {device}") def cli_storage_osd_refresh(osdid, device, wait_flag): @@ -4069,7 +4059,7 @@ def cli_storage_osd_refresh(osdid, device, wait_flag): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) @confirm_opt("Remove and destroy data on OSD {osdid}") def cli_storage_osd_remove(osdid, force_flag, wait_flag): @@ -6121,7 +6111,7 @@ def cli_provisioner_profile_list(limit, format_function): is_flag=True, default=True, show_default=True, - help="Wait or don't wait for task to complete, showing progress", + help="Wait or don't wait for task to complete, showing progress if waiting", ) def cli_provisioner_create( name, profile, define_flag, start_flag, script_args, wait_flag diff --git a/client-cli/pvc/cli/helpers.py b/client-cli/pvc/cli/helpers.py index c8ce2137..9c72f623 100644 --- a/client-cli/pvc/cli/helpers.py +++ b/client-cli/pvc/cli/helpers.py @@ -20,26 +20,16 @@ ############################################################################### from click import echo as click_echo -from click import confirm -from datetime import datetime from distutils.util import strtobool -from getpass import getuser from json import load as jload from json import dump as jdump -from os import chmod, environ, getpid, path, popen, makedirs, get_terminal_size -from re import findall +from os import chmod, environ, getpid, path, get_terminal_size from socket import gethostname -from subprocess import run, PIPE from sys import argv from syslog import syslog, openlog, closelog, LOG_AUTH from yaml import load as yload from yaml import SafeLoader -import pvc.lib.provisioner -import pvc.lib.vm -import pvc.lib.node -import pvc.lib.storage - DEFAULT_STORE_DATA = {"cfgfile": "/etc/pvc/pvc.conf"} DEFAULT_STORE_FILENAME = "pvc.json" @@ -196,452 +186,3 @@ def update_store(store_path, store_data): with open(store_file, "w") as fh: jdump(store_data, fh, sort_keys=True, indent=4) - - -def get_autobackup_config(CLI_CONFIG, cfgfile): - try: - config = dict() - with open(cfgfile) as fh: - full_config = yload(fh, Loader=SafeLoader) - backup_config = full_config["autobackup"] - config["backup_root_path"] = backup_config["backup_root_path"] - config["backup_root_suffix"] = backup_config["backup_root_suffix"] - config["backup_tags"] = backup_config["backup_tags"] - config["backup_schedule"] = backup_config["backup_schedule"] - config["auto_mount_enabled"] = backup_config["auto_mount"]["enabled"] - if config["auto_mount_enabled"]: - config["mount_cmds"] = list() - _mount_cmds = backup_config["auto_mount"]["mount_cmds"] - for _mount_cmd in _mount_cmds: - if "{backup_root_path}" in _mount_cmd: - _mount_cmd = _mount_cmd.format( - backup_root_path=backup_config["backup_root_path"] - ) - config["mount_cmds"].append(_mount_cmd) - - config["unmount_cmds"] = list() - _unmount_cmds = backup_config["auto_mount"]["unmount_cmds"] - for _unmount_cmd in _unmount_cmds: - if "{backup_root_path}" in _unmount_cmd: - _unmount_cmd = _unmount_cmd.format( - backup_root_path=backup_config["backup_root_path"] - ) - config["unmount_cmds"].append(_unmount_cmd) - except FileNotFoundError: - return "Backup configuration does not exist!" - except KeyError as e: - return f"Backup configuration is invalid: {e}" - - return config - - -def vm_autobackup( - CLI_CONFIG, - autobackup_cfgfile=DEFAULT_AUTOBACKUP_FILENAME, - email_report=None, - force_full_flag=False, - cron_flag=False, -): - """ - Perform automatic backups of VMs based on an external config file. - """ - - backup_summary = dict() - - if email_report is not None: - from email.utils import formatdate - from socket import gethostname - - try: - with open(autobackup_cfgfile) as fh: - tmp_config = yload(fh, Loader=SafeLoader) - cluster = tmp_config["cluster"]["name"] - except Exception: - cluster = "unknown" - - def send_execution_failure_report(error=None): - echo(CLI_CONFIG, f"Sending email failure report to {email_report}") - - current_datetime = datetime.now() - email_datetime = formatdate(float(current_datetime.strftime("%s"))) - - email = list() - email.append(f"Date: {email_datetime}") - email.append(f"Subject: PVC Autobackup execution failure for cluster {cluster}") - - recipients = list() - for recipient in email_report.split(","): - recipients.append(f"<{recipient}>") - email.append(f"To: {', '.join(recipients)}") - email.append(f"From: PVC Autobackup System ") - email.append("") - - email.append( - f"A PVC autobackup has FAILED at {current_datetime} due to an execution error." - ) - email.append("") - email.append("The reported error message is:") - email.append(f" {error}") - - try: - p = popen("/usr/sbin/sendmail -t", "w") - p.write("\n".join(email)) - p.close() - except Exception as e: - echo(CLI_CONFIG, f"Failed to send report email: {e}") - - # Validate that we are running on the current primary coordinator of the 'local' cluster connection - real_connection = CLI_CONFIG["connection"] - CLI_CONFIG["connection"] = "local" - retcode, retdata = pvc.lib.node.node_info(CLI_CONFIG, DEFAULT_NODE_HOSTNAME) - if not retcode or retdata.get("coordinator_state") != "primary": - if cron_flag: - echo( - CLI_CONFIG, - "Current host is not the primary coordinator of the local cluster and running in cron mode. Exiting cleanly.", - ) - exit(0) - else: - echo( - CLI_CONFIG, - f"ERROR: Current host is not the primary coordinator of the local cluster; got connection '{real_connection}', host '{DEFAULT_NODE_HOSTNAME}'.", - ) - echo( - CLI_CONFIG, - "Autobackup MUST be run from the cluster active primary coordinator using the 'local' connection. See '-h'/'--help' for details.", - ) - if email_report is not None: - send_execution_failure_report( - error=f"Autobackup run attempted from non-local connection or non-primary coordinator; got connection '{real_connection}', host '{DEFAULT_NODE_HOSTNAME}'." - ) - exit(1) - - # Ensure we're running as root, or show a warning & confirmation - if getuser() != "root": - confirm( - "WARNING: You are not running this command as 'root'. This command should be run under the same user as the API daemon, which is usually 'root'. Are you sure you want to continue?", - prompt_suffix=" ", - abort=True, - ) - - # Load our YAML config - autobackup_config = get_autobackup_config(CLI_CONFIG, autobackup_cfgfile) - if not isinstance(autobackup_config, dict): - echo(CLI_CONFIG, f"ERROR: {autobackup_config}") - if email_report is not None: - send_execution_failure_report(error=f"{autobackup_config}") - exit(1) - - # Get the start time of this run - autobackup_start_time = datetime.now() - - # Get a list of all VMs on the cluster - # We don't do tag filtering here, because we could match an arbitrary number of tags; instead, we - # parse the list after - retcode, retdata = pvc.lib.vm.vm_list(CLI_CONFIG, None, None, None, None, None) - if not retcode: - echo(CLI_CONFIG, f"ERROR: Failed to fetch VM list: {retdata}") - if email_report is not None: - send_execution_failure_report(error=f"Failed to fetch VM list: {retdata}") - exit(1) - cluster_vms = retdata - - # Parse the list to match tags; too complex for list comprehension alas - backup_vms = list() - for vm in cluster_vms: - vm_tag_names = [t["name"] for t in vm["tags"]] - matching_tags = ( - True - if len( - set(vm_tag_names).intersection(set(autobackup_config["backup_tags"])) - ) - > 0 - else False - ) - if matching_tags: - backup_vms.append(vm["name"]) - - if len(backup_vms) < 1: - echo(CLI_CONFIG, "Found no suitable VMs for autobackup.") - exit(0) - - # Pretty print the names of the VMs we'll back up (to stderr) - maxnamelen = max([len(n) for n in backup_vms]) + 2 - cols = 1 - while (cols * maxnamelen + maxnamelen + 2) <= MAX_CONTENT_WIDTH: - cols += 1 - rows = len(backup_vms) // cols - vm_list_rows = list() - for row in range(0, rows + 1): - row_start = row * cols - row_end = (row * cols) + cols - row_str = "" - for x in range(row_start, row_end): - if x < len(backup_vms): - row_str += "{:<{}}".format(backup_vms[x], maxnamelen) - vm_list_rows.append(row_str) - - echo(CLI_CONFIG, f"Found {len(backup_vms)} suitable VM(s) for autobackup.") - echo(CLI_CONFIG, "Full VM list:", stderr=True) - echo(CLI_CONFIG, " {}".format("\n ".join(vm_list_rows)), stderr=True) - echo(CLI_CONFIG, "", stderr=True) - - if autobackup_config["auto_mount_enabled"]: - # Execute each mount_cmds command in sequence - for cmd in autobackup_config["mount_cmds"]: - echo( - CLI_CONFIG, - f"Executing mount command '{cmd.split()[0]}'... ", - newline=False, - ) - tstart = datetime.now() - ret = run( - cmd.split(), - stdout=PIPE, - stderr=PIPE, - ) - tend = datetime.now() - ttot = tend - tstart - if ret.returncode != 0: - echo( - CLI_CONFIG, - f"failed. [{ttot.seconds}s]", - ) - echo( - CLI_CONFIG, - f"Exiting; command reports: {ret.stderr.decode().strip()}", - ) - if email_report is not None: - send_execution_failure_report(error=ret.stderr.decode().strip()) - exit(1) - else: - echo(CLI_CONFIG, f"done. [{ttot.seconds}s]") - - # For each VM, perform the backup - for vm in backup_vms: - backup_suffixed_path = f"{autobackup_config['backup_root_path']}{autobackup_config['backup_root_suffix']}" - if not path.exists(backup_suffixed_path): - makedirs(backup_suffixed_path) - - backup_path = f"{backup_suffixed_path}/{vm}" - autobackup_state_file = f"{backup_path}/.autobackup.json" - if not path.exists(backup_path) or not path.exists(autobackup_state_file): - # There are no new backups so the list is empty - state_data = dict() - tracked_backups = list() - else: - with open(autobackup_state_file) as fh: - state_data = jload(fh) - tracked_backups = state_data["tracked_backups"] - - full_interval = autobackup_config["backup_schedule"]["full_interval"] - full_retention = autobackup_config["backup_schedule"]["full_retention"] - - full_backups = [b for b in tracked_backups if b["type"] == "full"] - if len(full_backups) > 0: - last_full_backup = full_backups[0] - last_full_backup_idx = tracked_backups.index(last_full_backup) - if force_full_flag: - this_backup_type = "forced-full" - this_backup_incremental_parent = None - this_backup_retain_snapshot = True - elif last_full_backup_idx >= full_interval - 1: - this_backup_type = "full" - this_backup_incremental_parent = None - this_backup_retain_snapshot = True - else: - this_backup_type = "incremental" - this_backup_incremental_parent = last_full_backup["datestring"] - this_backup_retain_snapshot = False - else: - # The very first backup must be full to start the tree - this_backup_type = "full" - this_backup_incremental_parent = None - this_backup_retain_snapshot = True - - # Perform the backup - echo( - CLI_CONFIG, - f"Backing up VM '{vm}' ({this_backup_type})... ", - newline=False, - ) - tstart = datetime.now() - retcode, retdata = pvc.lib.vm.vm_backup( - CLI_CONFIG, - vm, - backup_suffixed_path, - incremental_parent=this_backup_incremental_parent, - retain_snapshot=this_backup_retain_snapshot, - ) - tend = datetime.now() - ttot = tend - tstart - if not retcode: - backup_datestring = findall(r"[0-9]{14}", retdata)[0] - echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]") - echo( - CLI_CONFIG, - retdata.strip().replace(f"ERROR in backup {backup_datestring}: ", ""), - ) - skip_cleanup = True - else: - backup_datestring = findall(r"[0-9]{14}", retdata)[0] - echo( - CLI_CONFIG, - f"done. Backup '{backup_datestring}' created. [{ttot.seconds}s]", - ) - skip_cleanup = False - - # Read backup file to get details - backup_json_file = f"{backup_path}/{backup_datestring}/pvcbackup.json" - with open(backup_json_file) as fh: - backup_json = jload(fh) - tracked_backups.insert(0, backup_json) - - # Delete any full backups that are expired - marked_for_deletion = list() - found_full_count = 0 - for backup in tracked_backups: - if backup["type"] == "full": - found_full_count += 1 - if found_full_count > full_retention: - marked_for_deletion.append(backup) - - # Depete any incremental backups that depend on marked parents - for backup in tracked_backups: - if backup["type"] == "incremental" and backup["incremental_parent"] in [ - b["datestring"] for b in marked_for_deletion - ]: - marked_for_deletion.append(backup) - - if len(marked_for_deletion) > 0: - if skip_cleanup: - echo( - CLI_CONFIG, - f"Skipping cleanups for {len(marked_for_deletion)} aged-out backups due to backup failure.", - ) - else: - echo( - CLI_CONFIG, - f"Running cleanups for {len(marked_for_deletion)} aged-out backups...", - ) - # Execute deletes - for backup_to_delete in marked_for_deletion: - echo( - CLI_CONFIG, - f"Removing old VM '{vm}' backup '{backup_to_delete['datestring']}' ({backup_to_delete['type']})... ", - newline=False, - ) - tstart = datetime.now() - retcode, retdata = pvc.lib.vm.vm_remove_backup( - CLI_CONFIG, - vm, - backup_suffixed_path, - backup_to_delete["datestring"], - ) - tend = datetime.now() - ttot = tend - tstart - if not retcode: - echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]") - echo( - CLI_CONFIG, - f"Skipping removal from tracked backups; command reports: {retdata}", - ) - else: - tracked_backups.remove(backup_to_delete) - echo(CLI_CONFIG, f"done. [{ttot.seconds}s]") - - # Update tracked state information - state_data["tracked_backups"] = tracked_backups - with open(autobackup_state_file, "w") as fh: - jdump(state_data, fh) - - backup_summary[vm] = tracked_backups - - if autobackup_config["auto_mount_enabled"]: - # Execute each unmount_cmds command in sequence - for cmd in autobackup_config["unmount_cmds"]: - echo( - CLI_CONFIG, - f"Executing unmount command '{cmd.split()[0]}'... ", - newline=False, - ) - tstart = datetime.now() - ret = run( - cmd.split(), - stdout=PIPE, - stderr=PIPE, - ) - tend = datetime.now() - ttot = tend - tstart - if ret.returncode != 0: - echo( - CLI_CONFIG, - f"failed. [{ttot.seconds}s]", - ) - echo( - CLI_CONFIG, - f"Continuing; command reports: {ret.stderr.decode().strip()}", - ) - else: - echo(CLI_CONFIG, f"done. [{ttot.seconds}s]") - - autobackup_end_time = datetime.now() - autobackup_total_time = autobackup_end_time - autobackup_start_time - - # Handle report emailing - if email_report is not None: - echo(CLI_CONFIG, "") - echo(CLI_CONFIG, f"Sending email summary report to {email_report}") - - current_datetime = datetime.now() - email_datetime = formatdate(float(current_datetime.strftime("%s"))) - - email = list() - email.append(f"Date: {email_datetime}") - email.append(f"Subject: PVC Autobackup report for cluster {cluster}") - - recipients = list() - for recipient in email_report.split(","): - recipients.append(f"<{recipient}>") - email.append(f"To: {', '.join(recipients)}") - email.append(f"From: PVC Autobackup System ") - email.append("") - - email.append( - f"A PVC autobackup has been completed at {current_datetime} in {autobackup_total_time}." - ) - email.append("") - email.append( - "The following is a summary of all current VM backups after cleanups, most recent first:" - ) - email.append("") - - for vm in backup_vms: - email.append(f"VM {vm}:") - for backup in backup_summary[vm]: - datestring = backup.get("datestring") - backup_date = datetime.strptime(datestring, "%Y%m%d%H%M%S") - if backup.get("result", False): - email.append( - f" {backup_date}: Success in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}" - ) - email.append( - f" Backup contains {len(backup.get('backup_files'))} files totaling {pvc.lib.storage.format_bytes_tohuman(backup.get('backup_size_bytes', 0))} ({backup.get('backup_size_bytes', 0)} bytes)" - ) - else: - email.append( - f" {backup_date}: Failure in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}" - ) - email.append( - f" {backup.get('result_message')}" - ) - - try: - p = popen("/usr/sbin/sendmail -t", "w") - p.write("\n".join(email)) - p.close() - except Exception as e: - echo(CLI_CONFIG, f"Failed to send report email: {e}") - - echo(CLI_CONFIG, "") - echo(CLI_CONFIG, f"Autobackup completed in {autobackup_total_time}.") diff --git a/client-cli/pvc/lib/vm.py b/client-cli/pvc/lib/vm.py index c0602fb3..e8811817 100644 --- a/client-cli/pvc/lib/vm.py +++ b/client-cli/pvc/lib/vm.py @@ -595,6 +595,24 @@ def vm_import_snapshot( return get_wait_retdata(response, wait_flag) +def vm_autobackup(config, email_recipients=None, force_full_flag=False, wait_flag=True): + """ + Perform a cluster VM autobackup + + API endpoint: POST /vm//autobackup + API arguments: email_recipients=email_recipients, force_full_flag=force_full_flag + API schema: {"message":"{data}"} + """ + params = { + "email_recipients": email_recipients, + "force_full": force_full_flag, + } + + response = call_api(config, "post", "/vm/autobackup", params=params) + + return get_wait_retdata(response, wait_flag) + + def vm_vcpus_set(config, vm, vcpus, topology, restart): """ Set the vCPU count of the VM with topology diff --git a/daemon-common/autobackup.py b/daemon-common/autobackup.py new file mode 100644 index 00000000..017103e8 --- /dev/null +++ b/daemon-common/autobackup.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 + +# autobackup.py - PVC API Autobackup functions +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2024 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +from datetime import datetime +from json import load as jload +from json import dump as jdump +from os import popen, makedirs, path +from shutil import rmtree +from subprocess import run, PIPE + +from daemon_lib.config import get_autobackup_configuration +from daemon_lib.celery import start, fail, log_info, log_err, update, finish + +import daemon_lib.ceph as pvc_ceph +import daemon_lib.vm as pvc_vm + + +def send_execution_failure_report( + celery_conf, config, recipients=None, total_time=0, error=None +): + if recipients is None: + return + + from email.utils import formatdate + from socket import gethostname + + log_message = f"Sending email failure report to {', '.join(recipients)}" + log_info(log_message) + update( + celery_conf[0], + log_message, + current=celery_conf[1] + 1, + total=celery_conf[2], + ) + + current_datetime = datetime.now() + email_datetime = formatdate(float(current_datetime.strftime("%s"))) + + email = list() + email.append(f"Date: {email_datetime}") + email.append( + f"Subject: PVC Autobackup execution failure for cluster '{config['cluster']}'" + ) + + email_to = list() + for recipient in recipients: + email_to.append(f"<{recipient}>") + + email.append(f"To: {', '.join(email_to)}") + email.append(f"From: PVC Autobackup System ") + email.append("") + + email.append( + f"A PVC autobackup has FAILED at {current_datetime} in {total_time}s due to an execution error." + ) + email.append("") + email.append("The reported error message is:") + email.append(f" {error}") + + try: + with popen("/usr/sbin/sendmail -t", "w") as p: + p.write("\n".join(email)) + except Exception as e: + log_err(f"Failed to send report email: {e}") + + +def send_execution_summary_report( + celery_conf, config, recipients=None, total_time=0, summary=dict() +): + if recipients is None: + return + + from email.utils import formatdate + from socket import gethostname + + log_message = f"Sending email summary report to {', '.join(recipients)}" + log_info(log_message) + update( + celery_conf[0], + log_message, + current=celery_conf[1] + 1, + total=celery_conf[2], + ) + + current_datetime = datetime.now() + email_datetime = formatdate(float(current_datetime.strftime("%s"))) + + email = list() + email.append(f"Date: {email_datetime}") + email.append(f"Subject: PVC Autobackup report for cluster '{config['cluster']}'") + + email_to = list() + for recipient in recipients: + email_to.append(f"<{recipient}>") + + email.append(f"To: {', '.join(email_to)}") + email.append(f"From: PVC Autobackup System ") + email.append("") + + email.append( + f"A PVC autobackup has been completed at {current_datetime} in {total_time}s." + ) + email.append("") + email.append( + "The following is a summary of all current VM backups after cleanups, most recent first:" + ) + email.append("") + + for vm in summary.keys(): + email.append(f"VM: {vm}:") + for backup in summary[vm]: + datestring = backup.get("datestring") + backup_date = datetime.strptime(datestring, "%Y%m%d%H%M%S") + if backup.get("result", False): + email.append( + f" {backup_date}: Success in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}" + ) + email.append( + f" Backup contains {len(backup.get('backup_files'))} files totaling {pvc_ceph.format_bytes_tohuman(backup.get('backup_size_bytes', 0))} ({backup.get('backup_size_bytes', 0)} bytes)" + ) + else: + email.append( + f" {backup_date}: Failure in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}" + ) + email.append(f" {backup.get('result_message')}") + + try: + with popen("/usr/sbin/sendmail -t", "w") as p: + p.write("\n".join(email)) + except Exception as e: + log_err(f"Failed to send report email: {e}") + + +def worker_cluster_autobackup( + zkhandler, celery, force_full=False, email_recipients=None +): + config = get_autobackup_configuration() + + backup_summary = dict() + + current_stage = 0 + total_stages = 1 + if email_recipients is not None: + total_stages += 1 + + start( + celery, + f"Starting cluster '{config['cluster']}' VM autobackup", + current=current_stage, + total=total_stages, + ) + + if not config["autobackup_enabled"]: + message = "Autobackups are not configured on this cluster." + log_info(celery, message) + return finish( + celery, + message, + current=total_stages, + total=total_stages, + ) + + autobackup_start_time = datetime.now() + + retcode, vm_list = pvc_vm.get_list(zkhandler) + if not retcode: + error_message = f"Failed to fetch VM list: {vm_list}" + log_err(celery, error_message) + send_execution_failure_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + error=error_message, + ) + fail(celery, error_message) + return False + + backup_vms = list() + for vm in vm_list: + vm_tag_names = [t["name"] for t in vm["tags"]] + matching_tags = ( + True + if len(set(vm_tag_names).intersection(set(config["backup_tags"]))) > 0 + else False + ) + if matching_tags: + backup_vms.append(vm) + + if len(backup_vms) < 1: + message = "Found no VMs tagged for autobackup." + log_info(celery, message) + return finish( + celery, + message, + current=total_stages, + total=total_stages, + ) + + if config["auto_mount_enabled"]: + total_stages += len(config["mount_cmds"]) + total_stages += len(config["unmount_cmds"]) + for vm in backup_vms: + total_disks = len([d for d in vm["disks"] if d["type"] == "rbd"]) + total_stages += 2 + 1 + 2 + 2 + 3 * total_disks + + log_info( + celery, + f"Found {len(backup_vms)} suitable VM(s) for autobackup: {', '.join(vm_list)}", + ) + + # Handle automount mount commands + if config["auto_mount_enabled"]: + for cmd in config["mount_cmds"]: + current_stage += 1 + update( + celery, + f"Executing mount command '{cmd.split()[0]}'", + current=current_stage, + total=total_stages, + ) + + ret = run( + cmd.split(), + stdout=PIPE, + stderr=PIPE, + ) + + if ret.returncode != 0: + error_message = f"Failed to execute mount command '{cmd.split()[0]}': {ret.stderr.decode().strip()}" + log_err(celery, error_message) + send_execution_failure_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + total_time=datetime.now() - autobackup_start_time, + error=error_message, + ) + fail(celery, error_message) + return False + + # Execute the backup: take a snapshot, then export the snapshot + backup_suffixed_path = ( + f"{config['backup_root_path']}/{config['backup_root_suffix']}" + ) + if not path.exists(backup_suffixed_path): + makedirs(backup_suffixed_path) + + full_interval = config["backup_schedule"]["full_interval"] + full_retention = config["backup_schedule"]["full_retention"] + + for vm in backup_vms: + vm_name = vm["name"] + vm_backup_path = f"{backup_suffixed_path}/{vm_name}" + autobackup_state_file = f"{vm_backup_path}/.autobackup.json" + if not path.exists(vm_backup_path) or not path.exists(autobackup_state_file): + # There are no existing backups so the list is empty + state_data = dict() + tracked_backups = list() + else: + with open(autobackup_state_file) as fh: + state_data = jload(fh) + tracked_backups = state_data["tracked_backups"] + + full_backups = [b for b in tracked_backups if b["type"] == "full"] + if len(full_backups) > 0: + last_full_backup = full_backups[0] + last_full_backup_idx = tracked_backups.index(last_full_backup) + if force_full: + this_backup_incremental_parent = None + this_backup_retain_snapshot = True + elif last_full_backup_idx >= full_interval - 1: + this_backup_incremental_parent = None + this_backup_retain_snapshot = True + else: + this_backup_incremental_parent = last_full_backup["datestring"] + this_backup_retain_snapshot = False + else: + # The very first ackup must be full to start the tree + this_backup_incremental_parent = None + this_backup_retain_snapshot = True + + now = datetime.now() + datestring = now.strftime("%Y%m%d%H%M%S") + snapshot_name = f"autobackup_{datestring}" + + # Take the snapshot + ret = pvc_vm.vm_worker_create_snapshot( + zkhandler, + celery, + vm_name, + snapshot_name=snapshot_name, + override_current_stage=current_stage, + override_total_stages=total_stages, + ) + if ret is False: + error_message = f"Failed to create backup snapshot '{snapshot_name}'" + log_err(celery, error_message) + send_execution_failure_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + error=error_message, + ) + return False + + # Export the snapshot + ret = pvc_vm.vm_worker_export_snapshot( + zkhandler, + celery, + vm_name, + snapshot_name, + backup_suffixed_path, + incremental_parent=this_backup_incremental_parent, + override_current_stage=current_stage, + override_total_stages=total_stages, + ) + if ret is False: + error_message = f"Failed to export backup snapshot '{snapshot_name}'" + log_err(celery, error_message) + send_execution_failure_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + error=error_message, + ) + return False + + # Clean up the snapshot + if not this_backup_retain_snapshot: + ret = pvc_vm.vm_worker_remove_snapshot( + zkhandler, + celery, + vm_name, + snapshot_name, + override_current_stage=current_stage, + override_total_stages=total_stages, + ) + if ret is False: + error_message = f"Failed to remove backup snapshot '{snapshot_name}'" + log_err(celery, error_message) + send_execution_failure_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + error=error_message, + ) + return False + else: + total_disks = len([d for d in vm["disks"] if d["type"] == "rbd"]) + current_stage += 2 + total_disks + + current_stage += 1 + update( + celery, + f"Finding obsolete incremental backups for '{vm_name}'", + current=current_stage, + total=total_stages, + ) + + # Read export file to get details + backup_json_file = f"{vm_backup_path}/{snapshot_name}/snapshot.json" + with open(backup_json_file) as fh: + backup_json = jload(fh) + tracked_backups.insert(0, backup_json) + + marked_for_deletion = list() + # Find any full backups that are expired + found_full_count = 0 + for backup in tracked_backups: + if backup["type"] == "full": + found_full_count += 1 + if found_full_count > full_retention: + marked_for_deletion.append(backup) + # Find any incremental backups that depend on marked parents + for backup in tracked_backups: + if backup["type"] == "incremental" and backup["incremental_parent"] in [ + b["datestring"] for b in marked_for_deletion + ]: + marked_for_deletion.append(backup) + + current_stage += 1 + if len(marked_for_deletion) > 0: + update( + celery, + f"Cleaning up aged out backups for '{vm_name}'", + current=current_stage, + total=total_stages, + ) + + for backup_to_delete in marked_for_deletion: + ret = pvc_vm.vm_worker_remove_snapshot( + zkhandler, None, vm_name, backup_to_delete["snapshot_name"] + ) + if ret is False: + error_message = f"Failed to remove obsolete backup snapshot '{backup_to_delete['snapshot_name']}', leaving in tracked backups" + log_err(celery, error_message) + else: + rmtree(f"{vm_backup_path}/{backup_to_delete['snapshot_name']}") + tracked_backups.remove(backup_to_delete) + + current_stage += 1 + update( + celery, + "Updating tracked backups", + current=current_stage, + total=total_stages, + ) + state_data["tracked_backups"] = tracked_backups + with open(autobackup_state_file, "w") as fh: + jdump(state_data, fh) + + backup_summary[vm] = tracked_backups + + # Handle automount unmount commands + if config["auto_mount_enabled"]: + for cmd in config["unmount_cmds"]: + current_stage += 1 + update( + celery, + f"Executing unmount command '{cmd.split()[0]}'", + current=current_stage, + total=total_stages, + ) + + ret = run( + cmd.split(), + stdout=PIPE, + stderr=PIPE, + ) + + if ret.returncode != 0: + error_message = f"Failed to execute unmount command '{cmd.split()[0]}': {ret.stderr.decode().strip()}" + log_err(celery, error_message) + send_execution_failure_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + total_time=datetime.now() - autobackup_start_time, + error=error_message, + ) + fail(celery, error_message) + return False + + autobackup_end_time = datetime.now() + autobackup_total_time = autobackup_end_time - autobackup_start_time + + send_execution_summary_report( + (celery, current_stage, total_stages), + config, + recipients=email_recipients, + total_time=autobackup_total_time, + summary=backup_summary, + ) + + current_stage += 1 + return finish( + celery, + f"Successfully completed cluster '{config['cluster']}' VM autobackup", + current=current_stage, + total=total_stages, + ) diff --git a/daemon-common/config.py b/daemon-common/config.py index 5e381383..a06f3979 100644 --- a/daemon-common/config.py +++ b/daemon-common/config.py @@ -406,6 +406,78 @@ def get_configuration(): return config +def get_parsed_autobackup_configuration(config_file): + """ + Load the configuration; this is the same main pvc.conf that the daemons read + """ + print('Loading configuration from file "{}"'.format(config_file)) + + with open(config_file, "r") as cfgfh: + try: + o_config = yaml.load(cfgfh, Loader=yaml.SafeLoader) + except Exception as e: + print(f"ERROR: Failed to parse configuration file: {e}") + os._exit(1) + + config = dict() + + try: + o_cluster = o_config["cluster"] + config_cluster = { + "cluster": o_cluster["name"], + "autobackup_enabled": True, + } + config = {**config, **config_cluster} + + o_autobackup = o_config["autobackup"] + if o_autobackup is None: + config["autobackup_enabled"] = False + return config + + config_autobackup = { + "backup_root_path": o_autobackup["backup_root_path"], + "backup_root_suffix": o_autobackup["backup_root_suffix"], + "backup_tags": o_autobackup["backup_tags"], + "backup_schedule": o_autobackup["backup_schedule"], + } + config = {**config, **config_autobackup} + + o_automount = o_autobackup["auto_mount"] + config_automount = { + "auto_mount_enabled": o_automount["enabled"], + } + config = {**config, **config_automount} + if config["auto_mount_enabled"]: + config["mount_cmds"] = list() + for _mount_cmd in o_automount["mount_cmds"]: + if "{backup_root_path}" in _mount_cmd: + _mount_cmd = _mount_cmd.format( + backup_root_path=config["backup_root_path"] + ) + config["mount_cmds"].append(_mount_cmd) + config["unmount_cmds"] = list() + for _unmount_cmd in o_automount["unmount_cmds"]: + if "{backup_root_path}" in _unmount_cmd: + _unmount_cmd = _unmount_cmd.format( + backup_root_path=config["backup_root_path"] + ) + config["unmount_cmds"].append(_unmount_cmd) + + except Exception as e: + raise MalformedConfigurationError(e) + + return config + + +def get_autobackup_configuration(): + """ + Get the configuration. + """ + pvc_config_file = get_configuration_path() + config = get_parsed_autobackup_configuration(pvc_config_file) + return config + + def validate_directories(config): if not os.path.exists(config["dynamic_directory"]): os.makedirs(config["dynamic_directory"]) diff --git a/worker-daemon/pvcworkerd/Daemon.py b/worker-daemon/pvcworkerd/Daemon.py index ba2007a6..86761e53 100755 --- a/worker-daemon/pvcworkerd/Daemon.py +++ b/worker-daemon/pvcworkerd/Daemon.py @@ -47,6 +47,9 @@ from daemon_lib.benchmark import ( from daemon_lib.vmbuilder import ( worker_create_vm, ) +from daemon_lib.autobackup import ( + worker_cluster_autobackup, +) # Daemon version version = "0.9.98" @@ -101,6 +104,21 @@ def storage_benchmark(self, pool=None, run_on="primary"): return run_storage_benchmark(self, pool) +@celery.task(name="cluster.autobackup", bind=True, routing_key="run_on") +def cluster_autobackup(self, force_full=False, email_recipients=None, run_on="primary"): + @ZKConnection(config) + def run_cluster_autobackup( + zkhandler, self, force_full=False, email_recipients=None + ): + return worker_cluster_autobackup( + zkhandler, self, force_full=force_full, email_recipients=email_recipients + ) + + return run_cluster_autobackup( + self, force_full=force_full, email_recipients=email_recipients + ) + + @celery.task(name="vm.flush_locks", bind=True, routing_key="run_on") def vm_flush_locks(self, domain=None, force_unlock=False, run_on="primary"): @ZKConnection(config)