Compare commits

...

24 Commits

Author SHA1 Message Date
Joshua Boniface 26d0d08873 Add is-primary command
Used by the cron to check if the node firing an autobackup is the
primary node or not, so it will not multi-fire from all coordinators.
2024-08-25 22:09:03 -04:00
Joshua Boniface f57b8d4a15 Simplify Celery event handling
It was far too cumbersome to report every possible stage here in a
consistent way. Realistically, this command will be run silently from
cron 99.95% of the time, so all this overcomplexity to handle individual
Celery state updates just isn't worth it.
2024-08-25 21:59:12 -04:00
Joshua Boniface 10de85cce3 Allow API-only builds and deploy 2024-08-25 20:45:52 -04:00
Joshua Boniface e938140414 Refactor autobackups to make more sense 2024-08-25 19:21:00 -04:00
Joshua Boniface fd87a28eb3 Fix bug in API parameters 2024-08-25 19:13:31 -04:00
Joshua Boniface 4ef5fbdbe8 Restore previous autobackup continue behaviour
With the original system, the failure of one VM's backups would not
trigger a total fault, thus allowing other backups to complete.
Restore that behaviour.
2024-08-25 17:04:43 -04:00
Joshua Boniface 8fa6bed736 Ensure cron flag triggers truly silent output 2024-08-25 16:35:24 -04:00
Joshua Boniface f7926726f2 Adjust snapshot name again 2024-08-25 16:20:59 -04:00
Joshua Boniface de58efdaa9 Ensure email_recipients is always a list 2024-08-25 16:18:19 -04:00
Joshua Boniface 8ca6976892 Re-add cron flag for autobackups 2024-08-25 16:17:41 -04:00
Joshua Boniface a957218976 Fix staging for summary report 2024-08-25 16:11:35 -04:00
Joshua Boniface 61365e6e01 Adjust autobackup snap name and output messages 2024-08-25 16:09:52 -04:00
Joshua Boniface 35fe16ce75 Revert "Adjust stage naming to reflect autobackup stages"
This reverts commit c1f320ede2.
2024-08-25 15:58:25 -04:00
Joshua Boniface c45e488958 Improve output of build-and-deploy 2024-08-25 15:57:07 -04:00
Joshua Boniface c1f320ede2 Adjust stage naming to reflect autobackup stages 2024-08-25 15:55:16 -04:00
Joshua Boniface 03db9604e1 Ensure recipients is a proper list 2024-08-25 15:55:00 -04:00
Joshua Boniface f1668bffcc Refactor autobackups to implement vm.worker defs
Avoid trying to subcall other Celery worker tasks, as this just gets
very screwy with the stages. Instead reimplement what is needed directly
here. While this does cause a fair bit of code duplication, I believe
the resulting clarity is worthwhile.
2024-08-25 15:54:03 -04:00
Joshua Boniface c0686fc5c7 Remove stage overrides
These aren't needed after pending refactor.
2024-08-25 15:17:46 -04:00
Joshua Boniface 7ecc05b413 Restart worker after becoming primary 2024-08-25 14:18:18 -04:00
Joshua Boniface 4b37c4fea3 Fix assignment bug 2024-08-25 14:10:59 -04:00
Joshua Boniface 0d918d66fe Port VM autobackups into pvcworkerd with snaps
Moves VM autobackups from being in-CLI to being handled by the
pvcworkerd system on the primary coordinator. Turns the CLI autobackup
command into an actual API client endpoint rather than having its logic
in the CLI.

In addition, modifies the new autobackup to leverage the new "pvc vm
snapshot" function set, just with special snapshot names. This helps
automate this within the new snapshot scaffolding.
2024-08-23 17:23:06 -04:00
Joshua Boniface fd199f405b Add deprection warning to pvc vm backup commands 2024-08-23 17:04:15 -04:00
Joshua Boniface f6c009beac Allow overriding stages in some commands
This allows them to be called by autobackup commands while still
preserving the current Celery report flow.
2024-08-23 11:21:02 -04:00
Joshua Boniface fc89f4f2f5 Fix error message contents 2024-08-23 10:23:51 -04:00
9 changed files with 999 additions and 518 deletions

View File

@ -3515,6 +3515,74 @@ class API_VM_Snapshot_Import(Resource):
api.add_resource(API_VM_Snapshot_Import, "/vm/<vm>/snapshot/import") api.add_resource(API_VM_Snapshot_Import, "/vm/<vm>/snapshot/import")
# /vm/autobackup
class API_VM_Autobackup_Root(Resource):
@RequestParser(
[
{"name": "force_full"},
{"name": "email_recipients"},
]
)
@Authenticator
def post(self, reqargs):
"""
Trigger a cluster autobackup job
---
tags:
- provisioner
parameters:
- in: query
name: force_full
type: boolean
required: false
description: If set and true, triggers a full autobackup regardless of schedule
- in: query
name: email_recipients
type: array
description: A list of email addresses to send failure and report emails to
items:
type: string
example: "user@domain.tld"
responses:
200:
description: OK
schema:
type: object
properties:
task_id:
type: string
description: Task ID for the provisioner Celery worker
400:
description: Bad request
schema:
type: object
id: Message
"""
email_recipients = reqargs.get("email_recipients", None)
if email_recipients is not None and not isinstance(email_recipients, list):
email_recipients = [email_recipients]
task = run_celery_task(
"cluster.autobackup",
force_full=bool(strtobool(reqargs.get("force_full", "false"))),
email_recipients=email_recipients,
run_on="primary",
)
return (
{
"task_id": task.id,
"task_name": "cluster.autobackup",
"run_on": f"{get_primary_node()} (primary)",
},
202,
{"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)},
)
api.add_resource(API_VM_Autobackup_Root, "/vm/autobackup")
########################################################## ##########################################################
# Client API - Network # Client API - Network
########################################################## ##########################################################
@ -5070,7 +5138,7 @@ class API_Storage_Ceph_Benchmark(Resource):
{ {
"task_id": task.id, "task_id": task.id,
"task_name": "storage.benchmark", "task_name": "storage.benchmark",
"run_on": get_primary_node(), "run_on": f"{get_primary_node()} (primary)",
}, },
202, 202,
{"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)}, {"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)},
@ -9326,7 +9394,7 @@ class API_Provisioner_Create_Root(Resource):
{ {
"task_id": task.id, "task_id": task.id,
"task_name": "provisioner.create", "task_name": "provisioner.create",
"run_on": get_primary_node(), "run_on": f"{get_primary_node()} (primary)",
}, },
202, 202,
{"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)}, {"Location": Api.url_for(api, API_Tasks_Element, task_id=task.id)},

View File

@ -13,6 +13,7 @@ else
fi fi
KEEP_ARTIFACTS="" KEEP_ARTIFACTS=""
API_ONLY=""
PRIMARY_NODE="" PRIMARY_NODE=""
if [[ -n ${1} ]]; then if [[ -n ${1} ]]; then
for arg in ${@}; do for arg in ${@}; do
@ -21,6 +22,10 @@ if [[ -n ${1} ]]; then
KEEP_ARTIFACTS="y" KEEP_ARTIFACTS="y"
shift shift
;; ;;
-a|--api-only)
API_ONLY="y"
shift
;;
-p=*|--become-primary=*) -p=*|--become-primary=*)
PRIMARY_NODE=$( awk -F'=' '{ print $NF }' <<<"${arg}" ) PRIMARY_NODE=$( awk -F'=' '{ print $NF }' <<<"${arg}" )
shift shift
@ -75,6 +80,7 @@ for HOST in ${HOSTS[@]}; do
ssh $HOST $SUDO systemctl restart pvcapid &>/dev/null ssh $HOST $SUDO systemctl restart pvcapid &>/dev/null
sleep 2 sleep 2
ssh $HOST $SUDO systemctl restart pvcworkerd &>/dev/null ssh $HOST $SUDO systemctl restart pvcworkerd &>/dev/null
if [[ -z ${API_ONLY} ]]; then
sleep 2 sleep 2
ssh $HOST $SUDO systemctl restart pvchealthd &>/dev/null ssh $HOST $SUDO systemctl restart pvchealthd &>/dev/null
sleep 2 sleep 2
@ -85,10 +91,13 @@ for HOST in ${HOSTS[@]}; do
sleep 5 sleep 5
echo -n "." echo -n "."
done done
fi
echo " done." echo " done."
if [[ -n ${PRIMARY_NODE} && ${PRIMARY_NODE} == ${HOST} ]]; then if [[ -n ${PRIMARY_NODE} && ${PRIMARY_NODE} == ${HOST} ]]; then
echo -n ">>> " echo -n ">>> Setting node $HOST to primary coordinator state... "
ssh $HOST pvc -q node primary ssh $HOST pvc -q node primary --wait &>/dev/null
ssh $HOST $SUDO systemctl restart pvcworkerd &>/dev/null
echo "done."
fi fi
done done

View File

@ -724,6 +724,33 @@ def cli_node():
pass pass
###############################################################################
# > pvc node is-primary
###############################################################################
@click.command(
name="is-primary",
short_help="Check if this node is primary coordinator.",
)
@connection_req
@click.argument("node", default=DEFAULT_NODE_HOSTNAME)
def cli_node_is_primary(
node,
):
"""
Check if NODE (or this node if unset) is the current primary coordinator.
Designed for scripting; returns no visible data, but the return code is 0 if the node
is primary, and 1 if it is not.
"""
_, primary_node = pvc.lib.cluster.get_primary_node(CLI_CONFIG)
if primary_node == node:
exit(0)
else:
exit(1)
############################################################################### ###############################################################################
# > pvc node primary # > pvc node primary
############################################################################### ###############################################################################
@ -1749,7 +1776,7 @@ def cli_vm_unmigrate(domain, wait, force_live):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
def cli_vm_flush_locks(domain, wait_flag): def cli_vm_flush_locks(domain, wait_flag):
""" """
@ -1793,7 +1820,7 @@ def cli_vm_snapshot():
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
def cli_vm_snapshot_create(domain, snapshot_name, wait_flag): def cli_vm_snapshot_create(domain, snapshot_name, wait_flag):
""" """
@ -1827,7 +1854,7 @@ def cli_vm_snapshot_create(domain, snapshot_name, wait_flag):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt("Remove shapshot {snapshot_name} of VM {domain}") @confirm_opt("Remove shapshot {snapshot_name} of VM {domain}")
def cli_vm_snapshot_remove(domain, snapshot_name, wait_flag): def cli_vm_snapshot_remove(domain, snapshot_name, wait_flag):
@ -1860,7 +1887,7 @@ def cli_vm_snapshot_remove(domain, snapshot_name, wait_flag):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt( @confirm_opt(
"Roll back to snapshot {snapshot_name} of {domain} and lose all data and changes since this snapshot" "Roll back to snapshot {snapshot_name} of {domain} and lose all data and changes since this snapshot"
@ -1903,7 +1930,7 @@ def cli_vm_snapshot_rollback(domain, snapshot_name, wait_flag):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
def cli_vm_snapshot_export( def cli_vm_snapshot_export(
domain, snapshot_name, export_path, incremental_parent, wait_flag domain, snapshot_name, export_path, incremental_parent, wait_flag
@ -1957,7 +1984,7 @@ def cli_vm_snapshot_export(
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
def cli_vm_snapshot_import( def cli_vm_snapshot_import(
domain, snapshot_name, import_path, retain_snapshot, wait_flag domain, snapshot_name, import_path, retain_snapshot, wait_flag
@ -2001,6 +2028,8 @@ def cli_vm_snapshot_import(
) )
def cli_vm_backup(): def cli_vm_backup():
""" """
DEPRECATED: Use 'pvc vm snapshot' commands instead. 'pvc vm backup' commands will be removed in PVC 0.9.100.
Manage backups of VMs in a PVC cluster. Manage backups of VMs in a PVC cluster.
""" """
pass pass
@ -2030,6 +2059,8 @@ def cli_vm_backup():
) )
def cli_vm_backup_create(domain, backup_path, incremental_parent, retain_snapshot): def cli_vm_backup_create(domain, backup_path, incremental_parent, retain_snapshot):
""" """
DEPRECATED: Use 'pvc vm snapshot' commands instead. 'pvc vm backup' commands will be removed in PVC 0.9.100.
Create a backup of virtual machine DOMAIN to BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name. Create a backup of virtual machine DOMAIN to BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
BACKUP_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing writes from the API daemon (normally running as "root"). The BACKUP_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage. BACKUP_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing writes from the API daemon (normally running as "root"). The BACKUP_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage.
@ -2076,6 +2107,8 @@ def cli_vm_backup_create(domain, backup_path, incremental_parent, retain_snapsho
) )
def cli_vm_backup_restore(domain, backup_datestring, backup_path, retain_snapshot): def cli_vm_backup_restore(domain, backup_datestring, backup_path, retain_snapshot):
""" """
DEPRECATED: Use 'pvc vm snapshot' commands instead. 'pvc vm backup' commands will be removed in PVC 0.9.100.
Restore the backup BACKUP_DATESTRING of virtual machine DOMAIN stored in BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name. Restore the backup BACKUP_DATESTRING of virtual machine DOMAIN stored in BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
BACKUP_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing reads from the API daemon (normally running as "root"). The BACKUP_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage. BACKUP_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing reads from the API daemon (normally running as "root"). The BACKUP_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage.
@ -2114,6 +2147,8 @@ def cli_vm_backup_restore(domain, backup_datestring, backup_path, retain_snapsho
@click.argument("backup_path") @click.argument("backup_path")
def cli_vm_backup_remove(domain, backup_datestring, backup_path): def cli_vm_backup_remove(domain, backup_datestring, backup_path):
""" """
DEPRECATED: Use 'pvc vm snapshot' commands instead. 'pvc vm backup' commands will be removed in PVC 0.9.100.
Remove the backup BACKUP_DATESTRING, including snapshots, of virtual machine DOMAIN stored in BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name. Remove the backup BACKUP_DATESTRING, including snapshots, of virtual machine DOMAIN stored in BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
WARNING: Removing an incremental parent will invalidate any existing incremental backups based on that backup. WARNING: Removing an incremental parent will invalidate any existing incremental backups based on that backup.
@ -2141,15 +2176,6 @@ def cli_vm_backup_remove(domain, backup_datestring, backup_path):
name="autobackup", short_help="Perform automatic virtual machine backups." name="autobackup", short_help="Perform automatic virtual machine backups."
) )
@connection_req @connection_req
@click.option(
"-f",
"--configuration",
"autobackup_cfgfile",
envvar="PVC_AUTOBACKUP_CFGFILE",
default=DEFAULT_AUTOBACKUP_FILENAME,
show_default=True,
help="Override default config file location.",
)
@click.option( @click.option(
"--email-report", "--email-report",
"email_report", "email_report",
@ -2163,39 +2189,42 @@ def cli_vm_backup_remove(domain, backup_datestring, backup_path):
is_flag=True, is_flag=True,
help="Force all backups to be full backups this run.", help="Force all backups to be full backups this run.",
) )
@click.option(
"--wait/--no-wait",
"wait_flag",
is_flag=True,
default=True,
show_default=True,
help="Wait or don't wait for task to complete, showing progress if waiting.",
)
@click.option( @click.option(
"--cron", "--cron",
"cron_flag", "cron_flag",
default=False,
is_flag=True, is_flag=True,
help="Cron mode; don't error exit if this isn't the primary coordinator.", default=False,
show_default=True,
help="Run in cron mode (returns immediately with no output once job is submitted).",
) )
def cli_vm_autobackup(autobackup_cfgfile, email_report, force_full_flag, cron_flag): def cli_vm_autobackup(email_report, force_full_flag, wait_flag, cron_flag):
""" """
Perform automated backups of VMs, with integrated cleanup and full/incremental scheduling. Perform automated backups of VMs, with integrated cleanup and full/incremental scheduling.
This command enables automatic backup of PVC VMs at the block level, leveraging the various "pvc vm backup" This command enables automatic backup of PVC VMs at the block level, leveraging the various "pvc vm snapshot"
functions with an internal rentention and cleanup system as well as determination of full vs. incremental functions with an internal rentention and cleanup system as well as determination of full vs. incremental
backups at different intervals. VMs are selected based on configured VM tags. The destination storage backups at different intervals. VMs are selected based on configured VM tags. The destination storage
may either be local, or provided by a remote filesystem which is automatically mounted and unmounted during may either be local, or provided by a remote filesystem which is automatically mounted and unmounted during
the backup run via a set of configured commands before and after the backup run. the backup run via a set of configured commands before and after the backup run.
NOTE: This command performs its tasks in a local context. It MUST be run from the cluster's active primary
coordinator using the "local" connection only; if either is not correct, the command will error.
NOTE: This command should be run as the same user as the API daemon, usually "root" with "sudo -E" or in
a cronjob as "root", to ensure permissions are correct on the backup files. Failure to do so will still take
the backup, but the state update write will likely fail and the backup will become untracked. The command
will prompt for confirmation if it is found not to be running as "root" and this cannot be bypassed.
This command should be run from cron or a timer at a regular interval (e.g. daily, hourly, etc.) which defines This command should be run from cron or a timer at a regular interval (e.g. daily, hourly, etc.) which defines
how often backups are taken. Backup format (full/incremental) and retention is based only on the number of how often backups are taken. Backup format (full/incremental) and retention is based only on the number of
recorded backups, not on the time interval between them. Backups taken manually outside of the "autobackup" recorded backups, not on the time interval between them. Exports taken manually outside of the "autobackup"
command are not counted towards the format or retention of autobackups. command are not counted towards the format or retention of autobackups.
The PVC_AUTOBACKUP_CFGFILE envvar or "-f"/"--configuration" option can be used to override the default WARNING: Running this command manually will interfere with the schedule! Do not run manually except for testing.
configuration file path if required by a particular run. For full details of the possible options, please
see the example configuration file at "/usr/share/pvc/autobackup.sample.yaml". The actual details of the autobackup, including retention policies, full-vs-incremental, pre- and post- run
mounting/unmounting commands, etc. are defined in the main PVC configuration file `/etc/pvc/pvc.conf`. See
the sample configuration for more details.
An optional report on all current backups can be emailed to one or more email addresses using the An optional report on all current backups can be emailed to one or more email addresses using the
"--email-report" flag. This report will include information on all current known backups. "--email-report" flag. This report will include information on all current known backups.
@ -2204,11 +2233,29 @@ def cli_vm_autobackup(autobackup_cfgfile, email_report, force_full_flag, cron_fl
which can help synchronize the backups of existing VMs with new ones. which can help synchronize the backups of existing VMs with new ones.
""" """
# All work here is done in the helper function for portability; we don't even use "finish" if cron_flag:
vm_autobackup( wait_flag = False
CLI_CONFIG, autobackup_cfgfile, email_report, force_full_flag, cron_flag
if email_report is not None:
email_recipients = email_report.split(",")
else:
email_recipients = None
retcode, retmsg = pvc.lib.vm.vm_autobackup(
CLI_CONFIG,
email_recipients=email_recipients,
force_full_flag=force_full_flag,
wait_flag=wait_flag,
) )
if retcode and wait_flag:
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
if cron_flag:
finish(retcode, None)
else:
finish(retcode, retmsg)
############################################################################### ###############################################################################
# > pvc vm tag # > pvc vm tag
@ -3714,7 +3761,7 @@ def cli_storage_benchmark():
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt( @confirm_opt(
"Storage benchmarks take approximately 10 minutes to run and generate significant load on the cluster; they should be run sparingly. Continue" "Storage benchmarks take approximately 10 minutes to run and generate significant load on the cluster; they should be run sparingly. Continue"
@ -3803,7 +3850,7 @@ def cli_storage_osd():
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt( @confirm_opt(
"Destroy all data on and create a new OSD database volume group on node {node} device {device}" "Destroy all data on and create a new OSD database volume group on node {node} device {device}"
@ -3878,7 +3925,7 @@ def cli_storage_osd_create_db_vg(node, device, wait_flag):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}") @confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}")
def cli_storage_osd_add( def cli_storage_osd_add(
@ -3961,7 +4008,7 @@ def cli_storage_osd_add(
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt( @confirm_opt(
"Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}" "Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}"
@ -4016,7 +4063,7 @@ def cli_storage_osd_replace(
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt("Refresh OSD {osdid} (and peer split OSDs) on device {device}") @confirm_opt("Refresh OSD {osdid} (and peer split OSDs) on device {device}")
def cli_storage_osd_refresh(osdid, device, wait_flag): def cli_storage_osd_refresh(osdid, device, wait_flag):
@ -4061,7 +4108,7 @@ def cli_storage_osd_refresh(osdid, device, wait_flag):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
@confirm_opt("Remove and destroy data on OSD {osdid}") @confirm_opt("Remove and destroy data on OSD {osdid}")
def cli_storage_osd_remove(osdid, force_flag, wait_flag): def cli_storage_osd_remove(osdid, force_flag, wait_flag):
@ -6113,7 +6160,7 @@ def cli_provisioner_profile_list(limit, format_function):
is_flag=True, is_flag=True,
default=True, default=True,
show_default=True, show_default=True,
help="Wait or don't wait for task to complete, showing progress", help="Wait or don't wait for task to complete, showing progress if waiting",
) )
def cli_provisioner_create( def cli_provisioner_create(
name, profile, define_flag, start_flag, script_args, wait_flag name, profile, define_flag, start_flag, script_args, wait_flag
@ -6505,6 +6552,7 @@ def cli(
# Click command tree # Click command tree
############################################################################### ###############################################################################
cli_node.add_command(cli_node_is_primary)
cli_node.add_command(cli_node_primary) cli_node.add_command(cli_node_primary)
cli_node.add_command(cli_node_secondary) cli_node.add_command(cli_node_secondary)
cli_node.add_command(cli_node_flush) cli_node.add_command(cli_node_flush)

View File

@ -20,26 +20,16 @@
############################################################################### ###############################################################################
from click import echo as click_echo from click import echo as click_echo
from click import confirm
from datetime import datetime
from distutils.util import strtobool from distutils.util import strtobool
from getpass import getuser
from json import load as jload from json import load as jload
from json import dump as jdump from json import dump as jdump
from os import chmod, environ, getpid, path, popen, makedirs, get_terminal_size from os import chmod, environ, getpid, path, get_terminal_size
from re import findall
from socket import gethostname from socket import gethostname
from subprocess import run, PIPE
from sys import argv from sys import argv
from syslog import syslog, openlog, closelog, LOG_AUTH from syslog import syslog, openlog, closelog, LOG_AUTH
from yaml import load as yload from yaml import load as yload
from yaml import SafeLoader from yaml import SafeLoader
import pvc.lib.provisioner
import pvc.lib.vm
import pvc.lib.node
import pvc.lib.storage
DEFAULT_STORE_DATA = {"cfgfile": "/etc/pvc/pvc.conf"} DEFAULT_STORE_DATA = {"cfgfile": "/etc/pvc/pvc.conf"}
DEFAULT_STORE_FILENAME = "pvc.json" DEFAULT_STORE_FILENAME = "pvc.json"
@ -196,452 +186,3 @@ def update_store(store_path, store_data):
with open(store_file, "w") as fh: with open(store_file, "w") as fh:
jdump(store_data, fh, sort_keys=True, indent=4) jdump(store_data, fh, sort_keys=True, indent=4)
def get_autobackup_config(CLI_CONFIG, cfgfile):
try:
config = dict()
with open(cfgfile) as fh:
full_config = yload(fh, Loader=SafeLoader)
backup_config = full_config["autobackup"]
config["backup_root_path"] = backup_config["backup_root_path"]
config["backup_root_suffix"] = backup_config["backup_root_suffix"]
config["backup_tags"] = backup_config["backup_tags"]
config["backup_schedule"] = backup_config["backup_schedule"]
config["auto_mount_enabled"] = backup_config["auto_mount"]["enabled"]
if config["auto_mount_enabled"]:
config["mount_cmds"] = list()
_mount_cmds = backup_config["auto_mount"]["mount_cmds"]
for _mount_cmd in _mount_cmds:
if "{backup_root_path}" in _mount_cmd:
_mount_cmd = _mount_cmd.format(
backup_root_path=backup_config["backup_root_path"]
)
config["mount_cmds"].append(_mount_cmd)
config["unmount_cmds"] = list()
_unmount_cmds = backup_config["auto_mount"]["unmount_cmds"]
for _unmount_cmd in _unmount_cmds:
if "{backup_root_path}" in _unmount_cmd:
_unmount_cmd = _unmount_cmd.format(
backup_root_path=backup_config["backup_root_path"]
)
config["unmount_cmds"].append(_unmount_cmd)
except FileNotFoundError:
return "Backup configuration does not exist!"
except KeyError as e:
return f"Backup configuration is invalid: {e}"
return config
def vm_autobackup(
CLI_CONFIG,
autobackup_cfgfile=DEFAULT_AUTOBACKUP_FILENAME,
email_report=None,
force_full_flag=False,
cron_flag=False,
):
"""
Perform automatic backups of VMs based on an external config file.
"""
backup_summary = dict()
if email_report is not None:
from email.utils import formatdate
from socket import gethostname
try:
with open(autobackup_cfgfile) as fh:
tmp_config = yload(fh, Loader=SafeLoader)
cluster = tmp_config["cluster"]["name"]
except Exception:
cluster = "unknown"
def send_execution_failure_report(error=None):
echo(CLI_CONFIG, f"Sending email failure report to {email_report}")
current_datetime = datetime.now()
email_datetime = formatdate(float(current_datetime.strftime("%s")))
email = list()
email.append(f"Date: {email_datetime}")
email.append(f"Subject: PVC Autobackup execution failure for cluster {cluster}")
recipients = list()
for recipient in email_report.split(","):
recipients.append(f"<{recipient}>")
email.append(f"To: {', '.join(recipients)}")
email.append(f"From: PVC Autobackup System <pvc@{gethostname()}>")
email.append("")
email.append(
f"A PVC autobackup has FAILED at {current_datetime} due to an execution error."
)
email.append("")
email.append("The reported error message is:")
email.append(f" {error}")
try:
p = popen("/usr/sbin/sendmail -t", "w")
p.write("\n".join(email))
p.close()
except Exception as e:
echo(CLI_CONFIG, f"Failed to send report email: {e}")
# Validate that we are running on the current primary coordinator of the 'local' cluster connection
real_connection = CLI_CONFIG["connection"]
CLI_CONFIG["connection"] = "local"
retcode, retdata = pvc.lib.node.node_info(CLI_CONFIG, DEFAULT_NODE_HOSTNAME)
if not retcode or retdata.get("coordinator_state") != "primary":
if cron_flag:
echo(
CLI_CONFIG,
"Current host is not the primary coordinator of the local cluster and running in cron mode. Exiting cleanly.",
)
exit(0)
else:
echo(
CLI_CONFIG,
f"ERROR: Current host is not the primary coordinator of the local cluster; got connection '{real_connection}', host '{DEFAULT_NODE_HOSTNAME}'.",
)
echo(
CLI_CONFIG,
"Autobackup MUST be run from the cluster active primary coordinator using the 'local' connection. See '-h'/'--help' for details.",
)
if email_report is not None:
send_execution_failure_report(
error=f"Autobackup run attempted from non-local connection or non-primary coordinator; got connection '{real_connection}', host '{DEFAULT_NODE_HOSTNAME}'."
)
exit(1)
# Ensure we're running as root, or show a warning & confirmation
if getuser() != "root":
confirm(
"WARNING: You are not running this command as 'root'. This command should be run under the same user as the API daemon, which is usually 'root'. Are you sure you want to continue?",
prompt_suffix=" ",
abort=True,
)
# Load our YAML config
autobackup_config = get_autobackup_config(CLI_CONFIG, autobackup_cfgfile)
if not isinstance(autobackup_config, dict):
echo(CLI_CONFIG, f"ERROR: {autobackup_config}")
if email_report is not None:
send_execution_failure_report(error=f"{autobackup_config}")
exit(1)
# Get the start time of this run
autobackup_start_time = datetime.now()
# Get a list of all VMs on the cluster
# We don't do tag filtering here, because we could match an arbitrary number of tags; instead, we
# parse the list after
retcode, retdata = pvc.lib.vm.vm_list(CLI_CONFIG, None, None, None, None, None)
if not retcode:
echo(CLI_CONFIG, f"ERROR: Failed to fetch VM list: {retdata}")
if email_report is not None:
send_execution_failure_report(error=f"Failed to fetch VM list: {retdata}")
exit(1)
cluster_vms = retdata
# Parse the list to match tags; too complex for list comprehension alas
backup_vms = list()
for vm in cluster_vms:
vm_tag_names = [t["name"] for t in vm["tags"]]
matching_tags = (
True
if len(
set(vm_tag_names).intersection(set(autobackup_config["backup_tags"]))
)
> 0
else False
)
if matching_tags:
backup_vms.append(vm["name"])
if len(backup_vms) < 1:
echo(CLI_CONFIG, "Found no suitable VMs for autobackup.")
exit(0)
# Pretty print the names of the VMs we'll back up (to stderr)
maxnamelen = max([len(n) for n in backup_vms]) + 2
cols = 1
while (cols * maxnamelen + maxnamelen + 2) <= MAX_CONTENT_WIDTH:
cols += 1
rows = len(backup_vms) // cols
vm_list_rows = list()
for row in range(0, rows + 1):
row_start = row * cols
row_end = (row * cols) + cols
row_str = ""
for x in range(row_start, row_end):
if x < len(backup_vms):
row_str += "{:<{}}".format(backup_vms[x], maxnamelen)
vm_list_rows.append(row_str)
echo(CLI_CONFIG, f"Found {len(backup_vms)} suitable VM(s) for autobackup.")
echo(CLI_CONFIG, "Full VM list:", stderr=True)
echo(CLI_CONFIG, " {}".format("\n ".join(vm_list_rows)), stderr=True)
echo(CLI_CONFIG, "", stderr=True)
if autobackup_config["auto_mount_enabled"]:
# Execute each mount_cmds command in sequence
for cmd in autobackup_config["mount_cmds"]:
echo(
CLI_CONFIG,
f"Executing mount command '{cmd.split()[0]}'... ",
newline=False,
)
tstart = datetime.now()
ret = run(
cmd.split(),
stdout=PIPE,
stderr=PIPE,
)
tend = datetime.now()
ttot = tend - tstart
if ret.returncode != 0:
echo(
CLI_CONFIG,
f"failed. [{ttot.seconds}s]",
)
echo(
CLI_CONFIG,
f"Exiting; command reports: {ret.stderr.decode().strip()}",
)
if email_report is not None:
send_execution_failure_report(error=ret.stderr.decode().strip())
exit(1)
else:
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]")
# For each VM, perform the backup
for vm in backup_vms:
backup_suffixed_path = f"{autobackup_config['backup_root_path']}{autobackup_config['backup_root_suffix']}"
if not path.exists(backup_suffixed_path):
makedirs(backup_suffixed_path)
backup_path = f"{backup_suffixed_path}/{vm}"
autobackup_state_file = f"{backup_path}/.autobackup.json"
if not path.exists(backup_path) or not path.exists(autobackup_state_file):
# There are no new backups so the list is empty
state_data = dict()
tracked_backups = list()
else:
with open(autobackup_state_file) as fh:
state_data = jload(fh)
tracked_backups = state_data["tracked_backups"]
full_interval = autobackup_config["backup_schedule"]["full_interval"]
full_retention = autobackup_config["backup_schedule"]["full_retention"]
full_backups = [b for b in tracked_backups if b["type"] == "full"]
if len(full_backups) > 0:
last_full_backup = full_backups[0]
last_full_backup_idx = tracked_backups.index(last_full_backup)
if force_full_flag:
this_backup_type = "forced-full"
this_backup_incremental_parent = None
this_backup_retain_snapshot = True
elif last_full_backup_idx >= full_interval - 1:
this_backup_type = "full"
this_backup_incremental_parent = None
this_backup_retain_snapshot = True
else:
this_backup_type = "incremental"
this_backup_incremental_parent = last_full_backup["datestring"]
this_backup_retain_snapshot = False
else:
# The very first backup must be full to start the tree
this_backup_type = "full"
this_backup_incremental_parent = None
this_backup_retain_snapshot = True
# Perform the backup
echo(
CLI_CONFIG,
f"Backing up VM '{vm}' ({this_backup_type})... ",
newline=False,
)
tstart = datetime.now()
retcode, retdata = pvc.lib.vm.vm_backup(
CLI_CONFIG,
vm,
backup_suffixed_path,
incremental_parent=this_backup_incremental_parent,
retain_snapshot=this_backup_retain_snapshot,
)
tend = datetime.now()
ttot = tend - tstart
if not retcode:
backup_datestring = findall(r"[0-9]{14}", retdata)[0]
echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]")
echo(
CLI_CONFIG,
retdata.strip().replace(f"ERROR in backup {backup_datestring}: ", ""),
)
skip_cleanup = True
else:
backup_datestring = findall(r"[0-9]{14}", retdata)[0]
echo(
CLI_CONFIG,
f"done. Backup '{backup_datestring}' created. [{ttot.seconds}s]",
)
skip_cleanup = False
# Read backup file to get details
backup_json_file = f"{backup_path}/{backup_datestring}/pvcbackup.json"
with open(backup_json_file) as fh:
backup_json = jload(fh)
tracked_backups.insert(0, backup_json)
# Delete any full backups that are expired
marked_for_deletion = list()
found_full_count = 0
for backup in tracked_backups:
if backup["type"] == "full":
found_full_count += 1
if found_full_count > full_retention:
marked_for_deletion.append(backup)
# Depete any incremental backups that depend on marked parents
for backup in tracked_backups:
if backup["type"] == "incremental" and backup["incremental_parent"] in [
b["datestring"] for b in marked_for_deletion
]:
marked_for_deletion.append(backup)
if len(marked_for_deletion) > 0:
if skip_cleanup:
echo(
CLI_CONFIG,
f"Skipping cleanups for {len(marked_for_deletion)} aged-out backups due to backup failure.",
)
else:
echo(
CLI_CONFIG,
f"Running cleanups for {len(marked_for_deletion)} aged-out backups...",
)
# Execute deletes
for backup_to_delete in marked_for_deletion:
echo(
CLI_CONFIG,
f"Removing old VM '{vm}' backup '{backup_to_delete['datestring']}' ({backup_to_delete['type']})... ",
newline=False,
)
tstart = datetime.now()
retcode, retdata = pvc.lib.vm.vm_remove_backup(
CLI_CONFIG,
vm,
backup_suffixed_path,
backup_to_delete["datestring"],
)
tend = datetime.now()
ttot = tend - tstart
if not retcode:
echo(CLI_CONFIG, f"failed. [{ttot.seconds}s]")
echo(
CLI_CONFIG,
f"Skipping removal from tracked backups; command reports: {retdata}",
)
else:
tracked_backups.remove(backup_to_delete)
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]")
# Update tracked state information
state_data["tracked_backups"] = tracked_backups
with open(autobackup_state_file, "w") as fh:
jdump(state_data, fh)
backup_summary[vm] = tracked_backups
if autobackup_config["auto_mount_enabled"]:
# Execute each unmount_cmds command in sequence
for cmd in autobackup_config["unmount_cmds"]:
echo(
CLI_CONFIG,
f"Executing unmount command '{cmd.split()[0]}'... ",
newline=False,
)
tstart = datetime.now()
ret = run(
cmd.split(),
stdout=PIPE,
stderr=PIPE,
)
tend = datetime.now()
ttot = tend - tstart
if ret.returncode != 0:
echo(
CLI_CONFIG,
f"failed. [{ttot.seconds}s]",
)
echo(
CLI_CONFIG,
f"Continuing; command reports: {ret.stderr.decode().strip()}",
)
else:
echo(CLI_CONFIG, f"done. [{ttot.seconds}s]")
autobackup_end_time = datetime.now()
autobackup_total_time = autobackup_end_time - autobackup_start_time
# Handle report emailing
if email_report is not None:
echo(CLI_CONFIG, "")
echo(CLI_CONFIG, f"Sending email summary report to {email_report}")
current_datetime = datetime.now()
email_datetime = formatdate(float(current_datetime.strftime("%s")))
email = list()
email.append(f"Date: {email_datetime}")
email.append(f"Subject: PVC Autobackup report for cluster {cluster}")
recipients = list()
for recipient in email_report.split(","):
recipients.append(f"<{recipient}>")
email.append(f"To: {', '.join(recipients)}")
email.append(f"From: PVC Autobackup System <pvc@{gethostname()}>")
email.append("")
email.append(
f"A PVC autobackup has been completed at {current_datetime} in {autobackup_total_time}."
)
email.append("")
email.append(
"The following is a summary of all current VM backups after cleanups, most recent first:"
)
email.append("")
for vm in backup_vms:
email.append(f"VM {vm}:")
for backup in backup_summary[vm]:
datestring = backup.get("datestring")
backup_date = datetime.strptime(datestring, "%Y%m%d%H%M%S")
if backup.get("result", False):
email.append(
f" {backup_date}: Success in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}"
)
email.append(
f" Backup contains {len(backup.get('backup_files'))} files totaling {pvc.lib.storage.format_bytes_tohuman(backup.get('backup_size_bytes', 0))} ({backup.get('backup_size_bytes', 0)} bytes)"
)
else:
email.append(
f" {backup_date}: Failure in {backup.get('runtime_secs', 0)} seconds, ID {datestring}, type {backup.get('type', 'unknown')}"
)
email.append(
f" {backup.get('result_message')}"
)
try:
p = popen("/usr/sbin/sendmail -t", "w")
p.write("\n".join(email))
p.close()
except Exception as e:
echo(CLI_CONFIG, f"Failed to send report email: {e}")
echo(CLI_CONFIG, "")
echo(CLI_CONFIG, f"Autobackup completed in {autobackup_total_time}.")

View File

@ -595,6 +595,24 @@ def vm_import_snapshot(
return get_wait_retdata(response, wait_flag) return get_wait_retdata(response, wait_flag)
def vm_autobackup(config, email_recipients=None, force_full_flag=False, wait_flag=True):
"""
Perform a cluster VM autobackup
API endpoint: POST /vm//autobackup
API arguments: email_recipients=email_recipients, force_full_flag=force_full_flag
API schema: {"message":"{data}"}
"""
params = {
"email_recipients": email_recipients,
"force_full": force_full_flag,
}
response = call_api(config, "post", "/vm/autobackup", params=params)
return get_wait_retdata(response, wait_flag)
def vm_vcpus_set(config, vm, vcpus, topology, restart): def vm_vcpus_set(config, vm, vcpus, topology, restart):
""" """
Set the vCPU count of the VM with topology Set the vCPU count of the VM with topology

695
daemon-common/autobackup.py Normal file
View File

@ -0,0 +1,695 @@
#!/usr/bin/env python3
# autobackup.py - PVC API Autobackup functions
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2024 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
from datetime import datetime
from json import load as jload
from json import dump as jdump
from os import popen, makedirs, path, scandir
from shutil import rmtree
from subprocess import run, PIPE
from daemon_lib.common import run_os_command
from daemon_lib.config import get_autobackup_configuration
from daemon_lib.celery import start, fail, log_info, log_err, update, finish
import daemon_lib.ceph as ceph
import daemon_lib.vm as vm
def send_execution_failure_report(
celery_conf, config, recipients=None, total_time=0, error=None
):
if recipients is None:
return
from email.utils import formatdate
from socket import gethostname
log_message = f"Sending email failure report to {', '.join(recipients)}"
log_info(celery_conf[0], log_message)
update(
celery_conf[0],
log_message,
current=celery_conf[1] + 1,
total=celery_conf[2],
)
current_datetime = datetime.now()
email_datetime = formatdate(float(current_datetime.strftime("%s")))
email = list()
email.append(f"Date: {email_datetime}")
email.append(
f"Subject: PVC Autobackup execution failure for cluster '{config['cluster']}'"
)
email_to = list()
for recipient in recipients:
email_to.append(f"<{recipient}>")
email.append(f"To: {', '.join(email_to)}")
email.append(f"From: PVC Autobackup System <pvc@{gethostname()}>")
email.append("")
email.append(
f"A PVC autobackup has FAILED at {current_datetime} in {total_time}s due to an execution error."
)
email.append("")
email.append("The reported error message is:")
email.append(f" {error}")
try:
with popen("/usr/sbin/sendmail -t", "w") as p:
p.write("\n".join(email))
except Exception as e:
log_err(f"Failed to send report email: {e}")
def send_execution_summary_report(
celery_conf, config, recipients=None, total_time=0, summary=dict()
):
if recipients is None:
return
from email.utils import formatdate
from socket import gethostname
log_message = f"Sending email summary report to {', '.join(recipients)}"
log_info(celery_conf[0], log_message)
update(
celery_conf[0],
log_message,
current=celery_conf[1] + 1,
total=celery_conf[2],
)
current_datetime = datetime.now()
email_datetime = formatdate(float(current_datetime.strftime("%s")))
email = list()
email.append(f"Date: {email_datetime}")
email.append(f"Subject: PVC Autobackup report for cluster '{config['cluster']}'")
email_to = list()
for recipient in recipients:
email_to.append(f"<{recipient}>")
email.append(f"To: {', '.join(email_to)}")
email.append(f"From: PVC Autobackup System <pvc@{gethostname()}>")
email.append("")
email.append(
f"A PVC autobackup has been completed at {current_datetime} in {total_time}."
)
email.append("")
email.append(
"The following is a summary of all current VM backups after cleanups, most recent first:"
)
email.append("")
for vm_name in summary.keys():
email.append(f"VM: {vm_name}:")
for backup in summary[vm_name]:
datestring = backup.get("datestring")
backup_date = datetime.strptime(datestring, "%Y%m%d%H%M%S")
if backup.get("result", False):
email.append(
f" {backup_date}: Success in {backup.get('runtime_secs', 0)} seconds, ID {backup.get('snapshot_name')}, type {backup.get('type', 'unknown')}"
)
email.append(
f" Backup contains {len(backup.get('export_files'))} files totaling {ceph.format_bytes_tohuman(backup.get('export_size_bytes', 0))} ({backup.get('export_size_bytes', 0)} bytes)"
)
else:
email.append(
f" {backup_date}: Failure in {backup.get('runtime_secs', 0)} seconds, ID {backup.get('snapshot_name')}, type {backup.get('type', 'unknown')}"
)
email.append(f" {backup.get('result_message')}")
try:
with popen("/usr/sbin/sendmail -t", "w") as p:
p.write("\n".join(email))
except Exception as e:
log_err(f"Failed to send report email: {e}")
def run_vm_backup(zkhandler, celery, config, vm_detail, force_full=False):
vm_name = vm_detail["name"]
dom_uuid = vm_detail["uuid"]
backup_suffixed_path = f"{config['backup_root_path']}{config['backup_root_suffix']}"
vm_backup_path = f"{backup_suffixed_path}/{vm_name}"
autobackup_state_file = f"{vm_backup_path}/.autobackup.json"
full_interval = config["backup_schedule"]["full_interval"]
full_retention = config["backup_schedule"]["full_retention"]
if not path.exists(vm_backup_path) or not path.exists(autobackup_state_file):
# There are no existing backups so the list is empty
state_data = dict()
tracked_backups = list()
else:
with open(autobackup_state_file) as fh:
state_data = jload(fh)
tracked_backups = state_data["tracked_backups"]
full_backups = [b for b in tracked_backups if b["type"] == "full"]
if len(full_backups) > 0:
last_full_backup = full_backups[0]
last_full_backup_idx = tracked_backups.index(last_full_backup)
if force_full:
this_backup_incremental_parent = None
this_backup_retain_snapshot = True
elif last_full_backup_idx >= full_interval - 1:
this_backup_incremental_parent = None
this_backup_retain_snapshot = True
else:
this_backup_incremental_parent = last_full_backup["snapshot_name"]
this_backup_retain_snapshot = False
else:
# The very first ackup must be full to start the tree
this_backup_incremental_parent = None
this_backup_retain_snapshot = True
export_type = (
"incremental" if this_backup_incremental_parent is not None else "full"
)
now = datetime.now()
datestring = now.strftime("%Y%m%d%H%M%S")
snapshot_name = f"ab{datestring}"
# Take the VM snapshot (vm.vm_worker_create_snapshot)
snap_list = list()
failure = False
export_files = None
export_files_size = 0
def update_tracked_backups():
# Read export file to get details
backup_json_file = (
f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json"
)
try:
with open(backup_json_file) as fh:
backup_json = jload(fh)
tracked_backups.insert(0, backup_json)
except Exception as e:
log_err(celery, f"Could not open export JSON: {e}")
return list()
state_data["tracked_backups"] = tracked_backups
with open(autobackup_state_file, "w") as fh:
jdump(state_data, fh)
return tracked_backups
def write_backup_summary(success=False, message=""):
ttotal = (datetime.now() - now).total_seconds()
export_details = {
"type": export_type,
"result": success,
"message": message,
"datestring": datestring,
"runtime_secs": ttotal,
"snapshot_name": snapshot_name,
"incremental_parent": this_backup_incremental_parent,
"vm_detail": vm_detail,
"export_files": export_files,
"export_size_bytes": export_files_size,
}
try:
with open(
f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/snapshot.json",
"w",
) as fh:
jdump(export_details, fh)
except Exception as e:
log_err(celery, f"Error exporting snapshot details: {e}")
return False, e
return True, ""
def cleanup_failure():
for snapshot in snap_list:
rbd, snapshot_name = snapshot.split("@")
pool, volume = rbd.split("/")
# We capture no output here, because if this fails too we're in a deep
# error chain and will just ignore it
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
rbd_list = zkhandler.read(("domain.storage.volumes", dom_uuid)).split(",")
for rbd in rbd_list:
pool, volume = rbd.split("/")
ret, msg = ceph.add_snapshot(
zkhandler, pool, volume, snapshot_name, zk_only=False
)
if not ret:
cleanup_failure()
error_message = msg.replace("ERROR: ", "")
log_err(celery, error_message)
failure = True
break
else:
snap_list.append(f"{pool}/{volume}@{snapshot_name}")
if failure:
error_message = (f"[{vm_name}] Error in snapshot export, skipping",)
write_backup_summary(message=error_message)
tracked_backups = update_tracked_backups()
return tracked_backups
# Get the current domain XML
vm_config = zkhandler.read(("domain.xml", dom_uuid))
# Add the snapshot entry to Zookeeper
ret = zkhandler.write(
[
(
(
"domain.snapshots",
dom_uuid,
"domain_snapshot.name",
snapshot_name,
),
snapshot_name,
),
(
(
"domain.snapshots",
dom_uuid,
"domain_snapshot.timestamp",
snapshot_name,
),
now.strftime("%s"),
),
(
(
"domain.snapshots",
dom_uuid,
"domain_snapshot.xml",
snapshot_name,
),
vm_config,
),
(
(
"domain.snapshots",
dom_uuid,
"domain_snapshot.rbd_snapshots",
snapshot_name,
),
",".join(snap_list),
),
]
)
if not ret:
error_message = (f"[{vm_name}] Error in snapshot export, skipping",)
log_err(celery, error_message)
write_backup_summary(message=error_message)
tracked_backups = update_tracked_backups()
return tracked_backups
# Export the snapshot (vm.vm_worker_export_snapshot)
export_target_path = f"{backup_suffixed_path}/{vm_name}/{snapshot_name}/images"
try:
makedirs(export_target_path)
except Exception as e:
error_message = (
f"[{vm_name}] Failed to create target directory '{export_target_path}': {e}",
)
log_err(celery, error_message)
return tracked_backups
def export_cleanup():
from shutil import rmtree
rmtree(f"{backup_suffixed_path}/{vm_name}/{snapshot_name}")
# Set the export filetype
if this_backup_incremental_parent is not None:
export_fileext = "rbddiff"
else:
export_fileext = "rbdimg"
snapshot_volumes = list()
for rbdsnap in snap_list:
pool, _volume = rbdsnap.split("/")
volume, name = _volume.split("@")
ret, snapshots = ceph.get_list_snapshot(
zkhandler, pool, volume, limit=name, is_fuzzy=False
)
if ret:
snapshot_volumes += snapshots
export_files = list()
for snapshot_volume in snapshot_volumes:
snap_pool = snapshot_volume["pool"]
snap_volume = snapshot_volume["volume"]
snap_snapshot_name = snapshot_volume["snapshot"]
snap_size = snapshot_volume["stats"]["size"]
if this_backup_incremental_parent is not None:
retcode, stdout, stderr = run_os_command(
f"rbd export-diff --from-snap {this_backup_incremental_parent} {snap_pool}/{snap_volume}@{snap_snapshot_name} {export_target_path}/{snap_pool}.{snap_volume}.{export_fileext}"
)
if retcode:
error_message = (
f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'",
)
failure = True
break
else:
export_files.append(
(
f"images/{snap_pool}.{snap_volume}.{export_fileext}",
snap_size,
)
)
else:
retcode, stdout, stderr = run_os_command(
f"rbd export --export-format 2 {snap_pool}/{snap_volume}@{snap_snapshot_name} {export_target_path}/{snap_pool}.{snap_volume}.{export_fileext}"
)
if retcode:
error_message = (
f"[{vm_name}] Failed to export snapshot for volume(s) '{snap_pool}/{snap_volume}'",
)
failure = True
break
else:
export_files.append(
(
f"images/{snap_pool}.{snap_volume}.{export_fileext}",
snap_size,
)
)
if failure:
log_err(celery, error_message)
write_backup_summary(message=error_message)
tracked_backups = update_tracked_backups()
return tracked_backups
def get_dir_size(pathname):
total = 0
with scandir(pathname) as it:
for entry in it:
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += get_dir_size(entry.path)
return total
export_files_size = get_dir_size(export_target_path)
ret, e = write_backup_summary(success=True)
if not ret:
error_message = (f"[{vm_name}] Failed to export configuration snapshot: {e}",)
log_err(celery, error_message)
write_backup_summary(message=error_message)
tracked_backups = update_tracked_backups()
return tracked_backups
# Clean up the snapshot (vm.vm_worker_remove_snapshot)
if not this_backup_retain_snapshot:
for snap in snap_list:
rbd, name = snap.split("@")
pool, volume = rbd.split("/")
ret, msg = ceph.remove_snapshot(zkhandler, pool, volume, name)
if not ret:
error_message = msg.replace("ERROR: ", f"[{vm_name}] ")
failure = True
break
if failure:
log_err(celery, error_message)
write_backup_summary(message=error_message)
tracked_backups = update_tracked_backups()
return tracked_backups
ret = zkhandler.delete(
("domain.snapshots", dom_uuid, "domain_snapshot.name", snapshot_name)
)
if not ret:
error_message = (f"[{vm_name}] Failed to remove VM snapshot; continuing",)
log_err(celery, error_message)
marked_for_deletion = list()
# Find any full backups that are expired
found_full_count = 0
for backup in tracked_backups:
if backup["type"] == "full":
found_full_count += 1
if found_full_count > full_retention:
marked_for_deletion.append(backup)
# Find any incremental backups that depend on marked parents
for backup in tracked_backups:
if backup["type"] == "incremental" and backup["incremental_parent"] in [
b["snapshot_name"] for b in marked_for_deletion
]:
marked_for_deletion.append(backup)
if len(marked_for_deletion) > 0:
for backup_to_delete in marked_for_deletion:
ret = vm.vm_worker_remove_snapshot(
zkhandler, None, vm_name, backup_to_delete["snapshot_name"]
)
if ret is False:
error_message = f"Failed to remove obsolete backup snapshot '{backup_to_delete['snapshot_name']}', leaving in tracked backups"
log_err(celery, error_message)
else:
rmtree(f"{vm_backup_path}/{backup_to_delete['snapshot_name']}")
tracked_backups.remove(backup_to_delete)
tracked_backups = update_tracked_backups()
return tracked_backups
def worker_cluster_autobackup(
zkhandler, celery, force_full=False, email_recipients=None
):
config = get_autobackup_configuration()
backup_summary = dict()
current_stage = 0
total_stages = 1
if email_recipients is not None:
total_stages += 1
start(
celery,
f"Starting cluster '{config['cluster']}' VM autobackup",
current=current_stage,
total=total_stages,
)
if not config["autobackup_enabled"]:
message = "Autobackups are not configured on this cluster."
log_info(celery, message)
return finish(
celery,
message,
current=total_stages,
total=total_stages,
)
autobackup_start_time = datetime.now()
retcode, vm_list = vm.get_list(zkhandler)
if not retcode:
error_message = f"Failed to fetch VM list: {vm_list}"
log_err(celery, error_message)
send_execution_failure_report(
(celery, current_stage, total_stages),
config,
recipients=email_recipients,
error=error_message,
)
fail(celery, error_message)
return False
backup_suffixed_path = f"{config['backup_root_path']}{config['backup_root_suffix']}"
if not path.exists(backup_suffixed_path):
makedirs(backup_suffixed_path)
full_interval = config["backup_schedule"]["full_interval"]
backup_vms = list()
for vm_detail in vm_list:
vm_tag_names = [t["name"] for t in vm_detail["tags"]]
matching_tags = (
True
if len(set(vm_tag_names).intersection(set(config["backup_tags"]))) > 0
else False
)
if matching_tags:
backup_vms.append(vm_detail)
if len(backup_vms) < 1:
message = "Found no VMs tagged for autobackup."
log_info(celery, message)
return finish(
celery,
message,
current=total_stages,
total=total_stages,
)
if config["auto_mount_enabled"]:
total_stages += len(config["mount_cmds"])
total_stages += len(config["unmount_cmds"])
total_stages += len(backup_vms)
log_info(
celery,
f"Found {len(backup_vms)} suitable VM(s) for autobackup: {', '.join([b['name'] for b in backup_vms])}",
)
# Handle automount mount commands
if config["auto_mount_enabled"]:
for cmd in config["mount_cmds"]:
current_stage += 1
update(
celery,
f"Executing mount command '{cmd.split()[0]}'",
current=current_stage,
total=total_stages,
)
ret = run(
cmd.split(),
stdout=PIPE,
stderr=PIPE,
)
if ret.returncode != 0:
error_message = f"Failed to execute mount command '{cmd.split()[0]}': {ret.stderr.decode().strip()}"
log_err(celery, error_message)
send_execution_failure_report(
(celery, current_stage, total_stages),
config,
recipients=email_recipients,
total_time=datetime.now() - autobackup_start_time,
error=error_message,
)
fail(celery, error_message)
return False
# Execute the backup: take a snapshot, then export the snapshot
for vm_detail in backup_vms:
vm_backup_path = f"{backup_suffixed_path}/{vm_detail['name']}"
autobackup_state_file = f"{vm_backup_path}/.autobackup.json"
if not path.exists(vm_backup_path) or not path.exists(autobackup_state_file):
# There are no existing backups so the list is empty
state_data = dict()
tracked_backups = list()
else:
with open(autobackup_state_file) as fh:
state_data = jload(fh)
tracked_backups = state_data["tracked_backups"]
full_backups = [b for b in tracked_backups if b["type"] == "full"]
if len(full_backups) > 0:
last_full_backup = full_backups[0]
last_full_backup_idx = tracked_backups.index(last_full_backup)
if force_full:
this_backup_incremental_parent = None
elif last_full_backup_idx >= full_interval - 1:
this_backup_incremental_parent = None
else:
this_backup_incremental_parent = last_full_backup["snapshot_name"]
else:
# The very first ackup must be full to start the tree
this_backup_incremental_parent = None
export_type = (
"incremental" if this_backup_incremental_parent is not None else "full"
)
current_stage += 1
update(
celery,
f"Performing autobackup of VM {vm_detail['name']} ({export_type})",
current=current_stage,
total=total_stages,
)
summary = run_vm_backup(
zkhandler,
celery,
config,
vm_detail,
force_full=force_full,
)
backup_summary[vm_detail["name"]] = summary
# Handle automount unmount commands
if config["auto_mount_enabled"]:
for cmd in config["unmount_cmds"]:
current_stage += 1
update(
celery,
f"Executing unmount command '{cmd.split()[0]}'",
current=current_stage,
total=total_stages,
)
ret = run(
cmd.split(),
stdout=PIPE,
stderr=PIPE,
)
if ret.returncode != 0:
error_message = f"Failed to execute unmount command '{cmd.split()[0]}': {ret.stderr.decode().strip()}"
log_err(celery, error_message)
send_execution_failure_report(
(celery, current_stage, total_stages),
config,
recipients=email_recipients,
total_time=datetime.now() - autobackup_start_time,
error=error_message,
)
fail(celery, error_message)
return False
autobackup_end_time = datetime.now()
autobackup_total_time = autobackup_end_time - autobackup_start_time
if email_recipients is not None:
send_execution_summary_report(
(celery, current_stage, total_stages),
config,
recipients=email_recipients,
total_time=autobackup_total_time,
summary=backup_summary,
)
current_stage += 1
current_stage += 1
return finish(
celery,
f"Successfully completed cluster '{config['cluster']}' VM autobackup",
current=current_stage,
total=total_stages,
)

View File

@ -406,6 +406,78 @@ def get_configuration():
return config return config
def get_parsed_autobackup_configuration(config_file):
"""
Load the configuration; this is the same main pvc.conf that the daemons read
"""
print('Loading configuration from file "{}"'.format(config_file))
with open(config_file, "r") as cfgfh:
try:
o_config = yaml.load(cfgfh, Loader=yaml.SafeLoader)
except Exception as e:
print(f"ERROR: Failed to parse configuration file: {e}")
os._exit(1)
config = dict()
try:
o_cluster = o_config["cluster"]
config_cluster = {
"cluster": o_cluster["name"],
"autobackup_enabled": True,
}
config = {**config, **config_cluster}
o_autobackup = o_config["autobackup"]
if o_autobackup is None:
config["autobackup_enabled"] = False
return config
config_autobackup = {
"backup_root_path": o_autobackup["backup_root_path"],
"backup_root_suffix": o_autobackup["backup_root_suffix"],
"backup_tags": o_autobackup["backup_tags"],
"backup_schedule": o_autobackup["backup_schedule"],
}
config = {**config, **config_autobackup}
o_automount = o_autobackup["auto_mount"]
config_automount = {
"auto_mount_enabled": o_automount["enabled"],
}
config = {**config, **config_automount}
if config["auto_mount_enabled"]:
config["mount_cmds"] = list()
for _mount_cmd in o_automount["mount_cmds"]:
if "{backup_root_path}" in _mount_cmd:
_mount_cmd = _mount_cmd.format(
backup_root_path=config["backup_root_path"]
)
config["mount_cmds"].append(_mount_cmd)
config["unmount_cmds"] = list()
for _unmount_cmd in o_automount["unmount_cmds"]:
if "{backup_root_path}" in _unmount_cmd:
_unmount_cmd = _unmount_cmd.format(
backup_root_path=config["backup_root_path"]
)
config["unmount_cmds"].append(_unmount_cmd)
except Exception as e:
raise MalformedConfigurationError(e)
return config
def get_autobackup_configuration():
"""
Get the configuration.
"""
pvc_config_file = get_configuration_path()
config = get_parsed_autobackup_configuration(pvc_config_file)
return config
def validate_directories(config): def validate_directories(config):
if not os.path.exists(config["dynamic_directory"]): if not os.path.exists(config["dynamic_directory"]):
os.makedirs(config["dynamic_directory"]) os.makedirs(config["dynamic_directory"])

View File

@ -2074,7 +2074,11 @@ def vm_worker_detach_device(zkhandler, celery, domain, xml_spec):
def vm_worker_create_snapshot( def vm_worker_create_snapshot(
zkhandler, celery, domain, snapshot_name=None, zk_only=False zkhandler,
celery,
domain,
snapshot_name=None,
zk_only=False,
): ):
if snapshot_name is None: if snapshot_name is None:
now = datetime.now() now = datetime.now()
@ -2117,7 +2121,7 @@ def vm_worker_create_snapshot(
# Get the list of all RBD volumes # Get the list of all RBD volumes
rbd_list = zkhandler.read(("domain.storage.volumes", dom_uuid)).split(",") rbd_list = zkhandler.read(("domain.storage.volumes", dom_uuid)).split(",")
total_stages = 2 + len(rbd_list) total_stages += 1 + len(rbd_list)
snap_list = list() snap_list = list()
@ -2218,7 +2222,12 @@ def vm_worker_create_snapshot(
) )
def vm_worker_remove_snapshot(zkhandler, celery, domain, snapshot_name): def vm_worker_remove_snapshot(
zkhandler,
celery,
domain,
snapshot_name,
):
current_stage = 0 current_stage = 0
total_stages = 1 total_stages = 1
start( start(
@ -2275,7 +2284,7 @@ def vm_worker_remove_snapshot(zkhandler, celery, domain, snapshot_name):
current_stage += 1 current_stage += 1
update( update(
celery, celery,
"Deleting VM configuration snapshot", "Removing VM configuration snapshot",
current=current_stage, current=current_stage,
total=total_stages, total=total_stages,
) )
@ -2286,7 +2295,7 @@ def vm_worker_remove_snapshot(zkhandler, celery, domain, snapshot_name):
if not ret: if not ret:
fail( fail(
celery, celery,
f'Failed to delete snapshot "{snapshot_name}" of VM "{domain}" in Zookeeper.', f'Failed to remove snapshot "{snapshot_name}" of VM "{domain}" from Zookeeper',
) )
return False return False
@ -2398,7 +2407,12 @@ def vm_worker_rollback_snapshot(zkhandler, celery, domain, snapshot_name):
def vm_worker_export_snapshot( def vm_worker_export_snapshot(
zkhandler, celery, domain, snapshot_name, export_path, incremental_parent=None zkhandler,
celery,
domain,
snapshot_name,
export_path,
incremental_parent=None,
): ):
current_stage = 0 current_stage = 0
total_stages = 1 total_stages = 1
@ -2422,7 +2436,7 @@ def vm_worker_export_snapshot(
if not os.path.isdir(export_path): if not os.path.isdir(export_path):
fail( fail(
celery, celery,
f"ERROR: Target path '{export_path}' does not exist on node '{myhostname}'", f"Target path '{export_path}' does not exist on node '{myhostname}'",
) )
return False return False
@ -2674,9 +2688,7 @@ def vm_worker_import_snapshot(
# Ensure that the archives are present # Ensure that the archives are present
export_source_snapshot_file = f"{vm_import_path}/{snapshot_name}/snapshot.json" export_source_snapshot_file = f"{vm_import_path}/{snapshot_name}/snapshot.json"
if not os.path.isfile(export_source_snapshot_file): if not os.path.isfile(export_source_snapshot_file):
fail( fail(celery, f"The specified source export '{snapshot_name}' do not exist")
celery, f"ERROR: The specified source export '{snapshot_name}' do not exist"
)
return False return False
# Read the export file and get VM details # Read the export file and get VM details
@ -3060,13 +3072,13 @@ def vm_worker_import_snapshot(
if not retcode: if not retcode:
fail( fail(
celery, celery,
f"ERROR: Failed to define imported VM: {retmsg}", f"Failed to define imported VM: {retmsg}",
) )
return False return False
except Exception as e: except Exception as e:
fail( fail(
celery, celery,
f"ERROR: Failed to parse VM export details: {e}", f"Failed to parse VM export details: {e}",
) )
return False return False
@ -3095,7 +3107,7 @@ def vm_worker_import_snapshot(
if not retcode: if not retcode:
fail( fail(
celery, celery,
f"ERROR: Failed to start imported VM {domain}: {retmsg}", f"Failed to start imported VM {domain}: {retmsg}",
) )
return False return False

View File

@ -47,6 +47,9 @@ from daemon_lib.benchmark import (
from daemon_lib.vmbuilder import ( from daemon_lib.vmbuilder import (
worker_create_vm, worker_create_vm,
) )
from daemon_lib.autobackup import (
worker_cluster_autobackup,
)
# Daemon version # Daemon version
version = "0.9.98" version = "0.9.98"
@ -101,6 +104,21 @@ def storage_benchmark(self, pool=None, run_on="primary"):
return run_storage_benchmark(self, pool) return run_storage_benchmark(self, pool)
@celery.task(name="cluster.autobackup", bind=True, routing_key="run_on")
def cluster_autobackup(self, force_full=False, email_recipients=None, run_on="primary"):
@ZKConnection(config)
def run_cluster_autobackup(
zkhandler, self, force_full=False, email_recipients=None
):
return worker_cluster_autobackup(
zkhandler, self, force_full=force_full, email_recipients=email_recipients
)
return run_cluster_autobackup(
self, force_full=force_full, email_recipients=email_recipients
)
@celery.task(name="vm.flush_locks", bind=True, routing_key="run_on") @celery.task(name="vm.flush_locks", bind=True, routing_key="run_on")
def vm_flush_locks(self, domain=None, force_unlock=False, run_on="primary"): def vm_flush_locks(self, domain=None, force_unlock=False, run_on="primary"):
@ZKConnection(config) @ZKConnection(config)