Add support for full VM backups

Adds support for exporting full VM backups, including configuration,
metainfo, and RBD disk images, with incremental support.
This commit is contained in:
Joshua Boniface 2023-10-17 10:15:06 -04:00
parent 6e83300d78
commit b997c6f31e
5 changed files with 323 additions and 1 deletions

View File

@ -2140,7 +2140,7 @@ class API_VM_Locks(Resource):
api.add_resource(API_VM_Locks, "/vm/<vm>/locks")
# /vm/<vm</console
# /vm/<vm>/console
class API_VM_Console(Resource):
@RequestParser([{"name": "lines"}])
@Authenticator
@ -2293,6 +2293,77 @@ class API_VM_Device(Resource):
api.add_resource(API_VM_Device, "/vm/<vm>/device")
# /vm/<vm>/backup
class API_VM_Backup(Resource):
@RequestParser(
[
{
"name": "target_path",
"required": True,
"helptext": "A local filesystem path on the primary coordinator must be specified",
},
{
"name": "incremental_parent",
"required": False,
},
{
"name": "retain_snapshots",
"required": False,
},
]
)
@Authenticator
def get(self, vm, reqargs):
"""
Create a backup of {vm} and its volumes to a local primary coordinator filesystem path
---
tags:
- vm
parameters:
- in: query
name: target_path
type: string
required: true
description: A local filesystem path on the primary coordinator to store the backup
- in: query
name: incremental_parent
type: string
required: false
description: A previous backup datestamp to use as an incremental parent; if unspecified a full backup is taken
- in: query
name: retain_snapshots
type: boolean
required: false
default: false
description: Whether or not to retain this backup's volume snapshots to use as a future incremental parent
responses:
200:
description: OK
schema:
type: object
id: Message
400:
description: Execution error
schema:
type: object
id: Message
404:
description: Not found
schema:
type: object
id: Message
"""
target_path = reqargs.get("target_path", None)
incremental_parent = reqargs.get("incremental_parent", None)
retain_snapshots = bool(strtobool(reqargs.get("retain_snapshots", "false")))
return api_helper.backup_vm(
vm, target_path, incremental_parent, retain_snapshots
)
api.add_resource(API_VM_Backup, "/vm/<vm>/backup")
##########################################################
# Client API - Network
##########################################################

View File

@ -470,6 +470,34 @@ def vm_define(
return output, retcode
@ZKConnection(config)
def vm_backup(
zkhandler,
domain,
target_path,
incremental_parent=None,
retain_snapshots=False,
):
"""
Back up a VM to a local (primary coordinator) filesystem path.
"""
retflag, retdata = pvc_vm.backup_vm(
zkhandler,
domain,
target_path,
incremental_parent,
retain_snapshots,
)
if retflag:
retcode = 200
else:
retcode = 400
output = {"message": retdata.replace('"', "'")}
return output, retcode
@ZKConnection(config)
def vm_attach_device(zkhandler, vm, device_spec_xml):
"""

View File

@ -1590,6 +1590,48 @@ def cli_vm_flush_locks(domain):
finish(retcode, retmsg)
###############################################################################
# > pvc vm backup
###############################################################################
@click.command(name="backup", short_help="Create a backup of a virtual machine.")
@connection_req
@click.argument("domain")
@click.argument("target_path")
@click.option(
"-i",
"--incremental" "incremental_parent",
default=None,
help="Perform an incremental volume backup from this parent backup datestring.",
)
@click.option(
"-r",
"--retain-snapshots",
"retain_snapshots",
is_flag=True,
default=False,
help="Retain volume snapshots for future incremental use.",
)
def cli_vm_backup(domain, target_path, incremental_parent, retain_snapshots):
"""
Create a backup of virtual machine DOMAIN to TARGET_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
TARGET_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing writes from the API daemon (normally running as "root"). The TARGET_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage.
The backup will export the VM configuration, metainfo, and a point-in-time snapshot of all attached RBD volumes, using a datestring formatted backup name (i.e. YYYYMMDDHHMMSS).
The virtual machine DOMAIN may be running, and due to snapshots the backup should be crash-consistent, but will be in an unclean state and this must be considered when restoring from backups.
Incremental snapshots are possible by specifying the "-i"/"--incremental" option along with a source backup datestring. The snapshots from that source backup must have been retained using the "-r"/"--retain-snapshots" option. Arbitrary snapshots, assuming they are valid for all attached RBD volumes, may also be used, as long as they are prefixed with "backup_". Retaining snapshots of incremental backups is supported, though it is not recommended to "chain" incremental backups in this way as it can make managing restores more difficult.
Full backup volume images are sparse-allocated, however it is recommended for safety to consider their maximum allocated size when allocated space for the TARGET_PATH. Incremental volume images are generally small but are dependent entirely on the rate of data change in each volume.
"""
retcode, retmsg = pvc.lib.vm.vm_backup(
CLI_CONFIG, domain, target_path, incremental_parent, retain_snapshots
)
finish(retcode, retmsg)
###############################################################################
# > pvc vm tag
###############################################################################
@ -5659,6 +5701,7 @@ cli_vm.add_command(cli_vm_move)
cli_vm.add_command(cli_vm_migrate)
cli_vm.add_command(cli_vm_unmigrate)
cli_vm.add_command(cli_vm_flush_locks)
cli_vm.add_command(cli_vm_backup)
cli_vm_tag.add_command(cli_vm_tag_get)
cli_vm_tag.add_command(cli_vm_tag_add)
cli_vm_tag.add_command(cli_vm_tag_remove)

View File

@ -433,6 +433,27 @@ def vm_locks(config, vm):
return retstatus, response.json().get("message", "")
def vm_backup(config, vm, target_path, incremental_parent=None, retain_snapshots=False):
"""
Create a backup of {vm} and its volumes to a local primary coordinator filesystem path
API endpoint: GET /vm/{vm}/backup
API arguments: target_path={target_path}, incremental_parent={incremental_parent}, retain_snapshots={retain_snapshots}
API schema: {"message":"{data}"}
"""
params = {
"target_path": target_path,
"incremental_parent": incremental_parent,
"retain_snapshots": retain_snapshots,
}
response = call_api(config, "get", "/vm/{vm}/backup".format(vm=vm), params=params)
if response.status_code != 200:
return False, response.json().get("message", "")
else:
return True, response.json().get("message", "")
def vm_vcpus_set(config, vm, vcpus, topology, restart):
"""
Set the vCPU count of the VM with topology

View File

@ -21,12 +21,15 @@
import time
import re
import os.path
import lxml.objectify
import lxml.etree
from distutils.util import strtobool
from uuid import UUID
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from json import dump as jdump
import daemon_lib.common as common
@ -1297,3 +1300,159 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
pass
return True, sorted(vm_data_list, key=lambda d: d["name"])
def backup_vm(
zkhandler, domain, target_path, incremental_parent=None, retain_snapshots=False
):
# 0. Validations
# Validate that VM exists in cluster
dom_uuid = getDomainUUID(zkhandler, domain)
if not dom_uuid:
return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)
# Validate that the target path exists
if not re.match(r"^/", target_path):
return (
False,
f"ERROR: Target path {target_path} is not a valid absolute path on the primary coordinator!",
)
# Ensure that target_path (on this node) exists
if not os.path.isdir(target_path):
return False, f"ERROR: Target path {target_path} does not exist!"
# 1. Get information about VM
vm_detail = get_list(zkhandler, limit=dom_uuid, is_fuzzy=False)[0]
vm_volumes = [
tuple(d["name"].split("/")) for d in vm_detail["disks"] if d["type"] == "rbd"
]
# 2a. Validate that all volumes exist (they should, but just in case)
for pool, volume in vm_volumes:
if not ceph.verifyVolume(zkhandler, pool, volume):
return (
False,
f"ERROR: VM defines a volume {pool}/{volume} which does not exist!",
)
# 2b. Validate that, if an incremental_parent is given, it is valid
# The incremental parent is just a datestring
if incremental_parent is not None:
for pool, volume in vm_volumes:
if not ceph.verifySnapshot(
zkhandler, pool, volume, f"backup_{incremental_parent}"
):
return (
False,
f"ERROR: Incremental parent {incremental_parent} given, but no snapshot {pool}/{volume}@backup_{incremental_parent} was found; cannot export an incremental backup.",
)
export_fileext = "rbddiff"
else:
export_fileext = "rbdimg"
# 3. Set datestring in YYYYMMDDHHMMSS format
now = datetime.now()
datestring = f"{now.year}{now.month}{now.day}{now.hour}{now.minute}{now.second}"
snapshot_name = f"backup_{datestring}"
# 4. Create destination directory
vm_target_root = f"{target_path}/{domain}"
vm_target_backup = f"{target_path}/{domain}/.{datestring}"
if not os.path.isdir(vm_target_backup):
try:
os.makedirs(vm_target_backup)
except Exception as e:
return False, f"ERROR: Failed to create backup directory: {e}"
# 5. Take snapshot of each disks with the name @backup_{datestring}
is_snapshot_create_failed = False
which_snapshot_create_failed = list()
msg_snapshot_create_failed = list()
for pool, volume in vm_volumes:
retcode, retmsg = ceph.add_snapshot(zkhandler, pool, volume, snapshot_name)
if not retcode:
is_snapshot_create_failed = True
which_snapshot_create_failed.append(f"{pool}/{volume}")
msg_snapshot_create_failed.append(retmsg)
if is_snapshot_create_failed:
for pool, volume in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
return (
False,
f'ERROR: Failed to create snapshot for volume(s) {", ".join(which_snapshot_create_failed)}: {", ".join(msg_snapshot_create_failed)}',
)
# 6. Dump snapshot to folder with `rbd export` (full) or `rbd export-diff` (incremental)
is_snapshot_export_failed = False
which_snapshot_export_failed = list()
msg_snapshot_export_failed = list()
for pool, volume in vm_volumes:
if incremental_parent is not None:
incremental_parent_snapshot_name = f"backup_{incremental_parent}"
retcode, stdout, stderr = common.run_os_command(
f"rbd export-diff --from-snap {incremental_parent_snapshot_name} {pool}/{volume}@{snapshot_name} {vm_target_backup}/{volume}.{export_fileext}"
)
if retcode:
is_snapshot_export_failed = True
which_snapshot_export_failed.append(f"{pool}/{volume}")
msg_snapshot_export_failed.append(stderr)
else:
retcode, stdout, stderr = common.run_os_command(
f"rbd export --export-format 2 {pool}/{volume}@{snapshot_name} {vm_target_backup}/{volume}.{export_fileext}"
)
if retcode:
is_snapshot_export_failed = True
which_snapshot_export_failed.append(f"{pool}/{volume}")
msg_snapshot_export_failed.append(stderr)
if is_snapshot_export_failed:
for pool, volume in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
return (
False,
f'ERROR: Failed to export snapshot for volume(s) {", ".join(which_snapshot_export_failed)}: {", ".join(msg_snapshot_export_failed)}',
)
# 7. Create and dump VM backup information
vm_backup = {
"type": "incremental" if incremental_parent is not None else "full",
"datestring": datestring,
"incremental_parent": incremental_parent,
"vm_detail": vm_detail,
"backup_files": [f".{datestring}/{v}.{export_fileext}" for p, v in vm_volumes],
}
with open(f"{vm_target_root}/{domain}.{datestring}.pvcbackup", "w") as fh:
jdump(fh, vm_backup)
# 8. Remove snapshots if retain_snapshot is False
if not retain_snapshots:
is_snapshot_remove_failed = False
which_snapshot_remove_failed = list()
msg_snapshot_remove_failed = list()
for pool, volume in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
retcode, retmsg = ceph.remove_snapshot(
zkhandler, pool, volume, snapshot_name
)
if not retcode:
is_snapshot_remove_failed = True
which_snapshot_remove_failed.append(f"{pool}/{volume}")
msg_snapshot_remove_failed.append(retmsg)
if is_snapshot_remove_failed:
for pool, volume in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
return (
True,
f'WARNING: Successfully backed up VM {domain} @ {datestring} to {target_path}, but failed to remove snapshot as requested for volume(s) {", ".join(which_snapshot_remove_failed)}: {", ".join(msg_snapshot_remove_failed)}',
)
return True, f"Successfully backed up VM {domain} @ {datestring} to {target_path}"