Complete VM restore functionality

This commit is contained in:
Joshua Boniface 2023-10-23 22:23:17 -04:00
parent d3b3fdfc80
commit 8d256a1737
5 changed files with 329 additions and 74 deletions

View File

@ -2313,7 +2313,7 @@ class API_VM_Backup(Resource):
]
)
@Authenticator
def get(self, vm, reqargs):
def post(self, vm, reqargs):
"""
Create a backup of {vm} and its volumes to a local primary coordinator filesystem path
---
@ -2364,6 +2364,67 @@ class API_VM_Backup(Resource):
api.add_resource(API_VM_Backup, "/vm/<vm>/backup")
# /vm/<vm>/restore
class API_VM_Restore(Resource):
@RequestParser(
[
{
"name": "target_path",
"required": True,
"helptext": "A local filesystem path on the primary coordinator must be specified",
},
{
"name": "backup_datestring",
"required": True,
"helptext": "A backup datestring must be specified",
}
]
)
@Authenticator
def post(self, vm, reqargs):
"""
Restore a backup of {vm} and its volumes from a local primary coordinator filesystem path
---
tags:
- vm
parameters:
- in: query
name: target_path
type: string
required: true
description: A local filesystem path on the primary coordinator where the backup is stored
- in: query
name: backup_datestring
type: string
required: true
description: The backup datestring identifier (e.g. 20230102030405)
responses:
200:
description: OK
schema:
type: object
id: Message
400:
description: Execution error
schema:
type: object
id: Message
404:
description: Not found
schema:
type: object
id: Message
"""
target_path = reqargs.get("target_path", None)
backup_datestring = reqargs.get("backup_datestring", None)
return api_helper.vm_restore(
vm, target_path, backup_datestring
)
api.add_resource(API_VM_Restore, "/vm/<vm>/restore")
##########################################################
# Client API - Network
##########################################################

View File

@ -498,6 +498,32 @@ def vm_backup(
return output, retcode
@ZKConnection(config)
def vm_restore(
zkhandler,
domain,
target_path,
datestring,
):
"""
Restore a VM from a local (primary coordinator) filesystem path.
"""
retflag, retdata = pvc_vm.restore_vm(
zkhandler,
domain,
target_path,
datestring,
)
if retflag:
retcode = 200
else:
retcode = 400
output = {"message": retdata.replace('"', "'")}
return output, retcode
@ZKConnection(config)
def vm_attach_device(zkhandler, vm, device_spec_xml):
"""

View File

@ -1642,6 +1642,40 @@ def cli_vm_backup(domain, target_path, incremental_parent, retain_snapshots):
finish(retcode, retmsg)
###############################################################################
# > pvc vm restore
###############################################################################
@click.command(name="restore", short_help="Restore a backup of a virtual machine.")
@connection_req
@click.argument("domain")
@click.argument("backup_datestring")
@click.argument("target_path")
def cli_vm_restore(domain, backup_datestring, target_path):
"""
Restore the backup BACKUP_DATESTRING of virtual machine DOMAIN stored in TARGET_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
TARGET_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing reads from the API daemon (normally running as "root"). The TARGET_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage.
The restore will import the VM configuration, metainfo, and the point-in-time snapshot of all attached RBD volumes. Incremental backups will be automatically handled.
A VM named DOMAIN must not exist; if the VM already exists, it must be removed before restoring. Renaming is not sufficient as the UUID will remain the same.
"""
echo(
CLI_CONFIG,
f"Restoring backup {backup_datestring} of VM '{domain}'... ",
newline=False,
)
retcode, retmsg = pvc.lib.vm.vm_restore(
CLI_CONFIG, domain, target_path, backup_datestring
)
if retcode:
echo(CLI_CONFIG, "done.")
else:
echo(CLI_CONFIG, "failed.")
finish(retcode, retmsg)
###############################################################################
# > pvc vm tag
###############################################################################
@ -5712,6 +5746,7 @@ cli_vm.add_command(cli_vm_migrate)
cli_vm.add_command(cli_vm_unmigrate)
cli_vm.add_command(cli_vm_flush_locks)
cli_vm.add_command(cli_vm_backup)
cli_vm.add_command(cli_vm_restore)
cli_vm_tag.add_command(cli_vm_tag_get)
cli_vm_tag.add_command(cli_vm_tag_add)
cli_vm_tag.add_command(cli_vm_tag_remove)

View File

@ -437,7 +437,7 @@ def vm_backup(config, vm, target_path, incremental_parent=None, retain_snapshots
"""
Create a backup of {vm} and its volumes to a local primary coordinator filesystem path
API endpoint: GET /vm/{vm}/backup
API endpoint: POST /vm/{vm}/backup
API arguments: target_path={target_path}, incremental_parent={incremental_parent}, retain_snapshots={retain_snapshots}
API schema: {"message":"{data}"}
"""
@ -446,7 +446,27 @@ def vm_backup(config, vm, target_path, incremental_parent=None, retain_snapshots
"incremental_parent": incremental_parent,
"retain_snapshots": retain_snapshots,
}
response = call_api(config, "get", "/vm/{vm}/backup".format(vm=vm), params=params)
response = call_api(config, "post", "/vm/{vm}/backup".format(vm=vm), params=params)
if response.status_code != 200:
return False, response.json().get("message", "")
else:
return True, response.json().get("message", "")
def vm_restore(config, vm, target_path, backup_datestring):
"""
Restore a backup of {vm} and its volumes from a local primary coordinator filesystem path
API endpoint: POST /vm/{vm}/restore
API arguments: target_path={target_path}, backup_datestring={backup_datestring}
API schema: {"message":"{data}"}
"""
params = {
"target_path": target_path,
"backup_datestring": backup_datestring,
}
response = call_api(config, "post", "/vm/{vm}/restore".format(vm=vm), params=params)
if response.status_code != 200:
return False, response.json().get("message", "")

View File

@ -29,6 +29,7 @@ from distutils.util import strtobool
from uuid import UUID
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from socket import gethostname
from json import dump as jdump
from json import load as jload
@ -1333,12 +1334,30 @@ def backup_vm(
if not isinstance(vm_detail, dict):
return False, f"ERROR: VM listing returned invalid data: {vm_detail}"
vm_volumes = [
tuple(d["name"].split("/")) for d in vm_detail["disks"] if d["type"] == "rbd"
]
vm_volumes = list()
for disk in vm_detail["disks"]:
if disk["type"] != "rbd":
continue
pool, volume = disk["name"].split('/')
retcode, retdata = ceph.get_list_volume(zkhandler, pool, volume, is_fuzzy=False)
if not retcode or len(retdata) != 1:
if len(retdata) < 1:
retdata = "No volumes returned."
elif len(retdata) > 1:
retdata = "Multiple volumes returned."
return False, f"ERROR: Failed to get volume details for {pool}/{volume}: {retdata}"
try:
size = retdata[0]["stats"]["size"]
except Exception as e:
return False, f"ERROR: Failed to get volume size for {pool}/{volume}: {e}"
vm_volumes.append((pool, volume, size))
# 2a. Validate that all volumes exist (they should, but just in case)
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
if not ceph.verifyVolume(zkhandler, pool, volume):
return (
False,
@ -1348,7 +1367,7 @@ def backup_vm(
# 2b. Validate that, if an incremental_parent is given, it is valid
# The incremental parent is just a datestring
if incremental_parent is not None:
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
if not ceph.verifySnapshot(
zkhandler, pool, volume, f"backup_{incremental_parent}"
):
@ -1372,7 +1391,7 @@ def backup_vm(
# 4. Create destination directory
vm_target_root = f"{target_path}/{domain}"
vm_target_backup = f"{target_path}/{domain}/.{datestring}"
vm_target_backup = f"{target_path}/{domain}/{domain}.{datestring}.pvcdisks"
if not os.path.isdir(vm_target_backup):
try:
os.makedirs(vm_target_backup)
@ -1383,7 +1402,7 @@ def backup_vm(
is_snapshot_create_failed = False
which_snapshot_create_failed = list()
msg_snapshot_create_failed = list()
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
retcode, retmsg = ceph.add_snapshot(zkhandler, pool, volume, snapshot_name)
if not retcode:
is_snapshot_create_failed = True
@ -1391,7 +1410,7 @@ def backup_vm(
msg_snapshot_create_failed.append(retmsg)
if is_snapshot_create_failed:
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
return (
@ -1403,11 +1422,11 @@ def backup_vm(
is_snapshot_export_failed = False
which_snapshot_export_failed = list()
msg_snapshot_export_failed = list()
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
if incremental_parent is not None:
incremental_parent_snapshot_name = f"backup_{incremental_parent}"
retcode, stdout, stderr = common.run_os_command(
f"rbd export-diff --from-snap {incremental_parent_snapshot_name} {pool}/{volume}@{snapshot_name} {vm_target_backup}/{volume}.{export_fileext}"
f"rbd export-diff --from-snap {incremental_parent_snapshot_name} {pool}/{volume}@{snapshot_name} {vm_target_backup}/{pool}.{volume}.{export_fileext}"
)
if retcode:
is_snapshot_export_failed = True
@ -1415,7 +1434,7 @@ def backup_vm(
msg_snapshot_export_failed.append(stderr)
else:
retcode, stdout, stderr = common.run_os_command(
f"rbd export --export-format 2 {pool}/{volume}@{snapshot_name} {vm_target_backup}/{volume}.{export_fileext}"
f"rbd export --export-format 2 {pool}/{volume}@{snapshot_name} {vm_target_backup}/{pool}.{volume}.{export_fileext}"
)
if retcode:
is_snapshot_export_failed = True
@ -1423,7 +1442,7 @@ def backup_vm(
msg_snapshot_export_failed.append(stderr)
if is_snapshot_export_failed:
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
return (
@ -1438,7 +1457,7 @@ def backup_vm(
"datestring": datestring,
"incremental_parent": incremental_parent,
"vm_detail": vm_detail,
"backup_files": [f".{datestring}/{v}.{export_fileext}" for p, v in vm_volumes],
"backup_files": [(f"{domain}.{datestring}.pvcdisks/{p}.{v}.{export_fileext}", s) for p, v, s in vm_volumes],
}
with open(f"{vm_target_root}/{domain}.{datestring}.pvcbackup", "w") as fh:
jdump(vm_backup, fh)
@ -1448,7 +1467,7 @@ def backup_vm(
which_snapshot_remove_failed = list()
msg_snapshot_remove_failed = list()
if not retain_snapshots:
for pool, volume in vm_volumes:
for pool, volume, _ in vm_volumes:
if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
retcode, retmsg = ceph.remove_snapshot(
zkhandler, pool, volume, snapshot_name
@ -1460,18 +1479,21 @@ def backup_vm(
tend = time.time()
ttot = round(tend - tstart, 2)
retlines = list()
if is_snapshot_remove_failed:
retmsg = f"WARNING: Successfully backed up VM '{domain}' ({backup_type}@{datestring}) to '{target_path}' in {ttot} seconds, but failed to remove snapshot as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}"
elif retain_snapshots:
retmsg = f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}, snapshots retained) to '{target_path}' in {ttot} seconds."
retlines.append(f"WARNING: Failed to remove snapshot as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}")
myhostname = gethostname().split(".")[0]
if retain_snapshots:
retlines.append(f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}, snapshots retained) to '{myhostname}:{target_path}' in {ttot}s.")
else:
retmsg = f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}) to '{target_path}' in {ttot} seconds."
retlines.append(f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}) to '{myhostname}:{target_path}' in {ttot}s.")
return True, retmsg
return True, '\n'.join(retlines)
def restore_vm(zkhandler, domain, datestring, source_path, incremental_parent=None):
def restore_vm(zkhandler, domain, source_path, datestring):
tstart = time.time()
@ -1496,67 +1518,158 @@ def restore_vm(zkhandler, domain, datestring, source_path, incremental_parent=No
return False, f"ERROR: Source path {source_path} does not exist!"
# Ensure that domain path (on this node) exists
vm_source_path = f"{source_path}/{domain}"
if not os.path.isdir(vm_source_path):
return False, f"ERROR: Source VM path {vm_source_path} does not exist!"
backup_source_path = f"{source_path}/{domain}"
if not os.path.isdir(backup_source_path):
return False, f"ERROR: Source VM path {backup_source_path} does not exist!"
# Ensure that the archives are present
vm_source_pvcbackup_file = f"{vm_source_path}/{domain}.{datestring}.pvcbackup"
vm_source_pvcdisks_file = f"{vm_source_path}/{domain}.{datestring}.pvcdisks"
if not os.path.isfile(vm_source_pvcbackup_file) or not os.path.isfile(
vm_source_pvcdisks_file
):
return False, f"ERROR: The specified source backup files do not exist!"
if incremental_parent is not None:
vm_source_parent_pvcbackup_file = (
f"{vm_source_path}/{domain}.{incremental_parent}.pvcbackup"
)
vm_source_parent_pvcdisks_file = (
f"{vm_source_path}/{domain}.{incremental_parent}.pvcdisks"
)
if not os.path.isfile(vm_source_parent_pvcbackup_file) or not os.path.isfile(
vm_source_parent_pvcdisks_file
):
return (
False,
f"ERROR: The specified incremental parent source backup files do not exist!",
)
backup_source_pvcbackup_file = f"{backup_source_path}/{domain}.{datestring}.pvcbackup"
if not os.path.isfile(backup_source_pvcbackup_file):
return False, "ERROR: The specified source backup files do not exist!"
# 1. Read the backup file and get VM details
try:
with open(vm_source_pvcbackup_file) as fh:
vm_source_details = jload(fh)
with open(backup_source_pvcbackup_file) as fh:
backup_source_details = jload(fh)
except Exception as e:
return False, f"ERROR: Failed to read source backup details: {e}"
# 2. Import VM config and metadata in provision state
vm_config_xml = vm_source_details.get("xml")
vm_config_meta = {
"node": vm_source_details.get("node"),
"node_limit": vm_source_details.get("node_limit"),
"node_selector": vm_source_details.get("node_selector"),
"node_autostart": vm_source_details.get("node_autostart"),
"migration_method": vm_source_details.get("migration_method"),
"tags": vm_source_details.get("tags"),
"description": vm_source_details.get("description"),
"profile": vm_source_details.get("profile"),
}
define_vm(
zkhandler,
vm_confing_xml,
vm_source_details.get("node"),
vm_source_details.get("node_limit"),
vm_source_details.get("node_selector"),
vm_source_details.get("node_autostart"),
vm_source_details.get("migration_method"),
vm_source_details.get("profile"),
vm_source_details.get("tags"),
"restore",
# Handle incrementals
incremental_parent = backup_source_details.get("incremental_parent", None)
if incremental_parent is not None:
backup_source_parent_pvcbackup_file = (
f"{backup_source_path}/{domain}.{incremental_parent}.pvcbackup"
)
if not os.path.isfile(backup_source_parent_pvcbackup_file):
return (
False,
"ERROR: The specified backup is incremental but the required incremental parent source backup files do not exist!",
)
# 4. Import parent snapshot disks (if applicable)
try:
with open(backup_source_parent_pvcbackup_file) as fh:
backup_source_parent_details = jload(fh)
except Exception as e:
return False, f"ERROR: Failed to read source incremental parent backup details: {e}"
# 2. Import VM config and metadata in provision state
try:
retcode, retmsg = define_vm(
zkhandler,
backup_source_details["vm_detail"]["xml"],
backup_source_details["vm_detail"]["node"],
backup_source_details["vm_detail"]["node_limit"],
backup_source_details["vm_detail"]["node_selector"],
backup_source_details["vm_detail"]["node_autostart"],
backup_source_details["vm_detail"]["migration_method"],
backup_source_details["vm_detail"]["profile"],
backup_source_details["vm_detail"]["tags"],
"restore",
)
if not retcode:
return False, f"ERROR: Failed to define restored VM: {retmsg}"
except Exception as e:
return False, f"ERROR: Failed to parse VM backup details: {e}"
# 4. Import volumes
is_snapshot_remove_failed = False
which_snapshot_remove_failed = list()
msg_snapshot_remove_failed = list()
if incremental_parent is not None:
for volume_file, volume_size in backup_source_details.get('backup_files'):
pool, volume, _ = volume_file.split('/')[-1].split('.')
try:
parent_volume_file = [f[0] for f in backup_source_parent_details.get('backup_files') if f[0].split('/')[-1].replace('.rbdimg', '') == volume_file.split('/')[-1].replace('.rbddiff', '')][0]
except Exception as e:
return False, f"ERROR: Failed to find parent volume for volume {pool}/{volume}; backup may be corrupt or invalid: {e}"
# First we create the expected volumes then clean them up
# This process is a bit of a hack because rbd import does not expect an existing volume,
# but we need the information in PVC.
# Thus create the RBD volume using ceph.add_volume based on the backup size, and then
# manually remove the RBD volume (leaving the PVC metainfo)
retcode, retmsg = ceph.add_volume(zkhandler, pool, volume, volume_size)
if not retcode:
return False, f"ERROR: Failed to create restored volume: {retmsg}"
retcode, stdout, stderr = common.run_os_command(
f"rbd remove {pool}/{volume}"
)
if retcode:
return False, f"ERROR: Failed to remove temporary RBD volume '{pool}/{volume}': {stderr}"
# Next we import the parent images
retcode, stdout, stderr = common.run_os_command(
f"rbd import --export-format 2 --dest-pool {pool} {source_path}/{domain}/{parent_volume_file} {volume}"
)
if retcode:
return False, f"ERROR: Failed to import parent backup image {parent_volume_file}: {stderr}"
# Then we import the incremental diffs
retcode, stdout, stderr = common.run_os_command(
f"rbd import-diff {source_path}/{domain}/{volume_file} {pool}/{volume}"
)
if retcode:
return False, f"ERROR: Failed to import incremental backup image {volume_file}: {stderr}"
# Finally we remove the parent and child snapshots (no longer required required)
retcode, stdout, stderr = common.run_os_command(
f"rbd snap rm {pool}/{volume}@backup_{incremental_parent}"
)
if retcode:
return False, f"ERROR: Failed to remove imported image snapshot for {parent_volume_file}: {stderr}"
retcode, stdout, stderr = common.run_os_command(
f"rbd snap rm {pool}/{volume}@backup_{datestring}"
)
if retcode:
return False, f"ERROR: Failed to remove imported image snapshot for {volume_file}: {stderr}"
else:
for volume_file, volume_size in backup_source_details.get('backup_files'):
pool, volume, _ = volume_file.split('/')[-1].split('.')
# First we create the expected volumes then clean them up
# This process is a bit of a hack because rbd import does not expect an existing volume,
# but we need the information in PVC.
# Thus create the RBD volume using ceph.add_volume based on the backup size, and then
# manually remove the RBD volume (leaving the PVC metainfo)
retcode, retmsg = ceph.add_volume(zkhandler, pool, volume, volume_size)
if not retcode:
return False, f"ERROR: Failed to create restored volume: {retmsg}"
retcode, stdout, stderr = common.run_os_command(
f"rbd remove {pool}/{volume}"
)
if retcode:
return False, f"ERROR: Failed to remove temporary RBD volume '{pool}/{volume}': {stderr}"
# Then we perform the actual import
retcode, stdout, stderr = common.run_os_command(
f"rbd import --export-format 2 --dest-pool {pool} {source_path}/{domain}/{volume_file} {volume}"
)
if retcode:
return False, f"ERROR: Failed to import backup image {volume_file}: {stderr}"
# Finally we remove the source snapshot (not required)
retcode, stdout, stderr = common.run_os_command(
f"rbd snap rm {pool}/{volume}@backup_{datestring}"
)
if retcode:
return False, f"ERROR: Failed to remove imported image snapshot for {volume_file}: {stderr}"
# 5. Apply diffs (if applicable)
# 5. Start VM
retcode, retmsg = start_vm(zkhandler, domain)
if not retcode:
return False, f"ERROR: Failed to start restored VM {domain}: {retmsg}"
tend = time.time()
ttot = round(tend - tstart, 2)
retlines = list()
if is_snapshot_remove_failed:
retlines.append(f"WARNING: Failed to remove parent snapshot as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}")
myhostname = gethostname().split(".")[0]
retlines.append(f"Successfully restored VM backup {datestring} for '{domain}' from '{myhostname}:{source_path}' in {ttot}s.")
return True, '\n'.join(retlines)