Bump version to 0.9.79

Use more hierarchical backup path structure
Avoid removing nonexistent snapshots
2023-10-24 02:10:24 -04:00 · 2023-10-24 02:04:16 -04:00 · 2023-10-24 01:35:00 -04:00 · 2023-10-24 01:25:01 -04:00 · 2023-10-24 01:20:44 -04:00 · 2023-10-24 01:18:27 -04:00
21 changed files with 1133 additions and 60 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.77
+0.9.79
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,20 @@
 ## PVC Changelog

+###### [v0.9.79](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.79)
+
+  **API Changes**: New endpoints /vm/{vm}/backup, /vm/{vm}/restore
+
+  * [CLI Client] Fixes some storage pool help text messages
+  * [Node Daemon] Increases the IPMI monitoring plugin timeout
+  * [All] Adds support for VM backups, including creation, removal, and restore
+  * [Repository] Fixes shebangs in scripts to be consistent
+  * [Daemon Library] Improves the handling of VM list arguments (default None)
+
+###### [v0.9.78](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.78)
+
+  * [API, Client CLI] Fixes several bugs around image uploads; adds a new query parameter for non-raw images
+  * [API] Ensures RBD images are created with a raw bytes value to avoid rounding errors
+
 ###### [v0.9.77](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.77)

  * [Client CLI] Fixes a bug from a bad library import
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -27,7 +27,7 @@ from ssl import SSLContext, TLSVersion
 from distutils.util import strtobool as dustrtobool

 # Daemon version
-version = "0.9.77"
+version = "0.9.79"

 # API version
 API_VERSION = 1.0
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@ -2140,7 +2140,7 @@ class API_VM_Locks(Resource):
 api.add_resource(API_VM_Locks, "/vm/<vm>/locks")


-# /vm/<vm</console
+# /vm/<vm>/console
 class API_VM_Console(Resource):
    @RequestParser([{"name": "lines"}])
    @Authenticator
@ -2293,6 +2293,202 @@ class API_VM_Device(Resource):
 api.add_resource(API_VM_Device, "/vm/<vm>/device")


+# /vm/<vm>/backup
+class API_VM_Backup(Resource):
+    @RequestParser(
+        [
+            {
+                "name": "backup_path",
+                "required": True,
+                "helptext": "A local filesystem path on the primary coordinator must be specified",
+            },
+            {
+                "name": "incremental_parent",
+                "required": False,
+            },
+            {
+                "name": "retain_snapshot",
+                "required": False,
+            },
+        ]
+    )
+    @Authenticator
+    def post(self, vm, reqargs):
+        """
+        Create a backup of {vm} and its volumes to a local primary coordinator filesystem path
+        ---
+        tags:
+          - vm
+        parameters:
+          - in: query
+            name: backup_path
+            type: string
+            required: true
+            description: A local filesystem path on the primary coordinator to store the backup
+          - in: query
+            name: incremental_parent
+            type: string
+            required: false
+            description: A previous backup datestamp to use as an incremental parent; if unspecified a full backup is taken
+          - in: query
+            name: retain_snapshot
+            type: boolean
+            required: false
+            default: false
+            description: Whether or not to retain this backup's volume snapshots to use as a future incremental parent; full backups only
+        responses:
+          200:
+            description: OK
+            schema:
+              type: object
+              id: Message
+          400:
+            description: Execution error
+            schema:
+              type: object
+              id: Message
+          404:
+            description: Not found
+            schema:
+              type: object
+              id: Message
+        """
+        backup_path = reqargs.get("backup_path", None)
+        incremental_parent = reqargs.get("incremental_parent", None)
+        retain_snapshot = bool(strtobool(reqargs.get("retain_snapshot", "false")))
+        return api_helper.vm_backup(
+            vm, backup_path, incremental_parent, retain_snapshot
+        )
+
+    @RequestParser(
+        [
+            {
+                "name": "backup_path",
+                "required": True,
+                "helptext": "A local filesystem path on the primary coordinator must be specified",
+            },
+            {
+                "name": "backup_datestring",
+                "required": True,
+                "helptext": "A backup datestring must be specified",
+            },
+        ]
+    )
+    @Authenticator
+    def delete(self, vm, reqargs):
+        """
+        Remove a backup of {vm}, including snapshots, from a local primary coordinator filesystem path
+        ---
+        tags:
+          - vm
+        parameters:
+          - in: query
+            name: backup_path
+            type: string
+            required: true
+            description: A local filesystem path on the primary coordinator where the backup is stored
+          - in: query
+            name: backup_datestring
+            type: string
+            required: true
+            description: The backup datestring identifier (e.g. 20230102030405)
+        responses:
+          200:
+            description: OK
+            schema:
+              type: object
+              id: Message
+          400:
+            description: Execution error
+            schema:
+              type: object
+              id: Message
+          404:
+            description: Not found
+            schema:
+              type: object
+              id: Message
+        """
+        backup_path = reqargs.get("backup_path", None)
+        backup_datestring = reqargs.get("backup_datestring", None)
+        return api_helper.vm_remove_backup(vm, backup_path, backup_datestring)
+
+
+api.add_resource(API_VM_Backup, "/vm/<vm>/backup")
+
+
+# /vm/<vm>/restore
+class API_VM_Restore(Resource):
+    @RequestParser(
+        [
+            {
+                "name": "backup_path",
+                "required": True,
+                "helptext": "A local filesystem path on the primary coordinator must be specified",
+            },
+            {
+                "name": "backup_datestring",
+                "required": True,
+                "helptext": "A backup datestring must be specified",
+            },
+            {
+                "name": "retain_snapshot",
+                "required": False,
+            },
+        ]
+    )
+    @Authenticator
+    def post(self, vm, reqargs):
+        """
+        Restore a backup of {vm} and its volumes from a local primary coordinator filesystem path
+        ---
+        tags:
+          - vm
+        parameters:
+          - in: query
+            name: backup_path
+            type: string
+            required: true
+            description: A local filesystem path on the primary coordinator where the backup is stored
+          - in: query
+            name: backup_datestring
+            type: string
+            required: true
+            description: The backup datestring identifier (e.g. 20230102030405)
+          - in: query
+            name: retain_snapshot
+            type: boolean
+            required: false
+            default: true
+            description: Whether or not to retain the (parent, if incremental) volume snapshot after restore
+        responses:
+          200:
+            description: OK
+            schema:
+              type: object
+              id: Message
+          400:
+            description: Execution error
+            schema:
+              type: object
+              id: Message
+          404:
+            description: Not found
+            schema:
+              type: object
+              id: Message
+        """
+        backup_path = reqargs.get("backup_path", None)
+        backup_datestring = reqargs.get("backup_datestring", None)
+        retain_snapshot = bool(strtobool(reqargs.get("retain_snapshot", "true")))
+        return api_helper.vm_restore(
+            vm, backup_path, backup_datestring, retain_snapshot
+        )
+
+
+api.add_resource(API_VM_Restore, "/vm/<vm>/restore")
+
+
 ##########################################################
 # Client API - Network
 ##########################################################
@ -4843,7 +5039,7 @@ class API_Storage_Ceph_Volume_Root(Resource):
            {
                "name": "size",
                "required": True,
-                "helptext": "A volume size in bytes (or with k/M/G/T suffix) must be specified.",
+                "helptext": "A volume size in bytes (B implied or with SI suffix k/M/G/T) must be specified.",
            },
        ]
    )
@ -4869,7 +5065,7 @@ class API_Storage_Ceph_Volume_Root(Resource):
            name: size
            type: string
            required: true
-            description: The volume size in bytes (or with a metric suffix, i.e. k/M/G/T)
+            description: The volume size, in bytes (B implied) or with a single-character SI suffix (k/M/G/T)
        responses:
          200:
            description: OK
@ -5122,7 +5318,7 @@ class API_Storage_Ceph_Volume_Element_Upload(Resource):
            name: file_size
            type: integer
            required: false
-            description: The size of the image file, if {image_format} is not "raw"
+            description: The size of the image file, in bytes, if {image_format} is not "raw"
        responses:
          200:
            description: OK
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@ -470,6 +470,88 @@ def vm_define(
    return output, retcode


+@ZKConnection(config)
+def vm_backup(
+    zkhandler,
+    domain,
+    backup_path,
+    incremental_parent=None,
+    retain_snapshot=False,
+):
+    """
+    Back up a VM to a local (primary coordinator) filesystem path.
+    """
+    retflag, retdata = pvc_vm.backup_vm(
+        zkhandler,
+        domain,
+        backup_path,
+        incremental_parent,
+        retain_snapshot,
+    )
+
+    if retflag:
+        retcode = 200
+    else:
+        retcode = 400
+
+    output = {"message": retdata.replace('"', "'")}
+    return output, retcode
+
+
+@ZKConnection(config)
+def vm_remove_backup(
+    zkhandler,
+    domain,
+    source_path,
+    datestring,
+):
+    """
+    Remove a VM backup from snapshots and a local (primary coordinator) filesystem path.
+    """
+    retflag, retdata = pvc_vm.remove_backup(
+        zkhandler,
+        domain,
+        source_path,
+        datestring,
+    )
+
+    if retflag:
+        retcode = 200
+    else:
+        retcode = 400
+
+    output = {"message": retdata.replace('"', "'")}
+    return output, retcode
+
+
+@ZKConnection(config)
+def vm_restore(
+    zkhandler,
+    domain,
+    backup_path,
+    datestring,
+    retain_snapshot=False,
+):
+    """
+    Restore a VM from a local (primary coordinator) filesystem path.
+    """
+    retflag, retdata = pvc_vm.restore_vm(
+        zkhandler,
+        domain,
+        backup_path,
+        datestring,
+        retain_snapshot,
+    )
+
+    if retflag:
+        retcode = 200
+    else:
+        retcode = 400
+
+    output = {"message": retdata.replace('"', "'")}
+    return output, retcode
+
+
@ZKConnection(config)
 def vm_attach_device(zkhandler, vm, device_spec_xml):
    """
@ -1629,7 +1711,6 @@ def ceph_volume_upload(zkhandler, pool, volume, img_type, file_size=None):
            zkhandler, pool, "{}_tmp".format(volume)
        )

-    # Create a temporary block device to store non-raw images
    if img_type == "raw":
        if file_size != dev_size:
            output = {
@ -1676,7 +1757,6 @@ def ceph_volume_upload(zkhandler, pool, volume, img_type, file_size=None):
        cleanup_maps_and_volumes()
        return output, retcode

-    # Write the image directly to the blockdev
    else:
        if file_size is None:
            output = {"message": "A file size must be specified"}
--- a/build-and-deploy.sh
+++ b/build-and-deploy.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # A useful script for testing out changes to PVC by building the debs and deploying them out to a
 # set of hosts automatically, including restarting the daemon (with a pause between) on the remote
@ -36,34 +36,37 @@ echo "Preparing code (format and lint)..."
 ./lint || exit 1

 # Build the packages
-echo -n "Building packages... "
+echo -n "Building packages..."
 version="$( ./build-unstable-deb.sh 2>/dev/null )"
-echo "done. Package version ${version}."
+echo " done. Package version ${version}."

 # Install the client(s) locally
-echo -n "Installing client packages locally... "
+echo -n "Installing client packages locally..."
 $SUDO dpkg -i ../pvc-client*_${version}*.deb &>/dev/null
-echo "done".
+echo " done".

 for HOST in ${HOSTS[@]}; do
    echo "> Deploying packages to host ${HOST}"
-    echo -n "Copying packages... "
+    echo -n "Copying packages..."
    ssh $HOST $SUDO rm -rf /tmp/pvc &>/dev/null
    ssh $HOST mkdir /tmp/pvc &>/dev/null
    scp ../pvc-*_${version}*.deb $HOST:/tmp/pvc/ &>/dev/null
-    echo "done."
-    echo -n "Installing packages... "
+    echo " done."
+    echo -n "Installing packages..."
    ssh $HOST $SUDO dpkg -i /tmp/pvc/{pvc-client-cli,pvc-daemon-common,pvc-daemon-api,pvc-daemon-node}*.deb &>/dev/null
    ssh $HOST rm -rf /tmp/pvc &>/dev/null
-    echo "done."
-    echo -n "Restarting PVC daemons... "
+    echo " done."
+    echo -n "Restarting PVC daemons..."
    ssh $HOST $SUDO systemctl restart pvcapid &>/dev/null
    ssh $HOST $SUDO systemctl restart pvcapid-worker &>/dev/null
    ssh $HOST $SUDO systemctl restart pvcnoded &>/dev/null
-    echo "done."
-    echo -n "Waiting 30s for host to stabilize... "
-    sleep 30
-    echo "done."
+    echo " done."
+    echo -n "Waiting for node daemon to be running..."
+    while [[ $( ssh $HOST "pvc -q node list -f json ${HOST%%.*} | jq -r '.[].daemon_state'" ) != "run" ]]; do
+        sleep 5
+        echo -n "."
+    done
+    echo " done."
 done
 if [[ -z ${KEEP_ARTIFACTS} ]]; then
    rm ../pvc*_${version}*
--- a/build-stable-deb.sh
+++ b/build-stable-deb.sh
@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 pushd $( git rev-parse --show-toplevel ) &>/dev/null
 ver="$( head -1 debian/changelog | awk -F'[()-]' '{ print $2 }' )"
 git pull
--- a/build-unstable-deb.sh
+++ b/build-unstable-deb.sh
@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 set -o xtrace
 exec 3>&1
 exec 1>&2
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@ -1590,6 +1590,149 @@ def cli_vm_flush_locks(domain):
    finish(retcode, retmsg)


+###############################################################################
+# > pvc vm backup
+###############################################################################
+@click.group(
+    name="backup",
+    short_help="Manage backups for PVC VMs.",
+    context_settings=CONTEXT_SETTINGS,
+)
+def cli_vm_backup():
+    """
+    Manage backups of VMs in a PVC cluster.
+    """
+    pass
+
+
+###############################################################################
+# > pvc vm backup create
+###############################################################################
+@click.command(name="create", short_help="Create a backup of a virtual machine.")
+@connection_req
+@click.argument("domain")
+@click.argument("backup_path")
+@click.option(
+    "-i",
+    "--incremental",
+    "incremental_parent",
+    default=None,
+    help="Perform an incremental volume backup from this parent backup datestring.",
+)
+@click.option(
+    "-r",
+    "--retain-snapshot",
+    "retain_snapshot",
+    is_flag=True,
+    default=False,
+    help="Retain volume snapshot for future incremental use (full only).",
+)
+def cli_vm_backup_create(domain, backup_path, incremental_parent, retain_snapshot):
+    """
+    Create a backup of virtual machine DOMAIN to BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
+
+    BACKUP_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing writes from the API daemon (normally running as "root"). The BACKUP_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage.
+
+    The backup will export the VM configuration, metainfo, and a point-in-time snapshot of all attached RBD volumes, using a datestring formatted backup name (i.e. YYYYMMDDHHMMSS).
+
+    The virtual machine DOMAIN may be running, and due to snapshots the backup should be crash-consistent, but will be in an unclean state and this must be considered when restoring from backups.
+
+    Incremental snapshots are possible by specifying the "-i"/"--incremental" option along with a source backup datestring. The snapshots from that source backup must have been retained using the "-r"/"--retain-snapshots" option. Retaining snapshots of incremental backups is not supported as incremental backups cannot be chained.
+
+    Full backup volume images are sparse-allocated, however it is recommended for safety to consider their maximum allocated size when allocated space for the BACKUP_PATH. Incremental volume images are generally small but are dependent entirely on the rate of data change in each volume.
+    """
+
+    echo(
+        CLI_CONFIG,
+        f"Backing up VM '{domain}'... ",
+        newline=False,
+    )
+    retcode, retmsg = pvc.lib.vm.vm_backup(
+        CLI_CONFIG, domain, backup_path, incremental_parent, retain_snapshot
+    )
+    if retcode:
+        echo(CLI_CONFIG, "done.")
+    else:
+        echo(CLI_CONFIG, "failed.")
+    finish(retcode, retmsg)
+
+
+###############################################################################
+# > pvc vm backup restore
+###############################################################################
+@click.command(name="restore", short_help="Restore a backup of a virtual machine.")
+@connection_req
+@click.argument("domain")
+@click.argument("backup_datestring")
+@click.argument("backup_path")
+@click.option(
+    "-r/-R",
+    "--retain-snapshot/--remove-snapshot",
+    "retain_snapshot",
+    is_flag=True,
+    default=True,
+    help="Retain or remove restored (parent, if incremental) snapshot.",
+)
+def cli_vm_backup_restore(domain, backup_datestring, backup_path, retain_snapshot):
+    """
+    Restore the backup BACKUP_DATESTRING of virtual machine DOMAIN stored in BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
+
+    BACKUP_PATH must be a valid absolute directory path on the cluster "primary" coordinator (see "pvc node list") allowing reads from the API daemon (normally running as "root"). The BACKUP_PATH should be a large storage volume, ideally a remotely mounted filesystem (e.g. NFS, SSHFS, etc.) or non-Ceph-backed disk; PVC does not handle this path, that is up to the administrator to configure and manage.
+
+    The restore will import the VM configuration, metainfo, and the point-in-time snapshot of all attached RBD volumes. Incremental backups will be automatically handled.
+
+    A VM named DOMAIN or with the same UUID must not exist; if a VM with the same name or UUID already exists, it must be removed, or renamed and then undefined (to preserve volumes), before restoring.
+
+    If the "-r"/"--retain-snapshot" option is specified (the default), for incremental restores, only the parent snapshot is kept; for full restores, the restored snapshot is kept. If the "-R"/"--remove-snapshot" option is specified, the imported snapshot is removed.
+
+    WARNING: The "-R"/"--remove-snapshot" option will invalidate any existing incremental backups based on the same incremental parent for the restored VM.
+    """
+
+    echo(
+        CLI_CONFIG,
+        f"Restoring backup {backup_datestring} of VM '{domain}'... ",
+        newline=False,
+    )
+    retcode, retmsg = pvc.lib.vm.vm_restore(
+        CLI_CONFIG, domain, backup_path, backup_datestring, retain_snapshot
+    )
+    if retcode:
+        echo(CLI_CONFIG, "done.")
+    else:
+        echo(CLI_CONFIG, "failed.")
+    finish(retcode, retmsg)
+
+
+###############################################################################
+# > pvc vm backup remove
+###############################################################################
+@click.command(name="remove", short_help="Remove a backup of a virtual machine.")
+@connection_req
+@click.argument("domain")
+@click.argument("backup_datestring")
+@click.argument("backup_path")
+def cli_vm_backup_remove(domain, backup_datestring, backup_path):
+    """
+    Remove the backup BACKUP_DATESTRING, including snapshots, of virtual machine DOMAIN stored in BACKUP_PATH on the cluster primary coordinator. DOMAIN may be a UUID or name.
+
+    WARNING: Removing an incremental parent will invalidate any existing incremental backups based on that backup.
+    """
+
+    echo(
+        CLI_CONFIG,
+        f"Removing backup {backup_datestring} of VM '{domain}'... ",
+        newline=False,
+    )
+    retcode, retmsg = pvc.lib.vm.vm_remove_backup(
+        CLI_CONFIG, domain, backup_path, backup_datestring
+    )
+    if retcode:
+        echo(CLI_CONFIG, "done.")
+    else:
+        echo(CLI_CONFIG, "failed.")
+    finish(retcode, retmsg)
+
+
 ###############################################################################
 # > pvc vm tag
 ###############################################################################
@ -3457,14 +3600,14 @@ def cli_storage_pool():
    show_default=True,
    required=False,
    help="""
-    The replication configuration, specifying both a "copies" and "mincopies" value, separated by a comma, e.g. "copies=3,mincopies=2". The "copies" value specifies the total number of replicas and should not exceed the total number of nodes; the "mincopies" value specifies the minimum number of available copies to allow writes. For additional details please see the Cluster Architecture documentation.
+    The replication configuration, specifying both a "copies" and "mincopies" value, separated by a comma, e.g. "copies=3,mincopies=2". The "copies" value specifies the total number of replicas and the "mincopies" value specifies the minimum number of active replicas to allow I/O. For additional details please see the documentation.
    """,
 )
 def cli_storage_pool_add(name, pgs, tier, replcfg):
    """
    Add a new Ceph RBD pool with name NAME and PGS placement groups.

-    The placement group count must be a non-zero power of 2.
+    The placement group count must be a non-zero power of 2. Generally you should choose a PGS number such that there will be 50-150 PGs on each OSD in a single node (before replicas); 64, 128, or 256 are good values for small clusters (1-5 OSDs per node); higher values are recommended for higher node or OSD counts. For additional details please see the documentation.
    """

    retcode, retmsg = pvc.lib.storage.ceph_pool_add(
@ -3503,9 +3646,9 @@ def cli_storage_pool_set_pgs(name, pgs):
    """
    Set the placement groups (PGs) count for the pool NAME to PGS.

-    The placement group count must be a non-zero power of 2.
+    The placement group count must be a non-zero power of 2. Generally you should choose a PGS number such that there will be 50-150 PGs on each OSD in a single node (before replicas); 64, 128, or 256 are good values for small clusters (1-5 OSDs per node); higher values are recommended for higher node or OSD counts. For additional details please see the documentation.

-    Placement group counts may be increased or decreased as required though frequent alteration is not recommended.
+    Placement group counts may be increased or decreased as required though frequent alteration is not recommended. Placement group alterations are intensive operations on the storage cluster.
    """

    retcode, retmsg = pvc.lib.storage.ceph_pool_set_pgs(CLI_CONFIG, name, pgs)
@ -5659,6 +5802,10 @@ cli_vm.add_command(cli_vm_move)
 cli_vm.add_command(cli_vm_migrate)
 cli_vm.add_command(cli_vm_unmigrate)
 cli_vm.add_command(cli_vm_flush_locks)
+cli_vm_backup.add_command(cli_vm_backup_create)
+cli_vm_backup.add_command(cli_vm_backup_restore)
+cli_vm_backup.add_command(cli_vm_backup_remove)
+cli_vm.add_command(cli_vm_backup)
 cli_vm_tag.add_command(cli_vm_tag_get)
 cli_vm_tag.add_command(cli_vm_tag_add)
 cli_vm_tag.add_command(cli_vm_tag_remove)
--- a/client-cli/pvc/lib/vm.py
+++ b/client-cli/pvc/lib/vm.py
@ -433,6 +433,70 @@ def vm_locks(config, vm):
    return retstatus, response.json().get("message", "")


+def vm_backup(config, vm, backup_path, incremental_parent=None, retain_snapshot=False):
+    """
+    Create a backup of {vm} and its volumes to a local primary coordinator filesystem path
+
+    API endpoint: POST /vm/{vm}/backup
+    API arguments: backup_path={backup_path}, incremental_parent={incremental_parent}, retain_snapshot={retain_snapshot}
+    API schema: {"message":"{data}"}
+    """
+    params = {
+        "backup_path": backup_path,
+        "incremental_parent": incremental_parent,
+        "retain_snapshot": retain_snapshot,
+    }
+    response = call_api(config, "post", "/vm/{vm}/backup".format(vm=vm), params=params)
+
+    if response.status_code != 200:
+        return False, response.json().get("message", "")
+    else:
+        return True, response.json().get("message", "")
+
+
+def vm_remove_backup(config, vm, backup_path, backup_datestring):
+    """
+    Remove a backup of {vm}, including snapshots, from a local primary coordinator filesystem path
+
+    API endpoint: DELETE /vm/{vm}/backup
+    API arguments: backup_path={backup_path}, backup_datestring={backup_datestring}
+    API schema: {"message":"{data}"}
+    """
+    params = {
+        "backup_path": backup_path,
+        "backup_datestring": backup_datestring,
+    }
+    response = call_api(
+        config, "delete", "/vm/{vm}/backup".format(vm=vm), params=params
+    )
+
+    if response.status_code != 200:
+        return False, response.json().get("message", "")
+    else:
+        return True, response.json().get("message", "")
+
+
+def vm_restore(config, vm, backup_path, backup_datestring, retain_snapshot=False):
+    """
+    Restore a backup of {vm} and its volumes from a local primary coordinator filesystem path
+
+    API endpoint: POST /vm/{vm}/restore
+    API arguments: backup_path={backup_path}, backup_datestring={backup_datestring}, retain_snapshot={retain_snapshot}
+    API schema: {"message":"{data}"}
+    """
+    params = {
+        "backup_path": backup_path,
+        "backup_datestring": backup_datestring,
+        "retain_snapshot": retain_snapshot,
+    }
+    response = call_api(config, "post", "/vm/{vm}/restore".format(vm=vm), params=params)
+
+    if response.status_code != 200:
+        return False, response.json().get("message", "")
+    else:
+        return True, response.json().get("message", "")
+
+
 def vm_vcpus_set(config, vm, vcpus, topology, restart):
    """
    Set the vCPU count of the VM with topology
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="pvc",
-    version="0.9.77",
+    version="0.9.79",
    packages=["pvc.cli", "pvc.lib"],
    install_requires=[
        "Click",
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@ -763,9 +763,7 @@ def add_volume(zkhandler, pool, name, size):

    # 2. Create the volume
    retcode, stdout, stderr = common.run_os_command(
-        "rbd create --size {} {}/{}".format(
-            format_bytes_tohuman(size_bytes), pool, name
-        )
+        "rbd create --size {}B {}/{}".format(size_bytes, pool, name)
    )
    if retcode:
        return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(name, stderr)
@ -1115,23 +1113,24 @@ def getCephSnapshots(zkhandler, pool, volume):
    return snapshot_list


-def add_snapshot(zkhandler, pool, volume, name):
+def add_snapshot(zkhandler, pool, volume, name, zk_only=False):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            volume, pool
        )

    # 1. Create the snapshot
-    retcode, stdout, stderr = common.run_os_command(
-        "rbd snap create {}/{}@{}".format(pool, volume, name)
-    )
-    if retcode:
-        return (
-            False,
-            'ERROR: Failed to create RBD snapshot "{}" of volume "{}" in pool "{}": {}'.format(
-                name, volume, pool, stderr
-            ),
+    if not zk_only:
+        retcode, stdout, stderr = common.run_os_command(
+            "rbd snap create {}/{}@{}".format(pool, volume, name)
        )
+        if retcode:
+            return (
+                False,
+                'ERROR: Failed to create RBD snapshot "{}" of volume "{}" in pool "{}": {}'.format(
+                    name, volume, pool, stderr
+                ),
+            )

    # 2. Add the snapshot to Zookeeper
    zkhandler.write(
--- a/daemon-common/common.py
+++ b/daemon-common/common.py
@ -146,7 +146,11 @@ def run_os_daemon(command_string, environment=None, logfile=None):
 # Run a local OS command via shell
 #
 def run_os_command(command_string, background=False, environment=None, timeout=None):
-    command = shlex_split(command_string)
+    if not isinstance(command_string, list):
+        command = shlex_split(command_string)
+    else:
+        command = command_string
+
    if background:

        def runcmd():
--- a/daemon-common/vm.py
+++ b/daemon-common/vm.py
@ -21,12 +21,18 @@

 import time
 import re
+import os.path
 import lxml.objectify
 import lxml.etree

-from distutils.util import strtobool
-from uuid import UUID
 from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from distutils.util import strtobool
+from json import dump as jdump
+from json import load as jload
+from shutil import rmtree
+from socket import gethostname
+from uuid import UUID

 import daemon_lib.common as common

@ -1175,13 +1181,15 @@ def get_info(zkhandler, domain):
    return True, domain_information


-def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
-    if node:
+def get_list(
+    zkhandler, node=None, state=None, tag=None, limit=None, is_fuzzy=True, negate=False
+):
+    if node is not None:
        # Verify node is valid
        if not common.verifyNode(zkhandler, node):
            return False, 'Specified node "{}" is invalid.'.format(node)

-    if state:
+    if state is not None:
        valid_states = [
            "start",
            "restart",
@ -1200,7 +1208,7 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
    full_vm_list.sort()

    # Set our limit to a sensible regex
-    if limit:
+    if limit is not None:
        # Check if the limit is a UUID
        is_limit_uuid = False
        try:
@ -1229,7 +1237,7 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
        is_state_match = False

        # Check on limit
-        if limit:
+        if limit is not None:
            # Try to match the limit against the UUID (if applicable) and name
            try:
                if is_limit_uuid and re.fullmatch(limit, vm):
@ -1241,7 +1249,7 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
        else:
            is_limit_match = True

-        if tag:
+        if tag is not None:
            vm_tags = zkhandler.children(("domain.meta.tags", vm))
            if negate and tag not in vm_tags:
                is_tag_match = True
@ -1251,7 +1259,7 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
            is_tag_match = True

        # Check on node
-        if node:
+        if node is not None:
            vm_node = zkhandler.read(("domain.node", vm))
            if negate and vm_node != node:
                is_node_match = True
@ -1261,7 +1269,7 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
            is_node_match = True

        # Check on state
-        if state:
+        if state is not None:
            vm_state = zkhandler.read(("domain.state", vm))
            if negate and vm_state != state:
                is_state_match = True
@ -1297,3 +1305,541 @@ def get_list(zkhandler, node, state, tag, limit, is_fuzzy=True, negate=False):
                pass

    return True, sorted(vm_data_list, key=lambda d: d["name"])
+
+
+def backup_vm(
+    zkhandler, domain, backup_path, incremental_parent=None, retain_snapshot=False
+):
+
+    tstart = time.time()
+
+    # 0. Validations
+    # Disallow retaining snapshots with an incremental parent
+    if incremental_parent is not None and retain_snapshot:
+        return (
+            False,
+            "ERROR: Retaining snapshots of incremental backups is not supported!",
+        )
+
+    # Validate that VM exists in cluster
+    dom_uuid = getDomainUUID(zkhandler, domain)
+    if not dom_uuid:
+        return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)
+
+    # Validate that the target path is valid
+    if not re.match(r"^/", backup_path):
+        return (
+            False,
+            f"ERROR: Target path {backup_path} is not a valid absolute path on the primary coordinator!",
+        )
+
+    # Ensure that backup_path (on this node) exists
+    if not os.path.isdir(backup_path):
+        return False, f"ERROR: Target path {backup_path} does not exist!"
+
+    # 1. Get information about VM
+    vm_detail = get_list(zkhandler, limit=dom_uuid, is_fuzzy=False)[1][0]
+    if not isinstance(vm_detail, dict):
+        return False, f"ERROR: VM listing returned invalid data: {vm_detail}"
+
+    vm_volumes = list()
+    for disk in vm_detail["disks"]:
+        if disk["type"] != "rbd":
+            continue
+
+        pool, volume = disk["name"].split("/")
+
+        retcode, retdata = ceph.get_list_volume(zkhandler, pool, volume, is_fuzzy=False)
+        if not retcode or len(retdata) != 1:
+            if len(retdata) < 1:
+                retdata = "No volumes returned."
+            elif len(retdata) > 1:
+                retdata = "Multiple volumes returned."
+            return (
+                False,
+                f"ERROR: Failed to get volume details for {pool}/{volume}: {retdata}",
+            )
+
+        try:
+            size = retdata[0]["stats"]["size"]
+        except Exception as e:
+            return False, f"ERROR: Failed to get volume size for {pool}/{volume}: {e}"
+
+        vm_volumes.append((pool, volume, size))
+
+    # 2a. Validate that all volumes exist (they should, but just in case)
+    for pool, volume, _ in vm_volumes:
+        if not ceph.verifyVolume(zkhandler, pool, volume):
+            return (
+                False,
+                f"ERROR: VM defines a volume {pool}/{volume} which does not exist!",
+            )
+
+    # 2b. Validate that, if an incremental_parent is given, it is valid
+    # The incremental parent is just a datestring
+    if incremental_parent is not None:
+        for pool, volume, _ in vm_volumes:
+            if not ceph.verifySnapshot(
+                zkhandler, pool, volume, f"backup_{incremental_parent}"
+            ):
+                return (
+                    False,
+                    f"ERROR: Incremental parent {incremental_parent} given, but no snapshots were found; cannot export an incremental backup.",
+                )
+
+        export_fileext = "rbddiff"
+    else:
+        export_fileext = "rbdimg"
+
+    # 2c. Validate that there's enough space on the target
+    # TODO
+
+    # 3. Set datestring in YYYYMMDDHHMMSS format
+    now = datetime.now()
+    datestring = now.strftime("%Y%m%d%H%M%S")
+
+    snapshot_name = f"backup_{datestring}"
+
+    # 4. Create destination directory
+    vm_target_root = f"{backup_path}/{domain}"
+    vm_target_backup = f"{backup_path}/{domain}/{datestring}/pvcdisks"
+    if not os.path.isdir(vm_target_backup):
+        try:
+            os.makedirs(vm_target_backup)
+        except Exception as e:
+            return False, f"ERROR: Failed to create backup directory: {e}"
+
+    # 5. Take snapshot of each disks with the name @backup_{datestring}
+    is_snapshot_create_failed = False
+    which_snapshot_create_failed = list()
+    msg_snapshot_create_failed = list()
+    for pool, volume, _ in vm_volumes:
+        retcode, retmsg = ceph.add_snapshot(zkhandler, pool, volume, snapshot_name)
+        if not retcode:
+            is_snapshot_create_failed = True
+            which_snapshot_create_failed.append(f"{pool}/{volume}")
+            msg_snapshot_create_failed.append(retmsg)
+
+    if is_snapshot_create_failed:
+        for pool, volume, _ in vm_volumes:
+            if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
+                ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
+        return (
+            False,
+            f'ERROR: Failed to create snapshot for volume(s) {", ".join(which_snapshot_create_failed)}: {", ".join(msg_snapshot_create_failed)}',
+        )
+
+    # 6. Dump snapshot to folder with `rbd export` (full) or `rbd export-diff` (incremental)
+    is_snapshot_export_failed = False
+    which_snapshot_export_failed = list()
+    msg_snapshot_export_failed = list()
+    for pool, volume, _ in vm_volumes:
+        if incremental_parent is not None:
+            incremental_parent_snapshot_name = f"backup_{incremental_parent}"
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd export-diff --from-snap {incremental_parent_snapshot_name} {pool}/{volume}@{snapshot_name} {vm_target_backup}/{pool}.{volume}.{export_fileext}"
+            )
+            if retcode:
+                is_snapshot_export_failed = True
+                which_snapshot_export_failed.append(f"{pool}/{volume}")
+                msg_snapshot_export_failed.append(stderr)
+        else:
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd export --export-format 2 {pool}/{volume}@{snapshot_name} {vm_target_backup}/{pool}.{volume}.{export_fileext}"
+            )
+            if retcode:
+                is_snapshot_export_failed = True
+                which_snapshot_export_failed.append(f"{pool}/{volume}")
+                msg_snapshot_export_failed.append(stderr)
+
+    if is_snapshot_export_failed:
+        for pool, volume, _ in vm_volumes:
+            if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
+                ceph.remove_snapshot(zkhandler, pool, volume, snapshot_name)
+        return (
+            False,
+            f'ERROR: Failed to export snapshot for volume(s) {", ".join(which_snapshot_export_failed)}: {", ".join(msg_snapshot_export_failed)}',
+        )
+
+    # 7. Create and dump VM backup information
+    backup_type = "incremental" if incremental_parent is not None else "full"
+    vm_backup = {
+        "type": backup_type,
+        "datestring": datestring,
+        "incremental_parent": incremental_parent,
+        "retained_snapshot": retain_snapshot,
+        "vm_detail": vm_detail,
+        "backup_files": [
+            (f"pvcdisks/{p}.{v}.{export_fileext}", s) for p, v, s in vm_volumes
+        ],
+    }
+    with open(f"{vm_target_root}/{datestring}/pvcbackup.json", "w") as fh:
+        jdump(vm_backup, fh)
+
+    # 8. Remove snapshots if retain_snapshot is False
+    is_snapshot_remove_failed = False
+    which_snapshot_remove_failed = list()
+    msg_snapshot_remove_failed = list()
+    if not retain_snapshot:
+        for pool, volume, _ in vm_volumes:
+            if ceph.verifySnapshot(zkhandler, pool, volume, snapshot_name):
+                retcode, retmsg = ceph.remove_snapshot(
+                    zkhandler, pool, volume, snapshot_name
+                )
+                if not retcode:
+                    is_snapshot_remove_failed = True
+                    which_snapshot_remove_failed.append(f"{pool}/{volume}")
+                    msg_snapshot_remove_failed.append(retmsg)
+
+    tend = time.time()
+    ttot = round(tend - tstart, 2)
+    retlines = list()
+
+    if is_snapshot_remove_failed:
+        retlines.append(
+            f"WARNING: Failed to remove snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}"
+        )
+
+    myhostname = gethostname().split(".")[0]
+    if retain_snapshot:
+        retlines.append(
+            f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}, snapshots retained) to '{myhostname}:{backup_path}' in {ttot}s."
+        )
+    else:
+        retlines.append(
+            f"Successfully backed up VM '{domain}' ({backup_type}@{datestring}) to '{myhostname}:{backup_path}' in {ttot}s."
+        )
+
+    return True, "\n".join(retlines)
+
+
+def remove_backup(zkhandler, domain, backup_path, datestring):
+    tstart = time.time()
+
+    # 0. Validation
+    # Validate that VM exists in cluster
+    dom_uuid = getDomainUUID(zkhandler, domain)
+    if not dom_uuid:
+        return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)
+
+    # Validate that the source path is valid
+    if not re.match(r"^/", backup_path):
+        return (
+            False,
+            f"ERROR: Source path {backup_path} is not a valid absolute path on the primary coordinator!",
+        )
+
+    # Ensure that backup_path (on this node) exists
+    if not os.path.isdir(backup_path):
+        return False, f"ERROR: Source path {backup_path} does not exist!"
+
+    # Ensure that domain path (on this node) exists
+    vm_backup_path = f"{backup_path}/{domain}"
+    if not os.path.isdir(vm_backup_path):
+        return False, f"ERROR: Source VM path {vm_backup_path} does not exist!"
+
+    # Ensure that the archives are present
+    backup_source_pvcbackup_file = f"{vm_backup_path}/{datestring}/pvcbackup.json"
+    if not os.path.isfile(backup_source_pvcbackup_file):
+        return False, "ERROR: The specified source backup files do not exist!"
+
+    backup_source_pvcdisks_path = f"{vm_backup_path}/{datestring}/pvcdisks"
+    if not os.path.isdir(backup_source_pvcdisks_path):
+        return False, "ERROR: The specified source backup files do not exist!"
+
+    # 1. Read the backup file and get VM details
+    try:
+        with open(backup_source_pvcbackup_file) as fh:
+            backup_source_details = jload(fh)
+    except Exception as e:
+        return False, f"ERROR: Failed to read source backup details: {e}"
+
+    # 2. Remove snapshots
+    is_snapshot_remove_failed = False
+    which_snapshot_remove_failed = list()
+    msg_snapshot_remove_failed = list()
+    if backup_source_details["retained_snapshot"]:
+        for volume_file, _ in backup_source_details.get("backup_files"):
+            pool, volume, _ = volume_file.split("/")[-1].split(".")
+            snapshot = f"backup_{datestring}"
+            retcode, retmsg = ceph.remove_snapshot(zkhandler, pool, volume, snapshot)
+            if not retcode:
+                is_snapshot_remove_failed = True
+                which_snapshot_remove_failed.append(f"{pool}/{volume}")
+                msg_snapshot_remove_failed.append(retmsg)
+
+    # 3. Remove files
+    is_files_remove_failed = False
+    msg_files_remove_failed = None
+    try:
+        rmtree(f"{vm_backup_path}/{datestring}")
+    except Exception as e:
+        is_files_remove_failed = True
+        msg_files_remove_failed = e
+
+    tend = time.time()
+    ttot = round(tend - tstart, 2)
+    retlines = list()
+
+    if is_snapshot_remove_failed:
+        retlines.append(
+            f"WARNING: Failed to remove snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}"
+        )
+
+    if is_files_remove_failed:
+        retlines.append(
+            f"WARNING: Failed to remove backup file(s) from {backup_path}: {msg_files_remove_failed}"
+        )
+
+    myhostname = gethostname().split(".")[0]
+    retlines.append(
+        f"Removed VM backup {datestring} for '{domain}' from '{myhostname}:{backup_path}' in {ttot}s."
+    )
+
+    return True, "\n".join(retlines)
+
+
+def restore_vm(zkhandler, domain, backup_path, datestring, retain_snapshot=False):
+    tstart = time.time()
+
+    # 0. Validations
+    # Validate that VM does not exist in cluster
+    dom_uuid = getDomainUUID(zkhandler, domain)
+    if dom_uuid:
+        return (
+            False,
+            f'ERROR: VM "{domain}" already exists in the cluster! Remove or rename it before restoring a backup.',
+        )
+
+    # Validate that the source path is valid
+    if not re.match(r"^/", backup_path):
+        return (
+            False,
+            f"ERROR: Source path {backup_path} is not a valid absolute path on the primary coordinator!",
+        )
+
+    # Ensure that backup_path (on this node) exists
+    if not os.path.isdir(backup_path):
+        return False, f"ERROR: Source path {backup_path} does not exist!"
+
+    # Ensure that domain path (on this node) exists
+    vm_backup_path = f"{backup_path}/{domain}"
+    if not os.path.isdir(vm_backup_path):
+        return False, f"ERROR: Source VM path {vm_backup_path} does not exist!"
+
+    # Ensure that the archives are present
+    backup_source_pvcbackup_file = f"{vm_backup_path}/{datestring}/pvcbackup.json"
+    if not os.path.isfile(backup_source_pvcbackup_file):
+        return False, "ERROR: The specified source backup files do not exist!"
+
+    # 1. Read the backup file and get VM details
+    try:
+        with open(backup_source_pvcbackup_file) as fh:
+            backup_source_details = jload(fh)
+    except Exception as e:
+        return False, f"ERROR: Failed to read source backup details: {e}"
+
+    # Handle incrementals
+    incremental_parent = backup_source_details.get("incremental_parent", None)
+    if incremental_parent is not None:
+        backup_source_parent_pvcbackup_file = (
+            f"{vm_backup_path}/{incremental_parent}/pvcbackup.json"
+        )
+        if not os.path.isfile(backup_source_parent_pvcbackup_file):
+            return (
+                False,
+                "ERROR: The specified backup is incremental but the required incremental parent source backup files do not exist!",
+            )
+
+        try:
+            with open(backup_source_parent_pvcbackup_file) as fh:
+                backup_source_parent_details = jload(fh)
+        except Exception as e:
+            return (
+                False,
+                f"ERROR: Failed to read source incremental parent backup details: {e}",
+            )
+
+    # 2. Import VM config and metadata in provision state
+    try:
+        retcode, retmsg = define_vm(
+            zkhandler,
+            backup_source_details["vm_detail"]["xml"],
+            backup_source_details["vm_detail"]["node"],
+            backup_source_details["vm_detail"]["node_limit"],
+            backup_source_details["vm_detail"]["node_selector"],
+            backup_source_details["vm_detail"]["node_autostart"],
+            backup_source_details["vm_detail"]["migration_method"],
+            backup_source_details["vm_detail"]["profile"],
+            backup_source_details["vm_detail"]["tags"],
+            "restore",
+        )
+        if not retcode:
+            return False, f"ERROR: Failed to define restored VM: {retmsg}"
+    except Exception as e:
+        return False, f"ERROR: Failed to parse VM backup details: {e}"
+
+    # 4. Import volumes
+    is_snapshot_remove_failed = False
+    which_snapshot_remove_failed = list()
+    msg_snapshot_remove_failed = list()
+    if incremental_parent is not None:
+        for volume_file, volume_size in backup_source_details.get("backup_files"):
+            pool, volume, _ = volume_file.split("/")[-1].split(".")
+            try:
+                parent_volume_file = [
+                    f[0]
+                    for f in backup_source_parent_details.get("backup_files")
+                    if f[0].split("/")[-1].replace(".rbdimg", "")
+                    == volume_file.split("/")[-1].replace(".rbddiff", "")
+                ][0]
+            except Exception as e:
+                return (
+                    False,
+                    f"ERROR: Failed to find parent volume for volume {pool}/{volume}; backup may be corrupt or invalid: {e}",
+                )
+
+            # First we create the expected volumes then clean them up
+            #   This process is a bit of a hack because rbd import does not expect an existing volume,
+            #   but we need the information in PVC.
+            #   Thus create the RBD volume using ceph.add_volume based on the backup size, and then
+            #   manually remove the RBD volume (leaving the PVC metainfo)
+            retcode, retmsg = ceph.add_volume(zkhandler, pool, volume, volume_size)
+            if not retcode:
+                return False, f"ERROR: Failed to create restored volume: {retmsg}"
+
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd remove {pool}/{volume}"
+            )
+            if retcode:
+                return (
+                    False,
+                    f"ERROR: Failed to remove temporary RBD volume '{pool}/{volume}': {stderr}",
+                )
+
+            # Next we import the parent images
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd import --export-format 2 --dest-pool {pool} {backup_path}/{domain}/{incremental_parent}/{parent_volume_file} {volume}"
+            )
+            if retcode:
+                return (
+                    False,
+                    f"ERROR: Failed to import parent backup image {parent_volume_file}: {stderr}",
+                )
+
+            # Then we import the incremental diffs
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd import-diff {backup_path}/{domain}/{datestring}/{volume_file} {pool}/{volume}"
+            )
+            if retcode:
+                return (
+                    False,
+                    f"ERROR: Failed to import incremental backup image {volume_file}: {stderr}",
+                )
+
+            # Finally we remove the parent and child snapshots (no longer required required)
+            if retain_snapshot:
+                retcode, retmsg = ceph.add_snapshot(
+                    zkhandler,
+                    pool,
+                    volume,
+                    f"backup_{incremental_parent}",
+                    zk_only=True,
+                )
+                if not retcode:
+                    return (
+                        False,
+                        f"ERROR: Failed to add imported image snapshot for {parent_volume_file}: {retmsg}",
+                    )
+            else:
+                retcode, stdout, stderr = common.run_os_command(
+                    f"rbd snap rm {pool}/{volume}@backup_{incremental_parent}"
+                )
+                if retcode:
+                    is_snapshot_remove_failed = True
+                    which_snapshot_remove_failed.append(f"{pool}/{volume}")
+                    msg_snapshot_remove_failed.append(retmsg)
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd snap rm {pool}/{volume}@backup_{datestring}"
+            )
+            if retcode:
+                is_snapshot_remove_failed = True
+                which_snapshot_remove_failed.append(f"{pool}/{volume}")
+                msg_snapshot_remove_failed.append(retmsg)
+
+    else:
+        for volume_file, volume_size in backup_source_details.get("backup_files"):
+            pool, volume, _ = volume_file.split("/")[-1].split(".")
+
+            # First we create the expected volumes then clean them up
+            #   This process is a bit of a hack because rbd import does not expect an existing volume,
+            #   but we need the information in PVC.
+            #   Thus create the RBD volume using ceph.add_volume based on the backup size, and then
+            #   manually remove the RBD volume (leaving the PVC metainfo)
+            retcode, retmsg = ceph.add_volume(zkhandler, pool, volume, volume_size)
+            if not retcode:
+                return False, f"ERROR: Failed to create restored volume: {retmsg}"
+
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd remove {pool}/{volume}"
+            )
+            if retcode:
+                return (
+                    False,
+                    f"ERROR: Failed to remove temporary RBD volume '{pool}/{volume}': {stderr}",
+                )
+
+            # Then we perform the actual import
+            retcode, stdout, stderr = common.run_os_command(
+                f"rbd import --export-format 2 --dest-pool {pool} {backup_path}/{domain}/{datestring}/{volume_file} {volume}"
+            )
+            if retcode:
+                return (
+                    False,
+                    f"ERROR: Failed to import backup image {volume_file}: {stderr}",
+                )
+
+            # Finally we remove the source snapshot (not required)
+            if retain_snapshot:
+                retcode, retmsg = ceph.add_snapshot(
+                    zkhandler,
+                    pool,
+                    volume,
+                    f"backup_{datestring}",
+                    zk_only=True,
+                )
+                if not retcode:
+                    return (
+                        False,
+                        f"ERROR: Failed to add imported image snapshot for {volume_file}: {retmsg}",
+                    )
+            else:
+                retcode, stdout, stderr = common.run_os_command(
+                    f"rbd snap rm {pool}/{volume}@backup_{datestring}"
+                )
+                if retcode:
+                    return (
+                        False,
+                        f"ERROR: Failed to remove imported image snapshot for {volume_file}: {stderr}",
+                    )
+
+    # 5. Start VM
+    retcode, retmsg = start_vm(zkhandler, domain)
+    if not retcode:
+        return False, f"ERROR: Failed to start restored VM {domain}: {retmsg}"
+
+    tend = time.time()
+    ttot = round(tend - tstart, 2)
+    retlines = list()
+
+    if is_snapshot_remove_failed:
+        retlines.append(
+            f"WARNING: Failed to remove hanging snapshot(s) as requested for volume(s) {', '.join(which_snapshot_remove_failed)}: {', '.join(msg_snapshot_remove_failed)}"
+        )
+
+    myhostname = gethostname().split(".")[0]
+    retlines.append(
+        f"Successfully restored VM backup {datestring} for '{domain}' from '{myhostname}:{backup_path}' in {ttot}s."
+    )
+
+    return True, "\n".join(retlines)
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,22 @@
+pvc (0.9.79-0) unstable; urgency=high
+
+  **API Changes**: New endpoints /vm/{vm}/backup, /vm/{vm}/restore
+
+  * [CLI Client] Fixes some storage pool help text messages
+  * [Node Daemon] Increases the IPMI monitoring plugin timeout
+  * [All] Adds support for VM backups, including creation, removal, and restore
+  * [Repository] Fixes shebangs in scripts to be consistent
+  * [Daemon Library] Improves the handling of VM list arguments (default None)
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Tue, 24 Oct 2023 02:10:24 -0400
+
+pvc (0.9.78-0) unstable; urgency=high
+
+  * [API, Client CLI] Fixes several bugs around image uploads; adds a new query parameter for non-raw images
+  * [API] Ensures RBD images are created with a raw bytes value to avoid rounding errors
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sat, 30 Sep 2023 12:57:55 -0400
+
 pvc (0.9.77-0) unstable; urgency=high

  * [Client CLI] Fixes a bug from a bad library import
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Generate the database migration files

--- a/2
+++ b/2
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Generate the Zookeeper migration files

--- a/node-daemon/plugins/ipmi
+++ b/node-daemon/plugins/ipmi
@ -76,7 +76,7 @@ class MonitoringPluginScript(MonitoringPlugin):
        ipmi_password = self.config["ipmi_password"]
        retcode, _, _ = run_os_command(
            f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status",
-            timeout=2
+            timeout=5
        )

        if retcode > 0:
--- a/node-daemon/plugins/nics
+++ b/node-daemon/plugins/nics
@ -132,7 +132,7 @@ class MonitoringPluginScript(MonitoringPlugin):
                for slave_interface in slave_interfaces:
                    if slave_interface[1] == 'up':
                        slave_interface_up_count += 1
-                if slave_interface_up_count < 2:
+                if slave_interface_up_count < len(slave_interfaces):
                    messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
                    health_delta += 10
                else:
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -49,7 +49,7 @@ import re
 import json

 # Daemon version
-version = "0.9.77"
+version = "0.9.79"


 ##########################################################
--- a/node-daemon/pvcnoded/util/services.py
+++ b/node-daemon/pvcnoded/util/services.py
@ -77,5 +77,5 @@ def start_system_services(logger, config):
    start_ceph_mon(logger, config)
    start_ceph_mgr(logger, config)

-    logger.out("Waiting 3 seconds for daemons to start", state="s")
-    sleep(3)
+    logger.out("Waiting 10 seconds for daemons to start", state="s")
+    sleep(10)
Author	SHA1	Message	Date
Joshua M. Boniface	221af3f241	Bump version to 0.9.79	2023-10-24 02:10:24 -04:00
Joshua M. Boniface	35f80e544c	Use more hierarchical backup path structure	2023-10-24 02:04:16 -04:00
Joshua M. Boniface	83b937654c	Avoid removing nonexistent snapshots Store retain_snapshot in JSON and use that to check during delete.	2023-10-24 01:35:00 -04:00
Joshua M. Boniface	714bde89e6	Fix incorrect variable ref	2023-10-24 01:25:01 -04:00
Joshua M. Boniface	c87736eb0a	Use consistent path name and format	2023-10-24 01:20:44 -04:00
Joshua M. Boniface	63d0a85e29	Add backup deletion command	2023-10-24 01:18:27 -04:00
Joshua M. Boniface	43e8cd3b07	Clarify restore help text	2023-10-24 00:32:53 -04:00
Joshua M. Boniface	55ca131c2c	Handle snapshots on restore and provide options Also rename the retain option to remove superfluous plural.	2023-10-24 00:25:06 -04:00
Joshua M. Boniface	0769f1ea52	Increase service start time to 10s	2023-10-23 22:24:03 -04:00
Joshua M. Boniface	c858ae8fed	Improve waiting in build-and-deploy	2023-10-23 22:23:48 -04:00
Joshua M. Boniface	8d256a1737	Complete VM restore functionality	2023-10-23 22:23:17 -04:00
Joshua M. Boniface	d3b3fdfc80	Revert "Export backup images to a tar archive" This reverts commit `38abd078af`.	2023-10-23 11:01:16 -04:00
Joshua M. Boniface	f1b29ea94e	Initial VM restore work	2023-10-23 11:00:54 -04:00
Joshua M. Boniface	38abd078af	Export backup images to a tar archive This helps ensure an easier restore as the tar archive(s) can be sent directly to the API via the normal process of image uploading, instead of individual disks.	2023-10-23 09:56:50 -04:00
Joshua M. Boniface	fabb97cf48	Only split a command_string if its not a list	2023-10-23 09:50:58 -04:00
Joshua M. Boniface	50aabde320	Ensure bond count is compared with actual qty	2023-10-22 02:28:04 -04:00
Joshua M. Boniface	68124db323	Remove extra spaces	2023-10-17 13:01:38 -04:00
Joshua M. Boniface	8921efd269	Fix incorrect tuple construct	2023-10-17 12:55:44 -04:00
Joshua M. Boniface	3e259bd926	Add state confirmation to newline	2023-10-17 12:53:20 -04:00
Joshua M. Boniface	3d12915989	Further improve return messages	2023-10-17 12:53:08 -04:00
Joshua M. Boniface	67b0b19bca	Use better time functionality	2023-10-17 12:39:37 -04:00
Joshua M. Boniface	5d0c674d1d	Add runtime and adjust ordering	2023-10-17 12:32:40 -04:00
Joshua M. Boniface	f3bc4dee04	Fix ordering of empty line	2023-10-17 12:27:06 -04:00
Joshua M. Boniface	f441b0d823	Improve missing parent message	2023-10-17 12:17:29 -04:00
Joshua M. Boniface	fd2331faa6	Add waiting message during backup	2023-10-17 12:16:31 -04:00
Joshua M. Boniface	a5d0f219e4	Improve return messages	2023-10-17 12:10:55 -04:00
Joshua M. Boniface	0169510df0	Fix up datestring generation	2023-10-17 12:05:45 -04:00
Joshua M. Boniface	a58c1d5a8c	Fix bad snapshot removals	2023-10-17 12:02:24 -04:00
Joshua M. Boniface	a8e4b01b67	Handle return data even better	2023-10-17 11:51:03 -04:00
Joshua M. Boniface	45c4c86911	Handle extra return variable	2023-10-17 11:47:01 -04:00
Joshua M. Boniface	6448b31d2c	Improve VM list arguments Use kwargs here instead of fixed args to allow default None values.	2023-10-17 11:01:38 -04:00
Joshua M. Boniface	4fc9b15652	Fix bad function name	2023-10-17 10:56:32 -04:00
Joshua M. Boniface	75b839692b	Fix missing comma	2023-10-17 10:51:30 -04:00
Joshua M. Boniface	751cfe0b29	Use consistent shebangs in scripts	2023-10-17 10:35:38 -04:00
Joshua M. Boniface	b997c6f31e	Add support for full VM backups Adds support for exporting full VM backups, including configuration, metainfo, and RBD disk images, with incremental support.	2023-10-17 10:15:06 -04:00
Joshua M. Boniface	6e83300d78	Increase ipmi plugin timeout	2023-10-04 19:21:59 -04:00
Joshua M. Boniface	522da3fd95	Adjust wording for volume create too	2023-10-03 09:42:23 -04:00
Joshua M. Boniface	3a1bf0724e	Mention file_size as bytes	2023-10-03 09:39:19 -04:00
Joshua M. Boniface	ee494fb1c0	Adjust the help text of storage pools Makes some places clearer, cleans up cruft, and adds references to the main documentation as required.	2023-10-02 11:46:12 -04:00
Joshua M. Boniface	c6c44bf775	Bump version to 0.9.78	2023-09-30 12:57:55 -04:00
Joshua M. Boniface	bbb940da65	Remove spurious comments	2023-09-30 12:37:58 -04:00
Joshua M. Boniface	a0b45a2bcd	Always create RBDs with bytes value Converting into human results in imprecise values when specifying bytes directly, which in turn breaks VMDK image uploads. Instead, just use the raw bytes value when creating the volume instead of converting it back.	2023-09-30 12:37:43 -04:00
 @ -1 +1 @@
 .9.77
 .9.79