Bump version to 0.9.98

Add --version flag to pvcnoded.py for info
Fix bugs listing snapshots by pool/volume
2024-06-05 12:01:31 -04:00 · 2024-06-05 11:57:47 -04:00 · 2024-05-16 16:32:22 -04:00 · 2024-05-13 15:29:43 -04:00 · 2024-05-13 15:24:51 -04:00 · 2024-05-08 10:55:15 -04:00
16 changed files with 218 additions and 32 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.97
+0.9.98
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,14 @@
 ## PVC Changelog

+###### [v0.9.98](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.98)
+
+  * [CLI Client] Fixed output when API call times out
+  * [Node Daemon] Improves the handling of fence states
+  * [API Daemon/CLI Client] Adds support for storage snapshot rollback
+  * [CLI Client] Adds additional warning messages about snapshot consistency to help output
+  * [API Daemon] Fixes a bug listing snapshots by pool/volume
+  * [Node Daemon] Adds a --version flag for information gathering by update-motd.sh
+
 ###### [v0.9.97](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.97)

  * [Client CLI] Ensures --lines is always an integer value
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
 import daemon_lib.config as cfg

 # Daemon version
-version = "0.9.97"
+version = "0.9.98"

 # API version
 API_VERSION = 1.0
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@ -6362,6 +6362,59 @@ api.add_resource(
 )


+# /storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback
+class API_Storage_Ceph_Snapshot_Rollback_Element(Resource):
+    @Authenticator
+    def post(self, pool, volume, snapshot):
+        """
+        Roll back an RBD volume {volume} in pool {pool} to snapshot {snapshot}
+
+        WARNING: This action cannot be done on an active RBD volume. All IO MUST be stopped first.
+        ---
+        tags:
+          - storage / ceph
+        parameters:
+          - in: query
+            name: snapshot
+            type: string
+            required: true
+            description: The name of the snapshot
+          - in: query
+            name: volume
+            type: string
+            required: true
+            description: The name of the volume
+          - in: query
+            name: pool
+            type: integer
+            required: true
+            description: The name of the pool
+        responses:
+          200:
+            description: OK
+            schema:
+              type: object
+              id: Message
+          404:
+            description: Not found
+            schema:
+              type: object
+              id: Message
+          400:
+            description: Bad request
+            schema:
+              type: object
+              id: Message
+        """
+        return api_helper.ceph_volume_snapshot_rollback(pool, volume, snapshot)
+
+
+api.add_resource(
+    API_Storage_Ceph_Snapshot_Rollback_Element,
+    "/storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback",
+)
+
+
 ##########################################################
 # Provisioner API
 ##########################################################
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@ -2183,6 +2183,22 @@ def ceph_volume_snapshot_rename(zkhandler, pool, volume, name, new_name):
    return output, retcode


+@ZKConnection(config)
+def ceph_volume_snapshot_rollback(zkhandler, pool, volume, name):
+    """
+    Roll back a Ceph RBD volume to a given snapshot in the PVC Ceph storage cluster.
+    """
+    retflag, retdata = pvc_ceph.rollback_snapshot(zkhandler, pool, volume, name)
+
+    if retflag:
+        retcode = 200
+    else:
+        retcode = 400
+
+    output = {"message": retdata.replace('"', "'")}
+    return output, retcode
+
+
@ZKConnection(config)
 def ceph_volume_snapshot_remove(zkhandler, pool, volume, name):
    """
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@ -4325,6 +4325,10 @@ def cli_storage_volume_snapshot():
 def cli_storage_volume_snapshot_add(pool, volume, name):
    """
    Add a snapshot with name NAME of Ceph RBD volume VOLUME in pool POOL.
+
+    WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
+    of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
+    VM at the moment of the snapshot.
    """

    retcode, retmsg = pvc.lib.storage.ceph_snapshot_add(CLI_CONFIG, pool, volume, name)
@ -4372,6 +4376,36 @@ def cli_storage_volume_snapshot_remove(pool, volume, name):
    finish(retcode, retmsg)


+###############################################################################
+# > pvc storage volume snapshot rollback
+###############################################################################
+@click.command(name="rollback", short_help="Roll back RBD volume to snapshot.")
+@connection_req
+@click.argument("pool")
+@click.argument("volume")
+@click.argument("name")
+@confirm_opt("Roll back to snapshot {name} for volume {pool}/{volume}")
+def cli_storage_volume_snapshot_rollback(pool, volume, name):
+    """
+    Roll back the Ceph RBD volume VOLUME in pool POOL to the snapshot NAME.
+
+    DANGER: All data written to the volume since the given snapshot will be permanently lost.
+
+    WARNING: A rollback cannot be performed on an RBD volume with active I/O. Doing so will cause
+    undefined behaviour and possible corruption. Ensure that any VM(s) using this RBD volume are
+    stopped or disabled before attempting a snapshot rollback.
+
+    WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
+    of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
+    VM at the moment of the snapshot.
+    """
+
+    retcode, retmsg = pvc.lib.storage.ceph_snapshot_rollback(
+        CLI_CONFIG, pool, volume, name
+    )
+    finish(retcode, retmsg)
+
+
 ###############################################################################
 # > pvc storage volume snapshot list
 ###############################################################################
@ -6349,6 +6383,7 @@ cli_storage_volume.add_command(cli_storage_volume_list)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_add)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rename)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_remove)
+cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rollback)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_list)
 cli_storage_volume.add_command(cli_storage_volume_snapshot)
 cli_storage.add_command(cli_storage_volume)
--- a/client-cli/pvc/lib/common.py
+++ b/client-cli/pvc/lib/common.py
@ -108,9 +108,10 @@ class UploadProgressBar(object):


 class ErrorResponse(requests.Response):
-    def __init__(self, json_data, status_code):
+    def __init__(self, json_data, status_code, headers):
        self.json_data = json_data
        self.status_code = status_code
+        self.headers = headers

    def json(self):
        return self.json_data
@ -206,7 +207,7 @@ def call_api(
    except Exception as e:
        message = "Failed to connect to the API: {}".format(e)
        code = response.status_code if response else 504
-        response = ErrorResponse({"message": message}, code)
+        response = ErrorResponse({"message": message}, code, None)

    # Display debug output
    if config["debug"]:
--- a/client-cli/pvc/lib/storage.py
+++ b/client-cli/pvc/lib/storage.py
@ -1544,6 +1544,30 @@ def ceph_snapshot_add(config, pool, volume, snapshot):
    return retstatus, response.json().get("message", "")


+def ceph_snapshot_rollback(config, pool, volume, snapshot):
+    """
+    Roll back Ceph volume to snapshot
+
+    API endpoint: POST /api/v1/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback
+    API arguments:
+    API schema: {"message":"{data}"}
+    """
+    response = call_api(
+        config,
+        "post",
+        "/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback".format(
+            snapshot=snapshot, volume=volume, pool=pool
+        ),
+    )
+
+    if response.status_code == 200:
+        retstatus = True
+    else:
+        retstatus = False
+
+    return retstatus, response.json().get("message", "")
+
+
 def ceph_snapshot_remove(config, pool, volume, snapshot):
    """
    Remove Ceph snapshot
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="pvc",
-    version="0.9.97",
+    version="0.9.98",
    packages=["pvc.cli", "pvc.lib"],
    install_requires=[
        "Click",
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@ -540,7 +540,10 @@ def getCephVolumes(zkhandler, pool):
        pool_list = [pool]

    for pool_name in pool_list:
-        for volume_name in zkhandler.children(("volume", pool_name)):
+        children = zkhandler.children(("volume", pool_name))
+        if children is None:
+            continue
+        for volume_name in children:
            volume_list.append("{}/{}".format(pool_name, volume_name))

    return volume_list
@ -1082,6 +1085,36 @@ def rename_snapshot(zkhandler, pool, volume, name, new_name):
    )


+def rollback_snapshot(zkhandler, pool, volume, name):
+    if not verifyVolume(zkhandler, pool, volume):
+        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
+            volume, pool
+        )
+    if not verifySnapshot(zkhandler, pool, volume, name):
+        return (
+            False,
+            'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".'.format(
+                name, volume, pool
+            ),
+        )
+
+        # 1. Roll back the snapshot
+        retcode, stdout, stderr = common.run_os_command(
+            "rbd snap rollback {}/{}@{}".format(pool, volume, name)
+        )
+        if retcode:
+            return (
+                False,
+                'ERROR: Failed to roll back RBD volume "{}" in pool "{}" to snapshot "{}": {}'.format(
+                    volume, pool, name, stderr
+                ),
+            )
+
+    return True, 'Rolled back RBD volume "{}" in pool "{}" to snapshot "{}".'.format(
+        volume, pool, name
+    )
+
+
 def remove_snapshot(zkhandler, pool, volume, name):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
@ -1123,20 +1156,9 @@ def remove_snapshot(zkhandler, pool, volume, name):
    )


-def get_list_snapshot(zkhandler, pool, volume, limit=None, is_fuzzy=True):
+def get_list_snapshot(zkhandler, target_pool, target_volume, limit=None, is_fuzzy=True):
    snapshot_list = []
-    if pool and not verifyPool(zkhandler, pool):
-        return False, 'ERROR: No pool with name "{}" is present in the cluster.'.format(
-            pool
-        )
-
-    if volume and not verifyPool(zkhandler, volume):
-        return (
-            False,
-            'ERROR: No volume with name "{}" is present in the cluster.'.format(volume),
-        )
-
-    full_snapshot_list = getCephSnapshots(zkhandler, pool, volume)
+    full_snapshot_list = getCephSnapshots(zkhandler, target_pool, target_volume)

    if is_fuzzy and limit:
        # Implicitly assume fuzzy limits
@ -1148,6 +1170,10 @@ def get_list_snapshot(zkhandler, pool, volume, limit=None, is_fuzzy=True):
    for snapshot in full_snapshot_list:
        volume, snapshot_name = snapshot.split("@")
        pool_name, volume_name = volume.split("/")
+        if target_pool and pool_name != target_pool:
+            continue
+        if target_volume and volume_name != target_volume:
+            continue
        if limit:
            try:
                if re.fullmatch(limit, snapshot_name):
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,14 @@
+pvc (0.9.98-0) unstable; urgency=high
+
+  * [CLI Client] Fixed output when API call times out
+  * [Node Daemon] Improves the handling of fence states
+  * [API Daemon/CLI Client] Adds support for storage snapshot rollback
+  * [CLI Client] Adds additional warning messages about snapshot consistency to help output
+  * [API Daemon] Fixes a bug listing snapshots by pool/volume
+  * [Node Daemon] Adds a --version flag for information gathering by update-motd.sh
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Wed, 05 Jun 2024 12:01:31 -0400
+
 pvc (0.9.97-0) unstable; urgency=high

  * [Client CLI] Ensures --lines is always an integer value
--- a/health-daemon/pvchealthd/Daemon.py
+++ b/health-daemon/pvchealthd/Daemon.py
@ -33,7 +33,7 @@ import os
 import signal

 # Daemon version
-version = "0.9.97"
+version = "0.9.98"


 ##########################################################
--- a/node-daemon/pvcnoded.py
+++ b/node-daemon/pvcnoded.py
@ -19,6 +19,11 @@
 #
 ###############################################################################

+from sys import argv
 import pvcnoded.Daemon  # noqa: F401

+if "--version" in argv:
+    print(pvcnoded.Daemon.version)
+    exit(0)
+
 pvcnoded.Daemon.entrypoint()
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -49,7 +49,7 @@ import re
 import json

 # Daemon version
-version = "0.9.97"
+version = "0.9.98"


 ##########################################################
--- a/node-daemon/pvcnoded/util/fencing.py
+++ b/node-daemon/pvcnoded/util/fencing.py
@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
        state="i",
        prefix=f"fencing {node_name}",
    )
-    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
+    (
+        ipmi_intermediate_status_retcode,
+        ipmi_intermediate_status_stdout,
+        ipmi_intermediate_status_stderr,
+    ) = common.run_os_command(
        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
    )
-    if ipmi_status_retcode == 0:
+    if ipmi_intermediate_status_retcode == 0:
        logger.out(
-            f"Current chassis power state is: {ipmi_status_stdout.strip()}",
+            f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
            state="i",
            prefix=f"fencing {node_name}",
        )
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
        state="i",
        prefix=f"fencing {node_name}",
    )
-    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
-        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
+    ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
+        common.run_os_command(
+            f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
+        )
    )

-    if ipmi_stop_retcode == 0:
-        if ipmi_status_stdout.strip() == "Chassis Power is on":
+    if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
+        if ipmi_final_status_stdout.strip() == "Chassis Power is on":
            # We successfully rebooted the node and it is powered on; this is a succeessful fence
            logger.out(
                "Successfully rebooted dead node; proceeding with fence recovery action",
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
                prefix=f"fencing {node_name}",
            )
            return True
-        elif ipmi_status_stdout.strip() == "Chassis Power is off":
+        elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
            # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
            logger.out(
                "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
        else:
            # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
            logger.out(
-                f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
+                f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
                state="e",
                prefix=f"fencing {node_name}",
            )
            return False
    else:
-        if ipmi_status_stdout.strip() == "Chassis Power is off":
+        if ipmi_final_status_stdout.strip() == "Chassis Power is off":
            # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
            logger.out(
                "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",
--- a/worker-daemon/pvcworkerd/Daemon.py
+++ b/worker-daemon/pvcworkerd/Daemon.py
@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
 )

 # Daemon version
-version = "0.9.97"
+version = "0.9.98"


 config = cfg.get_configuration()
Author	SHA1	Message	Date
Joshua M. Boniface	1aa5999109	Bump version to 0.9.98	2024-06-05 12:01:31 -04:00
Joshua M. Boniface	570460e5ee	Add --version flag to pvcnoded.py for info	2024-06-05 11:57:47 -04:00
Joshua M. Boniface	7a99e0e524	Fix bugs listing snapshots by pool/volume The logic of this didn't work, so reconfigure to use these like limits. Also fixes a bug in the upper getCephVolumes for invalid pools.	2024-05-16 16:32:22 -04:00
Joshua M. Boniface	234d6ae83b	Add warnings about snapshot consistency	2024-05-13 15:29:43 -04:00
Joshua M. Boniface	5d0e7931d1	Add support for rolling back snapshots We supported creating snapshots, but not doing anything with them. This removes the manual task of restoring a snapshot and replace it with a PVC abstraction of rolling back to a snapshot. While Ceph recommends cloning a snapshot instead of rolling back, due to the time taken, in our usecase I don't think that is an optimal strategy, as it will leave dangling clones that we'd then have to manage. Closes #183	2024-05-13 15:24:51 -04:00
Joshua M. Boniface	dcb9c0d12c	Improve fence handling conditions Use the intermediate output text when judging the fence status, rather than the retcode of the stop as this should be more reliable.	2024-05-08 10:55:15 -04:00
Joshua M. Boniface	f6e856bf98	Fix debug output on timeout	2024-05-06 10:49:57 -04:00
 @ -1 +1 @@
 .9.97
 .9.98