Add warnings about snapshot consistency

Add support for rolling back snapshots
We supported creating snapshots, but not doing anything with them. This removes the manual task of restoring a snapshot and replace it with a PVC abstraction of rolling back to a snapshot. While Ceph recommends cloning a snapshot instead of rolling back, due to the time taken, in our usecase I don't think that is an optimal strategy, as it will leave dangling clones that we'd then have to manage. Closes #183
2024-05-13 15:29:43 -04:00 · 2024-05-13 15:24:51 -04:00 · 2024-05-08 10:55:15 -04:00 · 2024-05-06 10:49:57 -04:00
7 changed files with 177 additions and 12 deletions
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@ -6362,6 +6362,59 @@ api.add_resource(
 )
 # /storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback
 class API_Storage_Ceph_Snapshot_Rollback_Element(Resource):
    @Authenticator
    def post(self, pool, volume, snapshot):
        """
        Roll back an RBD volume {volume} in pool {pool} to snapshot {snapshot}
        WARNING: This action cannot be done on an active RBD volume. All IO MUST be stopped first.
        ---
        tags:
          - storage / ceph
        parameters:
          - in: query
            name: snapshot
            type: string
            required: true
            description: The name of the snapshot
          - in: query
            name: volume
            type: string
            required: true
            description: The name of the volume
          - in: query
            name: pool
            type: integer
            required: true
            description: The name of the pool
        responses:
          200:
            description: OK
            schema:
              type: object
              id: Message
          404:
            description: Not found
            schema:
              type: object
              id: Message
          400:
            description: Bad request
            schema:
              type: object
              id: Message
        """
        return api_helper.ceph_volume_snapshot_rollback(pool, volume, snapshot)
 api.add_resource(
    API_Storage_Ceph_Snapshot_Rollback_Element,
    "/storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback",
 )
 ##########################################################
 # Provisioner API
 ##########################################################
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@ -2183,6 +2183,22 @@ def ceph_volume_snapshot_rename(zkhandler, pool, volume, name, new_name):
    return output, retcode
@ZKConnection(config)
 def ceph_volume_snapshot_rollback(zkhandler, pool, volume, name):
    """
    Roll back a Ceph RBD volume to a given snapshot in the PVC Ceph storage cluster.
    """
    retflag, retdata = pvc_ceph.rollback_snapshot(zkhandler, pool, volume, name)
    if retflag:
        retcode = 200
    else:
        retcode = 400
    output = {"message": retdata.replace('"', "'")}
    return output, retcode
@ZKConnection(config)
 def ceph_volume_snapshot_remove(zkhandler, pool, volume, name):
    """
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@ -4325,6 +4325,10 @@ def cli_storage_volume_snapshot():
 def cli_storage_volume_snapshot_add(pool, volume, name):
    """
    Add a snapshot with name NAME of Ceph RBD volume VOLUME in pool POOL.
    WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
    of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
    VM at the moment of the snapshot.
    """
    retcode, retmsg = pvc.lib.storage.ceph_snapshot_add(CLI_CONFIG, pool, volume, name)
@ -4372,6 +4376,36 @@ def cli_storage_volume_snapshot_remove(pool, volume, name):
    finish(retcode, retmsg)
 ###############################################################################
 # > pvc storage volume snapshot rollback
 ###############################################################################
@click.command(name="rollback", short_help="Roll back RBD volume to snapshot.")
@connection_req
@click.argument("pool")
@click.argument("volume")
@click.argument("name")
@confirm_opt("Roll back to snapshot {name} for volume {pool}/{volume}")
 def cli_storage_volume_snapshot_rollback(pool, volume, name):
    """
    Roll back the Ceph RBD volume VOLUME in pool POOL to the snapshot NAME.
    DANGER: All data written to the volume since the given snapshot will be permanently lost.
    WARNING: A rollback cannot be performed on an RBD volume with active I/O. Doing so will cause
    undefined behaviour and possible corruption. Ensure that any VM(s) using this RBD volume are
    stopped or disabled before attempting a snapshot rollback.
    WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
    of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
    VM at the moment of the snapshot.
    """
    retcode, retmsg = pvc.lib.storage.ceph_snapshot_rollback(
        CLI_CONFIG, pool, volume, name
    )
    finish(retcode, retmsg)
 ###############################################################################
 # > pvc storage volume snapshot list
 ###############################################################################
@ -6349,6 +6383,7 @@ cli_storage_volume.add_command(cli_storage_volume_list)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_add)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rename)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_remove)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rollback)
 cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_list)
 cli_storage_volume.add_command(cli_storage_volume_snapshot)
 cli_storage.add_command(cli_storage_volume)
--- a/client-cli/pvc/lib/common.py
+++ b/client-cli/pvc/lib/common.py
@ -108,9 +108,10 @@ class UploadProgressBar(object):
 class ErrorResponse(requests.Response):
-    def __init__(self, json_data, status_code):
+    def __init__(self, json_data, status_code, headers):
        self.json_data = json_data
        self.status_code = status_code
        self.headers = headers
    def json(self):
        return self.json_data
@ -206,7 +207,7 @@ def call_api(
    except Exception as e:
        message = "Failed to connect to the API: {}".format(e)
        code = response.status_code if response else 504
-        response = ErrorResponse({"message": message}, code)
+        response = ErrorResponse({"message": message}, code, None)
    # Display debug output
    if config["debug"]:
--- a/client-cli/pvc/lib/storage.py
+++ b/client-cli/pvc/lib/storage.py
@ -1544,6 +1544,30 @@ def ceph_snapshot_add(config, pool, volume, snapshot):
    return retstatus, response.json().get("message", "")
 def ceph_snapshot_rollback(config, pool, volume, snapshot):
    """
    Roll back Ceph volume to snapshot
    API endpoint: POST /api/v1/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback
    API arguments:
    API schema: {"message":"{data}"}
    """
    response = call_api(
        config,
        "post",
        "/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback".format(
            snapshot=snapshot, volume=volume, pool=pool
        ),
    )
    if response.status_code == 200:
        retstatus = True
    else:
        retstatus = False
    return retstatus, response.json().get("message", "")
 def ceph_snapshot_remove(config, pool, volume, snapshot):
    """
    Remove Ceph snapshot
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@ -1082,6 +1082,36 @@ def rename_snapshot(zkhandler, pool, volume, name, new_name):
    )
 def rollback_snapshot(zkhandler, pool, volume, name):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            volume, pool
        )
    if not verifySnapshot(zkhandler, pool, volume, name):
        return (
            False,
            'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".'.format(
                name, volume, pool
            ),
        )
        # 1. Roll back the snapshot
        retcode, stdout, stderr = common.run_os_command(
            "rbd snap rollback {}/{}@{}".format(pool, volume, name)
        )
        if retcode:
            return (
                False,
                'ERROR: Failed to roll back RBD volume "{}" in pool "{}" to snapshot "{}": {}'.format(
                    volume, pool, name, stderr
                ),
            )
    return True, 'Rolled back RBD volume "{}" in pool "{}" to snapshot "{}".'.format(
        volume, pool, name
    )
 def remove_snapshot(zkhandler, pool, volume, name):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
--- a/node-daemon/pvcnoded/util/fencing.py
+++ b/node-daemon/pvcnoded/util/fencing.py
@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
        state="i",
        prefix=f"fencing {node_name}",
    )
-    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
+    (
        ipmi_intermediate_status_retcode,
        ipmi_intermediate_status_stdout,
        ipmi_intermediate_status_stderr,
    ) = common.run_os_command(
        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
    )
-    if ipmi_status_retcode == 0:
+    if ipmi_intermediate_status_retcode == 0:
        logger.out(
-            f"Current chassis power state is: {ipmi_status_stdout.strip()}",
+            f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
            state="i",
            prefix=f"fencing {node_name}",
        )
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
        state="i",
        prefix=f"fencing {node_name}",
    )
-    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
+    ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
        common.run_os_command(
            f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
        )
    )
-    if ipmi_stop_retcode == 0:
+    if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
-        if ipmi_status_stdout.strip() == "Chassis Power is on":
+        if ipmi_final_status_stdout.strip() == "Chassis Power is on":
            # We successfully rebooted the node and it is powered on; this is a succeessful fence
            logger.out(
                "Successfully rebooted dead node; proceeding with fence recovery action",
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
                prefix=f"fencing {node_name}",
            )
            return True
-        elif ipmi_status_stdout.strip() == "Chassis Power is off":
+        elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
            # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
            logger.out(
                "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
        else:
            # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
            logger.out(
-                f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
+                f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
                state="e",
                prefix=f"fencing {node_name}",
            )
            return False
    else:
-        if ipmi_status_stdout.strip() == "Chassis Power is off":
+        if ipmi_final_status_stdout.strip() == "Chassis Power is off":
            # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
            logger.out(
                "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",
Author	SHA1	Message	Date
Joshua M. Boniface	234d6ae83b	Add warnings about snapshot consistency	2024-05-13 15:29:43 -04:00
Joshua M. Boniface	5d0e7931d1	Add support for rolling back snapshots We supported creating snapshots, but not doing anything with them. This removes the manual task of restoring a snapshot and replace it with a PVC abstraction of rolling back to a snapshot. While Ceph recommends cloning a snapshot instead of rolling back, due to the time taken, in our usecase I don't think that is an optimal strategy, as it will leave dangling clones that we'd then have to manage. Closes #183	2024-05-13 15:24:51 -04:00
Joshua M. Boniface	dcb9c0d12c	Improve fence handling conditions Use the intermediate output text when judging the fence status, rather than the retcode of the stop as this should be more reliable.	2024-05-08 10:55:15 -04:00
Joshua M. Boniface	f6e856bf98	Fix debug output on timeout	2024-05-06 10:49:57 -04:00