Compare commits

..

4 Commits

Author SHA1 Message Date
234d6ae83b Add warnings about snapshot consistency 2024-05-13 15:29:43 -04:00
5d0e7931d1 Add support for rolling back snapshots
We supported creating snapshots, but not doing anything with them. This
removes the manual task of restoring a snapshot and replace it with a
PVC abstraction of rolling back to a snapshot.

While Ceph recommends cloning a snapshot instead of rolling back, due to
the time taken, in our usecase I don't think that is an optimal
strategy, as it will leave dangling clones that we'd then have to
manage.

Closes #183
2024-05-13 15:24:51 -04:00
dcb9c0d12c Improve fence handling conditions
Use the intermediate output text when judging the fence status, rather
than the retcode of the stop as this should be more reliable.
2024-05-08 10:55:15 -04:00
f6e856bf98 Fix debug output on timeout 2024-05-06 10:49:57 -04:00
7 changed files with 177 additions and 12 deletions

View File

@ -6362,6 +6362,59 @@ api.add_resource(
) )
# /storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback
class API_Storage_Ceph_Snapshot_Rollback_Element(Resource):
@Authenticator
def post(self, pool, volume, snapshot):
"""
Roll back an RBD volume {volume} in pool {pool} to snapshot {snapshot}
WARNING: This action cannot be done on an active RBD volume. All IO MUST be stopped first.
---
tags:
- storage / ceph
parameters:
- in: query
name: snapshot
type: string
required: true
description: The name of the snapshot
- in: query
name: volume
type: string
required: true
description: The name of the volume
- in: query
name: pool
type: integer
required: true
description: The name of the pool
responses:
200:
description: OK
schema:
type: object
id: Message
404:
description: Not found
schema:
type: object
id: Message
400:
description: Bad request
schema:
type: object
id: Message
"""
return api_helper.ceph_volume_snapshot_rollback(pool, volume, snapshot)
api.add_resource(
API_Storage_Ceph_Snapshot_Rollback_Element,
"/storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback",
)
########################################################## ##########################################################
# Provisioner API # Provisioner API
########################################################## ##########################################################

View File

@ -2183,6 +2183,22 @@ def ceph_volume_snapshot_rename(zkhandler, pool, volume, name, new_name):
return output, retcode return output, retcode
@ZKConnection(config)
def ceph_volume_snapshot_rollback(zkhandler, pool, volume, name):
"""
Roll back a Ceph RBD volume to a given snapshot in the PVC Ceph storage cluster.
"""
retflag, retdata = pvc_ceph.rollback_snapshot(zkhandler, pool, volume, name)
if retflag:
retcode = 200
else:
retcode = 400
output = {"message": retdata.replace('"', "'")}
return output, retcode
@ZKConnection(config) @ZKConnection(config)
def ceph_volume_snapshot_remove(zkhandler, pool, volume, name): def ceph_volume_snapshot_remove(zkhandler, pool, volume, name):
""" """

View File

@ -4325,6 +4325,10 @@ def cli_storage_volume_snapshot():
def cli_storage_volume_snapshot_add(pool, volume, name): def cli_storage_volume_snapshot_add(pool, volume, name):
""" """
Add a snapshot with name NAME of Ceph RBD volume VOLUME in pool POOL. Add a snapshot with name NAME of Ceph RBD volume VOLUME in pool POOL.
WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
VM at the moment of the snapshot.
""" """
retcode, retmsg = pvc.lib.storage.ceph_snapshot_add(CLI_CONFIG, pool, volume, name) retcode, retmsg = pvc.lib.storage.ceph_snapshot_add(CLI_CONFIG, pool, volume, name)
@ -4372,6 +4376,36 @@ def cli_storage_volume_snapshot_remove(pool, volume, name):
finish(retcode, retmsg) finish(retcode, retmsg)
###############################################################################
# > pvc storage volume snapshot rollback
###############################################################################
@click.command(name="rollback", short_help="Roll back RBD volume to snapshot.")
@connection_req
@click.argument("pool")
@click.argument("volume")
@click.argument("name")
@confirm_opt("Roll back to snapshot {name} for volume {pool}/{volume}")
def cli_storage_volume_snapshot_rollback(pool, volume, name):
"""
Roll back the Ceph RBD volume VOLUME in pool POOL to the snapshot NAME.
DANGER: All data written to the volume since the given snapshot will be permanently lost.
WARNING: A rollback cannot be performed on an RBD volume with active I/O. Doing so will cause
undefined behaviour and possible corruption. Ensure that any VM(s) using this RBD volume are
stopped or disabled before attempting a snapshot rollback.
WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
VM at the moment of the snapshot.
"""
retcode, retmsg = pvc.lib.storage.ceph_snapshot_rollback(
CLI_CONFIG, pool, volume, name
)
finish(retcode, retmsg)
############################################################################### ###############################################################################
# > pvc storage volume snapshot list # > pvc storage volume snapshot list
############################################################################### ###############################################################################
@ -6349,6 +6383,7 @@ cli_storage_volume.add_command(cli_storage_volume_list)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_add) cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_add)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rename) cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rename)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_remove) cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_remove)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rollback)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_list) cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_list)
cli_storage_volume.add_command(cli_storage_volume_snapshot) cli_storage_volume.add_command(cli_storage_volume_snapshot)
cli_storage.add_command(cli_storage_volume) cli_storage.add_command(cli_storage_volume)

View File

@ -108,9 +108,10 @@ class UploadProgressBar(object):
class ErrorResponse(requests.Response): class ErrorResponse(requests.Response):
def __init__(self, json_data, status_code): def __init__(self, json_data, status_code, headers):
self.json_data = json_data self.json_data = json_data
self.status_code = status_code self.status_code = status_code
self.headers = headers
def json(self): def json(self):
return self.json_data return self.json_data
@ -206,7 +207,7 @@ def call_api(
except Exception as e: except Exception as e:
message = "Failed to connect to the API: {}".format(e) message = "Failed to connect to the API: {}".format(e)
code = response.status_code if response else 504 code = response.status_code if response else 504
response = ErrorResponse({"message": message}, code) response = ErrorResponse({"message": message}, code, None)
# Display debug output # Display debug output
if config["debug"]: if config["debug"]:

View File

@ -1544,6 +1544,30 @@ def ceph_snapshot_add(config, pool, volume, snapshot):
return retstatus, response.json().get("message", "") return retstatus, response.json().get("message", "")
def ceph_snapshot_rollback(config, pool, volume, snapshot):
"""
Roll back Ceph volume to snapshot
API endpoint: POST /api/v1/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback
API arguments:
API schema: {"message":"{data}"}
"""
response = call_api(
config,
"post",
"/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback".format(
snapshot=snapshot, volume=volume, pool=pool
),
)
if response.status_code == 200:
retstatus = True
else:
retstatus = False
return retstatus, response.json().get("message", "")
def ceph_snapshot_remove(config, pool, volume, snapshot): def ceph_snapshot_remove(config, pool, volume, snapshot):
""" """
Remove Ceph snapshot Remove Ceph snapshot

View File

@ -1082,6 +1082,36 @@ def rename_snapshot(zkhandler, pool, volume, name, new_name):
) )
def rollback_snapshot(zkhandler, pool, volume, name):
if not verifyVolume(zkhandler, pool, volume):
return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
volume, pool
)
if not verifySnapshot(zkhandler, pool, volume, name):
return (
False,
'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".'.format(
name, volume, pool
),
)
# 1. Roll back the snapshot
retcode, stdout, stderr = common.run_os_command(
"rbd snap rollback {}/{}@{}".format(pool, volume, name)
)
if retcode:
return (
False,
'ERROR: Failed to roll back RBD volume "{}" in pool "{}" to snapshot "{}": {}'.format(
volume, pool, name, stderr
),
)
return True, 'Rolled back RBD volume "{}" in pool "{}" to snapshot "{}".'.format(
volume, pool, name
)
def remove_snapshot(zkhandler, pool, volume, name): def remove_snapshot(zkhandler, pool, volume, name):
if not verifyVolume(zkhandler, pool, volume): if not verifyVolume(zkhandler, pool, volume):
return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(

View File

@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i", state="i",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( (
ipmi_intermediate_status_retcode,
ipmi_intermediate_status_stdout,
ipmi_intermediate_status_stderr,
) = common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
) )
if ipmi_status_retcode == 0: if ipmi_intermediate_status_retcode == 0:
logger.out( logger.out(
f"Current chassis power state is: {ipmi_status_stdout.strip()}", f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
state="i", state="i",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i", state="i",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command( ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
) )
)
if ipmi_stop_retcode == 0: if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
if ipmi_status_stdout.strip() == "Chassis Power is on": if ipmi_final_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence # We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out( logger.out(
"Successfully rebooted dead node; proceeding with fence recovery action", "Successfully rebooted dead node; proceeding with fence recovery action",
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
return True return True
elif ipmi_status_stdout.strip() == "Chassis Power is off": elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out( logger.out(
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action", "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
else: else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out( logger.out(
f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action", f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
state="e", state="e",
prefix=f"fencing {node_name}", prefix=f"fencing {node_name}",
) )
return False return False
else: else:
if ipmi_status_stdout.strip() == "Chassis Power is off": if ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out( logger.out(
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action", "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",