Compare commits

...

7 Commits

Author SHA1 Message Date
1aa5999109 Bump version to 0.9.98 2024-06-05 12:01:31 -04:00
570460e5ee Add --version flag to pvcnoded.py for info 2024-06-05 11:57:47 -04:00
7a99e0e524 Fix bugs listing snapshots by pool/volume
The logic of this didn't work, so reconfigure to use these like limits.
Also fixes a bug in the upper getCephVolumes for invalid pools.
2024-05-16 16:32:22 -04:00
234d6ae83b Add warnings about snapshot consistency 2024-05-13 15:29:43 -04:00
5d0e7931d1 Add support for rolling back snapshots
We supported creating snapshots, but not doing anything with them. This
removes the manual task of restoring a snapshot and replace it with a
PVC abstraction of rolling back to a snapshot.

While Ceph recommends cloning a snapshot instead of rolling back, due to
the time taken, in our usecase I don't think that is an optimal
strategy, as it will leave dangling clones that we'd then have to
manage.

Closes #183
2024-05-13 15:24:51 -04:00
dcb9c0d12c Improve fence handling conditions
Use the intermediate output text when judging the fence status, rather
than the retcode of the stop as this should be more reliable.
2024-05-08 10:55:15 -04:00
f6e856bf98 Fix debug output on timeout 2024-05-06 10:49:57 -04:00
16 changed files with 218 additions and 32 deletions

View File

@ -1 +1 @@
0.9.97
0.9.98

View File

@ -1,5 +1,14 @@
## PVC Changelog
###### [v0.9.98](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.98)
* [CLI Client] Fixed output when API call times out
* [Node Daemon] Improves the handling of fence states
* [API Daemon/CLI Client] Adds support for storage snapshot rollback
* [CLI Client] Adds additional warning messages about snapshot consistency to help output
* [API Daemon] Fixes a bug listing snapshots by pool/volume
* [Node Daemon] Adds a --version flag for information gathering by update-motd.sh
###### [v0.9.97](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.97)
* [Client CLI] Ensures --lines is always an integer value

View File

@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
import daemon_lib.config as cfg
# Daemon version
version = "0.9.97"
version = "0.9.98"
# API version
API_VERSION = 1.0

View File

@ -6362,6 +6362,59 @@ api.add_resource(
)
# /storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback
class API_Storage_Ceph_Snapshot_Rollback_Element(Resource):
@Authenticator
def post(self, pool, volume, snapshot):
"""
Roll back an RBD volume {volume} in pool {pool} to snapshot {snapshot}
WARNING: This action cannot be done on an active RBD volume. All IO MUST be stopped first.
---
tags:
- storage / ceph
parameters:
- in: query
name: snapshot
type: string
required: true
description: The name of the snapshot
- in: query
name: volume
type: string
required: true
description: The name of the volume
- in: query
name: pool
type: integer
required: true
description: The name of the pool
responses:
200:
description: OK
schema:
type: object
id: Message
404:
description: Not found
schema:
type: object
id: Message
400:
description: Bad request
schema:
type: object
id: Message
"""
return api_helper.ceph_volume_snapshot_rollback(pool, volume, snapshot)
api.add_resource(
API_Storage_Ceph_Snapshot_Rollback_Element,
"/storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback",
)
##########################################################
# Provisioner API
##########################################################

View File

@ -2183,6 +2183,22 @@ def ceph_volume_snapshot_rename(zkhandler, pool, volume, name, new_name):
return output, retcode
@ZKConnection(config)
def ceph_volume_snapshot_rollback(zkhandler, pool, volume, name):
"""
Roll back a Ceph RBD volume to a given snapshot in the PVC Ceph storage cluster.
"""
retflag, retdata = pvc_ceph.rollback_snapshot(zkhandler, pool, volume, name)
if retflag:
retcode = 200
else:
retcode = 400
output = {"message": retdata.replace('"', "'")}
return output, retcode
@ZKConnection(config)
def ceph_volume_snapshot_remove(zkhandler, pool, volume, name):
"""

View File

@ -4325,6 +4325,10 @@ def cli_storage_volume_snapshot():
def cli_storage_volume_snapshot_add(pool, volume, name):
"""
Add a snapshot with name NAME of Ceph RBD volume VOLUME in pool POOL.
WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
VM at the moment of the snapshot.
"""
retcode, retmsg = pvc.lib.storage.ceph_snapshot_add(CLI_CONFIG, pool, volume, name)
@ -4372,6 +4376,36 @@ def cli_storage_volume_snapshot_remove(pool, volume, name):
finish(retcode, retmsg)
###############################################################################
# > pvc storage volume snapshot rollback
###############################################################################
@click.command(name="rollback", short_help="Roll back RBD volume to snapshot.")
@connection_req
@click.argument("pool")
@click.argument("volume")
@click.argument("name")
@confirm_opt("Roll back to snapshot {name} for volume {pool}/{volume}")
def cli_storage_volume_snapshot_rollback(pool, volume, name):
"""
Roll back the Ceph RBD volume VOLUME in pool POOL to the snapshot NAME.
DANGER: All data written to the volume since the given snapshot will be permanently lost.
WARNING: A rollback cannot be performed on an RBD volume with active I/O. Doing so will cause
undefined behaviour and possible corruption. Ensure that any VM(s) using this RBD volume are
stopped or disabled before attempting a snapshot rollback.
WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
VM at the moment of the snapshot.
"""
retcode, retmsg = pvc.lib.storage.ceph_snapshot_rollback(
CLI_CONFIG, pool, volume, name
)
finish(retcode, retmsg)
###############################################################################
# > pvc storage volume snapshot list
###############################################################################
@ -6349,6 +6383,7 @@ cli_storage_volume.add_command(cli_storage_volume_list)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_add)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rename)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_remove)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rollback)
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_list)
cli_storage_volume.add_command(cli_storage_volume_snapshot)
cli_storage.add_command(cli_storage_volume)

View File

@ -108,9 +108,10 @@ class UploadProgressBar(object):
class ErrorResponse(requests.Response):
def __init__(self, json_data, status_code):
def __init__(self, json_data, status_code, headers):
self.json_data = json_data
self.status_code = status_code
self.headers = headers
def json(self):
return self.json_data
@ -206,7 +207,7 @@ def call_api(
except Exception as e:
message = "Failed to connect to the API: {}".format(e)
code = response.status_code if response else 504
response = ErrorResponse({"message": message}, code)
response = ErrorResponse({"message": message}, code, None)
# Display debug output
if config["debug"]:

View File

@ -1544,6 +1544,30 @@ def ceph_snapshot_add(config, pool, volume, snapshot):
return retstatus, response.json().get("message", "")
def ceph_snapshot_rollback(config, pool, volume, snapshot):
"""
Roll back Ceph volume to snapshot
API endpoint: POST /api/v1/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback
API arguments:
API schema: {"message":"{data}"}
"""
response = call_api(
config,
"post",
"/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback".format(
snapshot=snapshot, volume=volume, pool=pool
),
)
if response.status_code == 200:
retstatus = True
else:
retstatus = False
return retstatus, response.json().get("message", "")
def ceph_snapshot_remove(config, pool, volume, snapshot):
"""
Remove Ceph snapshot

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="pvc",
version="0.9.97",
version="0.9.98",
packages=["pvc.cli", "pvc.lib"],
install_requires=[
"Click",

View File

@ -540,7 +540,10 @@ def getCephVolumes(zkhandler, pool):
pool_list = [pool]
for pool_name in pool_list:
for volume_name in zkhandler.children(("volume", pool_name)):
children = zkhandler.children(("volume", pool_name))
if children is None:
continue
for volume_name in children:
volume_list.append("{}/{}".format(pool_name, volume_name))
return volume_list
@ -1082,6 +1085,36 @@ def rename_snapshot(zkhandler, pool, volume, name, new_name):
)
def rollback_snapshot(zkhandler, pool, volume, name):
if not verifyVolume(zkhandler, pool, volume):
return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
volume, pool
)
if not verifySnapshot(zkhandler, pool, volume, name):
return (
False,
'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".'.format(
name, volume, pool
),
)
# 1. Roll back the snapshot
retcode, stdout, stderr = common.run_os_command(
"rbd snap rollback {}/{}@{}".format(pool, volume, name)
)
if retcode:
return (
False,
'ERROR: Failed to roll back RBD volume "{}" in pool "{}" to snapshot "{}": {}'.format(
volume, pool, name, stderr
),
)
return True, 'Rolled back RBD volume "{}" in pool "{}" to snapshot "{}".'.format(
volume, pool, name
)
def remove_snapshot(zkhandler, pool, volume, name):
if not verifyVolume(zkhandler, pool, volume):
return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
@ -1123,20 +1156,9 @@ def remove_snapshot(zkhandler, pool, volume, name):
)
def get_list_snapshot(zkhandler, pool, volume, limit=None, is_fuzzy=True):
def get_list_snapshot(zkhandler, target_pool, target_volume, limit=None, is_fuzzy=True):
snapshot_list = []
if pool and not verifyPool(zkhandler, pool):
return False, 'ERROR: No pool with name "{}" is present in the cluster.'.format(
pool
)
if volume and not verifyPool(zkhandler, volume):
return (
False,
'ERROR: No volume with name "{}" is present in the cluster.'.format(volume),
)
full_snapshot_list = getCephSnapshots(zkhandler, pool, volume)
full_snapshot_list = getCephSnapshots(zkhandler, target_pool, target_volume)
if is_fuzzy and limit:
# Implicitly assume fuzzy limits
@ -1148,6 +1170,10 @@ def get_list_snapshot(zkhandler, pool, volume, limit=None, is_fuzzy=True):
for snapshot in full_snapshot_list:
volume, snapshot_name = snapshot.split("@")
pool_name, volume_name = volume.split("/")
if target_pool and pool_name != target_pool:
continue
if target_volume and volume_name != target_volume:
continue
if limit:
try:
if re.fullmatch(limit, snapshot_name):

11
debian/changelog vendored
View File

@ -1,3 +1,14 @@
pvc (0.9.98-0) unstable; urgency=high
* [CLI Client] Fixed output when API call times out
* [Node Daemon] Improves the handling of fence states
* [API Daemon/CLI Client] Adds support for storage snapshot rollback
* [CLI Client] Adds additional warning messages about snapshot consistency to help output
* [API Daemon] Fixes a bug listing snapshots by pool/volume
* [Node Daemon] Adds a --version flag for information gathering by update-motd.sh
-- Joshua M. Boniface <joshua@boniface.me> Wed, 05 Jun 2024 12:01:31 -0400
pvc (0.9.97-0) unstable; urgency=high
* [Client CLI] Ensures --lines is always an integer value

View File

@ -33,7 +33,7 @@ import os
import signal
# Daemon version
version = "0.9.97"
version = "0.9.98"
##########################################################

View File

@ -19,6 +19,11 @@
#
###############################################################################
from sys import argv
import pvcnoded.Daemon # noqa: F401
if "--version" in argv:
print(pvcnoded.Daemon.version)
exit(0)
pvcnoded.Daemon.entrypoint()

View File

@ -49,7 +49,7 @@ import re
import json
# Daemon version
version = "0.9.97"
version = "0.9.98"
##########################################################

View File

@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i",
prefix=f"fencing {node_name}",
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
(
ipmi_intermediate_status_retcode,
ipmi_intermediate_status_stdout,
ipmi_intermediate_status_stderr,
) = common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
)
if ipmi_status_retcode == 0:
if ipmi_intermediate_status_retcode == 0:
logger.out(
f"Current chassis power state is: {ipmi_status_stdout.strip()}",
f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
state="i",
prefix=f"fencing {node_name}",
)
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i",
prefix=f"fencing {node_name}",
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
)
)
if ipmi_stop_retcode == 0:
if ipmi_status_stdout.strip() == "Chassis Power is on":
if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
if ipmi_final_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out(
"Successfully rebooted dead node; proceeding with fence recovery action",
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
prefix=f"fencing {node_name}",
)
return True
elif ipmi_status_stdout.strip() == "Chassis Power is off":
elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out(
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out(
f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
state="e",
prefix=f"fencing {node_name}",
)
return False
else:
if ipmi_status_stdout.strip() == "Chassis Power is off":
if ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out(
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",

View File

@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
)
# Daemon version
version = "0.9.97"
version = "0.9.98"
config = cfg.get_configuration()