Add KeyDB to node startup services

Also ensure API worker starts on all nodes, not just coordinators.
Remove redis as a dependency
2023-11-05 19:26:38 -05:00 · 2023-11-05 18:23:34 -05:00 · 2023-11-05 17:10:46 -05:00 · 2023-11-05 16:48:47 -05:00 · 2023-11-04 15:17:50 -04:00 · 2023-11-04 15:05:07 -04:00
13 changed files with 1146 additions and 701 deletions
--- a/api-daemon/pvcapid-worker.sh
+++ b/api-daemon/pvcapid-worker.sh
@@ -25,10 +25,10 @@ CELERY_BIN="$( which celery )"
 # app arguments work in a non-backwards-compatible way with Celery 5.
 case "$( cat /etc/debian_version )" in
    10.*)
-        CELERY_ARGS="worker --app pvcapid.flaskapi.celery --concurrency 1 --loglevel INFO"
+        CELERY_ARGS="worker --app pvcapid.flaskapi.celery --concurrency 1 --hostname $(hostname -s) --queues $(hostname -s) --loglevel INFO"
    ;;
    *)
-        CELERY_ARGS="--app pvcapid.flaskapi.celery worker --concurrency 1 --loglevel INFO"
+        CELERY_ARGS="--app pvcapid.flaskapi.celery worker --concurrency 1 --hostname $(hostname -s) --queues $(hostname -s) --loglevel INFO"
    ;;
 esac

--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@@ -24,6 +24,11 @@ import flask
 from functools import wraps
 from flask_restful import Resource, Api, reqparse, abort
 from celery import Celery
+from kombu import Queue
+
+from daemon_lib.common import getPrimaryNode
+from daemon_lib.zkhandler import ZKConnection
+from daemon_lib.node import get_list as get_node_list

 from pvcapid.Daemon import config, strtobool, API_VERSION

@@ -44,6 +49,47 @@ app.config["CELERY_BROKER_URL"] = "redis://{}:{}{}".format(
 app.config["CELERY_RESULT_BACKEND"] = "redis://{}:{}{}".format(
    config["queue_host"], config["queue_port"], config["queue_path"]
 )
+
+# Set up Celery queues
+app.config["CELERY_DATABASE_ENGINE_OPTIONS"] = {"echo": True}
+
+
+@ZKConnection(config)
+def get_all_nodes(zkhandler):
+    _, all_nodes = get_node_list(zkhandler, None)
+    return [n["name"] for n in all_nodes]
+
+
+app.config["CELERY_QUEUES"] = tuple(
+    [Queue(h, routing_key=f"{h}.#") for h in get_all_nodes()]
+)
+
+
+# Set up Celery queue routing
+def route_task(name, args, kwargs, options, task=None, **kw):
+    @ZKConnection(config)
+    def get_primary_node(zkhandler):
+        return getPrimaryNode(zkhandler)
+
+    print("----")
+    print(f"Incoming Celery task: '{name}' with args {args}, kwargs {kwargs}")
+
+    # If an explicit routing_key is set and it's in the kwargs of the function, use it to set the queue
+    if options["routing_key"] != "default" and options["routing_key"] in kwargs.keys():
+        run_on = kwargs[options["routing_key"]]
+    # Otherwise, use the primary node
+    else:
+        run_on = get_primary_node()
+
+    print(f"Selected Celery worker: {run_on}")
+    print("----")
+
+    return run_on
+
+
+app.config["CELERY_ROUTES"] = (route_task,)
+
+# Set up SQLAlchemy backend
 app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
 app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql://{}:{}@{}:{}/{}".format(
    config["database_user"],
@@ -75,7 +121,6 @@ app.register_blueprint(blueprint)
 celery = Celery(app.name, broker=app.config["CELERY_BROKER_URL"])
 celery.conf.update(app.config)

-
 #
 # Custom decorators
 #
@@ -142,7 +187,7 @@ def Authenticator(function):
 #
 # Job functions
 #
-@celery.task(bind=True)
+@celery.task(name="provisioner.create", bind=True)
 def create_vm(
    self, vm_name, profile_name, define_vm=True, start_vm=True, script_run_args=[]
 ):
@@ -156,7 +201,7 @@ def create_vm(
    )


-@celery.task(bind=True)
+@celery.task(name="storage.benchmark", bind=True)
 def run_benchmark(self, pool):
    return api_benchmark.run_benchmark(self, pool)

@@ -4281,15 +4326,17 @@ class API_Storage_Ceph_OSD_Root(Resource):
                "required": True,
                "helptext": "An OSD weight must be specified.",
            },
-            {
-                "name": "ext_db",
-                "required": False,
-                "helptext": "Whether to use an external OSD DB LV device.",
-            },
            {
                "name": "ext_db_ratio",
                "required": False,
-                "helptext": "Decimal size ratio of the external OSD DB LV device.",
+            },
+            {
+                "name": "ext_db_size",
+                "required": False,
+            },
+            {
+                "name": "osd_count",
+                "required": False,
            },
        ]
    )
@@ -4297,7 +4344,7 @@ class API_Storage_Ceph_OSD_Root(Resource):
    def post(self, reqargs):
        """
        Add a Ceph OSD to the cluster
-        Note: This task may take up to 30s to complete and return
+        Note: This task may take up to 60s to complete and return
        ---
        tags:
          - storage / ceph
@@ -4317,16 +4364,21 @@ class API_Storage_Ceph_OSD_Root(Resource):
            type: number
            required: true
            description: The Ceph CRUSH weight for the OSD
-          - in: query
-            name: ext_db
-            type: boolean
-            required: false
-            description: Whether to use an external OSD DB LV device
          - in: query
            name: ext_db_ratio
            type: float
            required: false
-            description: Decimal ratio of total OSD size for the external OSD DB LV device, default 0.05 (5%)
+            description: If set, creates an OSD DB LV with this decimal ratio of DB to total OSD size (usually 0.05 i.e. 5%); mutually exclusive with ext_db_size
+          - in: query
+            name: ext_db_size
+            type: float
+            required: false
+            description: If set, creates an OSD DB LV with this explicit size in human units (e.g. 1024M, 20G); mutually exclusive with ext_db_ratio
+          - in: query
+            name: osd_count
+            type: integer
+            required: false
+            description: If set, create this many OSDs on the block device instead of 1; usually 2 or 4 depending on size
        responses:
          200:
            description: OK
@@ -4343,8 +4395,9 @@ class API_Storage_Ceph_OSD_Root(Resource):
            reqargs.get("node", None),
            reqargs.get("device", None),
            reqargs.get("weight", None),
-            reqargs.get("ext_db", False),
-            float(reqargs.get("ext_db_ratio", 0.05)),
+            reqargs.get("ext_db_ratio", None),
+            reqargs.get("ext_db_size", None),
+            reqargs.get("osd_count", None),
        )


@@ -4371,14 +4424,25 @@ class API_Storage_Ceph_OSD_Element(Resource):
    @RequestParser(
        [
            {
-                "name": "device",
+                "name": "new_device",
                "required": True,
                "helptext": "A valid device or detect string must be specified.",
            },
+            {
+                "name": "old_device",
+                "required": False,
+            },
            {
                "name": "weight",
-                "required": True,
-                "helptext": "An OSD weight must be specified.",
+                "required": False,
+            },
+            {
+                "name": "ext_db_ratio",
+                "required": False,
+            },
+            {
+                "name": "ext_db_size",
+                "required": False,
            },
            {
                "name": "yes-i-really-mean-it",
@@ -4397,15 +4461,30 @@ class API_Storage_Ceph_OSD_Element(Resource):
          - storage / ceph
        parameters:
          - in: query
-            name: device
+            name: new_device
            type: string
            required: true
            description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") to replace the OSD onto
+          - in: query
+            name: old_device
+            type: string
+            required: false
+            description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") of the original OSD
          - in: query
            name: weight
            type: number
-            required: true
-            description: The Ceph CRUSH weight for the replaced OSD
+            required: false
+            description: The Ceph CRUSH weight for the replacement OSD
+          - in: query
+            name: ext_db_ratio
+            type: float
+            required: false
+            description: If set, creates an OSD DB LV for the replcement OSD with this decimal ratio of DB to total OSD size (usually 0.05 i.e. 5%); if unset, use existing ext_db_size
+          - in: query
+            name: ext_db_size
+            type: float
+            required: false
+            description: If set, creates an OSD DB LV for the replacement OSD with this explicit size in human units (e.g. 1024M, 20G); if unset, use existing ext_db_size
        responses:
          200:
            description: OK
@@ -4420,8 +4499,11 @@ class API_Storage_Ceph_OSD_Element(Resource):
        """
        return api_helper.ceph_osd_replace(
            osdid,
-            reqargs.get("device", None),
+            reqargs.get("new_device"),
+            reqargs.get("old_device", None),
            reqargs.get("weight", None),
+            reqargs.get("ext_db_ratio", None),
+            reqargs.get("ext_db_size", None),
        )

    @RequestParser(
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@@ -1366,12 +1366,26 @@ def ceph_osd_db_vg_add(zkhandler, node, device):


@ZKConnection(config)
-def ceph_osd_add(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05):
+def ceph_osd_add(
+    zkhandler,
+    node,
+    device,
+    weight,
+    ext_db_ratio=None,
+    ext_db_size=None,
+    split_count=None,
+):
    """
    Add a Ceph OSD to the PVC Ceph storage cluster.
    """
    retflag, retdata = pvc_ceph.add_osd(
-        zkhandler, node, device, weight, ext_db_flag, ext_db_ratio
+        zkhandler,
+        node,
+        device,
+        weight,
+        ext_db_ratio,
+        ext_db_size,
+        split_count,
    )

    if retflag:
@@ -1384,11 +1398,21 @@ def ceph_osd_add(zkhandler, node, device, weight, ext_db_flag=False, ext_db_rati


@ZKConnection(config)
-def ceph_osd_replace(zkhandler, osd_id, device, weight):
+def ceph_osd_replace(
+    zkhandler,
+    osd_id,
+    new_device,
+    old_device=None,
+    weight=None,
+    ext_db_ratio=None,
+    ext_db_size=None,
+):
    """
    Replace a Ceph OSD in the PVC Ceph storage cluster.
    """
-    retflag, retdata = pvc_ceph.replace_osd(zkhandler, osd_id, device, weight)
+    retflag, retdata = pvc_ceph.replace_osd(
+        zkhandler, osd_id, new_device, old_device, weight, ext_db_ratio, ext_db_size
+    )

    if retflag:
        retcode = 200
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@@ -169,9 +169,10 @@ def restart_opt(function):
    @wraps(function)
    def confirm_action(*args, **kwargs):
        restart_state = kwargs.get("restart_flag", None)
+        live_state = kwargs.get("live_flag", False)

-        if restart_state is None:
-            # Neither "--restart" or "--no-restart" was passed: prompt for restart or restart if "--unsafe"
+        if restart_state is None and not live_state:
+            # Neither "--restart" or "--no-restart" was passed, and "--no-live" was passed: prompt for restart or restart if "--unsafe"
            try:
                click.confirm(
                    f"Restart VM {kwargs.get('domain')} to apply changes",
@@ -179,6 +180,7 @@ def restart_opt(function):
                    abort=True,
                )
                kwargs["restart_flag"] = True
+                kwargs["confirm_flag"] = True
            except Exception:
                echo(CLI_CONFIG, "Changes will be applied on next VM start/restart.")
                kwargs["restart_flag"] = False
@@ -3362,9 +3364,11 @@ def cli_storage_osd():
 )
 def cli_storage_osd_create_db_vg(node, device):
    """
-    Create a new Ceph OSD database volume group on node NODE with block device DEVICE. DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Using partitions is not supported.
+    Create a new Ceph OSD database volume group on node NODE with block device DEVICE.

-    This volume group will be used for Ceph OSD database and WAL functionality if the '--ext-db' flag is passed to newly-created OSDs during 'pvc storage osd add'. DEVICE should be an extremely fast SSD device (NVMe, Intel Optane, etc.) which is significantly faster than the normal OSD disks and with very high write endurance.
+    DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Partitions are NOT supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'. The path or detect string must be valid on the current node housing the OSD.
+
+    This volume group will be used for Ceph OSD database and WAL functionality if an'--ext-db-*' flag is passed to newly-created OSDs during 'pvc storage osd add'. DEVICE should be an extremely fast SSD device (NVMe, Intel Optane, etc.) which is significantly faster than the normal OSD disks and with very high write endurance. For mor edetails, see the "pvc storage osd add" command help.

    Only one OSD database volume group on a single physical device, named "osd-db", is supported per node, so it must be fast and large enough to act as an effective OSD database device for all OSDs on the node. Attempting to add additional database volume groups after the first will result in an error.

@@ -3390,42 +3394,70 @@ def cli_storage_osd_create_db_vg(node, device):
    "weight",
    default=1.0,
    show_default=True,
-    help="Weight of the OSD within the CRUSH map.",
-)
-@click.option(
-    "-d",
-    "--ext-db",
-    "ext_db_flag",
-    is_flag=True,
-    default=False,
-    help="Use an external database logical volume for this OSD.",
+    help="Weight of the OSD(s) within the CRUSH map.",
 )
@click.option(
    "-r",
    "--ext-db-ratio",
    "ext_db_ratio",
-    default=0.05,
-    show_default=True,
+    default=None,
    type=float,
-    help="Decimal ratio of the external database logical volume to the OSD size.",
+    help="Create an external database logical volume for the OSD(s) with this decimal ratio of the DB LV to the OSD size.",
 )
-@confirm_opt("Destroy all data on and create new OSD on node {node} device {device}")
-def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio):
+@click.option(
+    "-s",
+    "--ext-db-size",
+    "ext_db_size",
+    default=None,
+    show_default=True,
+    help="Create an external database logical volume for the OSD(s) with this human-unit size.",
+)
+@click.option(
+    "-c",
+    "--osd-count",
+    "osd_count",
+    default=None,
+    show_default=False,
+    type=int,
+    help="Split (an NVMe) disk into this many OSDs.",
+)
+@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}")
+def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_count):
    """
-    Add a new Ceph OSD on node NODE with block device DEVICE. DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported.
+    Add a new Ceph OSD on node NODE with block device DEVICE.
+
+    DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Partitions are NOT supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". The path or detect string must be valid on the current node housing the OSD.

    A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". Detect strings allow for automatic determination of Linux block device paths from known basic information about disks by leveraging "lsscsi" on the target host. The "NAME" should be some descriptive identifier, for instance the manufacturer (e.g. "INTEL"), the "HUMAN-SIZE" should be the labeled human-readable size of the device (e.g. "480GB", "1.92TB"), and "ID" specifies the Nth 0-indexed device which matches the "NAME" and "HUMAN-SIZE" values (e.g. "2" would match the third device with the corresponding "NAME" and "HUMAN-SIZE"). When matching against sizes, there is +/- 3% flexibility to account for base-1000 vs. base-1024 differences and rounding errors. The "NAME" may contain whitespace but if so the entire detect string should be quoted, and is case-insensitive. More information about detect strings can be found in the pvcbootstrapd manual.

-    The weight of an OSD should reflect the ratio of the OSD to other OSDs in the storage cluster. For example, if all OSDs are the same size as recommended for PVC, 1 (the default) is a valid weight so that all are treated identically. If a new OSD is added later which is 4x the size of the existing OSDs, the new OSD's weight should then be 4 to tell the cluster that 4x the data can be stored on the OSD. Weights can also be tweaked for performance reasons, since OSDs with more data will incur more I/O load. For more information about CRUSH weights, please see the Ceph documentation.
+    The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation.

-    If '--ext-db' is specified, the OSD database and WAL will be placed on a new logical volume in NODE's OSD database volume group. An OSD database volume group must exist on the node or the OSD creation will fail. See the 'pvc storage osd create-db-vg' command for more details.
+    The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its RocksDB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.). An 0.05 ratio is recommended; at least 0.02 is required, and more than 0.05 can potentially increase performance in write-heavy workloads.

-    The default '--ext-db-ratio' of 0.05 (5%) is sufficient for most RBD workloads and OSD sizes, though this can be adjusted based on the sizes of the OSD(s) and the underlying database device. Ceph documentation recommends at least 0.02 (2%) for RBD use-cases, and higher values may improve WAL performance under write-heavy workloads with fewer OSDs per node.
+    WARNING: An external DB carries important caveats. An external DB is only suggested for relatively slow OSD devices (e.g. SATA SSDs) when there is also a much faster, more robust, but smaller storage device in the system (e.g. an NVMe or 3DXPoint SSD) which can accelerate the OSD. An external DB is NOT recommended for NVMe OSDs as this will hamper performance and reliability. Additionally, it is important to note that the OSD will depend entirely on this external DB device; they cannot be separated without destroying the OSD, and the OSD cannot function without the external DB device, thus introducting a single point of failure. Use this feature with extreme care.
+
+    The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size and performance of the OSD disk; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs.
+
+    Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD; ratios are calculated from the OSD size, not the underlying device.
+
+    NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
    """

-    retcode, retmsg = pvc.lib.storage.ceph_osd_add(
-        CLI_CONFIG, node, device, weight, ext_db_flag, ext_db_ratio
+    echo(
+        CLI_CONFIG,
+        "Waiting for node task to complete, this may take some time... ",
+        newline=False,
    )
+    retcode, retmsg = pvc.lib.storage.ceph_osd_add(
+        CLI_CONFIG,
+        node,
+        device,
+        weight,
+        ext_db_ratio,
+        ext_db_size,
+        osd_count,
+    )
+    echo(CLI_CONFIG, "done.")
    finish(retcode, retmsg)


@@ -3435,30 +3467,68 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio):
@click.command(name="replace", short_help="Replace OSD block device.")
@connection_req
@click.argument("osdid")
-@click.argument("device")
+@click.argument("new_device")
+@click.option(
+    "-o",
+    "--old-device",
+    "old_device",
+    default=None,
+    help="The old OSD block device, if known and valid",
+)
@click.option(
    "-w",
    "--weight",
    "weight",
-    default=1.0,
-    show_default=True,
-    help="New weight of the OSD within the CRUSH map.",
+    default=None,
+    help="New weight of the OSD(s) within the CRUSH map; if unset, old weight is used",
 )
-@confirm_opt("Replace OSD {osdid} with block device {device} weight {weight}")
-def cli_storage_osd_replace(osdid, device, weight):
+@click.option(
+    "-r",
+    "--ext-db-ratio",
+    "ext_db_ratio",
+    default=None,
+    help="Create a new external database logical volume for the OSD(s) with this decimal ratio of the DB LV to the OSD size; if unset, old ext_db_size is used",
+)
+@click.option(
+    "-s",
+    "--ext-db-size",
+    "ext_db_size",
+    default=None,
+    help="Create a new external database logical volume for the OSD(s) with this human-unit size; if unset, old ext_db_size is used",
+)
+@confirm_opt(
+    "Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}"
+)
+def cli_storage_osd_replace(
+    osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
+):
    """
-    Replace the block device of an existing OSD with ID OSDID with DEVICE. Use this command to replace a failed or smaller OSD block device with a new one.
+    Replace the block device of an existing OSD with ID OSDID, and any peer split OSDs with the same block device, with NEW_DEVICE. Use this command to replace a failed or smaller OSD block device with a new one in one command.

-    DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'.
+    DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Partitions are NOT supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'. The path or detect string must be valid on the current node housing the OSD.

-    The weight of an OSD should reflect the ratio of the OSD to other OSDs in the storage cluster. For details, see 'pvc storage osd add --help'. Note that the current weight must be explicitly specified if it differs from the default.
+    If OSDID is part of a split OSD set, any peer split OSDs with the same configured block device will be replaced as well. The split count will be retained and cannot be changed with this command; to do so, all OSDs in the split OSD set must be removed and new OSD(s) created.

-    Existing IDs, external DB devices, etc. of the OSD will be preserved; data will be lost and rebuilt from the remaining healthy OSDs.
+    WARNING: This operation entails (and is functionally equivalent to) a removal and recreation of the specified OSD and, if applicable, all peer split OSDs. This is an intensive and potentially destructive action. Ensure that the cluster is otherwise healthy before proceeding, and ensure the subsequent rebuild completes successfully. Do not attempt this operation on a severely degraded cluster without first considering the possible data loss implications.
+
+    If the "-o"/"--old-device" option is specified, is a valid block device on the node, is readable/accessible, and contains the metadata for the specified OSD, it will be zapped. If this option is not specified, the system will try to find the old block device automatically to zap it. If it can't be found, the OSD will simply be removed from the CRUSH map and PVC database before recreating. This option can provide a cleaner deletion when replacing a working device that has a different block path, but is otherwise unnecessary.
+
+    The "-w"/"--weight", "-r"/"--ext-db-ratio", and "-s"/"--ext-db-size" allow overriding the existing weight and external DB LV for the OSD(s), if desired. If unset, the existing weight and external DB LV size (if applicable) will be used for the replacement OSD(s) instead.
+
+    NOTE: If neither the "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" option is specified, and the OSD(s) had an external DB LV, it cannot be removed a new DB LV will be created for the replacement OSD(s); this cannot be avoided. However, if the OSD(s) did not have an external DB LV, and one of these options is specified, a new DB LV will be added to the new OSD.
+
+    NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
    """

+    echo(
+        CLI_CONFIG,
+        "Waiting for node task to complete, this may take some time... ",
+        newline=False,
+    )
    retcode, retmsg = pvc.lib.storage.ceph_osd_replace(
-        CLI_CONFIG, osdid, device, weight
+        CLI_CONFIG, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
    )
+    echo(CLI_CONFIG, "done.")
    finish(retcode, retmsg)


@@ -3474,13 +3544,22 @@ def cli_storage_osd_refresh(osdid, device):
    """
    Refresh (reimport) the block DEVICE of an existing OSD with ID OSDID. Use this command to reimport a working OSD into a rebuilt/replaced node.

-    DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'.
+    DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Partitions are NOT supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'. The path or detect string must be valid on the current node housing the OSD.

-    Existing data, IDs, weights, etc. of the OSD will be preserved.
+    Existing data, IDs, weights, DB LVs, etc. of the OSD will be preserved. Any split peer OSD(s) on the same block device will also be automatically refreshed.

-    NOTE: If a device had an external DB device, this is not automatically handled at this time. It is best to remove and re-add the OSD instead.
+    NOTE: If the OSD(s) had an external DB device, it must exist before refreshing the OSD. If it can't be found, the OSD cannot be reimported and must be recreated.
+
+    NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
    """
+
+    echo(
+        CLI_CONFIG,
+        "Waiting for node task to complete, this may take some time... ",
+        newline=False,
+    )
    retcode, retmsg = pvc.lib.storage.ceph_osd_refresh(CLI_CONFIG, osdid, device)
+    echo(CLI_CONFIG, "done.")
    finish(retcode, retmsg)


@@ -3506,9 +3585,17 @@ def cli_storage_osd_remove(osdid, force_flag):
    DANGER: This will completely remove the OSD from the cluster. OSDs will rebalance which will negatively affect performance and available space. It is STRONGLY RECOMMENDED to set an OSD out (using 'pvc storage osd out') and allow the cluster to fully rebalance, verified with 'pvc storage status', before removing an OSD.

    NOTE: The "-f"/"--force" option is useful after replacing a failed node, to ensure the OSD is removed even if the OSD in question does not properly exist on the node after a rebuild.
+
+    NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
    """

+    echo(
+        CLI_CONFIG,
+        "Waiting for node task to complete, this may take some time... ",
+        newline=False,
+    )
    retcode, retmsg = pvc.lib.storage.ceph_osd_remove(CLI_CONFIG, osdid, force_flag)
+    echo(CLI_CONFIG, "done.")
    finish(retcode, retmsg)


@@ -5577,7 +5664,7 @@ def cli_connection_add(
    scheme = "https" if ssl_flag else "http"

    # Get the store data
-    connections_config = get_store(store_path)
+    connections_config = get_store(CLI_CONFIG["store_path"])

    # Add (or update) the new connection details
    connections_config[name] = {
@@ -5589,7 +5676,7 @@ def cli_connection_add(
    }

    # Update the store data
-    update_store(store_path, connections_config)
+    update_store(CLI_CONFIG["store_path"], connections_config)

    finish(
        True,
@@ -5613,7 +5700,7 @@ def cli_connection_remove(
    """

    # Get the store data
-    connections_config = get_store(store_path)
+    connections_config = get_store(CLI_CONFIG["store_path"])

    # Remove the entry matching the name
    try:
@@ -5622,7 +5709,7 @@ def cli_connection_remove(
        finish(False, f"""No connection found with name "{name}" in local database""")

    # Update the store data
-    update_store(store_path, connections_config)
+    update_store(CLI_CONFIG["store_path"], connections_config)

    finish(True, f"""Removed connection "{name}" from client database""")

@@ -5665,7 +5752,7 @@ def cli_connection_list(
        "json-pretty": Output in formatted JSON.
    """

-    connections_config = get_store(store_path)
+    connections_config = get_store(CLI_CONFIG["store_path"])
    connections_data = cli_connection_list_parser(connections_config, show_keys_flag)
    finish(True, connections_data, format_function)

@@ -5703,7 +5790,7 @@ def cli_connection_detail(
        newline=False,
        stderr=True,
    )
-    connections_config = get_store(store_path)
+    connections_config = get_store(CLI_CONFIG["store_path"])
    connections_data = cli_connection_detail_parser(connections_config)
    echo(CLI_CONFIG, "done.", stderr=True)
    echo(CLI_CONFIG, "", stderr=True)
@@ -5843,6 +5930,7 @@ def cli(
        CLI_CONFIG["colour"] = _colour
        CLI_CONFIG["quiet"] = _quiet
        CLI_CONFIG["silent"] = _silent
+        CLI_CONFIG["store_path"] = store_path

    audit()

--- a/client-cli/pvc/cli/helpers.py
+++ b/client-cli/pvc/cli/helpers.py
@@ -208,7 +208,7 @@ def wait_for_provisioner(CLI_CONFIG, task_id):
        )
        if task_status.get("state") != "PENDING":
            break
-        echo(".", newline=False)
+        echo(CLI_CONFIG, ".", newline=False)
    echo(CLI_CONFIG, " done.")
    echo(CLI_CONFIG, "")

--- a/client-cli/pvc/lib/storage.py
+++ b/client-cli/pvc/lib/storage.py
@@ -231,21 +231,27 @@ def ceph_osd_list(config, limit):
        return False, response.json().get("message", "")


-def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio):
+def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_count):
    """
    Add new Ceph OSD

    API endpoint: POST /api/v1/storage/ceph/osd
-    API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio}
+    API arguments: node={node}, device={device}, weight={weight}, [ext_db_ratio={ext_db_ratio}, ext_db_size={ext_db_size}, osd_count={osd_count}]
    API schema: {"message":"{data}"}
    """
    params = {
        "node": node,
        "device": device,
        "weight": weight,
-        "ext_db": ext_db_flag,
-        "ext_db_ratio": ext_db_ratio,
    }
+
+    if ext_db_ratio is not None:
+        params["ext_db_ratio"] = ext_db_ratio
+    if ext_db_size is not None:
+        params["ext_db_size"] = ext_db_size
+    if osd_count is not None:
+        params["osd_count"] = osd_count
+
    response = call_api(config, "post", "/storage/ceph/osd", params=params)

    if response.status_code == 200:
@@ -256,15 +262,30 @@ def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio):
    return retstatus, response.json().get("message", "")


-def ceph_osd_replace(config, osdid, device, weight):
+def ceph_osd_replace(
+    config, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
+):
    """
    Replace an existing Ceph OSD with a new device

    API endpoint: POST /api/v1/storage/ceph/osd/{osdid}
-    API arguments: device={device}, weight={weight}
+    API arguments: new_device, [old_device={old_device}, weight={weight}, ext_db_ratio={ext_db_ratio}, ext_db_size={ext_db_size}]
    API schema: {"message":"{data}"}
    """
-    params = {"device": device, "weight": weight, "yes-i-really-mean-it": "yes"}
+    params = {
+        "new_device": new_device,
+        "yes-i-really-mean-it": "yes",
+    }
+
+    if old_device is not None:
+        params["old_device"] = old_device
+    if weight is not None:
+        params["weight"] = weight
+    if ext_db_ratio is not None:
+        params["ext_db_ratio"] = ext_db_ratio
+    if ext_db_size is not None:
+        params["ext_db_size"] = ext_db_size
+
    response = call_api(config, "post", f"/storage/ceph/osd/{osdid}", params=params)

    if response.status_code == 200:
@@ -400,7 +421,6 @@ def format_list_osd(config, osd_list):
    osd_used_length = 5
    osd_free_length = 6
    osd_util_length = 6
-    osd_var_length = 5
    osd_wrops_length = 4
    osd_wrdata_length = 5
    osd_rdops_length = 4
@@ -433,8 +453,14 @@ def format_list_osd(config, osd_list):
            )
            continue

+        if osd_information["is_split"]:
+            osd_information["device"] = f"{osd_information['device']} [s]"
+
        # Deal with the size to human readable
+        if isinstance(osd_information["stats"]["kb"], int):
            osd_information["stats"]["size"] = osd_information["stats"]["kb"] * 1024
+        else:
+            osd_information["stats"]["size"] = "N/A"
        for datatype in "size", "wr_data", "rd_data":
            databytes = osd_information["stats"][datatype]
            if isinstance(databytes, int):
@@ -503,10 +529,6 @@ def format_list_osd(config, osd_list):
        if _osd_util_length > osd_util_length:
            osd_util_length = _osd_util_length

-        _osd_var_length = len(str(osd_information["stats"]["var"])) + 1
-        if _osd_var_length > osd_var_length:
-            osd_var_length = _osd_var_length
-
        # Set the read/write IOPS/data and length
        _osd_wrops_length = len(osd_information["stats"]["wr_ops"]) + 1
        if _osd_wrops_length > osd_wrops_length:
@@ -542,8 +564,7 @@ def format_list_osd(config, osd_list):
            + osd_used_length
            + osd_free_length
            + osd_util_length
-            + osd_var_length
-            + 7,
+            + 6,
            read_header_length=osd_rdops_length + osd_rddata_length + 1,
            write_header_length=osd_wrops_length + osd_wrdata_length + 1,
            osd_header="OSDs "
@@ -575,8 +596,7 @@ def format_list_osd(config, osd_list):
                        + osd_used_length
                        + osd_free_length
                        + osd_util_length
-                        + osd_var_length
-                        + 6,
+                        + 5,
                    )
                ]
            ),
@@ -602,7 +622,6 @@ def format_list_osd(config, osd_list):
 {osd_used: <{osd_used_length}} \
 {osd_free: <{osd_free_length}} \
 {osd_util: <{osd_util_length}} \
-{osd_var: <{osd_var_length}} \
 {osd_rdops: <{osd_rdops_length}} \
 {osd_rddata: <{osd_rddata_length}} \
 {osd_wrops: <{osd_wrops_length}} \
@@ -623,7 +642,6 @@ def format_list_osd(config, osd_list):
            osd_used_length=osd_used_length,
            osd_free_length=osd_free_length,
            osd_util_length=osd_util_length,
-            osd_var_length=osd_var_length,
            osd_wrops_length=osd_wrops_length,
            osd_wrdata_length=osd_wrdata_length,
            osd_rdops_length=osd_rdops_length,
@@ -641,7 +659,6 @@ def format_list_osd(config, osd_list):
            osd_used="Used",
            osd_free="Free",
            osd_util="Util%",
-            osd_var="Var",
            osd_wrops="OPS",
            osd_wrdata="Data",
            osd_rdops="OPS",
@@ -674,7 +691,6 @@ def format_list_osd(config, osd_list):
 {osd_used: <{osd_used_length}} \
 {osd_free: <{osd_free_length}} \
 {osd_util: <{osd_util_length}} \
-{osd_var: <{osd_var_length}} \
 {osd_rdops: <{osd_rdops_length}} \
 {osd_rddata: <{osd_rddata_length}} \
 {osd_wrops: <{osd_wrops_length}} \
@@ -696,7 +712,6 @@ def format_list_osd(config, osd_list):
                osd_used_length=osd_used_length,
                osd_free_length=osd_free_length,
                osd_util_length=osd_util_length,
-                osd_var_length=osd_var_length,
                osd_wrops_length=osd_wrops_length,
                osd_wrdata_length=osd_wrdata_length,
                osd_rdops_length=osd_rdops_length,
@@ -716,7 +731,6 @@ def format_list_osd(config, osd_list):
                osd_used=osd_information["stats"]["used"],
                osd_free=osd_information["stats"]["avail"],
                osd_util=osd_information["stats"]["utilization"],
-                osd_var=osd_information["stats"]["var"],
                osd_wrops=osd_information["stats"]["wr_ops"],
                osd_wrdata=osd_information["stats"]["wr_data"],
                osd_rdops=osd_information["stats"]["rd_ops"],
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@@ -26,6 +26,7 @@ import time
 import math

 from concurrent.futures import ThreadPoolExecutor
+from distutils.util import strtobool

 import daemon_lib.vm as vm
 import daemon_lib.common as common
@@ -209,8 +210,10 @@ def getClusterOSDList(zkhandler):

 def getOSDInformation(zkhandler, osd_id):
    # Get the devices
+    osd_fsid = zkhandler.read(("osd.ofsid", osd_id))
    osd_node = zkhandler.read(("osd.node", osd_id))
    osd_device = zkhandler.read(("osd.device", osd_id))
+    osd_is_split = bool(strtobool(zkhandler.read(("osd.is_split", osd_id))))
    osd_db_device = zkhandler.read(("osd.db_device", osd_id))
    # Parse the stats data
    osd_stats_raw = zkhandler.read(("osd.stats", osd_id))
@@ -218,8 +221,10 @@ def getOSDInformation(zkhandler, osd_id):

    osd_information = {
        "id": osd_id,
+        "fsid": osd_fsid,
        "node": osd_node,
        "device": osd_device,
+        "is_split": osd_is_split,
        "db_device": osd_db_device,
        "stats": osd_stats,
    }
@@ -266,7 +271,22 @@ def add_osd_db_vg(zkhandler, node, device):

 # OSD actions use the /cmd/ceph pipe
 # These actions must occur on the specific node they reference
-def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05):
+def add_osd(
+    zkhandler,
+    node,
+    device,
+    weight,
+    ext_db_ratio=None,
+    ext_db_size=None,
+    split_count=None,
+):
+    # Verify that options are valid
+    if ext_db_ratio is not None and ext_db_size is not None:
+        return (
+            False,
+            "ERROR: Both an ext_db_ratio and ext_db_size were specified; choose only one.",
+        )
+
    # Verify the target node exists
    if not common.verifyNode(zkhandler, node):
        return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
@@ -284,8 +304,8 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0
        )

    # Tell the cluster to create a new OSD for the host
-    add_osd_string = "osd_add {},{},{},{},{}".format(
-        node, device, weight, ext_db_flag, ext_db_ratio
+    add_osd_string = "osd_add {},{},{},{},{},{}".format(
+        node, device, weight, ext_db_ratio, ext_db_size, split_count
    )
    zkhandler.write([("base.cmd.ceph", add_osd_string)])
    # Wait 1/2 second for the cluster to get the message and start working
@@ -295,14 +315,10 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0
        try:
            result = zkhandler.read("base.cmd.ceph").split()[0]
            if result == "success-osd_add":
-                message = 'Created new OSD with block device "{}" on node "{}".'.format(
-                    device, node
-                )
+                message = f'Created {split_count} new OSD(s) on node "{node}" block device "{device}"'
                success = True
            else:
-                message = (
-                    "ERROR: Failed to create new OSD; check node logs for details."
-                )
+                message = "ERROR: Failed to create OSD(s); check node logs for details."
                success = False
        except Exception:
            message = "ERROR: Command ignored by node."
@@ -316,12 +332,18 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0
    return success, message


-def replace_osd(zkhandler, osd_id, new_device, weight):
+def replace_osd(
+    zkhandler,
+    osd_id,
+    new_device,
+    old_device=None,
+    weight=None,
+    ext_db_ratio=None,
+    ext_db_size=None,
+):
    # Get current OSD information
    osd_information = getOSDInformation(zkhandler, osd_id)
    node = osd_information["node"]
-    old_device = osd_information["device"]
-    ext_db_flag = True if osd_information["db_device"] else False

    # Verify target block device isn't in use
    block_osd = verifyOSDBlock(zkhandler, node, new_device)
@@ -334,8 +356,8 @@ def replace_osd(zkhandler, osd_id, new_device, weight):
        )

    # Tell the cluster to create a new OSD for the host
-    replace_osd_string = "osd_replace {},{},{},{},{},{}".format(
-        node, osd_id, old_device, new_device, weight, ext_db_flag
+    replace_osd_string = "osd_replace {},{},{},{},{},{},{}".format(
+        node, osd_id, new_device, old_device, weight, ext_db_ratio, ext_db_size
    )
    zkhandler.write([("base.cmd.ceph", replace_osd_string)])
    # Wait 1/2 second for the cluster to get the message and start working
@@ -370,16 +392,6 @@ def refresh_osd(zkhandler, osd_id, device):
    node = osd_information["node"]
    ext_db_flag = True if osd_information["db_device"] else False

-    # Verify target block device isn't in use
-    block_osd = verifyOSDBlock(zkhandler, node, device)
-    if not block_osd or block_osd != osd_id:
-        return (
-            False,
-            'ERROR: Block device "{}" on node "{}" is not used by OSD "{}"; use replace instead'.format(
-                device, node, osd_id
-            ),
-        )
-
    # Tell the cluster to create a new OSD for the host
    refresh_osd_string = "osd_refresh {},{},{},{}".format(
        node, osd_id, device, ext_db_flag
--- a/daemon-common/migrations/versions/10.json
+++ b/daemon-common/migrations/versions/10.json
@@ -0,0 +1 @@
+{"version": "10", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@@ -540,7 +540,7 @@ class ZKHandler(object):
 #
 class ZKSchema(object):
    # Current version
-    _version = 9
+    _version = 10

    # Root for doing nested keys
    _schema_root = ""
@@ -719,6 +719,7 @@ class ZKSchema(object):
            "lvm": "/lvm",
            "vg": "/lvm/vg",
            "lv": "/lvm/lv",
+            "is_split": "/is_split",
            "stats": "/stats",
        },
        # The schema of an individual pool entry (/ceph/pools/{pool_name})
@@ -963,7 +964,9 @@ class ZKSchema(object):
                    kpath = f"{elem}.{ikey}"
                    # Validate that the key exists for that child
                    if not zkhandler.zk_conn.exists(self.path(kpath, child)):
-                        if elem == "pool" and ikey == "tier":
+                        if elem == "osd" and ikey == "is_split":
+                            default_data = "False"
+                        elif elem == "pool" and ikey == "tier":
                            default_data = "default"
                        else:
                            default_data = ""
--- a/debian/control
+++ b/debian/control
@@ -16,7 +16,7 @@ Description: Parallel Virtual Cluster node daemon (Python 3)

 Package: pvc-daemon-api
 Architecture: all
-Depends: systemd, pvc-daemon-common, python3-yaml, python3-flask, python3-flask-restful, python3-celery, python-celery-common, python3-distutils, redis, python3-redis, python3-lxml, python3-flask-migrate, fio
+Depends: systemd, pvc-daemon-common, python3-yaml, python3-flask, python3-flask-restful, python3-celery, python-celery-common, python3-distutils, python3-redis, python3-lxml, python3-flask-migrate, fio
 Description: Parallel Virtual Cluster API daemon (Python 3)
 A KVM/Zookeeper/Ceph-based VM and private cloud manager
 .
--- a/node-daemon/pvcnoded/objects/CephInstance.py
+++ b/node-daemon/pvcnoded/objects/CephInstance.py
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@@ -350,6 +350,7 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
            elif line[0] == "+":
                continue

+            try:
                # If line begins with | and second entry is a digit (i.e. OSD ID)
                if line[0] == "|" and line[1].isdigit():
                    # Parse the line in Ceph 14 format
@@ -377,6 +378,8 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
                # Otherwise, it's the header line and is ignored
                else:
                    continue
+            except IndexError:
+                continue

            # I don't know why 2018 me used this construct instead of a normal
            # dictionary update, but it works so not changing it.
--- a/node-daemon/pvcnoded/util/services.py
+++ b/node-daemon/pvcnoded/util/services.py
@@ -69,6 +69,20 @@ def start_ceph_mgr(logger, config):
        )


+def start_keydb(logger, config):
+    if config["enable_api"] and config["daemon_mode"] == "coordinator":
+        logger.out("Starting KeyDB daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start keydb-server.service")
+
+
+def start_api_worker(logger, config):
+    if config["enable_api"]:
+        logger.out("Starting API worker daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start pvcapid-worker.service")
+
+
 def start_system_services(logger, config):
    start_zookeeper(logger, config)
    start_libvirtd(logger, config)
@@ -76,6 +90,8 @@ def start_system_services(logger, config):
    start_frrouting(logger, config)
    start_ceph_mon(logger, config)
    start_ceph_mgr(logger, config)
+    start_keydb(logger, config)
+    start_api_worker(logger, config)

    logger.out("Waiting 10 seconds for daemons to start", state="s")
    sleep(10)
Author	SHA1	Message	Date
Joshua M. Boniface	2c15036f86	Add KeyDB to node startup services Also ensure API worker starts on all nodes, not just coordinators.	2023-11-05 19:26:38 -05:00
Joshua M. Boniface	42ed6f6420	Remove redis as a dependency	2023-11-05 18:23:34 -05:00
Joshua M. Boniface	3dc1f57de2	Revert "Switch to ZK+PG over Redis for Celery queue" This reverts commit `54215bab6c`.	2023-11-05 17:10:46 -05:00
Joshua M. Boniface	b99b4e64b2	Ensure store path is passed properly	2023-11-05 16:48:47 -05:00
Joshua M. Boniface	91af1175ef	Fix missing CLI_CONFIG in echo()	2023-11-04 15:17:50 -04:00
Joshua M. Boniface	af8a8d969e	Ensure queues are set up for non-coordinator nodes Allows a runner to operate on every possible node, not just coordinators, as OSDs or other things could be on any node. Also add more comments.	2023-11-04 15:05:07 -04:00
Joshua M. Boniface	a6caac1b78	Add Celery queue routing for tasks By default, tasks will continue to run as they did, on the primary coordinator's task runner. However this opens the possibility for defining more tasks that will run on other nodes or coordinators.	2023-11-04 14:29:59 -04:00
Joshua M. Boniface	30d7e49401	Start API worker with node daemon on coordinators	2023-11-04 13:08:16 -04:00
Joshua M. Boniface	ab629f6b51	Use per-host hostname and queues in worker Opens up the ability to direct tasks to specific workers.	2023-11-04 13:02:30 -04:00
Joshua M. Boniface	54215bab6c	Switch to ZK+PG over Redis for Celery queue Redis did not provide a distributed solution for the worker, which precluded several important planned functions. So instead, move to using Zookeeper + PostgreSQL as the broker and result backend respectively. Should be a seamless drop-in change but for future uses requires the database host to be the primary coordinator IP rather than localhost, so that writes can occur to the database from non-primary hosts.	2023-11-04 12:46:34 -04:00
Joshua M. Boniface	7490f13b7c	Check for partition tables on new devices	2023-11-04 03:13:58 -04:00
Joshua M. Boniface	d1602f35de	Adjust split indicator	2023-11-04 02:56:21 -04:00
Joshua M. Boniface	7cdedde2fb	Adjust wording about extdb	2023-11-04 02:54:25 -04:00
Joshua M. Boniface	ab156b14b7	Update help messages for OSD refresh	2023-11-04 02:47:04 -04:00
Joshua M. Boniface	a016337f57	Remove block verify in APi This doesn't work right and is handled by the node anyways.	2023-11-04 02:45:10 -04:00
Joshua M. Boniface	e32054be81	Refactor refresh as well	2023-11-04 02:44:52 -04:00
Joshua M. Boniface	18d32fede3	Fix wording of detect strings	2023-11-04 01:37:07 -04:00
Joshua M. Boniface	b3d13fe9be	Add log message for zap	2023-11-04 01:02:51 -04:00
Joshua M. Boniface	48b2ccbd95	Add timeout for safe-to-destroy Continuously take the OSD down and out while doing so.	2023-11-04 00:55:05 -04:00
Joshua M. Boniface	1535078842	Fix lvremove, lvcreate, and update ZK details	2023-11-04 00:30:14 -04:00
Joshua M. Boniface	0e45613634	Use right key with correct data	2023-11-04 00:02:00 -04:00
Joshua M. Boniface	75135f6d5f	Avoid broken output format for new OSDs	2023-11-03 23:54:10 -04:00
Joshua M. Boniface	7f5dd385b5	Use right key for FSID elsewhere	2023-11-03 23:51:01 -04:00
Joshua M. Boniface	befce62925	Add OSD destroy before purge	2023-11-03 23:44:27 -04:00
Joshua M. Boniface	b0909aed61	Get proper FSID value	2023-11-03 23:38:24 -04:00
Joshua M. Boniface	f418b40527	Use proper FSID instead of hack	2023-11-03 16:38:19 -04:00
Joshua M. Boniface	ec42b19d0e	Send FSID to clients too	2023-11-03 16:37:55 -04:00
Joshua M. Boniface	dd0177ce10	Rework replacement procedure again Avoid calling other functions; replicate the actual process from Ceph docs (https://docs.ceph.com/en/pacific/rados/operations/add-or-rm-osds/) to ensure things work out well (e.g. preserving OSD IDs).	2023-11-03 16:31:56 -04:00
Joshua M. Boniface	ed5bc9fb43	Fix numerous formatting and function bugs	2023-11-03 14:00:05 -04:00
Joshua M. Boniface	94d8d2cf75	Fix skip_zap_flag anomaly and add crush rm	2023-11-03 02:35:12 -04:00
Joshua M. Boniface	20497cf89d	Fix bugs and skip safe_to_destroy on force	2023-11-03 02:29:50 -04:00
Joshua M. Boniface	64e37ae963	Update OSD replacement functionality 1. Simplify this by leveraging the existing remove_osd/add_osd functions, since its task was functionally identical to those two in sequential order. 2. Add support for split OSDs within the command (replacing all OSDs on the block device(s) as required). 3. Add additional configurability and flexibility around the old device, weight, and external DB LVs.	2023-11-03 01:45:49 -04:00
Joshua M. Boniface	3cb8a70f04	Add forcing to OSD purge	2023-11-02 23:20:48 -04:00
Joshua M. Boniface	44d2f98e75	Remove Var field from OSDs Not super duper useful and increases length	2023-11-02 22:55:39 -04:00
Joshua M. Boniface	cb91bf18a7	Fix incorrect variables	2023-11-02 22:39:32 -04:00
Joshua M. Boniface	a3e3fe829a	Adjust helptext for osd add	2023-11-02 22:34:58 -04:00
Joshua M. Boniface	f53af510c1	Avoid startup failures if OSD removed	2023-11-02 22:24:39 -04:00
Joshua M. Boniface	d5d783fad3	Set proper split flag	2023-11-02 22:20:22 -04:00
Joshua M. Boniface	8b8957547a	Adjust helptext for create-db-vg command	2023-11-02 22:14:25 -04:00
Joshua M. Boniface	980ea6a9e9	Adjust handling of ext_db and _count options Avoid the use of superfluous flag options, default them to none, and add support for fixed-size DB LVs.	2023-11-02 13:29:47 -04:00
Joshua M. Boniface	0f433bd5eb	Add wait messages for OSD commands	2023-11-02 09:31:41 -04:00
Joshua M. Boniface	8780044be6	Ensure db_device is an empty string	2023-11-02 00:52:18 -04:00
Joshua M. Boniface	f08c654f22	Fix missing fstring	2023-11-01 21:41:06 -04:00
Joshua M. Boniface	80a7fd6195	Improve help text messages	2023-11-01 21:38:55 -04:00
Joshua M. Boniface	8b93f9a80e	Handle OSD index errors during stats collection	2023-11-01 21:33:40 -04:00
Joshua M. Boniface	526a5f4a74	Add support for split OSD adds Allows creating multiple OSDs on a single (NVMe) block device, leveraging the "ceph-volume lvm batch" command. Replaces the previous method of creating OSDs. Also adds a new ZK item for each OSD indicating if it is split or not.	2023-11-01 21:31:35 -04:00
Joshua M. Boniface	aa0b1f504f	Fix output bug	2023-11-01 15:46:38 -04:00
Joshua M. Boniface	bc425b9224	Avoid duplicate confirmations in a safer way This version instead still requires --yes with --restart to avoid the confirmation option, but avoids duplicate prompts. This might be slightly more cumbersome, but ensures consistency: every situation that could cause a restart is confirmed even if --restart is given.	2023-11-01 12:05:52 -04:00
Joshua M. Boniface	79e5c098cd	Revert "Remove duplicate confirmation for VM restart" This reverts commit `3c61a3ac03`.	2023-11-01 12:04:34 -04:00
Joshua M. Boniface	3c61a3ac03	Remove duplicate confirmation for VM restart Having both restart_opt and confirm_opt resulted in a duplicate confirmation message, at least if neither --restart/--no-restart is specified. This is not necessary as the confirmation is already given by the restart_opt or the relevant --restart/--no-restart flag.	2023-11-01 12:02:34 -04:00
Joshua M. Boniface	988c777912	Properly handle live state with restart confirm If "--live" is passed (the default), we shouldn't confirm to restart the VM as this is not required. Instead only confirm if "--no-live" was passed or if the flag doesn't exist.	2023-11-01 11:46:59 -04:00
				`@@ -0,0 +1 @@`
				{"version": "10", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}