Adjust handling of ext_db and _count options

Avoid the use of superfluous flag options, default them to none, and add
support for fixed-size DB LVs.
This commit is contained in:
Joshua Boniface 2023-11-02 13:29:47 -04:00
parent 0f433bd5eb
commit 980ea6a9e9
6 changed files with 101 additions and 90 deletions

View File

@ -4281,20 +4281,16 @@ class API_Storage_Ceph_OSD_Root(Resource):
"required": True,
"helptext": "An OSD weight must be specified.",
},
{
"name": "ext_db",
"required": False,
},
{
"name": "ext_db_ratio",
"required": False,
},
{
"name": "split",
"name": "ext_db_size",
"required": False,
},
{
"name": "count",
"name": "osd_count",
"required": False,
},
]
@ -4303,7 +4299,7 @@ class API_Storage_Ceph_OSD_Root(Resource):
def post(self, reqargs):
"""
Add a Ceph OSD to the cluster
Note: This task may take up to 30s to complete and return
Note: This task may take up to 60s to complete and return
---
tags:
- storage / ceph
@ -4323,26 +4319,21 @@ class API_Storage_Ceph_OSD_Root(Resource):
type: number
required: true
description: The Ceph CRUSH weight for the OSD
- in: query
name: ext_db
type: boolean
required: false
description: Whether to use an external OSD DB LV device
- in: query
name: ext_db_ratio
type: float
required: false
description: Decimal ratio of total OSD size for the external OSD DB LV device, default 0.05 (5%)
description: If set, creates an OSD DB LV with this decimal ratio of DB to total OSD size (usually 0.05 i.e. 5%); mutually exclusive with ext_db_size
- in: query
name: split
type: boolean
name: ext_db_size
type: float
required: false
description: Whether to split the block device into multiple OSDs (recommended for NVMe devices)
description: If set, creates an OSD DB LV with this explicit size in human units (e.g. 1024M, 20G); mutually exclusive with ext_db_ratio
- in: query
name: count
name: osd_count
type: integer
required: false
description: If {split}, how many OSDs to create on the block device; usually 2 or 4 depending on size
description: If set, create this many OSDs on the block device instead of 1; usually 2 or 4 depending on size
responses:
200:
description: OK
@ -4359,10 +4350,9 @@ class API_Storage_Ceph_OSD_Root(Resource):
reqargs.get("node", None),
reqargs.get("device", None),
reqargs.get("weight", None),
reqargs.get("ext_db", False),
float(reqargs.get("ext_db_ratio", 0.05)),
reqargs.get("split", False),
reqargs.get("count", 1),
reqargs.get("ext_db_ratio", None),
reqargs.get("ext_db_size", None),
reqargs.get("osd_count", None),
)

View File

@ -1371,10 +1371,9 @@ def ceph_osd_add(
node,
device,
weight,
ext_db_flag=False,
ext_db_ratio=0.05,
split_flag=False,
split_count=1,
ext_db_ratio=None,
ext_db_size=None,
split_count=None,
):
"""
Add a Ceph OSD to the PVC Ceph storage cluster.
@ -1384,9 +1383,8 @@ def ceph_osd_add(
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
ext_db_size,
split_count,
)

View File

@ -3394,31 +3394,30 @@ def cli_storage_osd_create_db_vg(node, device):
show_default=True,
help="Weight of the OSD(s) within the CRUSH map.",
)
@click.option(
"-d",
"--ext-db",
"ext_db_flag",
is_flag=True,
default=False,
help="Use an external database logical volume for these OSD(s).",
)
@click.option(
"-r",
"--ext-db-ratio",
"ext_db_ratio",
default=0.05,
show_default=True,
default=None,
type=float,
help="Decimal ratio of the external database logical volume to the OSD size.",
help="Create an external database logical volume for the OSD(s) with this decimal ratio of the DB LV to the OSD size.",
)
@click.option(
"-s",
"--split",
"split_count",
"--ext-db-size",
"ext_db_size",
default=None,
show_default=True,
help="Create an external database logical volume for the OSD(s) with this fixed human-unit size.",
)
@click.option(
"-c",
"--osd-count",
"osd_count",
default=None,
show_default=False,
type=int,
help="Split an NVMe disk into this many OSDs",
help="Split (an NVMe) disk into this many OSDs",
)
@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}")
def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_count):
@ -3427,18 +3426,17 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_c
A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". Detect strings allow for automatic determination of Linux block device paths from known basic information about disks by leveraging "lsscsi" on the target host. The "NAME" should be some descriptive identifier, for instance the manufacturer (e.g. "INTEL"), the "HUMAN-SIZE" should be the labeled human-readable size of the device (e.g. "480GB", "1.92TB"), and "ID" specifies the Nth 0-indexed device which matches the "NAME" and "HUMAN-SIZE" values (e.g. "2" would match the third device with the corresponding "NAME" and "HUMAN-SIZE"). When matching against sizes, there is +/- 3% flexibility to account for base-1000 vs. base-1024 differences and rounding errors. The "NAME" may contain whitespace but if so the entire detect string should be quoted, and is case-insensitive. More information about detect strings can be found in the pvcbootstrapd manual.
The weight of an OSD should reflect the ratio of the OSD to other OSDs in the storage cluster. For example, if all OSDs are the same size as recommended for PVC, 1 (the default) is a valid weight so that all are treated identically. If a new OSD is added later which is 4x the size of the existing OSDs, the new OSD's weight should then be 4 to tell the cluster that 4x the data can be stored on the OSD. Weights can also be tweaked for performance reasons, since OSDs with more data will incur more I/O load. For more information about CRUSH weights, please see the Ceph documentation.
The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation.
If '--ext-db' is specified, the OSD database and WAL will be placed on a new logical volume in NODE's OSD database volume group. An OSD database volume group must exist on the node or the OSD creation will fail. See the 'pvc storage osd create-db-vg' command for more details.
The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its RocksDB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.).
The default '--ext-db-ratio' of 0.05 (5%) is sufficient for most RBD workloads and OSD sizes, though this can be adjusted based on the sizes of the OSD(s) and the underlying database device. Ceph documentation recommends at least 0.02 (2%) for RBD use-cases, and higher values may improve WAL performance under write-heavy workloads with fewer OSDs per node.
An external DB is only recommended for relatively slow OSD devices (i.e. SATA SSDs) when there is also a smaller, faster (i.e. an NVMe or 3DXPoint SSD) device in the node. For pure-NVMe OSDs, an external DB is not required nor recommended for optimal performance. Usually, an "--ext-db-ratio" of 0.05 (5%) is best for most workloads and OSD sizes; the Ceph documentation recommends a minimum of 0.02 (2%), and higher values may improve performance under write-heavy workloads with fewer OSDs per node. The explicit size option is also permitted to allow more fine-grained sizing, allowing the administrator to pre-calculate the desired size rather than relying on a ratio.
For NVMe devices, it is recommended to split block device into multiple OSDs to provide better processing throughput. To do this, specify "-s"/"--split" and the number of OSDs to create on the block device. For most NVMe devices, the recommended value is 2 or 4, such that each OSD is at least 500GB. Numbers higher than 4 are not recommended. This is NOT RECOMMENDED for SATA SSDs. If a block device is split, EACH OSD will have the weight indicated by "-w"/"--weight" value and EACH OSD will have a unique DB block device with "-r"/"--ext-db-ratio", if applicable ("-d"/"--ext-db" set).
The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size of the OSD disk such that each OSD is roughly 1TB or higher; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs.
Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD (ratios are calculated from the OSD size, not the underlying block device).
"""
if split_count is not None:
split_flag = True
echo(
CLI_CONFIG,
"Waiting for node task to complete, this may take some time... ",
@ -3449,9 +3447,8 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_c
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
ext_db_size,
split_count,
)
echo(CLI_CONFIG, "done.")

View File

@ -231,25 +231,27 @@ def ceph_osd_list(config, limit):
return False, response.json().get("message", "")
def ceph_osd_add(
config, node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count
):
def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_count):
"""
Add new Ceph OSD
API endpoint: POST /api/v1/storage/ceph/osd
API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio}, split={split_flag}, count={split_count}
API arguments: node={node}, device={device}, weight={weight}, [ext_db_ratio={ext_db_ratio}, ext_db_size={ext_db_size}, osd_count={osd_count}]
API schema: {"message":"{data}"}
"""
params = {
"node": node,
"device": device,
"weight": weight,
"ext_db": ext_db_flag,
"ext_db_ratio": ext_db_ratio,
"split": split_flag,
"count": split_count,
}
if ext_db_ratio is not None:
params["ext_db_ratio"] = ext_db_ratio
if ext_db_size is not None:
params["ext_db_size"] = ext_db_size
if osd_count is not None:
params["osd_count"] = osd_count
response = call_api(config, "post", "/storage/ceph/osd", params=params)
if response.status_code == 200:

View File

@ -273,11 +273,17 @@ def add_osd(
node,
device,
weight,
ext_db_flag=False,
ext_db_ratio=0.05,
split_flag=False,
split_count=1,
ext_db_ratio=None,
ext_db_size=None,
split_count=None,
):
# Verify that options are valid
if ext_db_ratio is not None and ext_db_size is not None:
return (
False,
"ERROR: Both an ext_db_ratio and ext_db_size were specified; choose only one.",
)
# Verify the target node exists
if not common.verifyNode(zkhandler, node):
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
@ -295,8 +301,8 @@ def add_osd(
)
# Tell the cluster to create a new OSD for the host
add_osd_string = "osd_add {},{},{},{},{},{},{}".format(
node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count
add_osd_string = "osd_add {},{},{},{},{},{}".format(
node, device, weight, ext_db_ratio, ext_db_size, split_count
)
zkhandler.write([("base.cmd.ceph", add_osd_string)])
# Wait 1/2 second for the cluster to get the message and start working

View File

@ -23,6 +23,7 @@ import time
import json
import daemon_lib.common as common
from daemon_lib.ceph import format_bytes_fromhuman
from distutils.util import strtobool
from re import search, match, sub
@ -266,10 +267,9 @@ class CephOSDInstance(object):
node,
device,
weight,
ext_db_flag=False,
ext_db_ratio=0.05,
split_device=False,
split_count=1,
ext_db_ratio=None,
ext_db_size=None,
split_count=None,
):
# Handle a detect device if that is passed
if match(r"detect:", device):
@ -287,7 +287,19 @@ class CephOSDInstance(object):
)
device = ddevice
if split_device and split_count > 1:
if ext_db_size is not None and ext_db_ratio is not None:
logger.out(
"Invalid configuration: both an ext_db_size and ext_db_ratio were specified",
state="e",
)
return False
if ext_db_size is not None or ext_db_ratio is not None:
ext_db_flag = True
else:
ext_db_flag = False
if split_count is not None:
split_flag = f"--osds-per-device {split_count}"
logger.out(
f"Creating {split_count} new OSD disks on block device {device}",
@ -352,16 +364,21 @@ class CephOSDInstance(object):
)
# 4b. Prepare the logical volume if ext_db_flag
_, osd_size_bytes, _ = common.run_os_command(
f"blockdev --getsize64 {osd_lv}"
)
osd_size_bytes = int(osd_size_bytes)
if ext_db_ratio is not None:
_, osd_size_bytes, _ = common.run_os_command(
f"blockdev --getsize64 {osd_lv}"
)
osd_size_bytes = int(osd_size_bytes)
osd_db_size_bytes = int(osd_size_bytes * ext_db_ratio)
if ext_db_size is not None:
osd_db_size_bytes = format_bytes_fromhuman(ext_db_size)
result = CephOSDInstance.create_osd_db_lv(
zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes
zkhandler, logger, osd_id, osd_db_size_bytes
)
if not result:
raise Exception
db_device = "osd-db/osd-{}".format(osd_id)
db_device = f"osd-db/osd-{osd_id}"
# 4c. Attach the new DB device to the OSD
retcode, stdout, stderr = common.run_os_command(
@ -1078,7 +1095,7 @@ class CephOSDInstance(object):
return False
@staticmethod
def create_osd_db_lv(zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes):
def create_osd_db_lv(zkhandler, logger, osd_id, osd_db_size_bytes):
logger.out(
"Creating new OSD database logical volume for OSD ID {}".format(osd_id),
state="i",
@ -1096,18 +1113,16 @@ class CephOSDInstance(object):
return False
# 1. Determine LV sizing
osd_db_size = int(osd_size_bytes * ext_db_ratio / 1024 / 1024)
osd_db_size_m = int(osd_db_size_bytes / 1024 / 1024)
# 2. Create the LV
logger.out(
'Creating DB LV "osd-db/osd-{}" of {}M ({} * {})'.format(
osd_id, osd_db_size, osd_size_bytes, ext_db_ratio
),
f'Creating DB LV "osd-db/osd-{osd_id}" of size {osd_db_size_m}M',
state="i",
)
retcode, stdout, stderr = common.run_os_command(
"lvcreate --yes --name osd-{} --size {} osd-db".format(
osd_id, osd_db_size
osd_id, osd_db_size_m
)
)
if retcode:
@ -1245,15 +1260,19 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
ext_db_size,
split_count,
) = args.split(",")
ext_db_flag = bool(strtobool(ext_db_flag))
ext_db_ratio = float(ext_db_ratio)
split_flag = bool(strtobool(split_flag))
split_count = int(split_count)
try:
ext_db_ratio = float(ext_db_ratio)
except Exception:
ext_db_ratio = None
try:
split_count = int(split_count)
except Exception:
split_count = None
if node == this_node.name:
# Lock the command queue
zk_lock = zkhandler.writelock("base.cmd.ceph")
@ -1265,9 +1284,8 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
ext_db_size,
split_count,
)
# Command succeeded