Adjust handling of ext_db and _count options
Avoid the use of superfluous flag options, default them to none, and add support for fixed-size DB LVs.
This commit is contained in:
parent
0f433bd5eb
commit
980ea6a9e9
|
@ -4281,20 +4281,16 @@ class API_Storage_Ceph_OSD_Root(Resource):
|
|||
"required": True,
|
||||
"helptext": "An OSD weight must be specified.",
|
||||
},
|
||||
{
|
||||
"name": "ext_db",
|
||||
"required": False,
|
||||
},
|
||||
{
|
||||
"name": "ext_db_ratio",
|
||||
"required": False,
|
||||
},
|
||||
{
|
||||
"name": "split",
|
||||
"name": "ext_db_size",
|
||||
"required": False,
|
||||
},
|
||||
{
|
||||
"name": "count",
|
||||
"name": "osd_count",
|
||||
"required": False,
|
||||
},
|
||||
]
|
||||
|
@ -4303,7 +4299,7 @@ class API_Storage_Ceph_OSD_Root(Resource):
|
|||
def post(self, reqargs):
|
||||
"""
|
||||
Add a Ceph OSD to the cluster
|
||||
Note: This task may take up to 30s to complete and return
|
||||
Note: This task may take up to 60s to complete and return
|
||||
---
|
||||
tags:
|
||||
- storage / ceph
|
||||
|
@ -4323,26 +4319,21 @@ class API_Storage_Ceph_OSD_Root(Resource):
|
|||
type: number
|
||||
required: true
|
||||
description: The Ceph CRUSH weight for the OSD
|
||||
- in: query
|
||||
name: ext_db
|
||||
type: boolean
|
||||
required: false
|
||||
description: Whether to use an external OSD DB LV device
|
||||
- in: query
|
||||
name: ext_db_ratio
|
||||
type: float
|
||||
required: false
|
||||
description: Decimal ratio of total OSD size for the external OSD DB LV device, default 0.05 (5%)
|
||||
description: If set, creates an OSD DB LV with this decimal ratio of DB to total OSD size (usually 0.05 i.e. 5%); mutually exclusive with ext_db_size
|
||||
- in: query
|
||||
name: split
|
||||
type: boolean
|
||||
name: ext_db_size
|
||||
type: float
|
||||
required: false
|
||||
description: Whether to split the block device into multiple OSDs (recommended for NVMe devices)
|
||||
description: If set, creates an OSD DB LV with this explicit size in human units (e.g. 1024M, 20G); mutually exclusive with ext_db_ratio
|
||||
- in: query
|
||||
name: count
|
||||
name: osd_count
|
||||
type: integer
|
||||
required: false
|
||||
description: If {split}, how many OSDs to create on the block device; usually 2 or 4 depending on size
|
||||
description: If set, create this many OSDs on the block device instead of 1; usually 2 or 4 depending on size
|
||||
responses:
|
||||
200:
|
||||
description: OK
|
||||
|
@ -4359,10 +4350,9 @@ class API_Storage_Ceph_OSD_Root(Resource):
|
|||
reqargs.get("node", None),
|
||||
reqargs.get("device", None),
|
||||
reqargs.get("weight", None),
|
||||
reqargs.get("ext_db", False),
|
||||
float(reqargs.get("ext_db_ratio", 0.05)),
|
||||
reqargs.get("split", False),
|
||||
reqargs.get("count", 1),
|
||||
reqargs.get("ext_db_ratio", None),
|
||||
reqargs.get("ext_db_size", None),
|
||||
reqargs.get("osd_count", None),
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -1371,10 +1371,9 @@ def ceph_osd_add(
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag=False,
|
||||
ext_db_ratio=0.05,
|
||||
split_flag=False,
|
||||
split_count=1,
|
||||
ext_db_ratio=None,
|
||||
ext_db_size=None,
|
||||
split_count=None,
|
||||
):
|
||||
"""
|
||||
Add a Ceph OSD to the PVC Ceph storage cluster.
|
||||
|
@ -1384,9 +1383,8 @@ def ceph_osd_add(
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag,
|
||||
ext_db_ratio,
|
||||
split_flag,
|
||||
ext_db_size,
|
||||
split_count,
|
||||
)
|
||||
|
||||
|
|
|
@ -3394,31 +3394,30 @@ def cli_storage_osd_create_db_vg(node, device):
|
|||
show_default=True,
|
||||
help="Weight of the OSD(s) within the CRUSH map.",
|
||||
)
|
||||
@click.option(
|
||||
"-d",
|
||||
"--ext-db",
|
||||
"ext_db_flag",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Use an external database logical volume for these OSD(s).",
|
||||
)
|
||||
@click.option(
|
||||
"-r",
|
||||
"--ext-db-ratio",
|
||||
"ext_db_ratio",
|
||||
default=0.05,
|
||||
show_default=True,
|
||||
default=None,
|
||||
type=float,
|
||||
help="Decimal ratio of the external database logical volume to the OSD size.",
|
||||
help="Create an external database logical volume for the OSD(s) with this decimal ratio of the DB LV to the OSD size.",
|
||||
)
|
||||
@click.option(
|
||||
"-s",
|
||||
"--split",
|
||||
"split_count",
|
||||
"--ext-db-size",
|
||||
"ext_db_size",
|
||||
default=None,
|
||||
show_default=True,
|
||||
help="Create an external database logical volume for the OSD(s) with this fixed human-unit size.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--osd-count",
|
||||
"osd_count",
|
||||
default=None,
|
||||
show_default=False,
|
||||
type=int,
|
||||
help="Split an NVMe disk into this many OSDs",
|
||||
help="Split (an NVMe) disk into this many OSDs",
|
||||
)
|
||||
@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}")
|
||||
def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_count):
|
||||
|
@ -3427,18 +3426,17 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_c
|
|||
|
||||
A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". Detect strings allow for automatic determination of Linux block device paths from known basic information about disks by leveraging "lsscsi" on the target host. The "NAME" should be some descriptive identifier, for instance the manufacturer (e.g. "INTEL"), the "HUMAN-SIZE" should be the labeled human-readable size of the device (e.g. "480GB", "1.92TB"), and "ID" specifies the Nth 0-indexed device which matches the "NAME" and "HUMAN-SIZE" values (e.g. "2" would match the third device with the corresponding "NAME" and "HUMAN-SIZE"). When matching against sizes, there is +/- 3% flexibility to account for base-1000 vs. base-1024 differences and rounding errors. The "NAME" may contain whitespace but if so the entire detect string should be quoted, and is case-insensitive. More information about detect strings can be found in the pvcbootstrapd manual.
|
||||
|
||||
The weight of an OSD should reflect the ratio of the OSD to other OSDs in the storage cluster. For example, if all OSDs are the same size as recommended for PVC, 1 (the default) is a valid weight so that all are treated identically. If a new OSD is added later which is 4x the size of the existing OSDs, the new OSD's weight should then be 4 to tell the cluster that 4x the data can be stored on the OSD. Weights can also be tweaked for performance reasons, since OSDs with more data will incur more I/O load. For more information about CRUSH weights, please see the Ceph documentation.
|
||||
The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation.
|
||||
|
||||
If '--ext-db' is specified, the OSD database and WAL will be placed on a new logical volume in NODE's OSD database volume group. An OSD database volume group must exist on the node or the OSD creation will fail. See the 'pvc storage osd create-db-vg' command for more details.
|
||||
The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its RocksDB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.).
|
||||
|
||||
The default '--ext-db-ratio' of 0.05 (5%) is sufficient for most RBD workloads and OSD sizes, though this can be adjusted based on the sizes of the OSD(s) and the underlying database device. Ceph documentation recommends at least 0.02 (2%) for RBD use-cases, and higher values may improve WAL performance under write-heavy workloads with fewer OSDs per node.
|
||||
An external DB is only recommended for relatively slow OSD devices (i.e. SATA SSDs) when there is also a smaller, faster (i.e. an NVMe or 3DXPoint SSD) device in the node. For pure-NVMe OSDs, an external DB is not required nor recommended for optimal performance. Usually, an "--ext-db-ratio" of 0.05 (5%) is best for most workloads and OSD sizes; the Ceph documentation recommends a minimum of 0.02 (2%), and higher values may improve performance under write-heavy workloads with fewer OSDs per node. The explicit size option is also permitted to allow more fine-grained sizing, allowing the administrator to pre-calculate the desired size rather than relying on a ratio.
|
||||
|
||||
For NVMe devices, it is recommended to split block device into multiple OSDs to provide better processing throughput. To do this, specify "-s"/"--split" and the number of OSDs to create on the block device. For most NVMe devices, the recommended value is 2 or 4, such that each OSD is at least 500GB. Numbers higher than 4 are not recommended. This is NOT RECOMMENDED for SATA SSDs. If a block device is split, EACH OSD will have the weight indicated by "-w"/"--weight" value and EACH OSD will have a unique DB block device with "-r"/"--ext-db-ratio", if applicable ("-d"/"--ext-db" set).
|
||||
The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size of the OSD disk such that each OSD is roughly 1TB or higher; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs.
|
||||
|
||||
Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD (ratios are calculated from the OSD size, not the underlying block device).
|
||||
"""
|
||||
|
||||
if split_count is not None:
|
||||
split_flag = True
|
||||
|
||||
echo(
|
||||
CLI_CONFIG,
|
||||
"Waiting for node task to complete, this may take some time... ",
|
||||
|
@ -3449,9 +3447,8 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_c
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag,
|
||||
ext_db_ratio,
|
||||
split_flag,
|
||||
ext_db_size,
|
||||
split_count,
|
||||
)
|
||||
echo(CLI_CONFIG, "done.")
|
||||
|
|
|
@ -231,25 +231,27 @@ def ceph_osd_list(config, limit):
|
|||
return False, response.json().get("message", "")
|
||||
|
||||
|
||||
def ceph_osd_add(
|
||||
config, node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count
|
||||
):
|
||||
def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_count):
|
||||
"""
|
||||
Add new Ceph OSD
|
||||
|
||||
API endpoint: POST /api/v1/storage/ceph/osd
|
||||
API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio}, split={split_flag}, count={split_count}
|
||||
API arguments: node={node}, device={device}, weight={weight}, [ext_db_ratio={ext_db_ratio}, ext_db_size={ext_db_size}, osd_count={osd_count}]
|
||||
API schema: {"message":"{data}"}
|
||||
"""
|
||||
params = {
|
||||
"node": node,
|
||||
"device": device,
|
||||
"weight": weight,
|
||||
"ext_db": ext_db_flag,
|
||||
"ext_db_ratio": ext_db_ratio,
|
||||
"split": split_flag,
|
||||
"count": split_count,
|
||||
}
|
||||
|
||||
if ext_db_ratio is not None:
|
||||
params["ext_db_ratio"] = ext_db_ratio
|
||||
if ext_db_size is not None:
|
||||
params["ext_db_size"] = ext_db_size
|
||||
if osd_count is not None:
|
||||
params["osd_count"] = osd_count
|
||||
|
||||
response = call_api(config, "post", "/storage/ceph/osd", params=params)
|
||||
|
||||
if response.status_code == 200:
|
||||
|
|
|
@ -273,11 +273,17 @@ def add_osd(
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag=False,
|
||||
ext_db_ratio=0.05,
|
||||
split_flag=False,
|
||||
split_count=1,
|
||||
ext_db_ratio=None,
|
||||
ext_db_size=None,
|
||||
split_count=None,
|
||||
):
|
||||
# Verify that options are valid
|
||||
if ext_db_ratio is not None and ext_db_size is not None:
|
||||
return (
|
||||
False,
|
||||
"ERROR: Both an ext_db_ratio and ext_db_size were specified; choose only one.",
|
||||
)
|
||||
|
||||
# Verify the target node exists
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
|
@ -295,8 +301,8 @@ def add_osd(
|
|||
)
|
||||
|
||||
# Tell the cluster to create a new OSD for the host
|
||||
add_osd_string = "osd_add {},{},{},{},{},{},{}".format(
|
||||
node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count
|
||||
add_osd_string = "osd_add {},{},{},{},{},{}".format(
|
||||
node, device, weight, ext_db_ratio, ext_db_size, split_count
|
||||
)
|
||||
zkhandler.write([("base.cmd.ceph", add_osd_string)])
|
||||
# Wait 1/2 second for the cluster to get the message and start working
|
||||
|
|
|
@ -23,6 +23,7 @@ import time
|
|||
import json
|
||||
|
||||
import daemon_lib.common as common
|
||||
from daemon_lib.ceph import format_bytes_fromhuman
|
||||
|
||||
from distutils.util import strtobool
|
||||
from re import search, match, sub
|
||||
|
@ -266,10 +267,9 @@ class CephOSDInstance(object):
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag=False,
|
||||
ext_db_ratio=0.05,
|
||||
split_device=False,
|
||||
split_count=1,
|
||||
ext_db_ratio=None,
|
||||
ext_db_size=None,
|
||||
split_count=None,
|
||||
):
|
||||
# Handle a detect device if that is passed
|
||||
if match(r"detect:", device):
|
||||
|
@ -287,7 +287,19 @@ class CephOSDInstance(object):
|
|||
)
|
||||
device = ddevice
|
||||
|
||||
if split_device and split_count > 1:
|
||||
if ext_db_size is not None and ext_db_ratio is not None:
|
||||
logger.out(
|
||||
"Invalid configuration: both an ext_db_size and ext_db_ratio were specified",
|
||||
state="e",
|
||||
)
|
||||
return False
|
||||
|
||||
if ext_db_size is not None or ext_db_ratio is not None:
|
||||
ext_db_flag = True
|
||||
else:
|
||||
ext_db_flag = False
|
||||
|
||||
if split_count is not None:
|
||||
split_flag = f"--osds-per-device {split_count}"
|
||||
logger.out(
|
||||
f"Creating {split_count} new OSD disks on block device {device}",
|
||||
|
@ -352,16 +364,21 @@ class CephOSDInstance(object):
|
|||
)
|
||||
|
||||
# 4b. Prepare the logical volume if ext_db_flag
|
||||
if ext_db_ratio is not None:
|
||||
_, osd_size_bytes, _ = common.run_os_command(
|
||||
f"blockdev --getsize64 {osd_lv}"
|
||||
)
|
||||
osd_size_bytes = int(osd_size_bytes)
|
||||
osd_db_size_bytes = int(osd_size_bytes * ext_db_ratio)
|
||||
if ext_db_size is not None:
|
||||
osd_db_size_bytes = format_bytes_fromhuman(ext_db_size)
|
||||
|
||||
result = CephOSDInstance.create_osd_db_lv(
|
||||
zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes
|
||||
zkhandler, logger, osd_id, osd_db_size_bytes
|
||||
)
|
||||
if not result:
|
||||
raise Exception
|
||||
db_device = "osd-db/osd-{}".format(osd_id)
|
||||
db_device = f"osd-db/osd-{osd_id}"
|
||||
|
||||
# 4c. Attach the new DB device to the OSD
|
||||
retcode, stdout, stderr = common.run_os_command(
|
||||
|
@ -1078,7 +1095,7 @@ class CephOSDInstance(object):
|
|||
return False
|
||||
|
||||
@staticmethod
|
||||
def create_osd_db_lv(zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes):
|
||||
def create_osd_db_lv(zkhandler, logger, osd_id, osd_db_size_bytes):
|
||||
logger.out(
|
||||
"Creating new OSD database logical volume for OSD ID {}".format(osd_id),
|
||||
state="i",
|
||||
|
@ -1096,18 +1113,16 @@ class CephOSDInstance(object):
|
|||
return False
|
||||
|
||||
# 1. Determine LV sizing
|
||||
osd_db_size = int(osd_size_bytes * ext_db_ratio / 1024 / 1024)
|
||||
osd_db_size_m = int(osd_db_size_bytes / 1024 / 1024)
|
||||
|
||||
# 2. Create the LV
|
||||
logger.out(
|
||||
'Creating DB LV "osd-db/osd-{}" of {}M ({} * {})'.format(
|
||||
osd_id, osd_db_size, osd_size_bytes, ext_db_ratio
|
||||
),
|
||||
f'Creating DB LV "osd-db/osd-{osd_id}" of size {osd_db_size_m}M',
|
||||
state="i",
|
||||
)
|
||||
retcode, stdout, stderr = common.run_os_command(
|
||||
"lvcreate --yes --name osd-{} --size {} osd-db".format(
|
||||
osd_id, osd_db_size
|
||||
osd_id, osd_db_size_m
|
||||
)
|
||||
)
|
||||
if retcode:
|
||||
|
@ -1245,15 +1260,19 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag,
|
||||
ext_db_ratio,
|
||||
split_flag,
|
||||
ext_db_size,
|
||||
split_count,
|
||||
) = args.split(",")
|
||||
ext_db_flag = bool(strtobool(ext_db_flag))
|
||||
try:
|
||||
ext_db_ratio = float(ext_db_ratio)
|
||||
split_flag = bool(strtobool(split_flag))
|
||||
except Exception:
|
||||
ext_db_ratio = None
|
||||
try:
|
||||
split_count = int(split_count)
|
||||
except Exception:
|
||||
split_count = None
|
||||
|
||||
if node == this_node.name:
|
||||
# Lock the command queue
|
||||
zk_lock = zkhandler.writelock("base.cmd.ceph")
|
||||
|
@ -1265,9 +1284,8 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
|
|||
node,
|
||||
device,
|
||||
weight,
|
||||
ext_db_flag,
|
||||
ext_db_ratio,
|
||||
split_flag,
|
||||
ext_db_size,
|
||||
split_count,
|
||||
)
|
||||
# Command succeeded
|
||||
|
|
Loading…
Reference in New Issue