Update OSD replacement functionality
1. Simplify this by leveraging the existing remove_osd/add_osd functions, since its task was functionally identical to those two in sequential order. 2. Add support for split OSDs within the command (replacing all OSDs on the block device(s) as required). 3. Add additional configurability and flexibility around the old device, weight, and external DB LVs.
This commit is contained in:
parent
3cb8a70f04
commit
64e37ae963
|
@ -4379,14 +4379,25 @@ class API_Storage_Ceph_OSD_Element(Resource):
|
||||||
@RequestParser(
|
@RequestParser(
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"name": "device",
|
"name": "new_device",
|
||||||
"required": True,
|
"required": True,
|
||||||
"helptext": "A valid device or detect string must be specified.",
|
"helptext": "A valid device or detect string must be specified.",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "old_device",
|
||||||
|
"required": False,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "weight",
|
"name": "weight",
|
||||||
"required": True,
|
"required": False,
|
||||||
"helptext": "An OSD weight must be specified.",
|
},
|
||||||
|
{
|
||||||
|
"name": "ext_db_ratio",
|
||||||
|
"required": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ext_db_size",
|
||||||
|
"required": False,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "yes-i-really-mean-it",
|
"name": "yes-i-really-mean-it",
|
||||||
|
@ -4405,15 +4416,30 @@ class API_Storage_Ceph_OSD_Element(Resource):
|
||||||
- storage / ceph
|
- storage / ceph
|
||||||
parameters:
|
parameters:
|
||||||
- in: query
|
- in: query
|
||||||
name: device
|
name: new_device
|
||||||
type: string
|
type: string
|
||||||
required: true
|
required: true
|
||||||
description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") to replace the OSD onto
|
description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") to replace the OSD onto
|
||||||
|
- in: query
|
||||||
|
name: old_device
|
||||||
|
type: string
|
||||||
|
required: false
|
||||||
|
description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") of the original OSD
|
||||||
- in: query
|
- in: query
|
||||||
name: weight
|
name: weight
|
||||||
type: number
|
type: number
|
||||||
required: true
|
required: false
|
||||||
description: The Ceph CRUSH weight for the replaced OSD
|
description: The Ceph CRUSH weight for the replacement OSD
|
||||||
|
- in: query
|
||||||
|
name: ext_db_ratio
|
||||||
|
type: float
|
||||||
|
required: false
|
||||||
|
description: If set, creates an OSD DB LV for the replcement OSD with this decimal ratio of DB to total OSD size (usually 0.05 i.e. 5%); if unset, use existing ext_db_size
|
||||||
|
- in: query
|
||||||
|
name: ext_db_size
|
||||||
|
type: float
|
||||||
|
required: false
|
||||||
|
description: If set, creates an OSD DB LV for the replacement OSD with this explicit size in human units (e.g. 1024M, 20G); if unset, use existing ext_db_size
|
||||||
responses:
|
responses:
|
||||||
200:
|
200:
|
||||||
description: OK
|
description: OK
|
||||||
|
@ -4428,8 +4454,11 @@ class API_Storage_Ceph_OSD_Element(Resource):
|
||||||
"""
|
"""
|
||||||
return api_helper.ceph_osd_replace(
|
return api_helper.ceph_osd_replace(
|
||||||
osdid,
|
osdid,
|
||||||
reqargs.get("device", None),
|
reqargs.get("new_device"),
|
||||||
|
reqargs.get("old_device", None),
|
||||||
reqargs.get("weight", None),
|
reqargs.get("weight", None),
|
||||||
|
reqargs.get("ext_db_ratio", None),
|
||||||
|
reqargs.get("ext_db_size", None),
|
||||||
)
|
)
|
||||||
|
|
||||||
@RequestParser(
|
@RequestParser(
|
||||||
|
|
|
@ -1398,11 +1398,21 @@ def ceph_osd_add(
|
||||||
|
|
||||||
|
|
||||||
@ZKConnection(config)
|
@ZKConnection(config)
|
||||||
def ceph_osd_replace(zkhandler, osd_id, device, weight):
|
def ceph_osd_replace(
|
||||||
|
zkhandler,
|
||||||
|
osd_id,
|
||||||
|
new_device,
|
||||||
|
old_device=None,
|
||||||
|
weight=None,
|
||||||
|
ext_db_ratio=None,
|
||||||
|
ext_db_size=None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Replace a Ceph OSD in the PVC Ceph storage cluster.
|
Replace a Ceph OSD in the PVC Ceph storage cluster.
|
||||||
"""
|
"""
|
||||||
retflag, retdata = pvc_ceph.replace_osd(zkhandler, osd_id, device, weight)
|
retflag, retdata = pvc_ceph.replace_osd(
|
||||||
|
zkhandler, osd_id, new_device, old_device, weight, ext_db_ratio, ext_db_size
|
||||||
|
)
|
||||||
|
|
||||||
if retflag:
|
if retflag:
|
||||||
retcode = 200
|
retcode = 200
|
||||||
|
|
|
@ -3428,13 +3428,15 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
|
||||||
|
|
||||||
The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation.
|
The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation.
|
||||||
|
|
||||||
The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its RocksDB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.).
|
The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its DB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.).
|
||||||
|
|
||||||
An external DB is only recommended for relatively slow OSD devices (i.e. SATA SSDs) when there is also a smaller, faster (i.e. NVMe or 3DXPoint SSD) device in the node. For NVMe OSDs, an external DB is not required nor recommended for optimal performance. An "--ext-db-ratio" of 0.05 (5%) is recommended for most workloads and OSD sizes; the Ceph documentation recommends a minimum of 0.02 (2%), and higher values may improve performance under write-heavy workloads with fewer OSDs per node. The explicit size option is also permitted to allow more fine-grained sizing, allowing the administrator to pre-calculate the desired size rather than relying on a ratio.
|
An external DB is only recommended for relatively slow OSD devices (i.e. SATA SSDs) when there is also a smaller, faster (i.e. NVMe or 3DXPoint SSD) device in the node. For NVMe OSDs, an external DB is not required nor recommended for optimal performance. An "--ext-db-ratio" of 0.05 (5%) is recommended for most workloads and OSD sizes; the Ceph documentation recommends a minimum of 0.02 (2%), and higher values may improve performance under write-heavy workloads with fewer OSDs per node. The explicit size option is also permitted to allow more fine-grained sizing, allowing the administrator to pre-calculate the desired size rather than relying on a ratio.
|
||||||
|
|
||||||
The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size and performance of the OSD disk; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs.
|
The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size and performance of the OSD disk; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs.
|
||||||
|
|
||||||
Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD; ratios are calculated from the OSD size, not the underlying device.
|
Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD; ratios are calculated from the OSD size, not the underlying device.
|
||||||
|
|
||||||
|
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
echo(
|
echo(
|
||||||
|
@ -3461,25 +3463,57 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
|
||||||
@click.command(name="replace", short_help="Replace OSD block device.")
|
@click.command(name="replace", short_help="Replace OSD block device.")
|
||||||
@connection_req
|
@connection_req
|
||||||
@click.argument("osdid")
|
@click.argument("osdid")
|
||||||
@click.argument("device")
|
@click.argument("new_device")
|
||||||
|
@click.option(
|
||||||
|
"-o",
|
||||||
|
"--old-device",
|
||||||
|
"old_device",
|
||||||
|
default=None,
|
||||||
|
help="The old OSD block device, if known and valid",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-w",
|
"-w",
|
||||||
"--weight",
|
"--weight",
|
||||||
"weight",
|
"weight",
|
||||||
default=1.0,
|
default=None,
|
||||||
show_default=True,
|
help="New weight of the OSD(s) within the CRUSH map; if unset, old weight is used",
|
||||||
help="New weight of the OSD within the CRUSH map.",
|
|
||||||
)
|
)
|
||||||
@confirm_opt("Replace OSD {osdid} with block device {device} weight {weight}")
|
@click.option(
|
||||||
def cli_storage_osd_replace(osdid, device, weight):
|
"-r",
|
||||||
|
"--ext-db-ratio",
|
||||||
|
"ext_db_ratio",
|
||||||
|
default=None,
|
||||||
|
help="Create a new external database logical volume for the OSD(s) with this decimal ratio of the DB LV to the OSD size; if unset, old ext_db_size is used",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-s",
|
||||||
|
"--ext-db-size",
|
||||||
|
"ext_db_size",
|
||||||
|
default=None,
|
||||||
|
help="Create a new external database logical volume for the OSD(s) with this human-unit size; if unset, old ext_db_size is used",
|
||||||
|
)
|
||||||
|
@confirm_opt(
|
||||||
|
"Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}"
|
||||||
|
)
|
||||||
|
def cli_storage_osd_replace(
|
||||||
|
osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Replace the block device of an existing OSD with ID OSDID with DEVICE. Use this command to replace a failed or smaller OSD block device with a new one.
|
Replace the block device of an existing OSD with ID OSDID, and any peer split OSDs with the same block device, with NEW_DEVICE. Use this command to replace a failed or smaller OSD block device with a new one in one command.
|
||||||
|
|
||||||
DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'.
|
DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Partitions are NOT supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'. The path or detect string must be valid on the current node housing the OSD.
|
||||||
|
|
||||||
The weight of an OSD should reflect the ratio of the OSD to other OSDs in the storage cluster. For details, see 'pvc storage osd add --help'. Note that the current weight must be explicitly specified if it differs from the default.
|
If OSDID is part of a split OSD set, any peer split OSDs with the same configured block device will be replaced as well. The split count will be retained and cannot be changed with this command; to do so, all OSDs in the split OSD set must be removed and new OSD(s) created.
|
||||||
|
|
||||||
Existing IDs, external DB devices, etc. of the OSD will be preserved; data will be lost and rebuilt from the remaining healthy OSDs.
|
WARNING: This operation entails (and is functionally equivalent to) a removal and recreation of the specified OSD and, if applicable, all peer split OSDs. This is an intensive and potentially destructive action. Ensure that the cluster is otherwise healthy before proceeding, and ensure the subsequent rebuild completes successfully. Do not attempt this operation on a severely degraded cluster without first considering the possible data loss implications.
|
||||||
|
|
||||||
|
If the "-o"/"--old-device" option is specified, is a valid block device on the node, is readable/accessible, and contains the metadata for the specified OSD, it will be zapped. If this option is not specified, the system will try to find the old block device automatically to zap it. If it can't be found, the OSD will simply be removed from the CRUSH map and PVC database before recreating. This option can provide a cleaner deletion when replacing a working device that has a different block path, but is otherwise unnecessary.
|
||||||
|
|
||||||
|
The "-w"/"--weight", "-r"/"--ext-db-ratio", and "-s"/"--ext-db-size" allow overriding the existing weight and external DB LV for the OSD(s), if desired. If unset, the existing weight and external DB LV size (if applicable) will be used for the replacement OSD(s) instead.
|
||||||
|
|
||||||
|
NOTE: If neither the "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" option is specified, and the OSD(s) had an external DB LV, it cannot be removed a new DB LV will be created for the replacement OSD(s); this cannot be avoided. However, if the OSD(s) did not have an external DB LV, and one of these options is specified, a new DB LV will be added to the new OSD.
|
||||||
|
|
||||||
|
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
echo(
|
echo(
|
||||||
|
@ -3488,7 +3522,7 @@ def cli_storage_osd_replace(osdid, device, weight):
|
||||||
newline=False,
|
newline=False,
|
||||||
)
|
)
|
||||||
retcode, retmsg = pvc.lib.storage.ceph_osd_replace(
|
retcode, retmsg = pvc.lib.storage.ceph_osd_replace(
|
||||||
CLI_CONFIG, osdid, device, weight
|
CLI_CONFIG, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
|
||||||
)
|
)
|
||||||
echo(CLI_CONFIG, "done.")
|
echo(CLI_CONFIG, "done.")
|
||||||
finish(retcode, retmsg)
|
finish(retcode, retmsg)
|
||||||
|
@ -3510,7 +3544,9 @@ def cli_storage_osd_refresh(osdid, device):
|
||||||
|
|
||||||
Existing data, IDs, weights, etc. of the OSD will be preserved.
|
Existing data, IDs, weights, etc. of the OSD will be preserved.
|
||||||
|
|
||||||
NOTE: If a device had an external DB device, this is not automatically handled at this time. It is best to remove and re-add the OSD instead.
|
WARNING: If a device had an external DB device, this is not automatically handled at this time. It is best to remove and re-add the OSD (e.g. with "pvc storage osd replace") instead.
|
||||||
|
|
||||||
|
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
echo(
|
echo(
|
||||||
|
@ -3545,6 +3581,8 @@ def cli_storage_osd_remove(osdid, force_flag):
|
||||||
DANGER: This will completely remove the OSD from the cluster. OSDs will rebalance which will negatively affect performance and available space. It is STRONGLY RECOMMENDED to set an OSD out (using 'pvc storage osd out') and allow the cluster to fully rebalance, verified with 'pvc storage status', before removing an OSD.
|
DANGER: This will completely remove the OSD from the cluster. OSDs will rebalance which will negatively affect performance and available space. It is STRONGLY RECOMMENDED to set an OSD out (using 'pvc storage osd out') and allow the cluster to fully rebalance, verified with 'pvc storage status', before removing an OSD.
|
||||||
|
|
||||||
NOTE: The "-f"/"--force" option is useful after replacing a failed node, to ensure the OSD is removed even if the OSD in question does not properly exist on the node after a rebuild.
|
NOTE: The "-f"/"--force" option is useful after replacing a failed node, to ensure the OSD is removed even if the OSD in question does not properly exist on the node after a rebuild.
|
||||||
|
|
||||||
|
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
echo(
|
echo(
|
||||||
|
|
|
@ -262,15 +262,30 @@ def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_co
|
||||||
return retstatus, response.json().get("message", "")
|
return retstatus, response.json().get("message", "")
|
||||||
|
|
||||||
|
|
||||||
def ceph_osd_replace(config, osdid, device, weight):
|
def ceph_osd_replace(
|
||||||
|
config, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Replace an existing Ceph OSD with a new device
|
Replace an existing Ceph OSD with a new device
|
||||||
|
|
||||||
API endpoint: POST /api/v1/storage/ceph/osd/{osdid}
|
API endpoint: POST /api/v1/storage/ceph/osd/{osdid}
|
||||||
API arguments: device={device}, weight={weight}
|
API arguments: new_device, [old_device={old_device}, weight={weight}, ext_db_ratio={ext_db_ratio}, ext_db_size={ext_db_size}]
|
||||||
API schema: {"message":"{data}"}
|
API schema: {"message":"{data}"}
|
||||||
"""
|
"""
|
||||||
params = {"device": device, "weight": weight, "yes-i-really-mean-it": "yes"}
|
params = {
|
||||||
|
"new_device": new_device,
|
||||||
|
"yes-i-really-mean-it": "yes",
|
||||||
|
}
|
||||||
|
|
||||||
|
if old_device is not None:
|
||||||
|
params["old_device"] = old_device
|
||||||
|
if weight is not None:
|
||||||
|
params["weight"] = weight
|
||||||
|
if ext_db_ratio is not None:
|
||||||
|
params["ext_db_ratio"] = ext_db_ratio
|
||||||
|
if ext_db_size is not None:
|
||||||
|
params["ext_db_size"] = ext_db_size
|
||||||
|
|
||||||
response = call_api(config, "post", f"/storage/ceph/osd/{osdid}", params=params)
|
response = call_api(config, "post", f"/storage/ceph/osd/{osdid}", params=params)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
@ -438,6 +453,9 @@ def format_list_osd(config, osd_list):
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if osd_information["is_split"]:
|
||||||
|
osd_information["device"] = f"{osd_information['device']} *s"
|
||||||
|
|
||||||
# Deal with the size to human readable
|
# Deal with the size to human readable
|
||||||
osd_information["stats"]["size"] = osd_information["stats"]["kb"] * 1024
|
osd_information["stats"]["size"] = osd_information["stats"]["kb"] * 1024
|
||||||
for datatype in "size", "wr_data", "rd_data":
|
for datatype in "size", "wr_data", "rd_data":
|
||||||
|
|
|
@ -26,6 +26,7 @@ import time
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from distutils.util import strtobool
|
||||||
|
|
||||||
import daemon_lib.vm as vm
|
import daemon_lib.vm as vm
|
||||||
import daemon_lib.common as common
|
import daemon_lib.common as common
|
||||||
|
@ -211,7 +212,7 @@ def getOSDInformation(zkhandler, osd_id):
|
||||||
# Get the devices
|
# Get the devices
|
||||||
osd_node = zkhandler.read(("osd.node", osd_id))
|
osd_node = zkhandler.read(("osd.node", osd_id))
|
||||||
osd_device = zkhandler.read(("osd.device", osd_id))
|
osd_device = zkhandler.read(("osd.device", osd_id))
|
||||||
osd_is_split = zkhandler.read(("osd.is_split", osd_id))
|
osd_is_split = bool(strtobool(zkhandler.read(("osd.is_split", osd_id))))
|
||||||
osd_db_device = zkhandler.read(("osd.db_device", osd_id))
|
osd_db_device = zkhandler.read(("osd.db_device", osd_id))
|
||||||
# Parse the stats data
|
# Parse the stats data
|
||||||
osd_stats_raw = zkhandler.read(("osd.stats", osd_id))
|
osd_stats_raw = zkhandler.read(("osd.stats", osd_id))
|
||||||
|
@ -329,12 +330,18 @@ def add_osd(
|
||||||
return success, message
|
return success, message
|
||||||
|
|
||||||
|
|
||||||
def replace_osd(zkhandler, osd_id, new_device, weight):
|
def replace_osd(
|
||||||
|
zkhandler,
|
||||||
|
osd_id,
|
||||||
|
new_device,
|
||||||
|
old_device=None,
|
||||||
|
weight=None,
|
||||||
|
ext_db_ratio=None,
|
||||||
|
ext_db_size=None,
|
||||||
|
):
|
||||||
# Get current OSD information
|
# Get current OSD information
|
||||||
osd_information = getOSDInformation(zkhandler, osd_id)
|
osd_information = getOSDInformation(zkhandler, osd_id)
|
||||||
node = osd_information["node"]
|
node = osd_information["node"]
|
||||||
old_device = osd_information["device"]
|
|
||||||
ext_db_flag = True if osd_information["db_device"] else False
|
|
||||||
|
|
||||||
# Verify target block device isn't in use
|
# Verify target block device isn't in use
|
||||||
block_osd = verifyOSDBlock(zkhandler, node, new_device)
|
block_osd = verifyOSDBlock(zkhandler, node, new_device)
|
||||||
|
@ -347,8 +354,8 @@ def replace_osd(zkhandler, osd_id, new_device, weight):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tell the cluster to create a new OSD for the host
|
# Tell the cluster to create a new OSD for the host
|
||||||
replace_osd_string = "osd_replace {},{},{},{},{},{}".format(
|
replace_osd_string = "osd_replace {},{},{},{},{},{},{}".format(
|
||||||
node, osd_id, old_device, new_device, weight, ext_db_flag
|
node, osd_id, new_device, old_device, weight, ext_db_ratio, ext_db_size
|
||||||
)
|
)
|
||||||
zkhandler.write([("base.cmd.ceph", replace_osd_string)])
|
zkhandler.write([("base.cmd.ceph", replace_osd_string)])
|
||||||
# Wait 1/2 second for the cluster to get the message and start working
|
# Wait 1/2 second for the cluster to get the message and start working
|
||||||
|
|
|
@ -23,7 +23,7 @@ import time
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import daemon_lib.common as common
|
import daemon_lib.common as common
|
||||||
from daemon_lib.ceph import format_bytes_fromhuman
|
from daemon_lib.ceph import format_bytes_fromhuman, get_list_osd
|
||||||
|
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from re import search, match, sub
|
from re import search, match, sub
|
||||||
|
@ -393,7 +393,7 @@ class CephOSDInstance(object):
|
||||||
raise Exception
|
raise Exception
|
||||||
|
|
||||||
# 4d. Get the list of created OSDs on the device (final pass)
|
# 4d. Get the list of created OSDs on the device (final pass)
|
||||||
logger.out(f"(Requerying OSD(s) on disk {device}", state="i")
|
logger.out(f"Requerying OSD(s) on disk {device}", state="i")
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
retcode, stdout, stderr = common.run_os_command(
|
||||||
f"ceph-volume lvm list --format json {device}"
|
f"ceph-volume lvm list --format json {device}"
|
||||||
)
|
)
|
||||||
|
@ -493,10 +493,11 @@ class CephOSDInstance(object):
|
||||||
logger,
|
logger,
|
||||||
node,
|
node,
|
||||||
osd_id,
|
osd_id,
|
||||||
old_device,
|
|
||||||
new_device,
|
new_device,
|
||||||
weight,
|
old_device=None,
|
||||||
ext_db_flag=False,
|
weight=None,
|
||||||
|
ext_db_ratio=None,
|
||||||
|
ext_db_size=None,
|
||||||
):
|
):
|
||||||
# Handle a detect device if that is passed
|
# Handle a detect device if that is passed
|
||||||
if match(r"detect:", new_device):
|
if match(r"detect:", new_device):
|
||||||
|
@ -514,223 +515,105 @@ class CephOSDInstance(object):
|
||||||
)
|
)
|
||||||
new_device = ddevice
|
new_device = ddevice
|
||||||
|
|
||||||
# We are ready to create a new OSD on this node
|
# Phase 1: Try to determine what we can about the old device
|
||||||
logger.out(
|
def find_osds_from_block(device):
|
||||||
"Replacing OSD {} disk with block device {}".format(osd_id, new_device),
|
# Try to query the passed block device directly
|
||||||
state="i",
|
logger.out(f"Querying for OSD(s) on disk {device}", state="i")
|
||||||
)
|
|
||||||
try:
|
|
||||||
# Verify the OSD is present
|
|
||||||
retcode, stdout, stderr = common.run_os_command("ceph osd ls")
|
|
||||||
osd_list = stdout.split("\n")
|
|
||||||
if osd_id not in osd_list:
|
|
||||||
logger.out(
|
|
||||||
"Could not find OSD {} in the cluster".format(osd_id), state="e"
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
|
|
||||||
# 1. Set the OSD down and out so it will flush
|
|
||||||
logger.out("Setting down OSD disk with ID {}".format(osd_id), state="i")
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
retcode, stdout, stderr = common.run_os_command(
|
||||||
"ceph osd down {}".format(osd_id)
|
f"ceph-volume lvm list --format json {device}"
|
||||||
)
|
)
|
||||||
if retcode:
|
if retcode:
|
||||||
print("ceph osd down")
|
found_osds = []
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
logger.out("Setting out OSD disk with ID {}".format(osd_id), state="i")
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"ceph osd out {}".format(osd_id)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("ceph osd out")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
# 2. Wait for the OSD to be safe to remove (but don't wait for rebalancing to complete)
|
|
||||||
logger.out(f"Waiting for OSD {osd_id} to be safe to remove", state="i")
|
|
||||||
while True:
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
f"ceph osd safe-to-destroy osd.{osd_id}"
|
|
||||||
)
|
|
||||||
if retcode in [0, 11]:
|
|
||||||
# Code 0 = success
|
|
||||||
# Code 11 = "Error EAGAIN: OSD(s) 5 have no reported stats, and not all PGs are active+clean; we cannot draw any conclusions." which means all PGs have been remappped but backfill is still occurring
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
time.sleep(5)
|
found_osds = jloads(stdout)
|
||||||
|
|
||||||
# 3. Stop the OSD process
|
return found_osds
|
||||||
logger.out("Stopping OSD disk with ID {}".format(osd_id), state="i")
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"systemctl stop ceph-osd@{}".format(osd_id)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("systemctl stop")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 4. Destroy the OSD
|
real_old_device = None
|
||||||
logger.out("Destroying OSD with ID {osd_id}", state="i")
|
osd_block = zkhandler.read(("osd.device", osd_id))
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
f"ceph osd destroy {osd_id} --yes-i-really-mean-it"
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("ceph osd destroy")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
# 5. Adjust the weight
|
# Determine information from a passed old_device
|
||||||
logger.out(
|
if old_device is not None:
|
||||||
"Adjusting weight of OSD disk with ID {} in CRUSH map".format(osd_id),
|
found_osds = find_osds_from_block(old_device)
|
||||||
state="i",
|
if found_osds and osd_id in found_osds.keys():
|
||||||
)
|
real_old_device = old_device
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"ceph osd crush reweight osd.{osdid} {weight}".format(
|
|
||||||
osdid=osd_id, weight=weight
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("ceph osd crush reweight")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
# 6a. Zap the new disk to ensure it is ready to go
|
|
||||||
logger.out("Zapping disk {}".format(new_device), state="i")
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"ceph-volume lvm zap --destroy {}".format(new_device)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("ceph-volume lvm zap")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
dev_flags = "--data {}".format(new_device)
|
|
||||||
|
|
||||||
# 6b. Prepare the logical volume if ext_db_flag
|
|
||||||
if ext_db_flag:
|
|
||||||
db_device = "osd-db/osd-{}".format(osd_id)
|
|
||||||
dev_flags += " --block.db {}".format(db_device)
|
|
||||||
else:
|
else:
|
||||||
db_device = ""
|
|
||||||
|
|
||||||
# 6c. Replace the OSD
|
|
||||||
logger.out(
|
logger.out(
|
||||||
"Preparing LVM for replaced OSD {} disk on {}".format(
|
f"No OSD(s) found on disk {old_device}; falling back to PVC detection",
|
||||||
osd_id, new_device
|
state="w",
|
||||||
),
|
|
||||||
state="i",
|
|
||||||
)
|
)
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"ceph-volume lvm prepare --osd-id {osdid} --bluestore {devices}".format(
|
|
||||||
osdid=osd_id, devices=dev_flags
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("ceph-volume lvm prepare")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
# 7a. Get OSD information
|
# Try to get an old_device from our PVC information
|
||||||
|
if real_old_device is None:
|
||||||
|
found_osds = find_osds_from_block(osd_block)
|
||||||
|
|
||||||
|
if osd_id in found_osds.keys():
|
||||||
|
real_old_device = osd_block
|
||||||
|
|
||||||
|
if real_old_device is None:
|
||||||
|
skip_zap = True
|
||||||
logger.out(
|
logger.out(
|
||||||
"Getting OSD information for ID {} on {}".format(osd_id, new_device),
|
"No valid old block device found for OSD; skipping zap", state="w"
|
||||||
state="i",
|
|
||||||
)
|
)
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
else:
|
||||||
"ceph-volume lvm list {device}".format(device=new_device)
|
skip_zap = False
|
||||||
)
|
|
||||||
for line in stdout.split("\n"):
|
|
||||||
if "block device" in line:
|
|
||||||
osd_blockdev = line.split()[-1]
|
|
||||||
if "osd fsid" in line:
|
|
||||||
osd_fsid = line.split()[-1]
|
|
||||||
if "cluster fsid" in line:
|
|
||||||
osd_clusterfsid = line.split()[-1]
|
|
||||||
if "devices" in line:
|
|
||||||
osd_device = line.split()[-1]
|
|
||||||
|
|
||||||
if not osd_fsid:
|
|
||||||
print("ceph-volume lvm list")
|
|
||||||
print("Could not find OSD information in data:")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
# Split OSD blockdev into VG and LV components
|
|
||||||
# osd_blockdev = /dev/ceph-<uuid>/osd-block-<uuid>
|
|
||||||
_, _, osd_vg, osd_lv = osd_blockdev.split("/")
|
|
||||||
|
|
||||||
# Reset whatever we were given to Ceph's /dev/xdX naming
|
|
||||||
if new_device != osd_device:
|
|
||||||
new_device = osd_device
|
|
||||||
|
|
||||||
# 7b. Activate the OSD
|
|
||||||
logger.out("Activating new OSD disk with ID {}".format(osd_id), state="i")
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"ceph-volume lvm activate --bluestore {osdid} {osdfsid}".format(
|
|
||||||
osdid=osd_id, osdfsid=osd_fsid
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("ceph-volume lvm activate")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
# 8. Verify it started
|
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
|
||||||
"systemctl status ceph-osd@{osdid}".format(osdid=osd_id)
|
|
||||||
)
|
|
||||||
if retcode:
|
|
||||||
print("systemctl status")
|
|
||||||
print(stdout)
|
|
||||||
print(stderr)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
# 9. Update Zookeeper information
|
|
||||||
logger.out(
|
logger.out(
|
||||||
"Adding new OSD disk with ID {} to Zookeeper".format(osd_id), state="i"
|
f"Found source OSD(s) on block device {real_old_device}", state="i"
|
||||||
)
|
)
|
||||||
zkhandler.write(
|
|
||||||
[
|
# Try to determine if any other OSDs shared a block device with this OSD
|
||||||
(("osd", osd_id), ""),
|
all_osds_on_block = [
|
||||||
(("osd.node", osd_id), node),
|
o
|
||||||
(("osd.device", osd_id), new_device),
|
for o in get_list_osd(zkhandler, None)
|
||||||
(("osd.db_device", osd_id), db_device),
|
if o["node"] == node and o["device"] == osd_block
|
||||||
(("osd.fsid", osd_id), ""),
|
|
||||||
(("osd.ofsid", osd_id), osd_fsid),
|
|
||||||
(("osd.cfsid", osd_id), osd_clusterfsid),
|
|
||||||
(("osd.lvm", osd_id), ""),
|
|
||||||
(("osd.vg", osd_id), osd_vg),
|
|
||||||
(("osd.lv", osd_id), osd_lv),
|
|
||||||
(
|
|
||||||
("osd.stats", osd_id),
|
|
||||||
'{"uuid": "|", "up": 0, "in": 0, "primary_affinity": "|", "utilization": "|", "var": "|", "pgs": "|", "kb": "|", "weight": "|", "reweight": "|", "node": "|", "used": "|", "avail": "|", "wr_ops": "|", "wr_data": "|", "rd_ops": "|", "rd_data": "|", "state": "|"}',
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Remove each OSD on the block device
|
||||||
|
for osd in all_osds_on_block:
|
||||||
|
result = CephOSDInstance.remove_osd(
|
||||||
|
zkhandler, logger, osd["id"], force_flag=True, skip_zap_flag=skip_zap
|
||||||
)
|
)
|
||||||
|
|
||||||
# Log it
|
# Determine the weight of the OSD(s)
|
||||||
logger.out(
|
if weight is None:
|
||||||
"Replaced OSD {} disk with device {}".format(osd_id, new_device),
|
weight = all_osds_on_block[0]["stats"]["weight"]
|
||||||
state="o",
|
|
||||||
|
# Determine how many split OSD(s) to recreate
|
||||||
|
if len(all_osds_on_block) > 1 and all_osds_on_block[0]["is_split"]:
|
||||||
|
split_count = len(all_osds_on_block)
|
||||||
|
else:
|
||||||
|
split_count = None
|
||||||
|
|
||||||
|
# Determine if an ext_db should be readded
|
||||||
|
if ext_db_ratio is not None:
|
||||||
|
osd_db_ratio = ext_db_ratio
|
||||||
|
osd_db_size = None
|
||||||
|
elif ext_db_size is not None:
|
||||||
|
osd_db_ratio = None
|
||||||
|
osd_db_size = ext_db_size
|
||||||
|
elif all_osds_on_block[0]["db_device"]:
|
||||||
|
_, osd_db_size_bytes, _ = common.run_os_command(
|
||||||
|
f"blockdev --getsize64 {all_osds_on_block[0]['db_device']}"
|
||||||
)
|
)
|
||||||
return True
|
osd_db_ratio = None
|
||||||
except Exception as e:
|
osd_db_size = f"{osd_db_size}B"
|
||||||
# Log it
|
else:
|
||||||
logger.out("Failed to replace OSD {} disk: {}".format(osd_id, e), state="e")
|
osd_db_ratio = None
|
||||||
return False
|
osd_db_size = None
|
||||||
|
|
||||||
|
# Create [a] new OSD[s], on the new block device
|
||||||
|
result = CephOSDInstance.add_osd(
|
||||||
|
zkhandler,
|
||||||
|
logger,
|
||||||
|
node,
|
||||||
|
new_device,
|
||||||
|
weight,
|
||||||
|
ext_db_ratio=osd_db_ratio,
|
||||||
|
ext_db_size=osd_db_size,
|
||||||
|
split_count=split_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def refresh_osd(zkhandler, logger, node, osd_id, device, ext_db_flag):
|
def refresh_osd(zkhandler, logger, node, osd_id, device, ext_db_flag):
|
||||||
|
@ -863,7 +746,7 @@ class CephOSDInstance(object):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_osd(zkhandler, logger, osd_id, osd_obj, force_flag):
|
def remove_osd(zkhandler, logger, osd_id, force_flag=False, skip_zap_flag=False):
|
||||||
logger.out("Removing OSD {}".format(osd_id), state="i")
|
logger.out("Removing OSD {}".format(osd_id), state="i")
|
||||||
try:
|
try:
|
||||||
# Verify the OSD is present
|
# Verify the OSD is present
|
||||||
|
@ -931,6 +814,7 @@ class CephOSDInstance(object):
|
||||||
raise Exception
|
raise Exception
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
|
if not skip_zap_flag:
|
||||||
# 4. Determine the block devices
|
# 4. Determine the block devices
|
||||||
osd_vg = zkhandler.read(("osd.vg", osd_id))
|
osd_vg = zkhandler.read(("osd.vg", osd_id))
|
||||||
osd_lv = zkhandler.read(("osd.lv", osd_id))
|
osd_lv = zkhandler.read(("osd.lv", osd_id))
|
||||||
|
@ -976,11 +860,13 @@ class CephOSDInstance(object):
|
||||||
else:
|
else:
|
||||||
raise Exception
|
raise Exception
|
||||||
|
|
||||||
|
if not skip_zap_flag:
|
||||||
# 7. Remove the DB device
|
# 7. Remove the DB device
|
||||||
if zkhandler.exists(("osd.db_device", osd_id)):
|
if zkhandler.exists(("osd.db_device", osd_id)):
|
||||||
db_device = zkhandler.read(("osd.db_device", osd_id))
|
db_device = zkhandler.read(("osd.db_device", osd_id))
|
||||||
logger.out(
|
logger.out(
|
||||||
'Removing OSD DB logical volume "{}"'.format(db_device), state="i"
|
'Removing OSD DB logical volume "{}"'.format(db_device),
|
||||||
|
state="i",
|
||||||
)
|
)
|
||||||
retcode, stdout, stderr = common.run_os_command(
|
retcode, stdout, stderr = common.run_os_command(
|
||||||
"lvremove --yes --force {}".format(db_device)
|
"lvremove --yes --force {}".format(db_device)
|
||||||
|
@ -1307,8 +1193,19 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
|
||||||
|
|
||||||
# Replacing an OSD
|
# Replacing an OSD
|
||||||
if command == "osd_replace":
|
if command == "osd_replace":
|
||||||
node, osd_id, old_device, new_device, weight, ext_db_flag = args.split(",")
|
(
|
||||||
ext_db_flag = bool(strtobool(ext_db_flag))
|
node,
|
||||||
|
osd_id,
|
||||||
|
new_device,
|
||||||
|
old_device,
|
||||||
|
weight,
|
||||||
|
ext_db_ratio,
|
||||||
|
ext_db_size,
|
||||||
|
) = args.split(",")
|
||||||
|
old_device = None if old_device == "None" else old_device
|
||||||
|
weight = None if weight == "None" else weight
|
||||||
|
ext_db_ratio = None if ext_db_ratio == "None" else ext_db_ratio
|
||||||
|
ext_db_size = None if ext_db_size == "None" else ext_db_size
|
||||||
if node == this_node.name:
|
if node == this_node.name:
|
||||||
# Lock the command queue
|
# Lock the command queue
|
||||||
zk_lock = zkhandler.writelock("base.cmd.ceph")
|
zk_lock = zkhandler.writelock("base.cmd.ceph")
|
||||||
|
@ -1319,10 +1216,11 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
|
||||||
logger,
|
logger,
|
||||||
node,
|
node,
|
||||||
osd_id,
|
osd_id,
|
||||||
old_device,
|
|
||||||
new_device,
|
new_device,
|
||||||
|
old_device,
|
||||||
weight,
|
weight,
|
||||||
ext_db_flag,
|
ext_db_ratio,
|
||||||
|
ext_db_size,
|
||||||
)
|
)
|
||||||
# Command succeeded
|
# Command succeeded
|
||||||
if result:
|
if result:
|
||||||
|
@ -1373,7 +1271,7 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
|
||||||
with zk_lock:
|
with zk_lock:
|
||||||
# Remove the OSD
|
# Remove the OSD
|
||||||
result = CephOSDInstance.remove_osd(
|
result = CephOSDInstance.remove_osd(
|
||||||
zkhandler, logger, osd_id, d_osd[osd_id], force_flag
|
zkhandler, logger, osd_id, force_flag
|
||||||
)
|
)
|
||||||
# Command succeeded
|
# Command succeeded
|
||||||
if result:
|
if result:
|
||||||
|
|
Loading…
Reference in New Issue