Update OSD replacement functionality

1. Simplify this by leveraging the existing remove_osd/add_osd
functions, since its task was functionally identical to those two in
sequential order.
2. Add support for split OSDs within the command (replacing all OSDs on
the block device(s) as required).
3. Add additional configurability and flexibility around the old device,
weight, and external DB LVs.
This commit is contained in:
Joshua Boniface 2023-11-03 01:45:49 -04:00
parent 3cb8a70f04
commit 64e37ae963
6 changed files with 279 additions and 279 deletions

View File

@ -4379,14 +4379,25 @@ class API_Storage_Ceph_OSD_Element(Resource):
@RequestParser( @RequestParser(
[ [
{ {
"name": "device", "name": "new_device",
"required": True, "required": True,
"helptext": "A valid device or detect string must be specified.", "helptext": "A valid device or detect string must be specified.",
}, },
{
"name": "old_device",
"required": False,
},
{ {
"name": "weight", "name": "weight",
"required": True, "required": False,
"helptext": "An OSD weight must be specified.", },
{
"name": "ext_db_ratio",
"required": False,
},
{
"name": "ext_db_size",
"required": False,
}, },
{ {
"name": "yes-i-really-mean-it", "name": "yes-i-really-mean-it",
@ -4405,15 +4416,30 @@ class API_Storage_Ceph_OSD_Element(Resource):
- storage / ceph - storage / ceph
parameters: parameters:
- in: query - in: query
name: device name: new_device
type: string type: string
required: true required: true
description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") to replace the OSD onto description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") to replace the OSD onto
- in: query
name: old_device
type: string
required: false
description: The block device (e.g. "/dev/sdb", "/dev/disk/by-path/...", etc.) or detect string ("detect:NAME:SIZE:ID") of the original OSD
- in: query - in: query
name: weight name: weight
type: number type: number
required: true required: false
description: The Ceph CRUSH weight for the replaced OSD description: The Ceph CRUSH weight for the replacement OSD
- in: query
name: ext_db_ratio
type: float
required: false
description: If set, creates an OSD DB LV for the replcement OSD with this decimal ratio of DB to total OSD size (usually 0.05 i.e. 5%); if unset, use existing ext_db_size
- in: query
name: ext_db_size
type: float
required: false
description: If set, creates an OSD DB LV for the replacement OSD with this explicit size in human units (e.g. 1024M, 20G); if unset, use existing ext_db_size
responses: responses:
200: 200:
description: OK description: OK
@ -4428,8 +4454,11 @@ class API_Storage_Ceph_OSD_Element(Resource):
""" """
return api_helper.ceph_osd_replace( return api_helper.ceph_osd_replace(
osdid, osdid,
reqargs.get("device", None), reqargs.get("new_device"),
reqargs.get("old_device", None),
reqargs.get("weight", None), reqargs.get("weight", None),
reqargs.get("ext_db_ratio", None),
reqargs.get("ext_db_size", None),
) )
@RequestParser( @RequestParser(

View File

@ -1398,11 +1398,21 @@ def ceph_osd_add(
@ZKConnection(config) @ZKConnection(config)
def ceph_osd_replace(zkhandler, osd_id, device, weight): def ceph_osd_replace(
zkhandler,
osd_id,
new_device,
old_device=None,
weight=None,
ext_db_ratio=None,
ext_db_size=None,
):
""" """
Replace a Ceph OSD in the PVC Ceph storage cluster. Replace a Ceph OSD in the PVC Ceph storage cluster.
""" """
retflag, retdata = pvc_ceph.replace_osd(zkhandler, osd_id, device, weight) retflag, retdata = pvc_ceph.replace_osd(
zkhandler, osd_id, new_device, old_device, weight, ext_db_ratio, ext_db_size
)
if retflag: if retflag:
retcode = 200 retcode = 200

View File

@ -3428,13 +3428,15 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation. The weight of an OSD should reflect the ratio of the size of the OSD to the other OSDs in the storage cluster. For example, with a 200GB disk and a 400GB disk in each node, the 400GB disk should have twice the weight as the 200GB disk. For more information about CRUSH weights, please see the Ceph documentation.
The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its RocksDB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.). The "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" options, if specified, and if a OSD DB VG exists on the node (see "pvc storage osd create-db-vg"), will instruct the OSD to locate its DB database and WAL on a new logical volume on that OSD DB VG. If "-r"/"--ext-db-ratio" is specified, the sizing of this DB LV will be the given ratio (specified as a decimal percentage e.g. 0.05 for 5%) of the size of the OSD (e.g. 0.05 on a 1TB SSD will create a 50GB LV). If "-s"/"--ext-db-size" is specified, the sizing of this DB LV will be the given human-unit size (e.g. 1024M, 20GB, etc.).
An external DB is only recommended for relatively slow OSD devices (i.e. SATA SSDs) when there is also a smaller, faster (i.e. NVMe or 3DXPoint SSD) device in the node. For NVMe OSDs, an external DB is not required nor recommended for optimal performance. An "--ext-db-ratio" of 0.05 (5%) is recommended for most workloads and OSD sizes; the Ceph documentation recommends a minimum of 0.02 (2%), and higher values may improve performance under write-heavy workloads with fewer OSDs per node. The explicit size option is also permitted to allow more fine-grained sizing, allowing the administrator to pre-calculate the desired size rather than relying on a ratio. An external DB is only recommended for relatively slow OSD devices (i.e. SATA SSDs) when there is also a smaller, faster (i.e. NVMe or 3DXPoint SSD) device in the node. For NVMe OSDs, an external DB is not required nor recommended for optimal performance. An "--ext-db-ratio" of 0.05 (5%) is recommended for most workloads and OSD sizes; the Ceph documentation recommends a minimum of 0.02 (2%), and higher values may improve performance under write-heavy workloads with fewer OSDs per node. The explicit size option is also permitted to allow more fine-grained sizing, allowing the administrator to pre-calculate the desired size rather than relying on a ratio.
The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size and performance of the OSD disk; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs. The "-c"/"--osd-count" option allows the splitting of a single block device into multiple logical OSDs. This is recommended in the Ceph literature for extremely fast OSD block devices (i.e. NVMe or 3DXPoint) which can saturate a single OSD process. Usually, 2 or 4 OSDs is recommended, based on the size and performance of the OSD disk; more than 4 OSDs per volume is not recommended, and this option is not recommended for SATA SSDs.
Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD; ratios are calculated from the OSD size, not the underlying device. Note that, if "-c"/"--osd-count" is specified, the provided "-w"/"--weight" will be the weight of EACH created OSD, not the block device as a whole. Ensure you take this into account if mixing and matching OSD block devices. Additionally, if "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" is specified, one DB LV will be created for EACH created OSD, of the given ratio/size per OSD; ratios are calculated from the OSD size, not the underlying device.
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
""" """
echo( echo(
@ -3461,25 +3463,57 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
@click.command(name="replace", short_help="Replace OSD block device.") @click.command(name="replace", short_help="Replace OSD block device.")
@connection_req @connection_req
@click.argument("osdid") @click.argument("osdid")
@click.argument("device") @click.argument("new_device")
@click.option(
"-o",
"--old-device",
"old_device",
default=None,
help="The old OSD block device, if known and valid",
)
@click.option( @click.option(
"-w", "-w",
"--weight", "--weight",
"weight", "weight",
default=1.0, default=None,
show_default=True, help="New weight of the OSD(s) within the CRUSH map; if unset, old weight is used",
help="New weight of the OSD within the CRUSH map.",
) )
@confirm_opt("Replace OSD {osdid} with block device {device} weight {weight}") @click.option(
def cli_storage_osd_replace(osdid, device, weight): "-r",
"--ext-db-ratio",
"ext_db_ratio",
default=None,
help="Create a new external database logical volume for the OSD(s) with this decimal ratio of the DB LV to the OSD size; if unset, old ext_db_size is used",
)
@click.option(
"-s",
"--ext-db-size",
"ext_db_size",
default=None,
help="Create a new external database logical volume for the OSD(s) with this human-unit size; if unset, old ext_db_size is used",
)
@confirm_opt(
"Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}"
)
def cli_storage_osd_replace(
osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
):
""" """
Replace the block device of an existing OSD with ID OSDID with DEVICE. Use this command to replace a failed or smaller OSD block device with a new one. Replace the block device of an existing OSD with ID OSDID, and any peer split OSDs with the same block device, with NEW_DEVICE. Use this command to replace a failed or smaller OSD block device with a new one in one command.
DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'. DEVICE must be a valid block device path (e.g. '/dev/nvme0n1', '/dev/disk/by-path/...') or a "detect" string. Partitions are NOT supported. A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". For details, see 'pvc storage osd add --help'. The path or detect string must be valid on the current node housing the OSD.
The weight of an OSD should reflect the ratio of the OSD to other OSDs in the storage cluster. For details, see 'pvc storage osd add --help'. Note that the current weight must be explicitly specified if it differs from the default. If OSDID is part of a split OSD set, any peer split OSDs with the same configured block device will be replaced as well. The split count will be retained and cannot be changed with this command; to do so, all OSDs in the split OSD set must be removed and new OSD(s) created.
Existing IDs, external DB devices, etc. of the OSD will be preserved; data will be lost and rebuilt from the remaining healthy OSDs. WARNING: This operation entails (and is functionally equivalent to) a removal and recreation of the specified OSD and, if applicable, all peer split OSDs. This is an intensive and potentially destructive action. Ensure that the cluster is otherwise healthy before proceeding, and ensure the subsequent rebuild completes successfully. Do not attempt this operation on a severely degraded cluster without first considering the possible data loss implications.
If the "-o"/"--old-device" option is specified, is a valid block device on the node, is readable/accessible, and contains the metadata for the specified OSD, it will be zapped. If this option is not specified, the system will try to find the old block device automatically to zap it. If it can't be found, the OSD will simply be removed from the CRUSH map and PVC database before recreating. This option can provide a cleaner deletion when replacing a working device that has a different block path, but is otherwise unnecessary.
The "-w"/"--weight", "-r"/"--ext-db-ratio", and "-s"/"--ext-db-size" allow overriding the existing weight and external DB LV for the OSD(s), if desired. If unset, the existing weight and external DB LV size (if applicable) will be used for the replacement OSD(s) instead.
NOTE: If neither the "-r"/"--ext-db-ratio" or "-s"/"--ext-db-size" option is specified, and the OSD(s) had an external DB LV, it cannot be removed a new DB LV will be created for the replacement OSD(s); this cannot be avoided. However, if the OSD(s) did not have an external DB LV, and one of these options is specified, a new DB LV will be added to the new OSD.
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
""" """
echo( echo(
@ -3488,7 +3522,7 @@ def cli_storage_osd_replace(osdid, device, weight):
newline=False, newline=False,
) )
retcode, retmsg = pvc.lib.storage.ceph_osd_replace( retcode, retmsg = pvc.lib.storage.ceph_osd_replace(
CLI_CONFIG, osdid, device, weight CLI_CONFIG, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
) )
echo(CLI_CONFIG, "done.") echo(CLI_CONFIG, "done.")
finish(retcode, retmsg) finish(retcode, retmsg)
@ -3510,7 +3544,9 @@ def cli_storage_osd_refresh(osdid, device):
Existing data, IDs, weights, etc. of the OSD will be preserved. Existing data, IDs, weights, etc. of the OSD will be preserved.
NOTE: If a device had an external DB device, this is not automatically handled at this time. It is best to remove and re-add the OSD instead. WARNING: If a device had an external DB device, this is not automatically handled at this time. It is best to remove and re-add the OSD (e.g. with "pvc storage osd replace") instead.
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
""" """
echo( echo(
@ -3545,6 +3581,8 @@ def cli_storage_osd_remove(osdid, force_flag):
DANGER: This will completely remove the OSD from the cluster. OSDs will rebalance which will negatively affect performance and available space. It is STRONGLY RECOMMENDED to set an OSD out (using 'pvc storage osd out') and allow the cluster to fully rebalance, verified with 'pvc storage status', before removing an OSD. DANGER: This will completely remove the OSD from the cluster. OSDs will rebalance which will negatively affect performance and available space. It is STRONGLY RECOMMENDED to set an OSD out (using 'pvc storage osd out') and allow the cluster to fully rebalance, verified with 'pvc storage status', before removing an OSD.
NOTE: The "-f"/"--force" option is useful after replacing a failed node, to ensure the OSD is removed even if the OSD in question does not properly exist on the node after a rebuild. NOTE: The "-f"/"--force" option is useful after replacing a failed node, to ensure the OSD is removed even if the OSD in question does not properly exist on the node after a rebuild.
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
""" """
echo( echo(

View File

@ -262,15 +262,30 @@ def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_co
return retstatus, response.json().get("message", "") return retstatus, response.json().get("message", "")
def ceph_osd_replace(config, osdid, device, weight): def ceph_osd_replace(
config, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
):
""" """
Replace an existing Ceph OSD with a new device Replace an existing Ceph OSD with a new device
API endpoint: POST /api/v1/storage/ceph/osd/{osdid} API endpoint: POST /api/v1/storage/ceph/osd/{osdid}
API arguments: device={device}, weight={weight} API arguments: new_device, [old_device={old_device}, weight={weight}, ext_db_ratio={ext_db_ratio}, ext_db_size={ext_db_size}]
API schema: {"message":"{data}"} API schema: {"message":"{data}"}
""" """
params = {"device": device, "weight": weight, "yes-i-really-mean-it": "yes"} params = {
"new_device": new_device,
"yes-i-really-mean-it": "yes",
}
if old_device is not None:
params["old_device"] = old_device
if weight is not None:
params["weight"] = weight
if ext_db_ratio is not None:
params["ext_db_ratio"] = ext_db_ratio
if ext_db_size is not None:
params["ext_db_size"] = ext_db_size
response = call_api(config, "post", f"/storage/ceph/osd/{osdid}", params=params) response = call_api(config, "post", f"/storage/ceph/osd/{osdid}", params=params)
if response.status_code == 200: if response.status_code == 200:
@ -438,6 +453,9 @@ def format_list_osd(config, osd_list):
) )
continue continue
if osd_information["is_split"]:
osd_information["device"] = f"{osd_information['device']} *s"
# Deal with the size to human readable # Deal with the size to human readable
osd_information["stats"]["size"] = osd_information["stats"]["kb"] * 1024 osd_information["stats"]["size"] = osd_information["stats"]["kb"] * 1024
for datatype in "size", "wr_data", "rd_data": for datatype in "size", "wr_data", "rd_data":

View File

@ -26,6 +26,7 @@ import time
import math import math
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from distutils.util import strtobool
import daemon_lib.vm as vm import daemon_lib.vm as vm
import daemon_lib.common as common import daemon_lib.common as common
@ -211,7 +212,7 @@ def getOSDInformation(zkhandler, osd_id):
# Get the devices # Get the devices
osd_node = zkhandler.read(("osd.node", osd_id)) osd_node = zkhandler.read(("osd.node", osd_id))
osd_device = zkhandler.read(("osd.device", osd_id)) osd_device = zkhandler.read(("osd.device", osd_id))
osd_is_split = zkhandler.read(("osd.is_split", osd_id)) osd_is_split = bool(strtobool(zkhandler.read(("osd.is_split", osd_id))))
osd_db_device = zkhandler.read(("osd.db_device", osd_id)) osd_db_device = zkhandler.read(("osd.db_device", osd_id))
# Parse the stats data # Parse the stats data
osd_stats_raw = zkhandler.read(("osd.stats", osd_id)) osd_stats_raw = zkhandler.read(("osd.stats", osd_id))
@ -329,12 +330,18 @@ def add_osd(
return success, message return success, message
def replace_osd(zkhandler, osd_id, new_device, weight): def replace_osd(
zkhandler,
osd_id,
new_device,
old_device=None,
weight=None,
ext_db_ratio=None,
ext_db_size=None,
):
# Get current OSD information # Get current OSD information
osd_information = getOSDInformation(zkhandler, osd_id) osd_information = getOSDInformation(zkhandler, osd_id)
node = osd_information["node"] node = osd_information["node"]
old_device = osd_information["device"]
ext_db_flag = True if osd_information["db_device"] else False
# Verify target block device isn't in use # Verify target block device isn't in use
block_osd = verifyOSDBlock(zkhandler, node, new_device) block_osd = verifyOSDBlock(zkhandler, node, new_device)
@ -347,8 +354,8 @@ def replace_osd(zkhandler, osd_id, new_device, weight):
) )
# Tell the cluster to create a new OSD for the host # Tell the cluster to create a new OSD for the host
replace_osd_string = "osd_replace {},{},{},{},{},{}".format( replace_osd_string = "osd_replace {},{},{},{},{},{},{}".format(
node, osd_id, old_device, new_device, weight, ext_db_flag node, osd_id, new_device, old_device, weight, ext_db_ratio, ext_db_size
) )
zkhandler.write([("base.cmd.ceph", replace_osd_string)]) zkhandler.write([("base.cmd.ceph", replace_osd_string)])
# Wait 1/2 second for the cluster to get the message and start working # Wait 1/2 second for the cluster to get the message and start working

View File

@ -23,7 +23,7 @@ import time
import json import json
import daemon_lib.common as common import daemon_lib.common as common
from daemon_lib.ceph import format_bytes_fromhuman from daemon_lib.ceph import format_bytes_fromhuman, get_list_osd
from distutils.util import strtobool from distutils.util import strtobool
from re import search, match, sub from re import search, match, sub
@ -393,7 +393,7 @@ class CephOSDInstance(object):
raise Exception raise Exception
# 4d. Get the list of created OSDs on the device (final pass) # 4d. Get the list of created OSDs on the device (final pass)
logger.out(f"(Requerying OSD(s) on disk {device}", state="i") logger.out(f"Requerying OSD(s) on disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm list --format json {device}" f"ceph-volume lvm list --format json {device}"
) )
@ -493,10 +493,11 @@ class CephOSDInstance(object):
logger, logger,
node, node,
osd_id, osd_id,
old_device,
new_device, new_device,
weight, old_device=None,
ext_db_flag=False, weight=None,
ext_db_ratio=None,
ext_db_size=None,
): ):
# Handle a detect device if that is passed # Handle a detect device if that is passed
if match(r"detect:", new_device): if match(r"detect:", new_device):
@ -514,223 +515,105 @@ class CephOSDInstance(object):
) )
new_device = ddevice new_device = ddevice
# We are ready to create a new OSD on this node # Phase 1: Try to determine what we can about the old device
logger.out( def find_osds_from_block(device):
"Replacing OSD {} disk with block device {}".format(osd_id, new_device), # Try to query the passed block device directly
state="i", logger.out(f"Querying for OSD(s) on disk {device}", state="i")
)
try:
# Verify the OSD is present
retcode, stdout, stderr = common.run_os_command("ceph osd ls")
osd_list = stdout.split("\n")
if osd_id not in osd_list:
logger.out(
"Could not find OSD {} in the cluster".format(osd_id), state="e"
)
return True
# 1. Set the OSD down and out so it will flush
logger.out("Setting down OSD disk with ID {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph osd down {}".format(osd_id) f"ceph-volume lvm list --format json {device}"
) )
if retcode: if retcode:
print("ceph osd down") found_osds = []
print(stdout)
print(stderr)
raise Exception
logger.out("Setting out OSD disk with ID {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command(
"ceph osd out {}".format(osd_id)
)
if retcode:
print("ceph osd out")
print(stdout)
print(stderr)
raise Exception
# 2. Wait for the OSD to be safe to remove (but don't wait for rebalancing to complete)
logger.out(f"Waiting for OSD {osd_id} to be safe to remove", state="i")
while True:
retcode, stdout, stderr = common.run_os_command(
f"ceph osd safe-to-destroy osd.{osd_id}"
)
if retcode in [0, 11]:
# Code 0 = success
# Code 11 = "Error EAGAIN: OSD(s) 5 have no reported stats, and not all PGs are active+clean; we cannot draw any conclusions." which means all PGs have been remappped but backfill is still occurring
break
else: else:
time.sleep(5) found_osds = jloads(stdout)
# 3. Stop the OSD process return found_osds
logger.out("Stopping OSD disk with ID {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command(
"systemctl stop ceph-osd@{}".format(osd_id)
)
if retcode:
print("systemctl stop")
print(stdout)
print(stderr)
raise Exception
time.sleep(2)
# 4. Destroy the OSD real_old_device = None
logger.out("Destroying OSD with ID {osd_id}", state="i") osd_block = zkhandler.read(("osd.device", osd_id))
retcode, stdout, stderr = common.run_os_command(
f"ceph osd destroy {osd_id} --yes-i-really-mean-it"
)
if retcode:
print("ceph osd destroy")
print(stdout)
print(stderr)
raise Exception
# 5. Adjust the weight # Determine information from a passed old_device
logger.out( if old_device is not None:
"Adjusting weight of OSD disk with ID {} in CRUSH map".format(osd_id), found_osds = find_osds_from_block(old_device)
state="i", if found_osds and osd_id in found_osds.keys():
) real_old_device = old_device
retcode, stdout, stderr = common.run_os_command(
"ceph osd crush reweight osd.{osdid} {weight}".format(
osdid=osd_id, weight=weight
)
)
if retcode:
print("ceph osd crush reweight")
print(stdout)
print(stderr)
raise Exception
# 6a. Zap the new disk to ensure it is ready to go
logger.out("Zapping disk {}".format(new_device), state="i")
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm zap --destroy {}".format(new_device)
)
if retcode:
print("ceph-volume lvm zap")
print(stdout)
print(stderr)
raise Exception
dev_flags = "--data {}".format(new_device)
# 6b. Prepare the logical volume if ext_db_flag
if ext_db_flag:
db_device = "osd-db/osd-{}".format(osd_id)
dev_flags += " --block.db {}".format(db_device)
else: else:
db_device = ""
# 6c. Replace the OSD
logger.out( logger.out(
"Preparing LVM for replaced OSD {} disk on {}".format( f"No OSD(s) found on disk {old_device}; falling back to PVC detection",
osd_id, new_device state="w",
),
state="i",
) )
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm prepare --osd-id {osdid} --bluestore {devices}".format(
osdid=osd_id, devices=dev_flags
)
)
if retcode:
print("ceph-volume lvm prepare")
print(stdout)
print(stderr)
raise Exception
# 7a. Get OSD information # Try to get an old_device from our PVC information
if real_old_device is None:
found_osds = find_osds_from_block(osd_block)
if osd_id in found_osds.keys():
real_old_device = osd_block
if real_old_device is None:
skip_zap = True
logger.out( logger.out(
"Getting OSD information for ID {} on {}".format(osd_id, new_device), "No valid old block device found for OSD; skipping zap", state="w"
state="i",
) )
retcode, stdout, stderr = common.run_os_command( else:
"ceph-volume lvm list {device}".format(device=new_device) skip_zap = False
)
for line in stdout.split("\n"):
if "block device" in line:
osd_blockdev = line.split()[-1]
if "osd fsid" in line:
osd_fsid = line.split()[-1]
if "cluster fsid" in line:
osd_clusterfsid = line.split()[-1]
if "devices" in line:
osd_device = line.split()[-1]
if not osd_fsid:
print("ceph-volume lvm list")
print("Could not find OSD information in data:")
print(stdout)
print(stderr)
raise Exception
# Split OSD blockdev into VG and LV components
# osd_blockdev = /dev/ceph-<uuid>/osd-block-<uuid>
_, _, osd_vg, osd_lv = osd_blockdev.split("/")
# Reset whatever we were given to Ceph's /dev/xdX naming
if new_device != osd_device:
new_device = osd_device
# 7b. Activate the OSD
logger.out("Activating new OSD disk with ID {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm activate --bluestore {osdid} {osdfsid}".format(
osdid=osd_id, osdfsid=osd_fsid
)
)
if retcode:
print("ceph-volume lvm activate")
print(stdout)
print(stderr)
raise Exception
time.sleep(0.5)
# 8. Verify it started
retcode, stdout, stderr = common.run_os_command(
"systemctl status ceph-osd@{osdid}".format(osdid=osd_id)
)
if retcode:
print("systemctl status")
print(stdout)
print(stderr)
raise Exception
# 9. Update Zookeeper information
logger.out( logger.out(
"Adding new OSD disk with ID {} to Zookeeper".format(osd_id), state="i" f"Found source OSD(s) on block device {real_old_device}", state="i"
) )
zkhandler.write(
[ # Try to determine if any other OSDs shared a block device with this OSD
(("osd", osd_id), ""), all_osds_on_block = [
(("osd.node", osd_id), node), o
(("osd.device", osd_id), new_device), for o in get_list_osd(zkhandler, None)
(("osd.db_device", osd_id), db_device), if o["node"] == node and o["device"] == osd_block
(("osd.fsid", osd_id), ""),
(("osd.ofsid", osd_id), osd_fsid),
(("osd.cfsid", osd_id), osd_clusterfsid),
(("osd.lvm", osd_id), ""),
(("osd.vg", osd_id), osd_vg),
(("osd.lv", osd_id), osd_lv),
(
("osd.stats", osd_id),
'{"uuid": "|", "up": 0, "in": 0, "primary_affinity": "|", "utilization": "|", "var": "|", "pgs": "|", "kb": "|", "weight": "|", "reweight": "|", "node": "|", "used": "|", "avail": "|", "wr_ops": "|", "wr_data": "|", "rd_ops": "|", "rd_data": "|", "state": "|"}',
),
] ]
# Remove each OSD on the block device
for osd in all_osds_on_block:
result = CephOSDInstance.remove_osd(
zkhandler, logger, osd["id"], force_flag=True, skip_zap_flag=skip_zap
) )
# Log it # Determine the weight of the OSD(s)
logger.out( if weight is None:
"Replaced OSD {} disk with device {}".format(osd_id, new_device), weight = all_osds_on_block[0]["stats"]["weight"]
state="o",
# Determine how many split OSD(s) to recreate
if len(all_osds_on_block) > 1 and all_osds_on_block[0]["is_split"]:
split_count = len(all_osds_on_block)
else:
split_count = None
# Determine if an ext_db should be readded
if ext_db_ratio is not None:
osd_db_ratio = ext_db_ratio
osd_db_size = None
elif ext_db_size is not None:
osd_db_ratio = None
osd_db_size = ext_db_size
elif all_osds_on_block[0]["db_device"]:
_, osd_db_size_bytes, _ = common.run_os_command(
f"blockdev --getsize64 {all_osds_on_block[0]['db_device']}"
) )
return True osd_db_ratio = None
except Exception as e: osd_db_size = f"{osd_db_size}B"
# Log it else:
logger.out("Failed to replace OSD {} disk: {}".format(osd_id, e), state="e") osd_db_ratio = None
return False osd_db_size = None
# Create [a] new OSD[s], on the new block device
result = CephOSDInstance.add_osd(
zkhandler,
logger,
node,
new_device,
weight,
ext_db_ratio=osd_db_ratio,
ext_db_size=osd_db_size,
split_count=split_count,
)
return result
@staticmethod @staticmethod
def refresh_osd(zkhandler, logger, node, osd_id, device, ext_db_flag): def refresh_osd(zkhandler, logger, node, osd_id, device, ext_db_flag):
@ -863,7 +746,7 @@ class CephOSDInstance(object):
return False return False
@staticmethod @staticmethod
def remove_osd(zkhandler, logger, osd_id, osd_obj, force_flag): def remove_osd(zkhandler, logger, osd_id, force_flag=False, skip_zap_flag=False):
logger.out("Removing OSD {}".format(osd_id), state="i") logger.out("Removing OSD {}".format(osd_id), state="i")
try: try:
# Verify the OSD is present # Verify the OSD is present
@ -931,6 +814,7 @@ class CephOSDInstance(object):
raise Exception raise Exception
time.sleep(2) time.sleep(2)
if not skip_zap_flag:
# 4. Determine the block devices # 4. Determine the block devices
osd_vg = zkhandler.read(("osd.vg", osd_id)) osd_vg = zkhandler.read(("osd.vg", osd_id))
osd_lv = zkhandler.read(("osd.lv", osd_id)) osd_lv = zkhandler.read(("osd.lv", osd_id))
@ -976,11 +860,13 @@ class CephOSDInstance(object):
else: else:
raise Exception raise Exception
if not skip_zap_flag:
# 7. Remove the DB device # 7. Remove the DB device
if zkhandler.exists(("osd.db_device", osd_id)): if zkhandler.exists(("osd.db_device", osd_id)):
db_device = zkhandler.read(("osd.db_device", osd_id)) db_device = zkhandler.read(("osd.db_device", osd_id))
logger.out( logger.out(
'Removing OSD DB logical volume "{}"'.format(db_device), state="i" 'Removing OSD DB logical volume "{}"'.format(db_device),
state="i",
) )
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"lvremove --yes --force {}".format(db_device) "lvremove --yes --force {}".format(db_device)
@ -1307,8 +1193,19 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
# Replacing an OSD # Replacing an OSD
if command == "osd_replace": if command == "osd_replace":
node, osd_id, old_device, new_device, weight, ext_db_flag = args.split(",") (
ext_db_flag = bool(strtobool(ext_db_flag)) node,
osd_id,
new_device,
old_device,
weight,
ext_db_ratio,
ext_db_size,
) = args.split(",")
old_device = None if old_device == "None" else old_device
weight = None if weight == "None" else weight
ext_db_ratio = None if ext_db_ratio == "None" else ext_db_ratio
ext_db_size = None if ext_db_size == "None" else ext_db_size
if node == this_node.name: if node == this_node.name:
# Lock the command queue # Lock the command queue
zk_lock = zkhandler.writelock("base.cmd.ceph") zk_lock = zkhandler.writelock("base.cmd.ceph")
@ -1319,10 +1216,11 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
logger, logger,
node, node,
osd_id, osd_id,
old_device,
new_device, new_device,
old_device,
weight, weight,
ext_db_flag, ext_db_ratio,
ext_db_size,
) )
# Command succeeded # Command succeeded
if result: if result:
@ -1373,7 +1271,7 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
with zk_lock: with zk_lock:
# Remove the OSD # Remove the OSD
result = CephOSDInstance.remove_osd( result = CephOSDInstance.remove_osd(
zkhandler, logger, osd_id, d_osd[osd_id], force_flag zkhandler, logger, osd_id, force_flag
) )
# Command succeeded # Command succeeded
if result: if result: