Add support for split OSD adds

Allows creating multiple OSDs on a single (NVMe) block device,
leveraging the "ceph-volume lvm batch" command. Replaces the previous
method of creating OSDs.

Also adds a new ZK item for each OSD indicating if it is split or not.
This commit is contained in:
Joshua Boniface 2023-11-01 21:17:38 -04:00
parent aa0b1f504f
commit 526a5f4a74
8 changed files with 281 additions and 197 deletions

View File

@ -4284,12 +4284,18 @@ class API_Storage_Ceph_OSD_Root(Resource):
{ {
"name": "ext_db", "name": "ext_db",
"required": False, "required": False,
"helptext": "Whether to use an external OSD DB LV device.",
}, },
{ {
"name": "ext_db_ratio", "name": "ext_db_ratio",
"required": False, "required": False,
"helptext": "Decimal size ratio of the external OSD DB LV device.", },
{
"name": "split",
"required": False,
},
{
"name": "count",
"required": False,
}, },
] ]
) )
@ -4327,6 +4333,16 @@ class API_Storage_Ceph_OSD_Root(Resource):
type: float type: float
required: false required: false
description: Decimal ratio of total OSD size for the external OSD DB LV device, default 0.05 (5%) description: Decimal ratio of total OSD size for the external OSD DB LV device, default 0.05 (5%)
- in: query
name: split
type: boolean
required: false
description: Whether to split the block device into multiple OSDs (recommended for NVMe devices)
- in: query
name: count
type: integer
required: false
description: If {split}, how many OSDs to create on the block device; usually 2 or 4 depending on size
responses: responses:
200: 200:
description: OK description: OK
@ -4345,6 +4361,8 @@ class API_Storage_Ceph_OSD_Root(Resource):
reqargs.get("weight", None), reqargs.get("weight", None),
reqargs.get("ext_db", False), reqargs.get("ext_db", False),
float(reqargs.get("ext_db_ratio", 0.05)), float(reqargs.get("ext_db_ratio", 0.05)),
reqargs.get("split", False),
reqargs.get("count", 1),
) )

View File

@ -1366,12 +1366,28 @@ def ceph_osd_db_vg_add(zkhandler, node, device):
@ZKConnection(config) @ZKConnection(config)
def ceph_osd_add(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05): def ceph_osd_add(
zkhandler,
node,
device,
weight,
ext_db_flag=False,
ext_db_ratio=0.05,
split_flag=False,
split_count=1,
):
""" """
Add a Ceph OSD to the PVC Ceph storage cluster. Add a Ceph OSD to the PVC Ceph storage cluster.
""" """
retflag, retdata = pvc_ceph.add_osd( retflag, retdata = pvc_ceph.add_osd(
zkhandler, node, device, weight, ext_db_flag, ext_db_ratio zkhandler,
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
split_count,
) )
if retflag: if retflag:

View File

@ -3411,8 +3411,17 @@ def cli_storage_osd_create_db_vg(node, device):
type=float, type=float,
help="Decimal ratio of the external database logical volume to the OSD size.", help="Decimal ratio of the external database logical volume to the OSD size.",
) )
@confirm_opt("Destroy all data on and create new OSD on node {node} device {device}") @click.option(
def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio): "-s",
"--split",
"split_count",
default=None,
show_default=False,
type=int,
help="Split an NVMe disk into this many OSDs",
)
@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}")
def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_count):
""" """
Add a new Ceph OSD on node NODE with block device DEVICE. DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported. Add a new Ceph OSD on node NODE with block device DEVICE. DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported.
@ -3423,10 +3432,22 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio):
If '--ext-db' is specified, the OSD database and WAL will be placed on a new logical volume in NODE's OSD database volume group. An OSD database volume group must exist on the node or the OSD creation will fail. See the 'pvc storage osd create-db-vg' command for more details. If '--ext-db' is specified, the OSD database and WAL will be placed on a new logical volume in NODE's OSD database volume group. An OSD database volume group must exist on the node or the OSD creation will fail. See the 'pvc storage osd create-db-vg' command for more details.
The default '--ext-db-ratio' of 0.05 (5%) is sufficient for most RBD workloads and OSD sizes, though this can be adjusted based on the sizes of the OSD(s) and the underlying database device. Ceph documentation recommends at least 0.02 (2%) for RBD use-cases, and higher values may improve WAL performance under write-heavy workloads with fewer OSDs per node. The default '--ext-db-ratio' of 0.05 (5%) is sufficient for most RBD workloads and OSD sizes, though this can be adjusted based on the sizes of the OSD(s) and the underlying database device. Ceph documentation recommends at least 0.02 (2%) for RBD use-cases, and higher values may improve WAL performance under write-heavy workloads with fewer OSDs per node.
For NVMe devices, it is recommended to split block device into multiple OSDs to provide better processing throughput. To do this, specify "-s"/"--split" and the number of OSDs to create on the block device. For most NVMe devices, the recommended value is 2 or 4, such that each OSD is at least 500GB. Numbers higher than 4 are not recommended. This is NOT RECOMMENDED for SATA SSDs.
""" """
if split_count is not None:
split_flag = True
retcode, retmsg = pvc.lib.storage.ceph_osd_add( retcode, retmsg = pvc.lib.storage.ceph_osd_add(
CLI_CONFIG, node, device, weight, ext_db_flag, ext_db_ratio CLI_CONFIG,
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
split_count,
) )
finish(retcode, retmsg) finish(retcode, retmsg)

View File

@ -231,12 +231,14 @@ def ceph_osd_list(config, limit):
return False, response.json().get("message", "") return False, response.json().get("message", "")
def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio): def ceph_osd_add(
config, node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count
):
""" """
Add new Ceph OSD Add new Ceph OSD
API endpoint: POST /api/v1/storage/ceph/osd API endpoint: POST /api/v1/storage/ceph/osd
API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio} API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio}, split={split_flag}, count={split_count}
API schema: {"message":"{data}"} API schema: {"message":"{data}"}
""" """
params = { params = {
@ -245,6 +247,8 @@ def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio):
"weight": weight, "weight": weight,
"ext_db": ext_db_flag, "ext_db": ext_db_flag,
"ext_db_ratio": ext_db_ratio, "ext_db_ratio": ext_db_ratio,
"split": split_flag,
"count": split_count,
} }
response = call_api(config, "post", "/storage/ceph/osd", params=params) response = call_api(config, "post", "/storage/ceph/osd", params=params)

View File

@ -211,6 +211,7 @@ def getOSDInformation(zkhandler, osd_id):
# Get the devices # Get the devices
osd_node = zkhandler.read(("osd.node", osd_id)) osd_node = zkhandler.read(("osd.node", osd_id))
osd_device = zkhandler.read(("osd.device", osd_id)) osd_device = zkhandler.read(("osd.device", osd_id))
osd_is_split = zkhandler.read(("osd.is_split", osd_id))
osd_db_device = zkhandler.read(("osd.db_device", osd_id)) osd_db_device = zkhandler.read(("osd.db_device", osd_id))
# Parse the stats data # Parse the stats data
osd_stats_raw = zkhandler.read(("osd.stats", osd_id)) osd_stats_raw = zkhandler.read(("osd.stats", osd_id))
@ -220,6 +221,7 @@ def getOSDInformation(zkhandler, osd_id):
"id": osd_id, "id": osd_id,
"node": osd_node, "node": osd_node,
"device": osd_device, "device": osd_device,
"is_split": osd_is_split,
"db_device": osd_db_device, "db_device": osd_db_device,
"stats": osd_stats, "stats": osd_stats,
} }
@ -266,7 +268,16 @@ def add_osd_db_vg(zkhandler, node, device):
# OSD actions use the /cmd/ceph pipe # OSD actions use the /cmd/ceph pipe
# These actions must occur on the specific node they reference # These actions must occur on the specific node they reference
def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05): def add_osd(
zkhandler,
node,
device,
weight,
ext_db_flag=False,
ext_db_ratio=0.05,
split_flag=False,
split_count=1,
):
# Verify the target node exists # Verify the target node exists
if not common.verifyNode(zkhandler, node): if not common.verifyNode(zkhandler, node):
return False, 'ERROR: No node named "{}" is present in the cluster.'.format( return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
@ -284,8 +295,8 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0
) )
# Tell the cluster to create a new OSD for the host # Tell the cluster to create a new OSD for the host
add_osd_string = "osd_add {},{},{},{},{}".format( add_osd_string = "osd_add {},{},{},{},{},{},{}".format(
node, device, weight, ext_db_flag, ext_db_ratio node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count
) )
zkhandler.write([("base.cmd.ceph", add_osd_string)]) zkhandler.write([("base.cmd.ceph", add_osd_string)])
# Wait 1/2 second for the cluster to get the message and start working # Wait 1/2 second for the cluster to get the message and start working
@ -295,14 +306,10 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0
try: try:
result = zkhandler.read("base.cmd.ceph").split()[0] result = zkhandler.read("base.cmd.ceph").split()[0]
if result == "success-osd_add": if result == "success-osd_add":
message = 'Created new OSD with block device "{}" on node "{}".'.format( message = f'Created {split_count} new OSD(s) on node "{node}" block device "{device}"'
device, node
)
success = True success = True
else: else:
message = ( message = "ERROR: Failed to create OSD(s); check node logs for details."
"ERROR: Failed to create new OSD; check node logs for details."
)
success = False success = False
except Exception: except Exception:
message = "ERROR: Command ignored by node." message = "ERROR: Command ignored by node."

View File

@ -0,0 +1 @@
{"version": "10", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}

View File

@ -540,7 +540,7 @@ class ZKHandler(object):
# #
class ZKSchema(object): class ZKSchema(object):
# Current version # Current version
_version = 9 _version = 10
# Root for doing nested keys # Root for doing nested keys
_schema_root = "" _schema_root = ""
@ -719,6 +719,7 @@ class ZKSchema(object):
"lvm": "/lvm", "lvm": "/lvm",
"vg": "/lvm/vg", "vg": "/lvm/vg",
"lv": "/lvm/lv", "lv": "/lvm/lv",
"is_split": "/is_split",
"stats": "/stats", "stats": "/stats",
}, },
# The schema of an individual pool entry (/ceph/pools/{pool_name}) # The schema of an individual pool entry (/ceph/pools/{pool_name})
@ -963,7 +964,9 @@ class ZKSchema(object):
kpath = f"{elem}.{ikey}" kpath = f"{elem}.{ikey}"
# Validate that the key exists for that child # Validate that the key exists for that child
if not zkhandler.zk_conn.exists(self.path(kpath, child)): if not zkhandler.zk_conn.exists(self.path(kpath, child)):
if elem == "pool" and ikey == "tier": if elem == "osd" and ikey == "is_split":
default_data = "False"
elif elem == "pool" and ikey == "tier":
default_data = "default" default_data = "default"
else: else:
default_data = "" default_data = ""

View File

@ -26,6 +26,7 @@ import daemon_lib.common as common
from distutils.util import strtobool from distutils.util import strtobool
from re import search, match, sub from re import search, match, sub
from json import loads as jloads
def get_detect_device(detect_string): def get_detect_device(detect_string):
@ -260,7 +261,15 @@ class CephOSDInstance(object):
@staticmethod @staticmethod
def add_osd( def add_osd(
zkhandler, logger, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05 zkhandler,
logger,
node,
device,
weight,
ext_db_flag=False,
ext_db_ratio=0.05,
split_device=False,
split_count=1,
): ):
# Handle a detect device if that is passed # Handle a detect device if that is passed
if match(r"detect:", device): if match(r"detect:", device):
@ -278,177 +287,185 @@ class CephOSDInstance(object):
) )
device = ddevice device = ddevice
# We are ready to create a new OSD on this node if split_device and split_count > 1:
logger.out("Creating new OSD disk on block device {}".format(device), state="i") split_flag = f"--osds-per-device {split_count}"
logger.out(
f"Creating {split_count} new OSD disks on block device {device}",
state="i",
)
else:
split_flag = ""
logger.out(f"Creating 1 new OSD disk on block device {device}", state="i")
if "nvme" in device:
class_flag = "--crush-device-class nvme"
else:
class_flag = "--crush-device-class ssd"
try: try:
# 1. Create an OSD; we do this so we know what ID will be gen'd # 1. Zap the block device
retcode, stdout, stderr = common.run_os_command("ceph osd create") logger.out(f"Zapping disk {device}", state="i")
if retcode:
print("ceph osd create")
print(stdout)
print(stderr)
raise Exception
osd_id = stdout.rstrip()
# 2. Remove that newly-created OSD
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph osd rm {}".format(osd_id) f"ceph-volume lvm zap --destroy {device}"
) )
if retcode: if retcode:
print("ceph osd rm") logger.out("Failed: ceph-volume lvm zap", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
# 3a. Zap the disk to ensure it is ready to go # 2. Prepare the OSD(s)
logger.out("Zapping disk {}".format(device), state="i") logger.out(f"Preparing OSD(s) on disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm zap --destroy {}".format(device) f"ceph-volume lvm batch --yes --prepare --bluestore {split_flag} {class_flag} {device}"
) )
if retcode: if retcode:
print("ceph-volume lvm zap") logger.out("Failed: ceph-volume lvm batch", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
dev_flags = "--data {}".format(device) # 3. Get the list of created OSDs on the device (initial pass)
logger.out(f"Querying OSD(s) on disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm list --format json {device}"
)
if retcode:
logger.out("Failed: ceph-volume lvm list", state="e")
logger.out(stdout, state="d")
logger.out(stderr, state="d")
raise Exception
# 3b. Prepare the logical volume if ext_db_flag created_osds = jloads(stdout)
# 4. Prepare the WAL and DB devices
if ext_db_flag: if ext_db_flag:
_, osd_size_bytes, _ = common.run_os_command( for created_osd in created_osds:
"blockdev --getsize64 {}".format(device) # 4a. Get the OSD FSID and ID from the details
osd_details = created_osds[created_osd][0]
osd_fsid = osd_details["tags"]["ceph.osd_fsid"]
osd_id = osd_details["tags"]["ceph.osd_id"]
osd_lv = osd_details["lv_path"]
logger.out(
f"Creating Bluestore DB volume for OSD {osd_id}", state="i"
)
# 4b. Prepare the logical volume if ext_db_flag
_, osd_size_bytes, _ = common.run_os_command(
f"blockdev --getsize64 {osd_lv}"
)
osd_size_bytes = int(osd_size_bytes)
result = CephOSDInstance.create_osd_db_lv(
zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes
)
if not result:
raise Exception
db_device = "osd-db/osd-{}".format(osd_id)
# 4c. Attach the new DB device to the OSD
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm new-db --osd-id {osd_id} --osd-fsid {osd_fsid} --target {db_device}"
)
if retcode:
logger.out("Failed: ceph-volume lvm new-db", state="e")
logger.out(stdout, state="d")
logger.out(stderr, state="d")
raise Exception
# 4d. Get the list of created OSDs on the device (final pass)
logger.out(f"(Requerying OSD(s) on disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm list --format json {device}"
) )
osd_size_bytes = int(osd_size_bytes) if retcode:
result = CephOSDInstance.create_osd_db_lv( logger.out("Failed: ceph-volume lvm list", state="e")
zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes logger.out(stdout, state="d")
) logger.out(stderr, state="d")
if not result:
raise Exception raise Exception
db_device = "osd-db/osd-{}".format(osd_id)
dev_flags += " --block.db {}".format(db_device)
else:
db_device = ""
# 3c. Create the OSD for real created_osds = jloads(stdout)
logger.out(
"Preparing LVM for new OSD disk with ID {} on {}".format( # 5. Activate the OSDs
osd_id, device logger.out(f"Activating OSD(s) on disk {device}", state="i")
), for created_osd in created_osds:
state="i", # 5a. Get the OSD FSID and ID from the details
) osd_details = created_osds[created_osd][0]
retcode, stdout, stderr = common.run_os_command( osd_clusterfsid = osd_details["tags"]["ceph.cluster_fsid"]
"ceph-volume lvm prepare --bluestore {devices}".format( osd_fsid = osd_details["tags"]["ceph.osd_fsid"]
devices=dev_flags osd_id = osd_details["tags"]["ceph.osd_id"]
db_device = osd_details["tags"].get("ceph.db_device", None)
osd_vg = osd_details["vg_name"]
osd_lv = osd_details["lv_name"]
# 5b. Activate the OSD
logger.out(f"Activating OSD {osd_id}", state="i")
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm activate --bluestore {osd_id} {osd_fsid}"
) )
) if retcode:
if retcode: logger.out("Failed: ceph-volume lvm activate", state="e")
print("ceph-volume lvm prepare") logger.out(stdout, state="d")
print(stdout) logger.out(stderr, state="d")
print(stderr) raise Exception
raise Exception
# 4a. Get OSD information # 5c. Add it to the crush map
logger.out( logger.out(f"Adding OSD {osd_id} to CRUSH map", state="i")
"Getting OSD information for ID {} on {}".format(osd_id, device), retcode, stdout, stderr = common.run_os_command(
state="i", f"ceph osd crush add osd.{osd_id} {weight} root=default host={node}"
)
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm list {device}".format(device=device)
)
for line in stdout.split("\n"):
if "block device" in line:
osd_blockdev = line.split()[-1]
if "osd fsid" in line:
osd_fsid = line.split()[-1]
if "cluster fsid" in line:
osd_clusterfsid = line.split()[-1]
if "devices" in line:
osd_device = line.split()[-1]
if not osd_fsid:
print("ceph-volume lvm list")
print("Could not find OSD information in data:")
print(stdout)
print(stderr)
raise Exception
# Split OSD blockdev into VG and LV components
# osd_blockdev = /dev/ceph-<uuid>/osd-block-<uuid>
_, _, osd_vg, osd_lv = osd_blockdev.split("/")
# Reset whatever we were given to Ceph's /dev/xdX naming
if device != osd_device:
device = osd_device
# 4b. Activate the OSD
logger.out("Activating new OSD disk with ID {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm activate --bluestore {osdid} {osdfsid}".format(
osdid=osd_id, osdfsid=osd_fsid
) )
) if retcode:
if retcode: logger.out("Failed: ceph osd crush add", state="e")
print("ceph-volume lvm activate") logger.out(stdout, state="d")
print(stdout) logger.out(stderr, state="d")
print(stderr) raise Exception
raise Exception
# 5. Add it to the crush map # 5d. Wait half a second for it to activate
logger.out( time.sleep(0.5)
"Adding new OSD disk with ID {} to CRUSH map".format(osd_id), state="i"
) # 5e. Verify it started
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph osd crush add osd.{osdid} {weight} root=default host={node}".format( "systemctl status ceph-osd@{osdid}".format(osdid=osd_id)
osdid=osd_id, weight=weight, node=node
) )
) if retcode:
if retcode: logger.out(f"Failed: OSD {osd_id} unit is not active", state="e")
print("ceph osd crush add") logger.out(stdout, state="d")
print(stdout) logger.out(stderr, state="d")
print(stderr) raise Exception
raise Exception
time.sleep(0.5) # 5f. Add the new OSD to PVC
logger.out(f"Adding OSD {osd_id} to PVC", state="i")
zkhandler.write(
[
(("osd", osd_id), ""),
(("osd.node", osd_id), node),
(("osd.device", osd_id), device),
(("osd.db_device", osd_id), db_device),
(("osd.fsid", osd_id), osd_fsid),
(("osd.ofsid", osd_id), osd_fsid),
(("osd.cfsid", osd_id), osd_clusterfsid),
(("osd.lvm", osd_id), ""),
(("osd.vg", osd_id), osd_vg),
(("osd.lv", osd_id), osd_lv),
(("osd.is_split", osd_id), split_flag),
(
("osd.stats", osd_id),
'{"uuid": "|", "up": 0, "in": 0, "primary_affinity": "|", "utilization": "|", "var": "|", "pgs": "|", "kb": "|", "weight": "|", "reweight": "|", "node": "|", "used": "|", "avail": "|", "wr_ops": "|", "wr_data": "|", "rd_ops": "|", "rd_data": "|", "state": "|"}',
),
]
)
# 6. Verify it started # 6. Log it
retcode, stdout, stderr = common.run_os_command(
"systemctl status ceph-osd@{osdid}".format(osdid=osd_id)
)
if retcode:
print("systemctl status")
print(stdout)
print(stderr)
raise Exception
# 7. Add the new OSD to the list
logger.out( logger.out(
"Adding new OSD disk with ID {} to Zookeeper".format(osd_id), state="i" f"Successfully created {split_count} new OSD(s) {','.join(created_osds.keys())} on disk {device}",
state="o",
) )
zkhandler.write(
[
(("osd", osd_id), ""),
(("osd.node", osd_id), node),
(("osd.device", osd_id), device),
(("osd.db_device", osd_id), db_device),
(("osd.fsid", osd_id), ""),
(("osd.ofsid", osd_id), osd_fsid),
(("osd.cfsid", osd_id), osd_clusterfsid),
(("osd.lvm", osd_id), ""),
(("osd.vg", osd_id), osd_vg),
(("osd.lv", osd_id), osd_lv),
(
("osd.stats", osd_id),
'{"uuid": "|", "up": 0, "in": 0, "primary_affinity": "|", "utilization": "|", "var": "|", "pgs": "|", "kb": "|", "weight": "|", "reweight": "|", "node": "|", "used": "|", "avail": "|", "wr_ops": "|", "wr_data": "|", "rd_ops": "|", "rd_data": "|", "state": "|"}',
),
]
)
# Log it
logger.out("Created new OSD disk with ID {}".format(osd_id), state="o")
return True return True
except Exception as e: except Exception as e:
# Log it logger.out(
logger.out("Failed to create new OSD disk: {}".format(e), state="e") f"Failed to create {split_count} new OSD(s) on disk {device}: {e}",
state="e",
)
return False return False
@staticmethod @staticmethod
@ -828,7 +845,7 @@ class CephOSDInstance(object):
@staticmethod @staticmethod
def remove_osd(zkhandler, logger, osd_id, osd_obj, force_flag): def remove_osd(zkhandler, logger, osd_id, osd_obj, force_flag):
logger.out("Removing OSD disk {}".format(osd_id), state="i") logger.out("Removing OSD {}".format(osd_id), state="i")
try: try:
# Verify the OSD is present # Verify the OSD is present
retcode, stdout, stderr = common.run_os_command("ceph osd ls") retcode, stdout, stderr = common.run_os_command("ceph osd ls")
@ -843,7 +860,7 @@ class CephOSDInstance(object):
return True return True
# 1. Set the OSD down and out so it will flush # 1. Set the OSD down and out so it will flush
logger.out("Setting down OSD disk with ID {}".format(osd_id), state="i") logger.out("Setting down OSD {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph osd down {}".format(osd_id) "ceph osd down {}".format(osd_id)
) )
@ -856,7 +873,7 @@ class CephOSDInstance(object):
else: else:
raise Exception raise Exception
logger.out("Setting out OSD disk with ID {}".format(osd_id), state="i") logger.out("Setting out OSD {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph osd out {}".format(osd_id) "ceph osd out {}".format(osd_id)
) )
@ -881,7 +898,7 @@ class CephOSDInstance(object):
time.sleep(5) time.sleep(5)
# 3. Stop the OSD process and wait for it to be terminated # 3. Stop the OSD process and wait for it to be terminated
logger.out("Stopping OSD disk with ID {}".format(osd_id), state="i") logger.out("Stopping OSD {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"systemctl stop ceph-osd@{}".format(osd_id) "systemctl stop ceph-osd@{}".format(osd_id)
) )
@ -922,25 +939,8 @@ class CephOSDInstance(object):
else: else:
raise Exception raise Exception
# 5. Zap the volumes # 5. Purge the OSD from Ceph
logger.out( logger.out("Purging OSD {}".format(osd_id), state="i")
"Zapping OSD {} disk on {}".format(osd_id, osd_device),
state="i",
)
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm zap --destroy {}".format(osd_device)
)
if retcode:
print("ceph-volume lvm zap")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# 6. Purge the OSD from Ceph
logger.out("Purging OSD disk with ID {}".format(osd_id), state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph osd purge {} --yes-i-really-mean-it".format(osd_id) "ceph osd purge {} --yes-i-really-mean-it".format(osd_id)
) )
@ -964,19 +964,15 @@ class CephOSDInstance(object):
) )
# 8. Delete OSD from ZK # 8. Delete OSD from ZK
logger.out( logger.out("Deleting OSD {} from Zookeeper".format(osd_id), state="i")
"Deleting OSD disk with ID {} from Zookeeper".format(osd_id), state="i"
)
zkhandler.delete(("osd", osd_id), recursive=True) zkhandler.delete(("osd", osd_id), recursive=True)
# Log it # Log it
logger.out("Removed OSD disk with ID {}".format(osd_id), state="o") logger.out("Successfully removed OSD {}".format(osd_id), state="o")
return True return True
except Exception as e: except Exception as e:
# Log it # Log it
logger.out( logger.out("Failed to remove OSD {}: {}".format(osd_id, e), state="e")
"Failed to purge OSD disk with ID {}: {}".format(osd_id, e), state="e"
)
return False return False
@staticmethod @staticmethod
@ -1245,16 +1241,34 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
# Adding a new OSD # Adding a new OSD
if command == "osd_add": if command == "osd_add":
node, device, weight, ext_db_flag, ext_db_ratio = args.split(",") (
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
split_count,
) = args.split(",")
ext_db_flag = bool(strtobool(ext_db_flag)) ext_db_flag = bool(strtobool(ext_db_flag))
ext_db_ratio = float(ext_db_ratio) ext_db_ratio = float(ext_db_ratio)
split_flag = bool(strtobool(split_flag))
split_count = int(split_count)
if node == this_node.name: if node == this_node.name:
# Lock the command queue # Lock the command queue
zk_lock = zkhandler.writelock("base.cmd.ceph") zk_lock = zkhandler.writelock("base.cmd.ceph")
with zk_lock: with zk_lock:
# Add the OSD # Add the OSD
result = CephOSDInstance.add_osd( result = CephOSDInstance.add_osd(
zkhandler, logger, node, device, weight, ext_db_flag, ext_db_ratio zkhandler,
logger,
node,
device,
weight,
ext_db_flag,
ext_db_ratio,
split_flag,
split_count,
) )
# Command succeeded # Command succeeded
if result: if result: