Fix numerous formatting and function bugs

This commit is contained in:
Joshua Boniface 2023-11-03 14:00:05 -04:00
parent 94d8d2cf75
commit ed5bc9fb43
1 changed files with 296 additions and 279 deletions

View File

@ -260,6 +260,20 @@ class CephOSDInstance(object):
self.vg = osd_vg self.vg = osd_vg
self.lv = osd_lv self.lv = osd_lv
@staticmethod
def find_osds_from_block(logger, device):
# Try to query the passed block device directly
logger.out(f"Querying for OSD(s) on disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm list --format json {device}"
)
if retcode:
found_osds = []
else:
found_osds = jloads(stdout)
return found_osds
@staticmethod @staticmethod
def add_osd( def add_osd(
zkhandler, zkhandler,
@ -316,7 +330,6 @@ class CephOSDInstance(object):
else: else:
class_flag = "--crush-device-class ssd" class_flag = "--crush-device-class ssd"
try:
# 1. Zap the block device # 1. Zap the block device
logger.out(f"Zapping disk {device}", state="i") logger.out(f"Zapping disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
@ -338,6 +351,9 @@ class CephOSDInstance(object):
logger.out(stdout, state="d") logger.out(stdout, state="d")
logger.out(stderr, state="d") logger.out(stderr, state="d")
raise Exception raise Exception
logger.out(
f"Successfully prepared {split_count} OSDs on disk {device}", state="o"
)
# 3. Get the list of created OSDs on the device (initial pass) # 3. Get the list of created OSDs on the device (initial pass)
logger.out(f"Querying OSD(s) on disk {device}", state="i") logger.out(f"Querying OSD(s) on disk {device}", state="i")
@ -360,10 +376,7 @@ class CephOSDInstance(object):
osd_fsid = osd_details["tags"]["ceph.osd_fsid"] osd_fsid = osd_details["tags"]["ceph.osd_fsid"]
osd_id = osd_details["tags"]["ceph.osd_id"] osd_id = osd_details["tags"]["ceph.osd_id"]
osd_lv = osd_details["lv_path"] osd_lv = osd_details["lv_path"]
logger.out(f"Creating Bluestore DB volume for OSD {osd_id}", state="i")
logger.out(
f"Creating Bluestore DB volume for OSD {osd_id}", state="i"
)
# 4b. Prepare the logical volume if ext_db_flag # 4b. Prepare the logical volume if ext_db_flag
if ext_db_ratio is not None: if ext_db_ratio is not None:
@ -383,6 +396,7 @@ class CephOSDInstance(object):
db_device = f"osd-db/osd-{osd_id}" db_device = f"osd-db/osd-{osd_id}"
# 4c. Attach the new DB device to the OSD # 4c. Attach the new DB device to the OSD
logger.out(f"Attaching Bluestore DB volume to OSD {osd_id}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm new-db --osd-id {osd_id} --osd-fsid {osd_fsid} --target {db_device}" f"ceph-volume lvm new-db --osd-id {osd_id} --osd-fsid {osd_fsid} --target {db_device}"
) )
@ -480,12 +494,6 @@ class CephOSDInstance(object):
state="o", state="o",
) )
return True return True
except Exception as e:
logger.out(
f"Failed to create {split_count} new OSD(s) on disk {device}: {e}",
state="e",
)
return False
@staticmethod @staticmethod
def replace_osd( def replace_osd(
@ -516,25 +524,12 @@ class CephOSDInstance(object):
new_device = ddevice new_device = ddevice
# Phase 1: Try to determine what we can about the old device # Phase 1: Try to determine what we can about the old device
def find_osds_from_block(device):
# Try to query the passed block device directly
logger.out(f"Querying for OSD(s) on disk {device}", state="i")
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm list --format json {device}"
)
if retcode:
found_osds = []
else:
found_osds = jloads(stdout)
return found_osds
real_old_device = None real_old_device = None
osd_block = zkhandler.read(("osd.device", osd_id)) osd_block = zkhandler.read(("osd.device", osd_id))
# Determine information from a passed old_device # Determine information from a passed old_device
if old_device is not None: if old_device is not None:
found_osds = find_osds_from_block(old_device) found_osds = CephOSDInstance.find_osds_from_block(logger, old_device)
if found_osds and osd_id in found_osds.keys(): if found_osds and osd_id in found_osds.keys():
real_old_device = old_device real_old_device = old_device
else: else:
@ -545,7 +540,7 @@ class CephOSDInstance(object):
# Try to get an old_device from our PVC information # Try to get an old_device from our PVC information
if real_old_device is None: if real_old_device is None:
found_osds = find_osds_from_block(osd_block) found_osds = CephOSDInstance.find_osds_from_block(logger, osd_block)
if osd_id in found_osds.keys(): if osd_id in found_osds.keys():
real_old_device = osd_block real_old_device = osd_block
@ -557,9 +552,6 @@ class CephOSDInstance(object):
) )
else: else:
skip_zap = False skip_zap = False
logger.out(
f"Found source OSD(s) on block device {real_old_device}", state="i"
)
# Try to determine if any other OSDs shared a block device with this OSD # Try to determine if any other OSDs shared a block device with this OSD
_, osd_list = get_list_osd(zkhandler, None) _, osd_list = get_list_osd(zkhandler, None)
@ -567,12 +559,6 @@ class CephOSDInstance(object):
o for o in osd_list if o["node"] == node and o["device"] == osd_block o for o in osd_list if o["node"] == node and o["device"] == osd_block
] ]
# Remove each OSD on the block device
for osd in all_osds_on_block:
result = CephOSDInstance.remove_osd(
zkhandler, logger, osd["id"], force_flag=True, skip_zap_flag=skip_zap
)
# Determine the weight of the OSD(s) # Determine the weight of the OSD(s)
if weight is None: if weight is None:
weight = all_osds_on_block[0]["stats"]["weight"] weight = all_osds_on_block[0]["stats"]["weight"]
@ -595,11 +581,28 @@ class CephOSDInstance(object):
f"blockdev --getsize64 {all_osds_on_block[0]['db_device']}" f"blockdev --getsize64 {all_osds_on_block[0]['db_device']}"
) )
osd_db_ratio = None osd_db_ratio = None
osd_db_size = f"{osd_db_size_bytes}B" osd_db_size = f"{osd_db_size_bytes}"
if not osd_db_size:
logger.out(
f"Could not get size of device {all_osds_on_block[0]['db_device']}; skipping external database creation",
state="w",
)
osd_db_size = None
else: else:
osd_db_ratio = None osd_db_ratio = None
osd_db_size = None osd_db_size = None
# Remove each OSD on the block device
for osd in all_osds_on_block:
result = CephOSDInstance.remove_osd(
zkhandler,
logger,
node,
osd["id"],
force_flag=True,
skip_zap_flag=skip_zap,
)
# Create [a] new OSD[s], on the new block device # Create [a] new OSD[s], on the new block device
result = CephOSDInstance.add_osd( result = CephOSDInstance.add_osd(
zkhandler, zkhandler,
@ -674,10 +677,9 @@ class CephOSDInstance(object):
osd_device = line.split()[-1] osd_device = line.split()[-1]
if not osd_fsid: if not osd_fsid:
print("ceph-volume lvm list") logger.out("Failed: ceph-volume lvm list", state="e")
print("Could not find OSD information in data:") logger.out(stdout, state="d")
print(stdout) logger.out(stderr, state="d")
print(stderr)
raise Exception raise Exception
# Split OSD blockdev into VG and LV components # Split OSD blockdev into VG and LV components
@ -696,9 +698,9 @@ class CephOSDInstance(object):
) )
) )
if retcode: if retcode:
print("ceph-volume lvm activate") logger.out("Failed: ceph-volume lvm activate", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
time.sleep(0.5) time.sleep(0.5)
@ -708,9 +710,9 @@ class CephOSDInstance(object):
"systemctl status ceph-osd@{osdid}".format(osdid=osd_id) "systemctl status ceph-osd@{osdid}".format(osdid=osd_id)
) )
if retcode: if retcode:
print("systemctl status") logger.out("Failed: systemctl status", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
# 5. Update Zookeeper information # 5. Update Zookeeper information
@ -745,43 +747,39 @@ class CephOSDInstance(object):
return False return False
@staticmethod @staticmethod
def remove_osd(zkhandler, logger, osd_id, force_flag=False, skip_zap_flag=False): def remove_osd(
logger.out("Removing OSD {}".format(osd_id), state="i") zkhandler, logger, node, osd_id, force_flag=False, skip_zap_flag=False
):
logger.out(f"Removing OSD {osd_id}", state="i")
try: try:
# Verify the OSD is present # Verify the OSD is present
retcode, stdout, stderr = common.run_os_command("ceph osd ls") retcode, stdout, stderr = common.run_os_command("ceph osd ls")
osd_list = stdout.split("\n") osd_list = stdout.split("\n")
if osd_id not in osd_list: if osd_id not in osd_list:
logger.out( logger.out(f"Could not find OSD {osd_id} in the cluster", state="e")
"Could not find OSD {} in the cluster".format(osd_id), state="e"
)
if force_flag: if force_flag:
logger.out("Ignoring error due to force flag", state="i") logger.out("Ignoring error due to force flag", state="i")
else: else:
return True return True
# 1. Set the OSD down and out so it will flush # 1. Set the OSD down and out so it will flush
logger.out("Setting down OSD {}".format(osd_id), state="i") logger.out(f"Setting down OSD {osd_id}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(f"ceph osd down {osd_id}")
"ceph osd down {}".format(osd_id)
)
if retcode: if retcode:
print("ceph osd down") logger.out("Failed: ceph osd down", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
if force_flag: if force_flag:
logger.out("Ignoring error due to force flag", state="i") logger.out("Ignoring error due to force flag", state="i")
else: else:
raise Exception raise Exception
logger.out("Setting out OSD {}".format(osd_id), state="i") logger.out(f"Setting out OSD {osd_id}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(f"ceph osd out {osd_id}")
"ceph osd out {}".format(osd_id)
)
if retcode: if retcode:
print("ceph osd out") logger.out("Failed: ceph osd out", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
if force_flag: if force_flag:
logger.out("Ignoring error due to force flag", state="i") logger.out("Ignoring error due to force flag", state="i")
else: else:
@ -800,47 +798,29 @@ class CephOSDInstance(object):
time.sleep(1) time.sleep(1)
# 3. Stop the OSD process and wait for it to be terminated # 3. Stop the OSD process and wait for it to be terminated
logger.out("Stopping OSD {}".format(osd_id), state="i") logger.out(f"Stopping OSD {osd_id}", state="i")
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"systemctl stop ceph-osd@{}".format(osd_id) f"systemctl stop ceph-osd@{osd_id}"
) )
if retcode: if retcode:
print("systemctl stop") logger.out("Failed: systemctl stop", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
if force_flag: if force_flag:
logger.out("Ignoring error due to force flag", state="i") logger.out("Ignoring error due to force flag", state="i")
else: else:
raise Exception raise Exception
time.sleep(5) time.sleep(5)
if not skip_zap_flag: # 4. Delete OSD from ZK
# 4. Determine the block devices data_device = zkhandler.read(("osd.device", osd_id))
osd_vg = zkhandler.read(("osd.vg", osd_id)) if zkhandler.exists(("osd.db_device", osd_id)):
osd_lv = zkhandler.read(("osd.lv", osd_id)) db_device = zkhandler.read(("osd.db_device", osd_id))
osd_lvm = f"/dev/{osd_vg}/{osd_lv}"
osd_device = None
logger.out(
f"Getting disk info for OSD {osd_id} LV {osd_lvm}",
state="i",
)
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm list {osd_lvm}"
)
for line in stdout.split("\n"):
if "devices" in line:
osd_device = line.split()[-1]
if not osd_device:
print("ceph-volume lvm list")
print("Could not find OSD information in data:")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else: else:
raise Exception db_device = None
logger.out(f"Deleting OSD {osd_id} from PVC", state="i")
zkhandler.delete(("osd", osd_id), recursive=True)
# 5. Purge the OSD from Ceph # 5. Purge the OSD from Ceph
logger.out(f"Purging OSD {osd_id}", state="i") logger.out(f"Purging OSD {osd_id}", state="i")
@ -848,51 +828,88 @@ class CephOSDInstance(object):
force_arg = "--force" force_arg = "--force"
else: else:
force_arg = "" force_arg = ""
retcode, stdout, stderr = common.run_os_command(
f"ceph osd purge {osd_id} {force_arg} --yes-i-really-mean-it"
)
if retcode:
print("ceph osd purge")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# Remove the OSD from the CRUSH map
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
f"ceph osd crush rm osd.{osd_id}" f"ceph osd crush rm osd.{osd_id}"
) )
if retcode: if retcode:
print("ceph osd crush rm") logger.out("Failed: ceph osd crush rm", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# Purge the OSD
retcode, stdout, stderr = common.run_os_command(
f"ceph osd purge {osd_id} {force_arg} --yes-i-really-mean-it"
)
if retcode:
logger.out("Failed: ceph osd purge", state="e")
logger.out(stdout, state="d")
logger.out(stderr, state="d")
if force_flag: if force_flag:
logger.out("Ignoring error due to force flag", state="i") logger.out("Ignoring error due to force flag", state="i")
else: else:
raise Exception raise Exception
# 7. Remove the DB device # 6. Remove the DB device
if zkhandler.exists(("osd.db_device", osd_id)): if db_device is not None:
db_device = zkhandler.read(("osd.db_device", osd_id))
logger.out( logger.out(
'Removing OSD DB logical volume "{}"'.format(db_device), f'Removing OSD DB logical volume "{db_device}"',
state="i", state="i",
) )
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"lvremove --yes --force {}".format(db_device) f"lvremove --yes --force {db_device}"
) )
# 8. Delete OSD from ZK if not skip_zap_flag:
logger.out("Deleting OSD {} from Zookeeper".format(osd_id), state="i") # 7. Determine the block devices
zkhandler.delete(("osd", osd_id), recursive=True) logger.out(
f"Getting disk info for OSD {osd_id} device {data_device}",
state="i",
)
found_osds = CephOSDInstance.find_osds_from_block(logger, data_device)
if osd_id in found_osds.keys():
# Try to determine if any other OSDs shared a block device with this OSD
_, osd_list = get_list_osd(zkhandler, None)
all_osds_on_block = [
o
for o in osd_list
if o["node"] == node and o["device"] == data_device
]
if len(all_osds_on_block) < 1:
logger.out(
f"Found no peer split OSDs on {data_device}; zapping disk",
state="i",
)
retcode, stdout, stderr = common.run_os_command(
f"ceph-volume lvm zap --destroy {data_device}"
)
if retcode:
logger.out("Failed: ceph-volume lvm zap", state="e")
logger.out(stdout, state="d")
logger.out(stderr, state="d")
raise Exception
else:
logger.out(
f"Found {len(all_osds_on_block)} OSD(s) still remaining on {data_device}; skipping zap",
state="w",
)
else:
logger.out(
f"Could not find OSD {osd_id} on device {data_device}; skipping zap",
state="w",
)
# Log it # Log it
logger.out("Successfully removed OSD {}".format(osd_id), state="o") logger.out(f"Successfully removed OSD {osd_id}", state="o")
return True return True
except Exception as e: except Exception as e:
# Log it # Log it
logger.out("Failed to remove OSD {}: {}".format(osd_id, e), state="e") logger.out(f"Failed to remove OSD {osd_id}: {e}", state="e")
return False return False
@staticmethod @staticmethod
@ -932,18 +949,18 @@ class CephOSDInstance(object):
"sgdisk --clear {}".format(device) "sgdisk --clear {}".format(device)
) )
if retcode: if retcode:
print("sgdisk create partition table") logger.out("Failed: sgdisk create partition table", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"sgdisk --new 1:: --typecode 1:8e00 {}".format(device) "sgdisk --new 1:: --typecode 1:8e00 {}".format(device)
) )
if retcode: if retcode:
print("sgdisk create pv partition") logger.out("Failed: sgdisk create pv partition", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
# Handle the partition ID portion # Handle the partition ID portion
@ -964,9 +981,9 @@ class CephOSDInstance(object):
"pvcreate --force {}".format(partition) "pvcreate --force {}".format(partition)
) )
if retcode: if retcode:
print("pv creation") logger.out("Failed: pv creation", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
# 2. Create the VG (named 'osd-db') # 2. Create the VG (named 'osd-db')
@ -977,9 +994,9 @@ class CephOSDInstance(object):
"vgcreate --force osd-db {}".format(partition) "vgcreate --force osd-db {}".format(partition)
) )
if retcode: if retcode:
print("vg creation") logger.out("Failed: vg creation", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
# Log it # Log it
@ -1029,9 +1046,9 @@ class CephOSDInstance(object):
) )
) )
if retcode: if retcode:
print("db lv creation") logger.out("Failed: db lv creation", state="e")
print(stdout) logger.out(stdout, state="d")
print(stderr) logger.out(stderr, state="d")
raise Exception raise Exception
# Log it # Log it
@ -1282,7 +1299,7 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
with zk_lock: with zk_lock:
# Remove the OSD # Remove the OSD
result = CephOSDInstance.remove_osd( result = CephOSDInstance.remove_osd(
zkhandler, logger, osd_id, force_flag zkhandler, logger, this_node.name, osd_id, force_flag
) )
# Command succeeded # Command succeeded
if result: if result: