Fix bugs with forced removal

This commit is contained in:
Joshua Boniface 2022-04-29 13:26:36 -04:00
parent 413100a147
commit d6ca74376a
3 changed files with 43 additions and 31 deletions

View File

@ -406,7 +406,7 @@ def format_list_osd(osd_list):
osd_id_length = _osd_id_length osd_id_length = _osd_id_length
# Set the OSD node length # Set the OSD node length
_osd_node_length = len(osd_information["stats"]["node"]) + 1 _osd_node_length = len(osd_information["node"]) + 1
if _osd_node_length > osd_node_length: if _osd_node_length > osd_node_length:
osd_node_length = _osd_node_length osd_node_length = _osd_node_length
@ -602,13 +602,6 @@ def format_list_osd(osd_list):
) )
for osd_information in sorted(osd_list, key=lambda x: int(x["id"])): for osd_information in sorted(osd_list, key=lambda x: int(x["id"])):
try:
# If this happens, the node hasn't checked in fully yet, so just ignore it
if osd_information["stats"]["node"] == "|":
continue
except KeyError:
continue
osd_up_flag, osd_up_colour, osd_in_flag, osd_in_colour = getOutputColoursOSD( osd_up_flag, osd_up_colour, osd_in_flag, osd_in_colour = getOutputColoursOSD(
osd_information osd_information
) )
@ -663,7 +656,7 @@ def format_list_osd(osd_list):
osd_rdops_length=osd_rdops_length, osd_rdops_length=osd_rdops_length,
osd_rddata_length=osd_rddata_length, osd_rddata_length=osd_rddata_length,
osd_id=osd_information["id"], osd_id=osd_information["id"],
osd_node=osd_information["stats"]["node"], osd_node=osd_information["node"],
osd_device=osd_information["device"], osd_device=osd_information["device"],
osd_db_device=osd_db_device, osd_db_device=osd_db_device,
osd_up_colour=osd_up_colour, osd_up_colour=osd_up_colour,

View File

@ -181,6 +181,7 @@ def getClusterOSDList(zkhandler):
def getOSDInformation(zkhandler, osd_id): def getOSDInformation(zkhandler, osd_id):
# Get the devices # Get the devices
osd_node = zkhandler.read(("osd.node", osd_id))
osd_device = zkhandler.read(("osd.device", osd_id)) osd_device = zkhandler.read(("osd.device", osd_id))
osd_db_device = zkhandler.read(("osd.db_device", osd_id)) osd_db_device = zkhandler.read(("osd.db_device", osd_id))
# Parse the stats data # Parse the stats data
@ -189,6 +190,7 @@ def getOSDInformation(zkhandler, osd_id):
osd_information = { osd_information = {
"id": osd_id, "id": osd_id,
"node": osd_node,
"device": osd_device, "device": osd_device,
"db_device": osd_db_device, "db_device": osd_db_device,
"stats": osd_stats, "stats": osd_stats,
@ -293,7 +295,7 @@ def remove_osd(zkhandler, osd_id, force_flag):
) )
# Tell the cluster to remove an OSD # Tell the cluster to remove an OSD
remove_osd_string = "osd_remove {} {}".format(osd_id, str(force_flag)) remove_osd_string = "osd_remove {},{}".format(osd_id, str(force_flag))
zkhandler.write([("base.cmd.ceph", remove_osd_string)]) zkhandler.write([("base.cmd.ceph", remove_osd_string)])
# Wait 1/2 second for the cluster to get the message and start working # Wait 1/2 second for the cluster to get the message and start working
time.sleep(0.5) time.sleep(0.5)

View File

@ -404,30 +404,47 @@ class CephOSDInstance(object):
break break
# 4. Determine the block devices # 4. Determine the block devices
retcode, stdout, stderr = common.run_os_command( device_zk = zkhandler.read(("osd.device", osd_id))
"readlink /var/lib/ceph/osd/ceph-{}/block".format(osd_id) try:
) retcode, stdout, stderr = common.run_os_command(
vg_name = stdout.split("/")[-2] # e.g. /dev/ceph-<uuid>/osd-block-<uuid> "readlink /var/lib/ceph/osd/ceph-{}/block".format(osd_id)
retcode, stdout, stderr = common.run_os_command( )
"vgs --separator , --noheadings -o pv_name {}".format(vg_name) vg_name = stdout.split("/")[
) -2
pv_block = stdout.strip() ] # e.g. /dev/ceph-<uuid>/osd-block-<uuid>
retcode, stdout, stderr = common.run_os_command(
"vgs --separator , --noheadings -o pv_name {}".format(vg_name)
)
pv_block = stdout.strip()
except Exception as e:
print(e)
pv_block = device_zk
# 5. Zap the volumes # 5a. Verify that the blockdev actually has a ceph volume that matches the ID, otherwise don't zap it
logger.out( logger.out(
"Zapping OSD disk with ID {} on {}".format(osd_id, pv_block), state="i" f"Check OSD disk {pv_block} for OSD signature with ID osd.{osd_id}",
state="i",
) )
retcode, stdout, stderr = common.run_os_command( retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm zap --destroy {}".format(pv_block) f"ceph-volume lvm list {pv_block}"
) )
if retcode: if f"====== osd.{osd_id} =======" in stdout:
print("ceph-volume lvm zap") # 5b. Zap the volumes
print(stdout) logger.out(
print(stderr) "Zapping OSD disk with ID {} on {}".format(osd_id, pv_block),
if force_flag: state="i",
logger.out("Ignoring error due to force flag", state="i") )
else: retcode, stdout, stderr = common.run_os_command(
raise Exception "ceph-volume lvm zap --destroy {}".format(pv_block)
)
if retcode:
print("ceph-volume lvm zap")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# 6. Purge the OSD from Ceph # 6. Purge the OSD from Ceph
logger.out("Purging OSD disk with ID {}".format(osd_id), state="i") logger.out("Purging OSD disk with ID {}".format(osd_id), state="i")
@ -756,8 +773,8 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
# Removing an OSD # Removing an OSD
elif command == "osd_remove": elif command == "osd_remove":
osd_id = args[0] osd_id, force = args.split(",")
force_flag = bool(strtobool(args[1])) force_flag = bool(strtobool(force))
# Verify osd_id is in the list # Verify osd_id is in the list
if d_osd[osd_id] and d_osd[osd_id].node == this_node.name: if d_osd[osd_id] and d_osd[osd_id].node == this_node.name: