Fix bugs with forced removal

This commit is contained in:
Joshua Boniface 2022-04-29 13:26:36 -04:00
parent 413100a147
commit d6ca74376a
3 changed files with 43 additions and 31 deletions

View File

@ -406,7 +406,7 @@ def format_list_osd(osd_list):
osd_id_length = _osd_id_length
# Set the OSD node length
_osd_node_length = len(osd_information["stats"]["node"]) + 1
_osd_node_length = len(osd_information["node"]) + 1
if _osd_node_length > osd_node_length:
osd_node_length = _osd_node_length
@ -602,13 +602,6 @@ def format_list_osd(osd_list):
)
for osd_information in sorted(osd_list, key=lambda x: int(x["id"])):
try:
# If this happens, the node hasn't checked in fully yet, so just ignore it
if osd_information["stats"]["node"] == "|":
continue
except KeyError:
continue
osd_up_flag, osd_up_colour, osd_in_flag, osd_in_colour = getOutputColoursOSD(
osd_information
)
@ -663,7 +656,7 @@ def format_list_osd(osd_list):
osd_rdops_length=osd_rdops_length,
osd_rddata_length=osd_rddata_length,
osd_id=osd_information["id"],
osd_node=osd_information["stats"]["node"],
osd_node=osd_information["node"],
osd_device=osd_information["device"],
osd_db_device=osd_db_device,
osd_up_colour=osd_up_colour,

View File

@ -181,6 +181,7 @@ def getClusterOSDList(zkhandler):
def getOSDInformation(zkhandler, osd_id):
# Get the devices
osd_node = zkhandler.read(("osd.node", osd_id))
osd_device = zkhandler.read(("osd.device", osd_id))
osd_db_device = zkhandler.read(("osd.db_device", osd_id))
# Parse the stats data
@ -189,6 +190,7 @@ def getOSDInformation(zkhandler, osd_id):
osd_information = {
"id": osd_id,
"node": osd_node,
"device": osd_device,
"db_device": osd_db_device,
"stats": osd_stats,
@ -293,7 +295,7 @@ def remove_osd(zkhandler, osd_id, force_flag):
)
# Tell the cluster to remove an OSD
remove_osd_string = "osd_remove {} {}".format(osd_id, str(force_flag))
remove_osd_string = "osd_remove {},{}".format(osd_id, str(force_flag))
zkhandler.write([("base.cmd.ceph", remove_osd_string)])
# Wait 1/2 second for the cluster to get the message and start working
time.sleep(0.5)

View File

@ -404,30 +404,47 @@ class CephOSDInstance(object):
break
# 4. Determine the block devices
retcode, stdout, stderr = common.run_os_command(
"readlink /var/lib/ceph/osd/ceph-{}/block".format(osd_id)
)
vg_name = stdout.split("/")[-2] # e.g. /dev/ceph-<uuid>/osd-block-<uuid>
retcode, stdout, stderr = common.run_os_command(
"vgs --separator , --noheadings -o pv_name {}".format(vg_name)
)
pv_block = stdout.strip()
device_zk = zkhandler.read(("osd.device", osd_id))
try:
retcode, stdout, stderr = common.run_os_command(
"readlink /var/lib/ceph/osd/ceph-{}/block".format(osd_id)
)
vg_name = stdout.split("/")[
-2
] # e.g. /dev/ceph-<uuid>/osd-block-<uuid>
retcode, stdout, stderr = common.run_os_command(
"vgs --separator , --noheadings -o pv_name {}".format(vg_name)
)
pv_block = stdout.strip()
except Exception as e:
print(e)
pv_block = device_zk
# 5. Zap the volumes
# 5a. Verify that the blockdev actually has a ceph volume that matches the ID, otherwise don't zap it
logger.out(
"Zapping OSD disk with ID {} on {}".format(osd_id, pv_block), state="i"
f"Check OSD disk {pv_block} for OSD signature with ID osd.{osd_id}",
state="i",
)
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm zap --destroy {}".format(pv_block)
f"ceph-volume lvm list {pv_block}"
)
if retcode:
print("ceph-volume lvm zap")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
if f"====== osd.{osd_id} =======" in stdout:
# 5b. Zap the volumes
logger.out(
"Zapping OSD disk with ID {} on {}".format(osd_id, pv_block),
state="i",
)
retcode, stdout, stderr = common.run_os_command(
"ceph-volume lvm zap --destroy {}".format(pv_block)
)
if retcode:
print("ceph-volume lvm zap")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# 6. Purge the OSD from Ceph
logger.out("Purging OSD disk with ID {}".format(osd_id), state="i")
@ -756,8 +773,8 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
# Removing an OSD
elif command == "osd_remove":
osd_id = args[0]
force_flag = bool(strtobool(args[1]))
osd_id, force = args.split(",")
force_flag = bool(strtobool(force))
# Verify osd_id is in the list
if d_osd[osd_id] and d_osd[osd_id].node == this_node.name: