From d6ca74376adfceb083bfedae4d4a09f334ea97f8 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 29 Apr 2022 13:26:36 -0400 Subject: [PATCH] Fix bugs with forced removal --- client-cli/pvc/cli_lib/ceph.py | 11 +--- daemon-common/ceph.py | 4 +- node-daemon/pvcnoded/objects/CephInstance.py | 59 +++++++++++++------- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/client-cli/pvc/cli_lib/ceph.py b/client-cli/pvc/cli_lib/ceph.py index 65eb267b..84d884d7 100644 --- a/client-cli/pvc/cli_lib/ceph.py +++ b/client-cli/pvc/cli_lib/ceph.py @@ -406,7 +406,7 @@ def format_list_osd(osd_list): osd_id_length = _osd_id_length # Set the OSD node length - _osd_node_length = len(osd_information["stats"]["node"]) + 1 + _osd_node_length = len(osd_information["node"]) + 1 if _osd_node_length > osd_node_length: osd_node_length = _osd_node_length @@ -602,13 +602,6 @@ def format_list_osd(osd_list): ) for osd_information in sorted(osd_list, key=lambda x: int(x["id"])): - try: - # If this happens, the node hasn't checked in fully yet, so just ignore it - if osd_information["stats"]["node"] == "|": - continue - except KeyError: - continue - osd_up_flag, osd_up_colour, osd_in_flag, osd_in_colour = getOutputColoursOSD( osd_information ) @@ -663,7 +656,7 @@ def format_list_osd(osd_list): osd_rdops_length=osd_rdops_length, osd_rddata_length=osd_rddata_length, osd_id=osd_information["id"], - osd_node=osd_information["stats"]["node"], + osd_node=osd_information["node"], osd_device=osd_information["device"], osd_db_device=osd_db_device, osd_up_colour=osd_up_colour, diff --git a/daemon-common/ceph.py b/daemon-common/ceph.py index f8b2163d..79752bbc 100644 --- a/daemon-common/ceph.py +++ b/daemon-common/ceph.py @@ -181,6 +181,7 @@ def getClusterOSDList(zkhandler): def getOSDInformation(zkhandler, osd_id): # Get the devices + osd_node = zkhandler.read(("osd.node", osd_id)) osd_device = zkhandler.read(("osd.device", osd_id)) osd_db_device = zkhandler.read(("osd.db_device", osd_id)) # Parse the stats data @@ -189,6 +190,7 @@ def getOSDInformation(zkhandler, osd_id): osd_information = { "id": osd_id, + "node": osd_node, "device": osd_device, "db_device": osd_db_device, "stats": osd_stats, @@ -293,7 +295,7 @@ def remove_osd(zkhandler, osd_id, force_flag): ) # Tell the cluster to remove an OSD - remove_osd_string = "osd_remove {} {}".format(osd_id, str(force_flag)) + remove_osd_string = "osd_remove {},{}".format(osd_id, str(force_flag)) zkhandler.write([("base.cmd.ceph", remove_osd_string)]) # Wait 1/2 second for the cluster to get the message and start working time.sleep(0.5) diff --git a/node-daemon/pvcnoded/objects/CephInstance.py b/node-daemon/pvcnoded/objects/CephInstance.py index e8c7dae4..70061a0c 100644 --- a/node-daemon/pvcnoded/objects/CephInstance.py +++ b/node-daemon/pvcnoded/objects/CephInstance.py @@ -404,30 +404,47 @@ class CephOSDInstance(object): break # 4. Determine the block devices - retcode, stdout, stderr = common.run_os_command( - "readlink /var/lib/ceph/osd/ceph-{}/block".format(osd_id) - ) - vg_name = stdout.split("/")[-2] # e.g. /dev/ceph-/osd-block- - retcode, stdout, stderr = common.run_os_command( - "vgs --separator , --noheadings -o pv_name {}".format(vg_name) - ) - pv_block = stdout.strip() + device_zk = zkhandler.read(("osd.device", osd_id)) + try: + retcode, stdout, stderr = common.run_os_command( + "readlink /var/lib/ceph/osd/ceph-{}/block".format(osd_id) + ) + vg_name = stdout.split("/")[ + -2 + ] # e.g. /dev/ceph-/osd-block- + retcode, stdout, stderr = common.run_os_command( + "vgs --separator , --noheadings -o pv_name {}".format(vg_name) + ) + pv_block = stdout.strip() + except Exception as e: + print(e) + pv_block = device_zk - # 5. Zap the volumes + # 5a. Verify that the blockdev actually has a ceph volume that matches the ID, otherwise don't zap it logger.out( - "Zapping OSD disk with ID {} on {}".format(osd_id, pv_block), state="i" + f"Check OSD disk {pv_block} for OSD signature with ID osd.{osd_id}", + state="i", ) retcode, stdout, stderr = common.run_os_command( - "ceph-volume lvm zap --destroy {}".format(pv_block) + f"ceph-volume lvm list {pv_block}" ) - if retcode: - print("ceph-volume lvm zap") - print(stdout) - print(stderr) - if force_flag: - logger.out("Ignoring error due to force flag", state="i") - else: - raise Exception + if f"====== osd.{osd_id} =======" in stdout: + # 5b. Zap the volumes + logger.out( + "Zapping OSD disk with ID {} on {}".format(osd_id, pv_block), + state="i", + ) + retcode, stdout, stderr = common.run_os_command( + "ceph-volume lvm zap --destroy {}".format(pv_block) + ) + if retcode: + print("ceph-volume lvm zap") + print(stdout) + print(stderr) + if force_flag: + logger.out("Ignoring error due to force flag", state="i") + else: + raise Exception # 6. Purge the OSD from Ceph logger.out("Purging OSD disk with ID {}".format(osd_id), state="i") @@ -756,8 +773,8 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd): # Removing an OSD elif command == "osd_remove": - osd_id = args[0] - force_flag = bool(strtobool(args[1])) + osd_id, force = args.split(",") + force_flag = bool(strtobool(force)) # Verify osd_id is in the list if d_osd[osd_id] and d_osd[osd_id].node == this_node.name: