Add OSD removal force option

Ensures a removal can continue even in situations where some step(s)
might fail, for instance removing an obsolete OSD from a replaced node.
This commit is contained in:
Joshua Boniface 2022-04-29 11:16:33 -04:00
parent 53aed0a735
commit 4d698be34b
6 changed files with 63 additions and 20 deletions

View File

@ -4099,11 +4099,16 @@ class API_Storage_Ceph_OSD_Element(Resource):
@RequestParser(
[
{
"name": "force",
"required": False,
"helptext": "Force removal even if steps fail.",
},
{
"name": "yes-i-really-mean-it",
"required": True,
"helptext": "Please confirm that 'yes-i-really-mean-it'.",
}
},
]
)
@Authenticator
@ -4116,6 +4121,11 @@ class API_Storage_Ceph_OSD_Element(Resource):
tags:
- storage / ceph
parameters:
- in: query
name: force
type: boolean
required: flase
description: Force removal even if some step(s) fail
- in: query
name: yes-i-really-mean-it
type: string
@ -4138,7 +4148,7 @@ class API_Storage_Ceph_OSD_Element(Resource):
type: object
id: Message
"""
return api_helper.ceph_osd_remove(osdid)
return api_helper.ceph_osd_remove(osdid, reqargs.get("force", False))
api.add_resource(API_Storage_Ceph_OSD_Element, "/storage/ceph/osd/<osdid>")

View File

@ -1302,11 +1302,11 @@ def ceph_osd_add(zkhandler, node, device, weight, ext_db_flag=False, ext_db_rati
@ZKConnection(config)
def ceph_osd_remove(zkhandler, osd_id):
def ceph_osd_remove(zkhandler, osd_id, force_flag):
"""
Remove a Ceph OSD from the PVC Ceph storage cluster.
"""
retflag, retdata = pvc_ceph.remove_osd(zkhandler, osd_id)
retflag, retdata = pvc_ceph.remove_osd(zkhandler, osd_id, force_flag)
if retflag:
retcode = 200

View File

@ -255,7 +255,7 @@ def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio):
return retstatus, response.json().get("message", "")
def ceph_osd_remove(config, osdid):
def ceph_osd_remove(config, osdid, force_flag):
"""
Remove Ceph OSD
@ -263,7 +263,7 @@ def ceph_osd_remove(config, osdid):
API arguments:
API schema: {"message":"{data}"}
"""
params = {"yes-i-really-mean-it": "yes"}
params = {"force": force_flag, "yes-i-really-mean-it": "yes"}
response = call_api(
config, "delete", "/storage/ceph/osd/{osdid}".format(osdid=osdid), params=params
)

View File

@ -3376,6 +3376,14 @@ def ceph_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, confirm_flag):
###############################################################################
@click.command(name="remove", short_help="Remove OSD.")
@click.argument("osdid")
@click.option(
"-f",
"--force",
"force_flag",
is_flag=True,
default=False,
help="Force removal even if steps fail",
)
@click.option(
"-y",
"--yes",
@ -3385,11 +3393,13 @@ def ceph_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, confirm_flag):
help="Confirm the removal",
)
@cluster_req
def ceph_osd_remove(osdid, confirm_flag):
def ceph_osd_remove(osdid, force_flag, confirm_flag):
"""
Remove a Ceph OSD with ID OSDID.
DANGER: This will completely remove the OSD from the cluster. OSDs will rebalance which will negatively affect performance and available space. It is STRONGLY RECOMMENDED to set an OSD out (using 'pvc storage osd out') and allow the cluster to fully rebalance (verified with 'pvc storage status') before removing an OSD.
NOTE: The "-f"/"--force" option is useful after replacing a failed node, to ensure the OSD is removed even if the OSD in question does not properly exist on the node after a rebuild.
"""
if not confirm_flag and not config["unsafe"]:
try:
@ -3397,7 +3407,7 @@ def ceph_osd_remove(osdid, confirm_flag):
except Exception:
exit(0)
retcode, retmsg = pvc_ceph.ceph_osd_remove(config, osdid)
retcode, retmsg = pvc_ceph.ceph_osd_remove(config, osdid, force_flag)
cleanup(retcode, retmsg)

View File

@ -286,14 +286,14 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0
return success, message
def remove_osd(zkhandler, osd_id):
def remove_osd(zkhandler, osd_id, force_flag):
if not verifyOSD(zkhandler, osd_id):
return False, 'ERROR: No OSD with ID "{}" is present in the cluster.'.format(
osd_id
)
# Tell the cluster to remove an OSD
remove_osd_string = "osd_remove {}".format(osd_id)
remove_osd_string = "osd_remove {} {}".format(osd_id, str(force_flag))
zkhandler.write([("base.cmd.ceph", remove_osd_string)])
# Wait 1/2 second for the cluster to get the message and start working
time.sleep(0.5)

View File

@ -310,7 +310,7 @@ class CephOSDInstance(object):
return False
@staticmethod
def remove_osd(zkhandler, logger, osd_id, osd_obj):
def remove_osd(zkhandler, logger, osd_id, osd_obj, force_flag):
logger.out("Removing OSD disk {}".format(osd_id), state="i")
try:
# Verify the OSD is present
@ -320,6 +320,9 @@ class CephOSDInstance(object):
logger.out(
"Could not find OSD {} in the cluster".format(osd_id), state="e"
)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
return True
# 1. Set the OSD down and out so it will flush
@ -331,6 +334,9 @@ class CephOSDInstance(object):
print("ceph osd down")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
logger.out("Setting out OSD disk with ID {}".format(osd_id), state="i")
@ -341,6 +347,9 @@ class CephOSDInstance(object):
print("ceph osd out")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# 2. Wait for the OSD to flush
@ -358,8 +367,12 @@ class CephOSDInstance(object):
num_pgs = osd_string["num_pgs"]
if num_pgs > 0:
time.sleep(5)
else:
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
except Exception:
break
@ -372,6 +385,9 @@ class CephOSDInstance(object):
print("systemctl stop")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# FIXME: There has to be a better way to do this /shrug
@ -408,6 +424,9 @@ class CephOSDInstance(object):
print("ceph-volume lvm zap")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# 6. Purge the OSD from Ceph
@ -419,6 +438,9 @@ class CephOSDInstance(object):
print("ceph osd purge")
print(stdout)
print(stderr)
if force_flag:
logger.out("Ignoring error due to force flag", state="i")
else:
raise Exception
# 7. Remove the DB device
@ -734,7 +756,8 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
# Removing an OSD
elif command == "osd_remove":
osd_id = args
osd_id = args[0]
force_flag = bool(strtobool(args[1]))
# Verify osd_id is in the list
if d_osd[osd_id] and d_osd[osd_id].node == this_node.name:
@ -743,7 +766,7 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd):
with zk_lock:
# Remove the OSD
result = CephOSDInstance.remove_osd(
zkhandler, logger, osd_id, d_osd[osd_id]
zkhandler, logger, osd_id, d_osd[osd_id], force_flag
)
# Command succeeded
if result: