Port OSD on-node tasks to Celery worker system

Adds Celery versions of the osd_add, osd_replace, osd_refresh,
osd_remove, and osd_db_vg_add functions.
This commit is contained in:
2023-11-09 14:05:15 -05:00
parent 89681d54b9
commit ce17c60a20
12 changed files with 2039 additions and 1723 deletions

View File

@ -1582,10 +1582,10 @@ def cli_vm_flush_locks(domain, wait_flag):
NOTE: This is a task-based command. The "--wait" flag (default) will block and show progress. Specifying the "--no-wait" flag will return immediately with a job ID instead, which can be queried externally later.
"""
retcode, retmsg = pvc.lib.vm.vm_locks(CLI_CONFIG, domain, wait_flag=wait_flag)
retcode, retmsg = pvc.lib.vm.vm_locks(CLI_CONFIG, domain, wait_flag)
if retcode and wait_flag:
retmsg = wait_for_flush_locks(CLI_CONFIG, retmsg)
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
finish(retcode, retmsg)
@ -3372,10 +3372,18 @@ def cli_storage_osd():
@connection_req
@click.argument("node")
@click.argument("device")
@click.option(
"--wait/--no-wait",
"wait_flag",
is_flag=True,
default=True,
show_default=True,
help="Wait or don't wait for task to complete, showing progress",
)
@confirm_opt(
"Destroy all data on and create a new OSD database volume group on node {node} device {device}"
)
def cli_storage_osd_create_db_vg(node, device):
def cli_storage_osd_create_db_vg(node, device, wait_flag):
"""
Create a new Ceph OSD database volume group on node NODE with block device DEVICE.
@ -3390,7 +3398,12 @@ def cli_storage_osd_create_db_vg(node, device):
A "detect" string is a string in the form "detect:<NAME>:<HUMAN-SIZE>:<ID>". Detect strings allow for automatic determination of Linux block device paths from known basic information about disks by leveraging "lsscsi" on the target host. The "NAME" should be some descriptive identifier, for instance the manufacturer (e.g. "INTEL"), the "HUMAN-SIZE" should be the labeled human-readable size of the device (e.g. "480GB", "1.92TB"), and "ID" specifies the Nth 0-indexed device which matches the "NAME" and "HUMAN-SIZE" values (e.g. "2" would match the third device with the corresponding "NAME" and "HUMAN-SIZE"). When matching against sizes, there is +/- 3% flexibility to account for base-1000 vs. base-1024 differences and rounding errors. The "NAME" may contain whitespace but if so the entire detect string should be quoted, and is case-insensitive. More information about detect strings can be found in the manual.
"""
retcode, retmsg = pvc.lib.storage.ceph_osd_db_vg_add(CLI_CONFIG, node, device)
retcode, retmsg = pvc.lib.storage.ceph_osd_db_vg_add(
CLI_CONFIG, node, device, wait_flag
)
if retcode and wait_flag:
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
finish(retcode, retmsg)
@ -3434,8 +3447,18 @@ def cli_storage_osd_create_db_vg(node, device):
type=int,
help="Split (an NVMe) disk into this many OSDs.",
)
@click.option(
"--wait/--no-wait",
"wait_flag",
is_flag=True,
default=True,
show_default=True,
help="Wait or don't wait for task to complete, showing progress",
)
@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}")
def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_count):
def cli_storage_osd_add(
node, device, weight, ext_db_ratio, ext_db_size, osd_count, wait_flag
):
"""
Add a new Ceph OSD on node NODE with block device DEVICE.
@ -3456,11 +3479,6 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
"""
echo(
CLI_CONFIG,
"Waiting for node task to complete, this may take some time... ",
newline=False,
)
retcode, retmsg = pvc.lib.storage.ceph_osd_add(
CLI_CONFIG,
node,
@ -3469,8 +3487,11 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
ext_db_ratio,
ext_db_size,
osd_count,
wait_flag,
)
echo(CLI_CONFIG, "done.")
if retcode and wait_flag:
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
finish(retcode, retmsg)
@ -3509,11 +3530,19 @@ def cli_storage_osd_add(node, device, weight, ext_db_ratio, ext_db_size, osd_cou
default=None,
help="Create a new external database logical volume for the OSD(s) with this human-unit size; if unset, old ext_db_size is used",
)
@click.option(
"--wait/--no-wait",
"wait_flag",
is_flag=True,
default=True,
show_default=True,
help="Wait or don't wait for task to complete, showing progress",
)
@confirm_opt(
"Destroy all data on and replace OSD {osdid} (and peer split OSDs) with new device {new_device}"
)
def cli_storage_osd_replace(
osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size, wait_flag
):
"""
Replace the block device of an existing OSD with ID OSDID, and any peer split OSDs with the same block device, with NEW_DEVICE. Use this command to replace a failed or smaller OSD block device with a new one in one command.
@ -3533,15 +3562,19 @@ def cli_storage_osd_replace(
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
"""
echo(
CLI_CONFIG,
"Waiting for node task to complete, this may take some time... ",
newline=False,
)
retcode, retmsg = pvc.lib.storage.ceph_osd_replace(
CLI_CONFIG, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
CLI_CONFIG,
osdid,
new_device,
old_device,
weight,
ext_db_ratio,
ext_db_size,
wait_flag,
)
echo(CLI_CONFIG, "done.")
if retcode and wait_flag:
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
finish(retcode, retmsg)
@ -3552,8 +3585,16 @@ def cli_storage_osd_replace(
@connection_req
@click.argument("osdid")
@click.argument("device")
@confirm_opt("Refresh OSD {osdid} on device {device}")
def cli_storage_osd_refresh(osdid, device):
@click.option(
"--wait/--no-wait",
"wait_flag",
is_flag=True,
default=True,
show_default=True,
help="Wait or don't wait for task to complete, showing progress",
)
@confirm_opt("Refresh OSD {osdid} (and peer split OSDs) on device {device}")
def cli_storage_osd_refresh(osdid, device, wait_flag):
"""
Refresh (reimport) the block DEVICE of an existing OSD with ID OSDID. Use this command to reimport a working OSD into a rebuilt/replaced node.
@ -3566,13 +3607,12 @@ def cli_storage_osd_refresh(osdid, device):
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
"""
echo(
CLI_CONFIG,
"Waiting for node task to complete, this may take some time... ",
newline=False,
retcode, retmsg = pvc.lib.storage.ceph_osd_refresh(
CLI_CONFIG, osdid, device, wait_flag
)
retcode, retmsg = pvc.lib.storage.ceph_osd_refresh(CLI_CONFIG, osdid, device)
echo(CLI_CONFIG, "done.")
if retcode and wait_flag:
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
finish(retcode, retmsg)
@ -3590,8 +3630,16 @@ def cli_storage_osd_refresh(osdid, device):
default=False,
help="Force removal even if steps fail",
)
@click.option(
"--wait/--no-wait",
"wait_flag",
is_flag=True,
default=True,
show_default=True,
help="Wait or don't wait for task to complete, showing progress",
)
@confirm_opt("Remove and destroy data on OSD {osdid}")
def cli_storage_osd_remove(osdid, force_flag):
def cli_storage_osd_remove(osdid, force_flag, wait_flag):
"""
Remove a Ceph OSD with ID OSDID.
@ -3602,13 +3650,12 @@ def cli_storage_osd_remove(osdid, force_flag):
NOTE: This command may take a long time to complete. Observe the node logs of the hosting OSD node for detailed status.
"""
echo(
CLI_CONFIG,
"Waiting for node task to complete, this may take some time... ",
newline=False,
retcode, retmsg = pvc.lib.storage.ceph_osd_remove(
CLI_CONFIG, osdid, force_flag, wait_flag
)
retcode, retmsg = pvc.lib.storage.ceph_osd_remove(CLI_CONFIG, osdid, force_flag)
echo(CLI_CONFIG, "done.")
if retcode and wait_flag:
retmsg = wait_for_celery_task(CLI_CONFIG, retmsg)
finish(retcode, retmsg)

View File

@ -20,7 +20,7 @@
###############################################################################
from click import echo as click_echo
from click import progressbar, confirm
from click import confirm
from datetime import datetime
from distutils.util import strtobool
from getpass import getuser
@ -32,7 +32,6 @@ from socket import gethostname
from subprocess import run, PIPE
from sys import argv
from syslog import syslog, openlog, closelog, LOG_AUTH
from time import sleep
from yaml import load as yload
from yaml import BaseLoader, SafeLoader
@ -191,123 +190,6 @@ def update_store(store_path, store_data):
jdump(store_data, fh, sort_keys=True, indent=4)
def wait_for_flush_locks(CLI_CONFIG, task_detail):
"""
Wait for a flush_locks task to complete
"""
task_id = task_detail["task_id"]
run_on = task_detail["run_on"]
echo(CLI_CONFIG, f"Task ID: {task_id} assigned to node {run_on}")
echo(CLI_CONFIG, "")
# Wait for the task to start
echo(CLI_CONFIG, "Waiting for task to start...", newline=False)
while True:
sleep(0.25)
task_status = pvc.lib.common.task_status(
CLI_CONFIG, task_id=task_id, is_watching=True
)
if task_status.get("state") != "PENDING":
break
echo(CLI_CONFIG, ".", newline=False)
echo(CLI_CONFIG, " done.")
echo(CLI_CONFIG, "")
# Start following the task state, updating progress as we go
total_task = task_status.get("total")
with progressbar(length=total_task, show_eta=False) as bar:
last_task = 0
maxlen = 0
while True:
sleep(0.25)
if task_status.get("state") != "RUNNING":
break
if task_status.get("current") > last_task:
current_task = int(task_status.get("current"))
bar.update(current_task - last_task)
last_task = current_task
# The extensive spaces at the end cause this to overwrite longer previous messages
curlen = len(str(task_status.get("status")))
if curlen > maxlen:
maxlen = curlen
lendiff = maxlen - curlen
overwrite_whitespace = " " * lendiff
echo(
CLI_CONFIG,
" " + task_status.get("status") + overwrite_whitespace,
newline=False,
)
task_status = pvc.lib.common.task_status(
CLI_CONFIG, task_id=task_id, is_watching=True
)
if task_status.get("state") == "SUCCESS":
bar.update(total_task - last_task)
echo(CLI_CONFIG, "")
retdata = task_status.get("state") + ": " + task_status.get("status")
return retdata
def wait_for_provisioner(CLI_CONFIG, task_id):
"""
Wait for a provisioner task to complete
"""
echo(CLI_CONFIG, f"Task ID: {task_id}")
echo(CLI_CONFIG, "")
# Wait for the task to start
echo(CLI_CONFIG, "Waiting for task to start...", newline=False)
while True:
sleep(1)
task_status = pvc.lib.provisioner.task_status(
CLI_CONFIG, task_id, is_watching=True
)
if task_status.get("state") != "PENDING":
break
echo(CLI_CONFIG, ".", newline=False)
echo(CLI_CONFIG, " done.")
echo(CLI_CONFIG, "")
# Start following the task state, updating progress as we go
total_task = task_status.get("total")
with progressbar(length=total_task, show_eta=False) as bar:
last_task = 0
maxlen = 0
while True:
sleep(1)
if task_status.get("state") != "RUNNING":
break
if task_status.get("current") > last_task:
current_task = int(task_status.get("current"))
bar.update(current_task - last_task)
last_task = current_task
# The extensive spaces at the end cause this to overwrite longer previous messages
curlen = len(str(task_status.get("status")))
if curlen > maxlen:
maxlen = curlen
lendiff = maxlen - curlen
overwrite_whitespace = " " * lendiff
echo(
CLI_CONFIG,
" " + task_status.get("status") + overwrite_whitespace,
newline=False,
)
task_status = pvc.lib.provisioner.task_status(
CLI_CONFIG, task_id, is_watching=True
)
if task_status.get("state") == "SUCCESS":
bar.update(total_task - last_task)
echo(CLI_CONFIG, "")
retdata = task_status.get("state") + ": " + task_status.get("status")
return retdata
def get_autobackup_config(CLI_CONFIG, cfgfile):
try:
config = dict()

View File

@ -19,6 +19,7 @@
#
###############################################################################
from click import progressbar
from time import sleep, time
from pvc.cli.helpers import echo
@ -62,3 +63,120 @@ def cli_node_waiter(config, node, state_field, state_value):
t_end = time()
echo(config, f" done. [{int(t_end - t_start)}s]")
def wait_for_celery_task(CLI_CONFIG, task_detail):
"""
Wait for a Celery task to complete
"""
task_id = task_detail["task_id"]
run_on = task_detail["run_on"]
echo(CLI_CONFIG, f"Task ID: {task_id} assigned to node {run_on}")
echo(CLI_CONFIG, "")
# Wait for the task to start
echo(CLI_CONFIG, "Waiting for task to start...", newline=False)
while True:
sleep(0.25)
task_status = pvc.lib.common.task_status(
CLI_CONFIG, task_id=task_id, is_watching=True
)
if task_status.get("state") != "PENDING":
break
echo(CLI_CONFIG, ".", newline=False)
echo(CLI_CONFIG, " done.")
echo(CLI_CONFIG, "")
# Start following the task state, updating progress as we go
total_task = task_status.get("total")
with progressbar(length=total_task, show_eta=False) as bar:
last_task = 0
maxlen = 0
while True:
sleep(0.25)
if task_status.get("state") != "RUNNING":
break
if task_status.get("current") > last_task:
current_task = int(task_status.get("current"))
bar.update(current_task - last_task)
last_task = current_task
# The extensive spaces at the end cause this to overwrite longer previous messages
curlen = len(str(task_status.get("status")))
if curlen > maxlen:
maxlen = curlen
lendiff = maxlen - curlen
overwrite_whitespace = " " * lendiff
echo(
CLI_CONFIG,
" " + task_status.get("status") + overwrite_whitespace,
newline=False,
)
task_status = pvc.lib.common.task_status(
CLI_CONFIG, task_id=task_id, is_watching=True
)
if task_status.get("state") == "SUCCESS":
bar.update(total_task - last_task)
echo(CLI_CONFIG, "")
retdata = task_status.get("state") + ": " + task_status.get("status")
return retdata
def wait_for_provisioner(CLI_CONFIG, task_id):
"""
Wait for a provisioner task to complete
"""
echo(CLI_CONFIG, f"Task ID: {task_id}")
echo(CLI_CONFIG, "")
# Wait for the task to start
echo(CLI_CONFIG, "Waiting for task to start...", newline=False)
while True:
sleep(1)
task_status = pvc.lib.provisioner.task_status(
CLI_CONFIG, task_id, is_watching=True
)
if task_status.get("state") != "PENDING":
break
echo(CLI_CONFIG, ".", newline=False)
echo(CLI_CONFIG, " done.")
echo(CLI_CONFIG, "")
# Start following the task state, updating progress as we go
total_task = task_status.get("total")
with progressbar(length=total_task, show_eta=False) as bar:
last_task = 0
maxlen = 0
while True:
sleep(1)
if task_status.get("state") != "RUNNING":
break
if task_status.get("current") > last_task:
current_task = int(task_status.get("current"))
bar.update(current_task - last_task)
last_task = current_task
# The extensive spaces at the end cause this to overwrite longer previous messages
curlen = len(str(task_status.get("status")))
if curlen > maxlen:
maxlen = curlen
lendiff = maxlen - curlen
overwrite_whitespace = " " * lendiff
echo(
CLI_CONFIG,
" " + task_status.get("status") + overwrite_whitespace,
newline=False,
)
task_status = pvc.lib.provisioner.task_status(
CLI_CONFIG, task_id, is_watching=True
)
if task_status.get("state") == "SUCCESS":
bar.update(total_task - last_task)
echo(CLI_CONFIG, "")
retdata = task_status.get("state") + ": " + task_status.get("status")
return retdata

View File

@ -164,7 +164,7 @@ def format_raw_output(config, status_data):
#
# OSD DB VG functions
#
def ceph_osd_db_vg_add(config, node, device):
def ceph_osd_db_vg_add(config, node, device, wait_flag):
"""
Add new Ceph OSD database volume group
@ -175,12 +175,21 @@ def ceph_osd_db_vg_add(config, node, device):
params = {"node": node, "device": device}
response = call_api(config, "post", "/storage/ceph/osddb", params=params)
if response.status_code == 200:
retstatus = True
if response.status_code == 202:
retvalue = True
retjson = response.json()
if not wait_flag:
retdata = (
f"Task ID: {retjson['task_id']} assigned to node {retjson['run_on']}"
)
else:
# Just return the task JSON without formatting
retdata = response.json()
else:
retstatus = False
retvalue = False
retdata = response.json().get("message", "")
return retstatus, response.json().get("message", "")
return retvalue, retdata
#
@ -231,7 +240,9 @@ def ceph_osd_list(config, limit):
return False, response.json().get("message", "")
def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_count):
def ceph_osd_add(
config, node, device, weight, ext_db_ratio, ext_db_size, osd_count, wait_flag
):
"""
Add new Ceph OSD
@ -254,16 +265,25 @@ def ceph_osd_add(config, node, device, weight, ext_db_ratio, ext_db_size, osd_co
response = call_api(config, "post", "/storage/ceph/osd", params=params)
if response.status_code == 200:
retstatus = True
if response.status_code == 202:
retvalue = True
retjson = response.json()
if not wait_flag:
retdata = (
f"Task ID: {retjson['task_id']} assigned to node {retjson['run_on']}"
)
else:
# Just return the task JSON without formatting
retdata = response.json()
else:
retstatus = False
retvalue = False
retdata = response.json().get("message", "")
return retstatus, response.json().get("message", "")
return retvalue, retdata
def ceph_osd_replace(
config, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size
config, osdid, new_device, old_device, weight, ext_db_ratio, ext_db_size, wait_flag
):
"""
Replace an existing Ceph OSD with a new device
@ -288,15 +308,24 @@ def ceph_osd_replace(
response = call_api(config, "post", f"/storage/ceph/osd/{osdid}", params=params)
if response.status_code == 200:
retstatus = True
if response.status_code == 202:
retvalue = True
retjson = response.json()
if not wait_flag:
retdata = (
f"Task ID: {retjson['task_id']} assigned to node {retjson['run_on']}"
)
else:
# Just return the task JSON without formatting
retdata = response.json()
else:
retstatus = False
retvalue = False
retdata = response.json().get("message", "")
return retstatus, response.json().get("message", "")
return retvalue, retdata
def ceph_osd_refresh(config, osdid, device):
def ceph_osd_refresh(config, osdid, device, wait_flag):
"""
Refresh (reimport) an existing Ceph OSD with device {device}
@ -309,15 +338,24 @@ def ceph_osd_refresh(config, osdid, device):
}
response = call_api(config, "put", f"/storage/ceph/osd/{osdid}", params=params)
if response.status_code == 200:
retstatus = True
if response.status_code == 202:
retvalue = True
retjson = response.json()
if not wait_flag:
retdata = (
f"Task ID: {retjson['task_id']} assigned to node {retjson['run_on']}"
)
else:
# Just return the task JSON without formatting
retdata = response.json()
else:
retstatus = False
retvalue = False
retdata = response.json().get("message", "")
return retstatus, response.json().get("message", "")
return retvalue, retdata
def ceph_osd_remove(config, osdid, force_flag):
def ceph_osd_remove(config, osdid, force_flag, wait_flag):
"""
Remove Ceph OSD
@ -330,12 +368,21 @@ def ceph_osd_remove(config, osdid, force_flag):
config, "delete", "/storage/ceph/osd/{osdid}".format(osdid=osdid), params=params
)
if response.status_code == 200:
retstatus = True
if response.status_code == 202:
retvalue = True
retjson = response.json()
if not wait_flag:
retdata = (
f"Task ID: {retjson['task_id']} assigned to node {retjson['run_on']}"
)
else:
# Just return the task JSON without formatting
retdata = response.json()
else:
retstatus = False
retvalue = False
retdata = response.json().get("message", "")
return retstatus, response.json().get("message", "")
return retvalue, retdata
def ceph_osd_state(config, osdid, state):

View File

@ -415,7 +415,7 @@ def vm_node(config, vm, target_node, action, force=False, wait=False, force_live
return retstatus, response.json().get("message", "")
def vm_locks(config, vm, wait_flag=False):
def vm_locks(config, vm, wait_flag):
"""
Flush RBD locks of (stopped) VM