Readd RBD lock detection and clearing on startup

This is still needed due to the nature of the locks and freeing them on
startup, and to preserve lock=fail behaviour on VM startup.

Also fixes the fencing lock flush to directly use the client library
outside of Celery. I don't like this hack but it seems prudent until we
move fencing to the workers as well.
This commit is contained in:
Joshua Boniface 2023-11-10 01:28:41 -05:00
parent 2a9bc632fa
commit 83c4c6633d
3 changed files with 52 additions and 6 deletions

View File

@ -33,6 +33,8 @@ class TaskFailure(Exception):
def start(celery, msg, current=0, total=1):
logger = getLogger(__name__)
logger.info(f"Starting {current}/{total}: {msg}")
if celery is None:
return
celery.update_state(
state="RUNNING", meta={"current": current, "total": total, "status": msg}
)
@ -64,6 +66,8 @@ def log_err(celery, msg):
def update(celery, msg, current=1, total=2):
logger = getLogger(__name__)
logger.info(f"Task update {current}/{total}: {msg}")
if celery is None:
return
celery.update_state(
state="RUNNING", meta={"current": current, "total": total, "status": msg}
)
@ -73,6 +77,8 @@ def update(celery, msg, current=1, total=2):
def finish(celery, msg, current=2, total=2):
logger = getLogger(__name__)
logger.info(f"Task update {current}/{total}: Finishing up")
if celery is None:
return
celery.update_state(
state="RUNNING",
meta={"current": current, "total": total, "status": "Finishing up"},

View File

@ -24,8 +24,8 @@ import time
import libvirt
from threading import Thread
from xml.etree import ElementTree
from json import loads as jloads
import daemon_lib.common as common
@ -283,9 +283,49 @@ class VMInstance(object):
self.logger.out(
"Flushing RBD locks", state="i", prefix="Domain {}".format(self.domuuid)
)
VMInstance.flush_locks(
self.zkhandler, self.logger, self.domuuid, self.this_node
rbd_list = self.zkhandler.read(
("domain.storage.volumes", self.domuuid)
).split(",")
locks = list()
for rbd in rbd_list:
retcode, stdout, stderr = common.run_os_command(
f"rbd lock list --format json {rbd}"
)
if retcode == 0:
_locks = jloads(stdout)
for lock in _locks:
lock["rbd"] = rbd
locks.append(lock)
for lock in locks:
lockid = lock["id"]
locker = lock["locker"]
owner = lock["address"].split(":")[0]
rbd = lock["rbd"]
if owner == self.this_node.storage_ipaddr:
retcode, stdout, stderr = common.run_os_command(
f'rbd lock remove {rbd} "{lockid}" "{locker}"'
)
else:
self.logger.out(
"RBD lock does not belong to this host (owner {owner}) so freeing this long is dangerous; aborting VM start",
state="e",
prefix="Domain {}".format(self.domuuid),
)
self.zkhandler.write(
[
(("domain.state", self.domuuid), "fail"),
(
("domain.failed_reason", self.domuuid),
f"Could not safely free RBD lock {lockid} ({owner}) on volume {rbd}; stop VM and flush locks manually",
),
]
)
break
if self.zkhandler.read(("domain.state", self.domuuid)) == "fail":
lv_conn.close()
self.dom = None

View File

@ -23,7 +23,7 @@ import time
import daemon_lib.common as common
from pvcnoded.objects.VMInstance import VMInstance
from daemon_lib.vm import vm_worker_flush_locks
#
@ -121,7 +121,7 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
# Migrate a VM after a flush
def fence_migrate_vm(dom_uuid):
VMInstance.flush_locks(zkhandler, logger, dom_uuid)
vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True)
target_node = common.findTargetNode(zkhandler, dom_uuid)