From 83c4c6633d3792b2cfacfd3beed6a13d1dad27ee Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 10 Nov 2023 01:28:41 -0500 Subject: [PATCH] Readd RBD lock detection and clearing on startup This is still needed due to the nature of the locks and freeing them on startup, and to preserve lock=fail behaviour on VM startup. Also fixes the fencing lock flush to directly use the client library outside of Celery. I don't like this hack but it seems prudent until we move fencing to the workers as well. --- daemon-common/celery.py | 6 +++ node-daemon/pvcnoded/objects/VMInstance.py | 48 ++++++++++++++++++++-- node-daemon/pvcnoded/util/fencing.py | 4 +- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/daemon-common/celery.py b/daemon-common/celery.py index 08830a86..4888bff0 100644 --- a/daemon-common/celery.py +++ b/daemon-common/celery.py @@ -33,6 +33,8 @@ class TaskFailure(Exception): def start(celery, msg, current=0, total=1): logger = getLogger(__name__) logger.info(f"Starting {current}/{total}: {msg}") + if celery is None: + return celery.update_state( state="RUNNING", meta={"current": current, "total": total, "status": msg} ) @@ -64,6 +66,8 @@ def log_err(celery, msg): def update(celery, msg, current=1, total=2): logger = getLogger(__name__) logger.info(f"Task update {current}/{total}: {msg}") + if celery is None: + return celery.update_state( state="RUNNING", meta={"current": current, "total": total, "status": msg} ) @@ -73,6 +77,8 @@ def update(celery, msg, current=1, total=2): def finish(celery, msg, current=2, total=2): logger = getLogger(__name__) logger.info(f"Task update {current}/{total}: Finishing up") + if celery is None: + return celery.update_state( state="RUNNING", meta={"current": current, "total": total, "status": "Finishing up"}, diff --git a/node-daemon/pvcnoded/objects/VMInstance.py b/node-daemon/pvcnoded/objects/VMInstance.py index 218ac2b6..e3df75bf 100644 --- a/node-daemon/pvcnoded/objects/VMInstance.py +++ b/node-daemon/pvcnoded/objects/VMInstance.py @@ -24,8 +24,8 @@ import time import libvirt from threading import Thread - from xml.etree import ElementTree +from json import loads as jloads import daemon_lib.common as common @@ -283,9 +283,49 @@ class VMInstance(object): self.logger.out( "Flushing RBD locks", state="i", prefix="Domain {}".format(self.domuuid) ) - VMInstance.flush_locks( - self.zkhandler, self.logger, self.domuuid, self.this_node - ) + + rbd_list = self.zkhandler.read( + ("domain.storage.volumes", self.domuuid) + ).split(",") + + locks = list() + for rbd in rbd_list: + retcode, stdout, stderr = common.run_os_command( + f"rbd lock list --format json {rbd}" + ) + if retcode == 0: + _locks = jloads(stdout) + for lock in _locks: + lock["rbd"] = rbd + locks.append(lock) + + for lock in locks: + lockid = lock["id"] + locker = lock["locker"] + owner = lock["address"].split(":")[0] + rbd = lock["rbd"] + + if owner == self.this_node.storage_ipaddr: + retcode, stdout, stderr = common.run_os_command( + f'rbd lock remove {rbd} "{lockid}" "{locker}"' + ) + else: + self.logger.out( + "RBD lock does not belong to this host (owner {owner}) so freeing this long is dangerous; aborting VM start", + state="e", + prefix="Domain {}".format(self.domuuid), + ) + self.zkhandler.write( + [ + (("domain.state", self.domuuid), "fail"), + ( + ("domain.failed_reason", self.domuuid), + f"Could not safely free RBD lock {lockid} ({owner}) on volume {rbd}; stop VM and flush locks manually", + ), + ] + ) + break + if self.zkhandler.read(("domain.state", self.domuuid)) == "fail": lv_conn.close() self.dom = None diff --git a/node-daemon/pvcnoded/util/fencing.py b/node-daemon/pvcnoded/util/fencing.py index 33aeaddd..fc5d2b93 100644 --- a/node-daemon/pvcnoded/util/fencing.py +++ b/node-daemon/pvcnoded/util/fencing.py @@ -23,7 +23,7 @@ import time import daemon_lib.common as common -from pvcnoded.objects.VMInstance import VMInstance +from daemon_lib.vm import vm_worker_flush_locks # @@ -121,7 +121,7 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): # Migrate a VM after a flush def fence_migrate_vm(dom_uuid): - VMInstance.flush_locks(zkhandler, logger, dom_uuid) + vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True) target_node = common.findTargetNode(zkhandler, dom_uuid)