Readd RBD lock detection and clearing on startup

This is still needed due to the nature of the locks and freeing them on startup, and to preserve lock=fail behaviour on VM startup. Also fixes the fencing lock flush to directly use the client library outside of Celery. I don't like this hack but it seems prudent until we move fencing to the workers as well.
2023-11-10 01:28:41 -05:00
parent 2a9bc632fa
commit 83c4c6633d
3 changed files with 52 additions and 6 deletions
--- a/daemon-common/celery.py
+++ b/daemon-common/celery.py
@@ -33,6 +33,8 @@ class TaskFailure(Exception):
 def start(celery, msg, current=0, total=1):
    logger = getLogger(__name__)
    logger.info(f"Starting {current}/{total}: {msg}")
+    if celery is None:
+        return
    celery.update_state(
        state="RUNNING", meta={"current": current, "total": total, "status": msg}
    )
@@ -64,6 +66,8 @@ def log_err(celery, msg):
 def update(celery, msg, current=1, total=2):
    logger = getLogger(__name__)
    logger.info(f"Task update {current}/{total}: {msg}")
+    if celery is None:
+        return
    celery.update_state(
        state="RUNNING", meta={"current": current, "total": total, "status": msg}
    )
@@ -73,6 +77,8 @@ def update(celery, msg, current=1, total=2):
 def finish(celery, msg, current=2, total=2):
    logger = getLogger(__name__)
    logger.info(f"Task update {current}/{total}: Finishing up")
+    if celery is None:
+        return
    celery.update_state(
        state="RUNNING",
        meta={"current": current, "total": total, "status": "Finishing up"},
--- a/node-daemon/pvcnoded/objects/VMInstance.py
+++ b/node-daemon/pvcnoded/objects/VMInstance.py
@@ -24,8 +24,8 @@ import time
 import libvirt

 from threading import Thread
-
 from xml.etree import ElementTree
+from json import loads as jloads

 import daemon_lib.common as common

@@ -283,9 +283,49 @@ class VMInstance(object):
            self.logger.out(
                "Flushing RBD locks", state="i", prefix="Domain {}".format(self.domuuid)
            )
-            VMInstance.flush_locks(
-                self.zkhandler, self.logger, self.domuuid, self.this_node
+
+            rbd_list = self.zkhandler.read(
+                ("domain.storage.volumes", self.domuuid)
+            ).split(",")
+
+            locks = list()
+            for rbd in rbd_list:
+                retcode, stdout, stderr = common.run_os_command(
+                    f"rbd lock list --format json {rbd}"
                )
+                if retcode == 0:
+                    _locks = jloads(stdout)
+                    for lock in _locks:
+                        lock["rbd"] = rbd
+                        locks.append(lock)
+
+            for lock in locks:
+                lockid = lock["id"]
+                locker = lock["locker"]
+                owner = lock["address"].split(":")[0]
+                rbd = lock["rbd"]
+
+                if owner == self.this_node.storage_ipaddr:
+                    retcode, stdout, stderr = common.run_os_command(
+                        f'rbd lock remove {rbd} "{lockid}" "{locker}"'
+                    )
+                else:
+                    self.logger.out(
+                        "RBD lock does not belong to this host (owner {owner}) so freeing this long is dangerous; aborting VM start",
+                        state="e",
+                        prefix="Domain {}".format(self.domuuid),
+                    )
+                    self.zkhandler.write(
+                        [
+                            (("domain.state", self.domuuid), "fail"),
+                            (
+                                ("domain.failed_reason", self.domuuid),
+                                f"Could not safely free RBD lock {lockid} ({owner}) on volume {rbd}; stop VM and flush locks manually",
+                            ),
+                        ]
+                    )
+                    break
+
            if self.zkhandler.read(("domain.state", self.domuuid)) == "fail":
                lv_conn.close()
                self.dom = None
--- a/node-daemon/pvcnoded/util/fencing.py
+++ b/node-daemon/pvcnoded/util/fencing.py
@@ -23,7 +23,7 @@ import time

 import daemon_lib.common as common

-from pvcnoded.objects.VMInstance import VMInstance
+from daemon_lib.vm import vm_worker_flush_locks


 #
@@ -121,7 +121,7 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):

    # Migrate a VM after a flush
    def fence_migrate_vm(dom_uuid):
-        VMInstance.flush_locks(zkhandler, logger, dom_uuid)
+        vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True)

        target_node = common.findTargetNode(zkhandler, dom_uuid)