Readd RBD lock detection and clearing on startup

This is still needed due to the nature of the locks and freeing them on startup, and to preserve lock=fail behaviour on VM startup. Also fixes the fencing lock flush to directly use the client library outside of Celery. I don't like this hack but it seems prudent until we move fencing to the workers as well.
2023-11-10 01:28:41 -05:00
parent 2a9bc632fa
commit 83c4c6633d
3 changed files with 52 additions and 6 deletions
--- a/daemon-common/celery.py
+++ b/daemon-common/celery.py
@@ -33,6 +33,8 @@ class TaskFailure(Exception):
 def start(celery, msg, current=0, total=1):
    logger = getLogger(__name__)
    logger.info(f"Starting {current}/{total}: {msg}")
    if celery is None:
        return
    celery.update_state(
        state="RUNNING", meta={"current": current, "total": total, "status": msg}
    )
@@ -64,6 +66,8 @@ def log_err(celery, msg):
 def update(celery, msg, current=1, total=2):
    logger = getLogger(__name__)
    logger.info(f"Task update {current}/{total}: {msg}")
    if celery is None:
        return
    celery.update_state(
        state="RUNNING", meta={"current": current, "total": total, "status": msg}
    )
@@ -73,6 +77,8 @@ def update(celery, msg, current=1, total=2):
 def finish(celery, msg, current=2, total=2):
    logger = getLogger(__name__)
    logger.info(f"Task update {current}/{total}: Finishing up")
    if celery is None:
        return
    celery.update_state(
        state="RUNNING",
        meta={"current": current, "total": total, "status": "Finishing up"},
--- a/node-daemon/pvcnoded/objects/VMInstance.py
+++ b/node-daemon/pvcnoded/objects/VMInstance.py
@@ -24,8 +24,8 @@ import time
 import libvirt
 from threading import Thread
 from xml.etree import ElementTree
 from json import loads as jloads
 import daemon_lib.common as common
@@ -283,9 +283,49 @@ class VMInstance(object):
            self.logger.out(
                "Flushing RBD locks", state="i", prefix="Domain {}".format(self.domuuid)
            )
-            VMInstance.flush_locks(
+
-                self.zkhandler, self.logger, self.domuuid, self.this_node
+            rbd_list = self.zkhandler.read(
                ("domain.storage.volumes", self.domuuid)
            ).split(",")
            locks = list()
            for rbd in rbd_list:
                retcode, stdout, stderr = common.run_os_command(
                    f"rbd lock list --format json {rbd}"
                )
                if retcode == 0:
                    _locks = jloads(stdout)
                    for lock in _locks:
                        lock["rbd"] = rbd
                        locks.append(lock)
            for lock in locks:
                lockid = lock["id"]
                locker = lock["locker"]
                owner = lock["address"].split(":")[0]
                rbd = lock["rbd"]
                if owner == self.this_node.storage_ipaddr:
                    retcode, stdout, stderr = common.run_os_command(
                        f'rbd lock remove {rbd} "{lockid}" "{locker}"'
                    )
                else:
                    self.logger.out(
                        "RBD lock does not belong to this host (owner {owner}) so freeing this long is dangerous; aborting VM start",
                        state="e",
                        prefix="Domain {}".format(self.domuuid),
                    )
                    self.zkhandler.write(
                        [
                            (("domain.state", self.domuuid), "fail"),
                            (
                                ("domain.failed_reason", self.domuuid),
                                f"Could not safely free RBD lock {lockid} ({owner}) on volume {rbd}; stop VM and flush locks manually",
                            ),
                        ]
                    )
                    break
            if self.zkhandler.read(("domain.state", self.domuuid)) == "fail":
                lv_conn.close()
                self.dom = None
--- a/node-daemon/pvcnoded/util/fencing.py
+++ b/node-daemon/pvcnoded/util/fencing.py
@@ -23,7 +23,7 @@ import time
 import daemon_lib.common as common
-from pvcnoded.objects.VMInstance import VMInstance
+from daemon_lib.vm import vm_worker_flush_locks
 #
@@ -121,7 +121,7 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
    # Migrate a VM after a flush
    def fence_migrate_vm(dom_uuid):
-        VMInstance.flush_locks(zkhandler, logger, dom_uuid)
+        vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True)
        target_node = common.findTargetNode(zkhandler, dom_uuid)