Improve fence handling to prevent anomalies

1. Move fence monitoring to its own thread rather than doing the listing and triggering within the main keepalive thread. 2. Add a global lock key at /config/fence_lock and use this lock key to prevent multiple nodes from trying to run fences simultaneously. 3. Run the fencing monitor for each node sequentially within the context of the main fence monitoring thread, to ensure that fences of multiple nodes happen sequentially rather than in parallel. All of these should help to prevent any anomalies where one node can try to fence multiple nodes at once without recourse.
2024-10-10 16:38:19 -04:00
parent ebec1332e9
commit a6f8500309
4 changed files with 71 additions and 41 deletions
--- a/daemon-common/migrations/versions/15.json
+++ b/daemon-common/migrations/versions/15.json
@ -0,0 +1 @@
+{"version": "15", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.fence_lock": "/config/fence_lock", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock", "snapshots": "/snapshots"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "domain_snapshot": {"name": "", "timestamp": "/timestamp", "xml": "/xml", "rbd_snapshots": "/rbdsnaplist"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@ -576,7 +576,7 @@ class ZKHandler(object):
 #
 class ZKSchema(object):
    # Current version
-    _version = 14
+    _version = 15

    # Root for doing nested keys
    _schema_root = ""
@ -592,6 +592,7 @@ class ZKSchema(object):
            "schema.version": f"{_schema_root}/schema/version",
            "config": f"{_schema_root}/config",
            "config.maintenance": f"{_schema_root}/config/maintenance",
+            "config.fence_lock": f"{_schema_root}/config/fence_lock",
            "config.primary_node": f"{_schema_root}/config/primary_node",
            "config.primary_node.sync_lock": f"{_schema_root}/config/primary_node/sync_lock",
            "config.upstream_ip": f"{_schema_root}/config/upstream_ip",
--- a/node-daemon/pvcnoded/util/fencing.py
+++ b/node-daemon/pvcnoded/util/fencing.py
@ -21,15 +21,72 @@

 import time

+from kazoo.exceptions import LockTimeout
+
 import daemon_lib.common as common

 from daemon_lib.vm import vm_worker_flush_locks


 #
-# Fence thread entry function
+# Fence monitor thread entrypoint
 #
-def fence_node(node_name, zkhandler, config, logger):
+def fence_monitor(zkhandler, config, logger):
+    # Attempt to acquire an exclusive lock on the fence_lock key
+    # If it is already held, we'll abort since another node is processing fences
+    lock = zkhandler.exclusivelock("base.config.fence_lock")
+
+    try:
+        lock.acquire(timeout=config["keepalive_interval"] - 1)
+
+        for node_name in zkhandler.children("base.node"):
+            try:
+                node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
+                node_keepalive = int(zkhandler.read(("node.keepalive", node_name)))
+            except Exception:
+                node_daemon_state = "unknown"
+                node_keepalive = 0
+
+            node_deadtime = int(time.time()) - (
+                int(config["keepalive_interval"]) * int(config["fence_intervals"])
+            )
+            if node_keepalive < node_deadtime and node_daemon_state == "run":
+                logger.out(
+                    f"Node {node_name} seems dead; starting monitor for fencing",
+                    state="w",
+                )
+                zk_lock = zkhandler.writelock(("node.state.daemon", node_name))
+                with zk_lock:
+                    # Ensures that, if we lost the lock race and come out of waiting,
+                    # we won't try to trigger our own fence thread.
+                    if zkhandler.read(("node.state.daemon", node_name)) != "dead":
+                        # Write the updated data after we start the fence thread
+                        zkhandler.write([(("node.state.daemon", node_name), "dead")])
+                        # Start the fence monitoring task for this node
+                        # NOTE: This is not a subthread and is designed to block this for loop
+                        # This ensures that only one node is ever being fenced at a time
+                        fence_node(zkhandler, config, logger, node_name)
+            else:
+                logger.out(
+                    f"Node {node_name} is OK; last checkin is {node_deadtime - node_keepalive}s from threshold, node state is '{node_daemon_state}'",
+                    state="d",
+                    prefix="fence-thread",
+                )
+    except LockTimeout:
+        logger.out(
+            "Fence monitor thread failed to acquire exclusive lock; skipping", state="i"
+        )
+    except Exception as e:
+        logger.out(f"Fence monitor thread failed: {e}", state="w")
+    finally:
+        # We're finished, so release the global lock
+        lock.release()
+
+
+#
+# Fence action function
+#
+def fence_node(zkhandler, config, logger, node_name):
    # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
    failcount_limit = 6
    failcount = 0
@ -202,6 +259,9 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):

    # Loop through the VMs
    for dom_uuid in dead_node_running_domains:
+        if dom_uuid in ["0", 0]:
+            # Skip the invalid "0" UUID we sometimes get
+            continue
        try:
            fence_migrate_vm(dom_uuid)
        except Exception as e:
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@ -877,44 +877,12 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
            )

    # Look for dead nodes and fence them
-    if not this_node.maintenance:
+    if not this_node.maintenance and config["daemon_mode"] == "coordinator":
        logger.out(
            "Look for dead nodes and fence them", state="d", prefix="main-thread"
        )
-        if config["daemon_mode"] == "coordinator":
-            for node_name in zkhandler.children("base.node"):
-                try:
-                    node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
-                    node_keepalive = int(zkhandler.read(("node.keepalive", node_name)))
-                except Exception:
-                    node_daemon_state = "unknown"
-                    node_keepalive = 0
-
-                # Handle deadtime and fencng if needed
-                # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds
-                # out-of-date while in 'start' state)
-                node_deadtime = int(time.time()) - (
-                    int(config["keepalive_interval"]) * int(config["fence_intervals"])
-                )
-                if node_keepalive < node_deadtime and node_daemon_state == "run":
-                    logger.out(
-                        "Node {} seems dead - starting monitor for fencing".format(
-                            node_name
-                        ),
-                        state="w",
-                    )
-                    zk_lock = zkhandler.writelock(("node.state.daemon", node_name))
-                    with zk_lock:
-                        # Ensures that, if we lost the lock race and come out of waiting,
-                        # we won't try to trigger our own fence thread.
-                        if zkhandler.read(("node.state.daemon", node_name)) != "dead":
-                            fence_thread = Thread(
-                                target=pvcnoded.util.fencing.fence_node,
-                                args=(node_name, zkhandler, config, logger),
-                                kwargs={},
-                            )
-                            fence_thread.start()
-                            # Write the updated data after we start the fence thread
-                            zkhandler.write(
-                                [(("node.state.daemon", node_name), "dead")]
-                            )
+        fence_monitor_thread = Thread(
+            target=pvcnoded.util.fencing.fence_monitor,
+            args=(zkhandler, config, logger),
+        )
+        fence_monitor_thread.start()
				`@ -0,0 +1 @@`
				{"version": "15", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.fence_lock": "/config/fence_lock", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock", "snapshots": "/snapshots"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "domain_snapshot": {"name": "", "timestamp": "/timestamp", "xml": "/xml", "rbd_snapshots": "/rbdsnaplist"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}