From a6f8500309c5547ed98d57181ab138af5800133e Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 10 Oct 2024 16:38:19 -0400 Subject: [PATCH] Improve fence handling to prevent anomalies 1. Move fence monitoring to its own thread rather than doing the listing and triggering within the main keepalive thread. 2. Add a global lock key at /config/fence_lock and use this lock key to prevent multiple nodes from trying to run fences simultaneously. 3. Run the fencing monitor for each node sequentially within the context of the main fence monitoring thread, to ensure that fences of multiple nodes happen sequentially rather than in parallel. All of these should help to prevent any anomalies where one node can try to fence multiple nodes at once without recourse. --- daemon-common/migrations/versions/15.json | 1 + daemon-common/zkhandler.py | 3 +- node-daemon/pvcnoded/util/fencing.py | 64 ++++++++++++++++++++++- node-daemon/pvcnoded/util/keepalive.py | 44 +++------------- 4 files changed, 71 insertions(+), 41 deletions(-) create mode 100644 daemon-common/migrations/versions/15.json diff --git a/daemon-common/migrations/versions/15.json b/daemon-common/migrations/versions/15.json new file mode 100644 index 00000000..0da01938 --- /dev/null +++ b/daemon-common/migrations/versions/15.json @@ -0,0 +1 @@ +{"version": "15", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.fence_lock": "/config/fence_lock", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock", "snapshots": "/snapshots"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "domain_snapshot": {"name": "", "timestamp": "/timestamp", "xml": "/xml", "rbd_snapshots": "/rbdsnaplist"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index 09caa5a1..54bd3bbb 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -576,7 +576,7 @@ class ZKHandler(object): # class ZKSchema(object): # Current version - _version = 14 + _version = 15 # Root for doing nested keys _schema_root = "" @@ -592,6 +592,7 @@ class ZKSchema(object): "schema.version": f"{_schema_root}/schema/version", "config": f"{_schema_root}/config", "config.maintenance": f"{_schema_root}/config/maintenance", + "config.fence_lock": f"{_schema_root}/config/fence_lock", "config.primary_node": f"{_schema_root}/config/primary_node", "config.primary_node.sync_lock": f"{_schema_root}/config/primary_node/sync_lock", "config.upstream_ip": f"{_schema_root}/config/upstream_ip", diff --git a/node-daemon/pvcnoded/util/fencing.py b/node-daemon/pvcnoded/util/fencing.py index 0524bf43..a538bb28 100644 --- a/node-daemon/pvcnoded/util/fencing.py +++ b/node-daemon/pvcnoded/util/fencing.py @@ -21,15 +21,72 @@ import time +from kazoo.exceptions import LockTimeout + import daemon_lib.common as common from daemon_lib.vm import vm_worker_flush_locks # -# Fence thread entry function +# Fence monitor thread entrypoint # -def fence_node(node_name, zkhandler, config, logger): +def fence_monitor(zkhandler, config, logger): + # Attempt to acquire an exclusive lock on the fence_lock key + # If it is already held, we'll abort since another node is processing fences + lock = zkhandler.exclusivelock("base.config.fence_lock") + + try: + lock.acquire(timeout=config["keepalive_interval"] - 1) + + for node_name in zkhandler.children("base.node"): + try: + node_daemon_state = zkhandler.read(("node.state.daemon", node_name)) + node_keepalive = int(zkhandler.read(("node.keepalive", node_name))) + except Exception: + node_daemon_state = "unknown" + node_keepalive = 0 + + node_deadtime = int(time.time()) - ( + int(config["keepalive_interval"]) * int(config["fence_intervals"]) + ) + if node_keepalive < node_deadtime and node_daemon_state == "run": + logger.out( + f"Node {node_name} seems dead; starting monitor for fencing", + state="w", + ) + zk_lock = zkhandler.writelock(("node.state.daemon", node_name)) + with zk_lock: + # Ensures that, if we lost the lock race and come out of waiting, + # we won't try to trigger our own fence thread. + if zkhandler.read(("node.state.daemon", node_name)) != "dead": + # Write the updated data after we start the fence thread + zkhandler.write([(("node.state.daemon", node_name), "dead")]) + # Start the fence monitoring task for this node + # NOTE: This is not a subthread and is designed to block this for loop + # This ensures that only one node is ever being fenced at a time + fence_node(zkhandler, config, logger, node_name) + else: + logger.out( + f"Node {node_name} is OK; last checkin is {node_deadtime - node_keepalive}s from threshold, node state is '{node_daemon_state}'", + state="d", + prefix="fence-thread", + ) + except LockTimeout: + logger.out( + "Fence monitor thread failed to acquire exclusive lock; skipping", state="i" + ) + except Exception as e: + logger.out(f"Fence monitor thread failed: {e}", state="w") + finally: + # We're finished, so release the global lock + lock.release() + + +# +# Fence action function +# +def fence_node(zkhandler, config, logger, node_name): # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it failcount_limit = 6 failcount = 0 @@ -202,6 +259,9 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger): # Loop through the VMs for dom_uuid in dead_node_running_domains: + if dom_uuid in ["0", 0]: + # Skip the invalid "0" UUID we sometimes get + continue try: fence_migrate_vm(dom_uuid) except Exception as e: diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py index 2068359a..f37bfc61 100644 --- a/node-daemon/pvcnoded/util/keepalive.py +++ b/node-daemon/pvcnoded/util/keepalive.py @@ -877,44 +877,12 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats): ) # Look for dead nodes and fence them - if not this_node.maintenance: + if not this_node.maintenance and config["daemon_mode"] == "coordinator": logger.out( "Look for dead nodes and fence them", state="d", prefix="main-thread" ) - if config["daemon_mode"] == "coordinator": - for node_name in zkhandler.children("base.node"): - try: - node_daemon_state = zkhandler.read(("node.state.daemon", node_name)) - node_keepalive = int(zkhandler.read(("node.keepalive", node_name))) - except Exception: - node_daemon_state = "unknown" - node_keepalive = 0 - - # Handle deadtime and fencng if needed - # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds - # out-of-date while in 'start' state) - node_deadtime = int(time.time()) - ( - int(config["keepalive_interval"]) * int(config["fence_intervals"]) - ) - if node_keepalive < node_deadtime and node_daemon_state == "run": - logger.out( - "Node {} seems dead - starting monitor for fencing".format( - node_name - ), - state="w", - ) - zk_lock = zkhandler.writelock(("node.state.daemon", node_name)) - with zk_lock: - # Ensures that, if we lost the lock race and come out of waiting, - # we won't try to trigger our own fence thread. - if zkhandler.read(("node.state.daemon", node_name)) != "dead": - fence_thread = Thread( - target=pvcnoded.util.fencing.fence_node, - args=(node_name, zkhandler, config, logger), - kwargs={}, - ) - fence_thread.start() - # Write the updated data after we start the fence thread - zkhandler.write( - [(("node.state.daemon", node_name), "dead")] - ) + fence_monitor_thread = Thread( + target=pvcnoded.util.fencing.fence_monitor, + args=(zkhandler, config, logger), + ) + fence_monitor_thread.start()