From c08c3b2d7d832939d0d9ceb5e8c889223cb7758b Mon Sep 17 00:00:00 2001
From: "Joshua M. Boniface" <joshua@boniface.me>
Date: Thu, 10 Oct 2024 15:33:47 -0400
Subject: [PATCH] Improve thread timeouts in keepalive

Avoids various parts of the keepalive deadlocking waiting on data that
will never come when various internal processes fail. This should ensure
based on testing that the keepalive will always finish in <5 seconds.
---
 node-daemon/pvcnoded/util/keepalive.py | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/node-daemon/pvcnoded/util/keepalive.py b/node-daemon/pvcnoded/util/keepalive.py
index 29fbf8f8..2068359a 100644
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@@ -756,29 +756,21 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
 
     # Join against running threads
     if config["enable_hypervisor"]:
-        vm_stats_thread.join(timeout=config["keepalive_interval"])
+        vm_stats_thread.join(timeout=config["keepalive_interval"] - 1)
         if vm_stats_thread.is_alive():
             logger.out("VM stats gathering exceeded timeout, continuing", state="w")
     if config["enable_storage"]:
-        ceph_stats_thread.join(timeout=config["keepalive_interval"])
+        ceph_stats_thread.join(timeout=config["keepalive_interval"] - 1)
         if ceph_stats_thread.is_alive():
             logger.out("Ceph stats gathering exceeded timeout, continuing", state="w")
 
     # Get information from thread queues
     if config["enable_hypervisor"]:
         try:
-            this_node.domains_count = vm_thread_queue.get(
-                timeout=config["keepalive_interval"]
-            )
-            this_node.memalloc = vm_thread_queue.get(
-                timeout=config["keepalive_interval"]
-            )
-            this_node.memprov = vm_thread_queue.get(
-                timeout=config["keepalive_interval"]
-            )
-            this_node.vcpualloc = vm_thread_queue.get(
-                timeout=config["keepalive_interval"]
-            )
+            this_node.domains_count = vm_thread_queue.get(timeout=0.1)
+            this_node.memalloc = vm_thread_queue.get(timeout=0.1)
+            this_node.memprov = vm_thread_queue.get(timeout=0.1)
+            this_node.vcpualloc = vm_thread_queue.get(timeout=0.1)
         except Exception:
             logger.out("VM stats queue get exceeded timeout, continuing", state="w")
     else:
@@ -789,9 +781,7 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
 
     if config["enable_storage"]:
         try:
-            osds_this_node = ceph_thread_queue.get(
-                timeout=(config["keepalive_interval"] - 1)
-            )
+            osds_this_node = ceph_thread_queue.get(timeout=0.1)
         except Exception:
             logger.out("Ceph stats queue get exceeded timeout, continuing", state="w")
             osds_this_node = "?"