Improve thread timeouts in keepalive

Avoids various parts of the keepalive deadlocking waiting on data that
will never come when various internal processes fail. This should ensure
based on testing that the keepalive will always finish in <5 seconds.
This commit is contained in:
Joshua Boniface 2024-10-10 15:33:47 -04:00
parent 4c0d90b517
commit c08c3b2d7d
1 changed files with 7 additions and 17 deletions

View File

@ -756,29 +756,21 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
# Join against running threads # Join against running threads
if config["enable_hypervisor"]: if config["enable_hypervisor"]:
vm_stats_thread.join(timeout=config["keepalive_interval"]) vm_stats_thread.join(timeout=config["keepalive_interval"] - 1)
if vm_stats_thread.is_alive(): if vm_stats_thread.is_alive():
logger.out("VM stats gathering exceeded timeout, continuing", state="w") logger.out("VM stats gathering exceeded timeout, continuing", state="w")
if config["enable_storage"]: if config["enable_storage"]:
ceph_stats_thread.join(timeout=config["keepalive_interval"]) ceph_stats_thread.join(timeout=config["keepalive_interval"] - 1)
if ceph_stats_thread.is_alive(): if ceph_stats_thread.is_alive():
logger.out("Ceph stats gathering exceeded timeout, continuing", state="w") logger.out("Ceph stats gathering exceeded timeout, continuing", state="w")
# Get information from thread queues # Get information from thread queues
if config["enable_hypervisor"]: if config["enable_hypervisor"]:
try: try:
this_node.domains_count = vm_thread_queue.get( this_node.domains_count = vm_thread_queue.get(timeout=0.1)
timeout=config["keepalive_interval"] this_node.memalloc = vm_thread_queue.get(timeout=0.1)
) this_node.memprov = vm_thread_queue.get(timeout=0.1)
this_node.memalloc = vm_thread_queue.get( this_node.vcpualloc = vm_thread_queue.get(timeout=0.1)
timeout=config["keepalive_interval"]
)
this_node.memprov = vm_thread_queue.get(
timeout=config["keepalive_interval"]
)
this_node.vcpualloc = vm_thread_queue.get(
timeout=config["keepalive_interval"]
)
except Exception: except Exception:
logger.out("VM stats queue get exceeded timeout, continuing", state="w") logger.out("VM stats queue get exceeded timeout, continuing", state="w")
else: else:
@ -789,9 +781,7 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
if config["enable_storage"]: if config["enable_storage"]:
try: try:
osds_this_node = ceph_thread_queue.get( osds_this_node = ceph_thread_queue.get(timeout=0.1)
timeout=(config["keepalive_interval"] - 1)
)
except Exception: except Exception:
logger.out("Ceph stats queue get exceeded timeout, continuing", state="w") logger.out("Ceph stats queue get exceeded timeout, continuing", state="w")
osds_this_node = "?" osds_this_node = "?"