From 46ffe352e352a8cc442664b2c014ad245c30a0b1 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Tue, 11 Aug 2020 11:37:26 -0400 Subject: [PATCH] Better handle subthread timeouts in keepalive Prevent the main keepalive thread from getting stuck due to a subthread taking an enormous time. If this happens, the rest of the main keepalive will continue onward, thus ensuring that the main keepalive does not fail for a significant number of cycles, which would cause a fence. --- node-daemon/pvcnoded/Daemon.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index ea675d58..2c10be31 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -1463,24 +1463,36 @@ def node_keepalive(): # Join against running threads if enable_hypervisor: - vm_stats_thread.join() + vm_stats_thread.join(timeout=4.0) + if vm_stats_thread.is_alive(): + logger.out('VM stats gathering exceeded 4s timeout, continuing', state='w') if enable_storage: - ceph_stats_thread.join() + ceph_stats_thread.join(timeout=4.0) + if ceph_stats_thread.is_alive(): + logger.out('Ceph stats gathering exceeded 4s timeout, continuing', state='w') # Get information from thread queues if enable_hypervisor: - this_node.domains_count = vm_thread_queue.get() - this_node.memalloc = vm_thread_queue.get() - this_node.vcpualloc = vm_thread_queue.get() + try: + this_node.domains_count = vm_thread_queue.get() + this_node.memalloc = vm_thread_queue.get() + this_node.vcpualloc = vm_thread_queue.get() + except: + pass else: this_node.domains_count = 0 this_node.memalloc = 0 this_node.vcpualloc = 0 if enable_storage: - ceph_health_colour = ceph_thread_queue.get() - ceph_health = ceph_thread_queue.get() - osds_this_node = ceph_thread_queue.get() + try: + ceph_health_colour = ceph_thread_queue.get() + ceph_health = ceph_thread_queue.get() + osds_this_node = ceph_thread_queue.get() + except: + ceph_health_colour = fmt_cyan + ceph_health = 'UNKNOWN' + osds_this_node = '?' # Set our information in zookeeper keepalive_time = int(time.time())