From ccee124c8b1b7a6a7c5d85ac88747ab8b1b8bcc4 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 5 Aug 2020 22:36:28 -0400 Subject: [PATCH] Adjust fence failcount limit to 6 (30s) The previous saving throw limit (3/15s) seems to have been too low. I was observing bizarre failures where a node would be fenced while it was still starting up. Some of this may have been related to Zookeeper connections taking too long, but this was inconsistent. Increase this to 6 saving throws (30s). This provides significantly more time for a node to properly check in on startup before another node fences it. In the real world, 15s vs 30s isn't that big of a downtime change, but prevents false-positive fences. --- node-daemon/pvcnoded/fencing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/node-daemon/pvcnoded/fencing.py b/node-daemon/pvcnoded/fencing.py index 8fb1ed45..0a26b613 100644 --- a/node-daemon/pvcnoded/fencing.py +++ b/node-daemon/pvcnoded/fencing.py @@ -31,9 +31,10 @@ import pvcnoded.VMInstance as VMInstance # Fence thread entry function # def fenceNode(node_name, zk_conn, config, logger): + # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it + failcount_limit = 6 failcount = 0 - # We allow exactly 3 saving throws for the host to come back online - while failcount < 3: + while failcount < failcount_limit: # Wait 5 seconds time.sleep(5) # Get the state @@ -41,7 +42,7 @@ def fenceNode(node_name, zk_conn, config, logger): # Is it still 'dead' if node_daemon_state == 'dead': failcount += 1 - logger.out('Node "{}" failed {} saving throws'.format(node_name, failcount), state='w') + logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='w') # It changed back to something else so it must be alive else: logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')