Adjust fence failcount limit to 6 (30s)
The previous saving throw limit (3/15s) seems to have been too low. I was observing bizarre failures where a node would be fenced while it was still starting up. Some of this may have been related to Zookeeper connections taking too long, but this was inconsistent. Increase this to 6 saving throws (30s). This provides significantly more time for a node to properly check in on startup before another node fences it. In the real world, 15s vs 30s isn't that big of a downtime change, but prevents false-positive fences.
This commit is contained in:
parent
02343079c0
commit
ccee124c8b
|
@ -31,9 +31,10 @@ import pvcnoded.VMInstance as VMInstance
|
||||||
# Fence thread entry function
|
# Fence thread entry function
|
||||||
#
|
#
|
||||||
def fenceNode(node_name, zk_conn, config, logger):
|
def fenceNode(node_name, zk_conn, config, logger):
|
||||||
|
# We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
|
||||||
|
failcount_limit = 6
|
||||||
failcount = 0
|
failcount = 0
|
||||||
# We allow exactly 3 saving throws for the host to come back online
|
while failcount < failcount_limit:
|
||||||
while failcount < 3:
|
|
||||||
# Wait 5 seconds
|
# Wait 5 seconds
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
# Get the state
|
# Get the state
|
||||||
|
@ -41,7 +42,7 @@ def fenceNode(node_name, zk_conn, config, logger):
|
||||||
# Is it still 'dead'
|
# Is it still 'dead'
|
||||||
if node_daemon_state == 'dead':
|
if node_daemon_state == 'dead':
|
||||||
failcount += 1
|
failcount += 1
|
||||||
logger.out('Node "{}" failed {} saving throws'.format(node_name, failcount), state='w')
|
logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='w')
|
||||||
# It changed back to something else so it must be alive
|
# It changed back to something else so it must be alive
|
||||||
else:
|
else:
|
||||||
logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')
|
logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')
|
||||||
|
|
Loading…
Reference in New Issue