Only perform fencing duties on primary
There was really no need for this to be shared among all the coordinators, which seemed more fragile. This way only the primary will try to fence dead nodes.
This commit is contained in:
parent
249611b161
commit
464c69aac6
|
@ -1120,35 +1120,36 @@ def update_zookeeper():
|
||||||
# Close the Libvirt connection
|
# Close the Libvirt connection
|
||||||
lv_conn.close()
|
lv_conn.close()
|
||||||
|
|
||||||
# Look for dead nodes and fence them
|
# Look for dead nodes and fence them (primary only)
|
||||||
if debug:
|
if this_node.router_state == 'primary'
|
||||||
print("Look for dead nodes and fence them")
|
if debug:
|
||||||
if config['daemon_mode'] == 'coordinator':
|
print("Look for dead nodes and fence them")
|
||||||
for node_name in d_node:
|
if config['daemon_mode'] == 'coordinator':
|
||||||
try:
|
for node_name in d_node:
|
||||||
node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
|
try:
|
||||||
node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name))
|
node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
|
||||||
node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name)))
|
node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name))
|
||||||
except:
|
node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name)))
|
||||||
node_daemon_state = 'unknown'
|
except:
|
||||||
node_domain_state = 'unknown'
|
node_daemon_state = 'unknown'
|
||||||
node_keepalive = 0
|
node_domain_state = 'unknown'
|
||||||
|
node_keepalive = 0
|
||||||
|
|
||||||
# Handle deadtime and fencng if needed
|
# Handle deadtime and fencng if needed
|
||||||
# (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds
|
# (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds
|
||||||
# out-of-date while in 'start' state)
|
# out-of-date while in 'start' state)
|
||||||
node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) )
|
node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) )
|
||||||
if node_keepalive < node_deadtime and node_daemon_state == 'run':
|
if node_keepalive < node_deadtime and node_daemon_state == 'run':
|
||||||
logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w')
|
logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w')
|
||||||
zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
|
zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
|
||||||
with zk_lock:
|
with zk_lock:
|
||||||
# Ensures that, if we lost the lock race and come out of waiting,
|
# Ensures that, if we lost the lock race and come out of waiting,
|
||||||
# we won't try to trigger our own fence thread.
|
# we won't try to trigger our own fence thread.
|
||||||
if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead':
|
if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead':
|
||||||
fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={})
|
fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={})
|
||||||
fence_thread.start()
|
fence_thread.start()
|
||||||
# Write the updated data after we start the fence thread
|
# Write the updated data after we start the fence thread
|
||||||
zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' })
|
zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' })
|
||||||
|
|
||||||
# Display node information to the terminal
|
# Display node information to the terminal
|
||||||
if config['log_keepalives']:
|
if config['log_keepalives']:
|
||||||
|
|
Loading…
Reference in New Issue