Only perform fencing duties on primary

There was really no need for this to be shared among all the
coordinators, which seemed more fragile. This way only the primary will
try to fence dead nodes.
This commit is contained in:
Joshua Boniface 2019-06-24 20:17:51 -04:00
parent 249611b161
commit 464c69aac6
1 changed files with 29 additions and 28 deletions

View File

@ -1120,35 +1120,36 @@ def update_zookeeper():
# Close the Libvirt connection # Close the Libvirt connection
lv_conn.close() lv_conn.close()
# Look for dead nodes and fence them # Look for dead nodes and fence them (primary only)
if debug: if this_node.router_state == 'primary'
print("Look for dead nodes and fence them") if debug:
if config['daemon_mode'] == 'coordinator': print("Look for dead nodes and fence them")
for node_name in d_node: if config['daemon_mode'] == 'coordinator':
try: for node_name in d_node:
node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) try:
node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name))) node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name))
except: node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name)))
node_daemon_state = 'unknown' except:
node_domain_state = 'unknown' node_daemon_state = 'unknown'
node_keepalive = 0 node_domain_state = 'unknown'
node_keepalive = 0
# Handle deadtime and fencng if needed # Handle deadtime and fencng if needed
# (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds
# out-of-date while in 'start' state) # out-of-date while in 'start' state)
node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) ) node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) )
if node_keepalive < node_deadtime and node_daemon_state == 'run': if node_keepalive < node_deadtime and node_daemon_state == 'run':
logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w')
zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name))
with zk_lock: with zk_lock:
# Ensures that, if we lost the lock race and come out of waiting, # Ensures that, if we lost the lock race and come out of waiting,
# we won't try to trigger our own fence thread. # we won't try to trigger our own fence thread.
if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead': if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead':
fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={}) fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={})
fence_thread.start() fence_thread.start()
# Write the updated data after we start the fence thread # Write the updated data after we start the fence thread
zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' }) zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' })
# Display node information to the terminal # Display node information to the terminal
if config['log_keepalives']: if config['log_keepalives']: