From 464c69aac67303a7eb16c0061ad8fa202b15a535 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 24 Jun 2019 20:17:51 -0400 Subject: [PATCH] Only perform fencing duties on primary There was really no need for this to be shared among all the coordinators, which seemed more fragile. This way only the primary will try to fence dead nodes. --- node-daemon/pvcd/Daemon.py | 57 +++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/node-daemon/pvcd/Daemon.py b/node-daemon/pvcd/Daemon.py index 79a848bf..894b2d15 100644 --- a/node-daemon/pvcd/Daemon.py +++ b/node-daemon/pvcd/Daemon.py @@ -1120,35 +1120,36 @@ def update_zookeeper(): # Close the Libvirt connection lv_conn.close() - # Look for dead nodes and fence them - if debug: - print("Look for dead nodes and fence them") - if config['daemon_mode'] == 'coordinator': - for node_name in d_node: - try: - node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) - node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) - node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name))) - except: - node_daemon_state = 'unknown' - node_domain_state = 'unknown' - node_keepalive = 0 + # Look for dead nodes and fence them (primary only) + if this_node.router_state == 'primary' + if debug: + print("Look for dead nodes and fence them") + if config['daemon_mode'] == 'coordinator': + for node_name in d_node: + try: + node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) + node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) + node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name))) + except: + node_daemon_state = 'unknown' + node_domain_state = 'unknown' + node_keepalive = 0 - # Handle deadtime and fencng if needed - # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds - # out-of-date while in 'start' state) - node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) ) - if node_keepalive < node_deadtime and node_daemon_state == 'run': - logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') - zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) - with zk_lock: - # Ensures that, if we lost the lock race and come out of waiting, - # we won't try to trigger our own fence thread. - if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead': - fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={}) - fence_thread.start() - # Write the updated data after we start the fence thread - zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' }) + # Handle deadtime and fencng if needed + # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds + # out-of-date while in 'start' state) + node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) ) + if node_keepalive < node_deadtime and node_daemon_state == 'run': + logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') + zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) + with zk_lock: + # Ensures that, if we lost the lock race and come out of waiting, + # we won't try to trigger our own fence thread. + if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead': + fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={}) + fence_thread.start() + # Write the updated data after we start the fence thread + zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' }) # Display node information to the terminal if config['log_keepalives']: