From b413e042a64221dcd81460e8eee4447aa214eb77 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sun, 12 Apr 2020 03:40:17 -0400 Subject: [PATCH] Improve handling of primary contention Previously, contention could occasionally cause a flap/dual primary contention state due to the lack of checking within this function. This could cause a state where a node transitions to primary than is almost immediately shifted away, which could cause undefined behaviour in the cluster. The solution includes several elements: * Implement an exclusive lock operation in zkhandler * Switch the become_primary function to use this exclusive lock * Implement exclusive locking during the contention process * As a failsafe, check stat versions before setting the node as the primary node, in case another node already has * Delay the start of takeover/relinquish operations by slightly longer than the lock timeout * Make the current router_state conditions more explicit (positive conditionals rather than negative conditionals) The new scenario ensures that during contention, only one secondary will ever succeed at acquiring the lock. Ideally, the other would then grab the lock and pass, but in testing this does not seem to be the case - the lock always times out, so the failsafe check is technically not needed but has been left as an added safety mechanism. With this setup, the node that fails the contention will never block the switchover nor will it try to force itself onto the cluster after another node has successfully won contention. Timeouts may need to be adjusted in the future, but the base timeout of 0.4 seconds (and transition delay of 0.5 seconds) seems to work reliably during preliminary tests. --- node-daemon/pvcnoded/Daemon.py | 24 +++++++++++++++++++++--- node-daemon/pvcnoded/NodeInstance.py | 2 +- node-daemon/pvcnoded/zkhandler.py | 6 ++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index ac2c1f1c..57a2d83f 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -824,18 +824,36 @@ def update_primary(new_primary, stat, event=''): new_primary = new_primary.decode('ascii') except AttributeError: new_primary = 'none' + key_version = stat.version if new_primary != this_node.primary_node: if config['daemon_mode'] == 'coordinator': # We're a coordinator and there is no primary if new_primary == 'none': - if this_node.daemon_state == 'run' and this_node.router_state != 'primary': + if this_node.daemon_state == 'run' and this_node.router_state == 'secondary': logger.out('Contending for primary coordinator state', state='i') - zkhandler.writedata(zk_conn, {'/primary_node': myhostname}) + # Acquire an exclusive lock on the primary_node key + primary_lock = zkhandler.exclusivelock(zk_conn, '/primary_node') + try: + # This lock times out after 0.4s, which is 0.1s less than the pre-takeover + # timeout below, thus ensuring that a primary takeover will not deadlock + # against a node that failed the contention + primary_lock.acquire(timeout=0.4) + # Ensure when we get the lock that the versions are still consistent and that + # another node hasn't already acquired primary state + if key_version == zk_conn.get('/primary_node')[1].version: + zkhandler.writedata(zk_conn, {'/primary_node': myhostname}) + # Cleanly release the lock + primary_lock.release() + # We timed out acquiring a lock, which means we failed contention, so just pass + except kazoo.exceptions.LockTimeout: + pass elif new_primary == myhostname: + time.sleep(0.5) zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'takeover'}) else: - if this_node.router_state != 'secondary': + if this_node.router_state == 'primary': + time.sleep(0.5) zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'relinquish'}) else: zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'client'}) diff --git a/node-daemon/pvcnoded/NodeInstance.py b/node-daemon/pvcnoded/NodeInstance.py index 04be43e9..b71dc145 100644 --- a/node-daemon/pvcnoded/NodeInstance.py +++ b/node-daemon/pvcnoded/NodeInstance.py @@ -317,7 +317,7 @@ class NodeInstance(object): Acquire primary coordinator status from a peer node """ # Lock the primary node until transition is complete - primary_lock = zkhandler.writelock(self.zk_conn, '/primary_node') + primary_lock = zkhandler.exclusivelock(self.zk_conn, '/primary_node') primary_lock.acquire() # Ensure our lock key is populated diff --git a/node-daemon/pvcnoded/zkhandler.py b/node-daemon/pvcnoded/zkhandler.py index 94c6e293..138c9e15 100644 --- a/node-daemon/pvcnoded/zkhandler.py +++ b/node-daemon/pvcnoded/zkhandler.py @@ -130,3 +130,9 @@ def readlock(zk_conn, key): lock_id = str(uuid.uuid1()) lock = zk_conn.ReadLock('{}'.format(key), lock_id) return lock + +# Exclusive lock function +def exclusivelock(zk_conn, key): + lock_id = str(uuid.uuid1()) + lock = zk_conn.Lock('{}'.format(key), lock_id) + return lock