From 0d72798814682888cdc283b4fef47aae4de8fcde Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sun, 26 Sep 2021 23:24:23 -0400 Subject: [PATCH] Work around synchronization lock issues Make the block on stage C only wait for 900 seconds (15 minutes) to prevent indefinite blocking. The issue comes if a VM is being received, and the current unflush is cancelled for a flush. When this happens, this lock acquisition seems to block for no obvious reason, and no other changes seem to affect it. This is certainly some sort of locking bug within Kazoo but I can't diagnose it as-is. Leave a TODO to look into this again in the future. --- node-daemon/pvcnoded/objects/VMInstance.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/node-daemon/pvcnoded/objects/VMInstance.py b/node-daemon/pvcnoded/objects/VMInstance.py index ec41e9c3..33eaed8b 100644 --- a/node-daemon/pvcnoded/objects/VMInstance.py +++ b/node-daemon/pvcnoded/objects/VMInstance.py @@ -555,9 +555,16 @@ class VMInstance(object): time.sleep(0.5) self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid)) - lock.acquire() - # This is strictly a synchronizng step - lock.release() + try: + # Wait for only 900 seconds on this step since we don't do anything and it can fail + # if a flush or unflush is cancelled. 900 seconds should be plenty for real long + # migations while still avoiding an indefinite blocking here. + # TODO: Really dig into why + lock.acquire(timeout=900) + # This is strictly a synchronizng step + lock.release() + except Exception: + self.logger.out('Failed to acquire lock for phase C within 15 minutes, continuing', state='w', prefix='Domain {}'.format(self.domuuid)) time.sleep(0.5)