From c2a473ed8bf4e2220d882d104ff3ffbda0607eab Mon Sep 17 00:00:00 2001
From: "Joshua M. Boniface" <joshua@boniface.me>
Date: Mon, 27 Sep 2021 00:03:20 -0400
Subject: [PATCH] Simplify VM migration down to 3 steps

Remove two superfluous synchronization steps which are not needed here,
since the exclusive lock handles that situation anyways.

Still does not fix the weird flush->unflush lock timeout bug, but is
better worked-around now due to the cancelling of the other wait freeing
this up and continuing.
---
 node-daemon/pvcnoded/objects/VMInstance.py | 25 +---------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/node-daemon/pvcnoded/objects/VMInstance.py b/node-daemon/pvcnoded/objects/VMInstance.py
index 33eaed8b..11f80cb1 100644
--- a/node-daemon/pvcnoded/objects/VMInstance.py
+++ b/node-daemon/pvcnoded/objects/VMInstance.py
@@ -507,16 +507,9 @@ class VMInstance(object):
                 aborted = True
             else:
                 migrate_shutdown()
-        lock.release()
-
         if aborted:
             abort_migrate('Live migration failed and is required')
             return
-
-        time.sleep(0.5)
-
-        self.logger.out('Acquiring lock for phase D', state='i', prefix='Domain {}'.format(self.domuuid))
-        lock.acquire()
         self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))
         self.last_lastnode = self.zkhandler.read(('domain.last_node', self.domuuid))
         migrate_lock_node.release()
@@ -541,7 +534,7 @@ class VMInstance(object):
             (('domain.migrate.sync_lock', self.domuuid), self.domuuid)
         ])
 
-        self.logger.out('Acquiring lock for phase A', state='i', prefix='Domain {}'.format(self.domuuid))
+        self.logger.out('Acquiring lock for migrate synchronization', state='i', prefix='Domain {}'.format(self.domuuid))
         lock = self.zkhandler.exclusivelock(('domain.migrate.sync_lock', self.domuuid))
         try:
             lock.acquire(timeout=30.0)
@@ -552,23 +545,7 @@ class VMInstance(object):
         time.sleep(1)
         lock.release()
 
-        time.sleep(0.5)
-
         self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid))
-        try:
-            # Wait for only 900 seconds on this step since we don't do anything and it can fail
-            # if a flush or unflush is cancelled. 900 seconds should be plenty for real long
-            # migations while still avoiding an indefinite blocking here.
-            # TODO: Really dig into why
-            lock.acquire(timeout=900)
-            # This is strictly a synchronizng step
-            lock.release()
-        except Exception:
-            self.logger.out('Failed to acquire lock for phase C within 15 minutes, continuing', state='w', prefix='Domain {}'.format(self.domuuid))
-
-        time.sleep(0.5)
-
-        self.logger.out('Acquiring lock for phase E', state='i', prefix='Domain {}'.format(self.domuuid))
         lock.acquire()
         # Set the updated data
         self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))