Simplify VM migration down to 3 steps

Remove two superfluous synchronization steps which are not needed here, since the exclusive lock handles that situation anyways. Still does not fix the weird flush->unflush lock timeout bug, but is better worked-around now due to the cancelling of the other wait freeing this up and continuing.
2021-09-27 00:03:20 -04:00
parent 0d72798814
commit 55221b3d97
1 changed files with 1 additions and 24 deletions
--- a/node-daemon/pvcnoded/objects/VMInstance.py
+++ b/node-daemon/pvcnoded/objects/VMInstance.py
@@ -507,16 +507,9 @@ class VMInstance(object):
                aborted = True
            else:
                migrate_shutdown()
-        lock.release()
-
        if aborted:
            abort_migrate('Live migration failed and is required')
            return
-
-        time.sleep(0.5)
-
-        self.logger.out('Acquiring lock for phase D', state='i', prefix='Domain {}'.format(self.domuuid))
-        lock.acquire()
        self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))
        self.last_lastnode = self.zkhandler.read(('domain.last_node', self.domuuid))
        migrate_lock_node.release()
@@ -541,7 +534,7 @@ class VMInstance(object):
            (('domain.migrate.sync_lock', self.domuuid), self.domuuid)
        ])

-        self.logger.out('Acquiring lock for phase A', state='i', prefix='Domain {}'.format(self.domuuid))
+        self.logger.out('Acquiring lock for migrate synchronization', state='i', prefix='Domain {}'.format(self.domuuid))
        lock = self.zkhandler.exclusivelock(('domain.migrate.sync_lock', self.domuuid))
        try:
            lock.acquire(timeout=30.0)
@@ -552,23 +545,7 @@ class VMInstance(object):
        time.sleep(1)
        lock.release()

-        time.sleep(0.5)
-
        self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid))
-        try:
-            # Wait for only 900 seconds on this step since we don't do anything and it can fail
-            # if a flush or unflush is cancelled. 900 seconds should be plenty for real long
-            # migations while still avoiding an indefinite blocking here.
-            # TODO: Really dig into why
-            lock.acquire(timeout=900)
-            # This is strictly a synchronizng step
-            lock.release()
-        except Exception:
-            self.logger.out('Failed to acquire lock for phase C within 15 minutes, continuing', state='w', prefix='Domain {}'.format(self.domuuid))
-
-        time.sleep(0.5)
-
-        self.logger.out('Acquiring lock for phase E', state='i', prefix='Domain {}'.format(self.domuuid))
        lock.acquire()
        # Set the updated data
        self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))