From c2a473ed8bf4e2220d882d104ff3ffbda0607eab Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 27 Sep 2021 00:03:20 -0400 Subject: [PATCH] Simplify VM migration down to 3 steps Remove two superfluous synchronization steps which are not needed here, since the exclusive lock handles that situation anyways. Still does not fix the weird flush->unflush lock timeout bug, but is better worked-around now due to the cancelling of the other wait freeing this up and continuing. --- node-daemon/pvcnoded/objects/VMInstance.py | 25 +--------------------- 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/node-daemon/pvcnoded/objects/VMInstance.py b/node-daemon/pvcnoded/objects/VMInstance.py index 33eaed8b..11f80cb1 100644 --- a/node-daemon/pvcnoded/objects/VMInstance.py +++ b/node-daemon/pvcnoded/objects/VMInstance.py @@ -507,16 +507,9 @@ class VMInstance(object): aborted = True else: migrate_shutdown() - lock.release() - if aborted: abort_migrate('Live migration failed and is required') return - - time.sleep(0.5) - - self.logger.out('Acquiring lock for phase D', state='i', prefix='Domain {}'.format(self.domuuid)) - lock.acquire() self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid)) self.last_lastnode = self.zkhandler.read(('domain.last_node', self.domuuid)) migrate_lock_node.release() @@ -541,7 +534,7 @@ class VMInstance(object): (('domain.migrate.sync_lock', self.domuuid), self.domuuid) ]) - self.logger.out('Acquiring lock for phase A', state='i', prefix='Domain {}'.format(self.domuuid)) + self.logger.out('Acquiring lock for migrate synchronization', state='i', prefix='Domain {}'.format(self.domuuid)) lock = self.zkhandler.exclusivelock(('domain.migrate.sync_lock', self.domuuid)) try: lock.acquire(timeout=30.0) @@ -552,23 +545,7 @@ class VMInstance(object): time.sleep(1) lock.release() - time.sleep(0.5) - self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid)) - try: - # Wait for only 900 seconds on this step since we don't do anything and it can fail - # if a flush or unflush is cancelled. 900 seconds should be plenty for real long - # migations while still avoiding an indefinite blocking here. - # TODO: Really dig into why - lock.acquire(timeout=900) - # This is strictly a synchronizng step - lock.release() - except Exception: - self.logger.out('Failed to acquire lock for phase C within 15 minutes, continuing', state='w', prefix='Domain {}'.format(self.domuuid)) - - time.sleep(0.5) - - self.logger.out('Acquiring lock for phase E', state='i', prefix='Domain {}'.format(self.domuuid)) lock.acquire() # Set the updated data self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))