Simplify VM migration down to 3 steps

Remove two superfluous synchronization steps which are not needed here,
since the exclusive lock handles that situation anyways.

Still does not fix the weird flush->unflush lock timeout bug, but is
better worked-around now due to the cancelling of the other wait freeing
this up and continuing.
This commit is contained in:
Joshua Boniface 2021-09-27 00:03:20 -04:00
parent 0d72798814
commit 55221b3d97
1 changed files with 1 additions and 24 deletions

View File

@ -507,16 +507,9 @@ class VMInstance(object):
aborted = True aborted = True
else: else:
migrate_shutdown() migrate_shutdown()
lock.release()
if aborted: if aborted:
abort_migrate('Live migration failed and is required') abort_migrate('Live migration failed and is required')
return return
time.sleep(0.5)
self.logger.out('Acquiring lock for phase D', state='i', prefix='Domain {}'.format(self.domuuid))
lock.acquire()
self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid)) self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))
self.last_lastnode = self.zkhandler.read(('domain.last_node', self.domuuid)) self.last_lastnode = self.zkhandler.read(('domain.last_node', self.domuuid))
migrate_lock_node.release() migrate_lock_node.release()
@ -541,7 +534,7 @@ class VMInstance(object):
(('domain.migrate.sync_lock', self.domuuid), self.domuuid) (('domain.migrate.sync_lock', self.domuuid), self.domuuid)
]) ])
self.logger.out('Acquiring lock for phase A', state='i', prefix='Domain {}'.format(self.domuuid)) self.logger.out('Acquiring lock for migrate synchronization', state='i', prefix='Domain {}'.format(self.domuuid))
lock = self.zkhandler.exclusivelock(('domain.migrate.sync_lock', self.domuuid)) lock = self.zkhandler.exclusivelock(('domain.migrate.sync_lock', self.domuuid))
try: try:
lock.acquire(timeout=30.0) lock.acquire(timeout=30.0)
@ -552,23 +545,7 @@ class VMInstance(object):
time.sleep(1) time.sleep(1)
lock.release() lock.release()
time.sleep(0.5)
self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid)) self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid))
try:
# Wait for only 900 seconds on this step since we don't do anything and it can fail
# if a flush or unflush is cancelled. 900 seconds should be plenty for real long
# migations while still avoiding an indefinite blocking here.
# TODO: Really dig into why
lock.acquire(timeout=900)
# This is strictly a synchronizng step
lock.release()
except Exception:
self.logger.out('Failed to acquire lock for phase C within 15 minutes, continuing', state='w', prefix='Domain {}'.format(self.domuuid))
time.sleep(0.5)
self.logger.out('Acquiring lock for phase E', state='i', prefix='Domain {}'.format(self.domuuid))
lock.acquire() lock.acquire()
# Set the updated data # Set the updated data
self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid)) self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid))