Simplify VM migration down to 3 steps
Remove two superfluous synchronization steps which are not needed here, since the exclusive lock handles that situation anyways. Still does not fix the weird flush->unflush lock timeout bug, but is better worked-around now due to the cancelling of the other wait freeing this up and continuing.
This commit is contained in:
		| @@ -507,16 +507,9 @@ class VMInstance(object): | ||||
|                 aborted = True | ||||
|             else: | ||||
|                 migrate_shutdown() | ||||
|         lock.release() | ||||
|  | ||||
|         if aborted: | ||||
|             abort_migrate('Live migration failed and is required') | ||||
|             return | ||||
|  | ||||
|         time.sleep(0.5) | ||||
|  | ||||
|         self.logger.out('Acquiring lock for phase D', state='i', prefix='Domain {}'.format(self.domuuid)) | ||||
|         lock.acquire() | ||||
|         self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid)) | ||||
|         self.last_lastnode = self.zkhandler.read(('domain.last_node', self.domuuid)) | ||||
|         migrate_lock_node.release() | ||||
| @@ -541,7 +534,7 @@ class VMInstance(object): | ||||
|             (('domain.migrate.sync_lock', self.domuuid), self.domuuid) | ||||
|         ]) | ||||
|  | ||||
|         self.logger.out('Acquiring lock for phase A', state='i', prefix='Domain {}'.format(self.domuuid)) | ||||
|         self.logger.out('Acquiring lock for migrate synchronization', state='i', prefix='Domain {}'.format(self.domuuid)) | ||||
|         lock = self.zkhandler.exclusivelock(('domain.migrate.sync_lock', self.domuuid)) | ||||
|         try: | ||||
|             lock.acquire(timeout=30.0) | ||||
| @@ -552,23 +545,7 @@ class VMInstance(object): | ||||
|         time.sleep(1) | ||||
|         lock.release() | ||||
|  | ||||
|         time.sleep(0.5) | ||||
|  | ||||
|         self.logger.out('Acquiring lock for phase C', state='i', prefix='Domain {}'.format(self.domuuid)) | ||||
|         try: | ||||
|             # Wait for only 900 seconds on this step since we don't do anything and it can fail | ||||
|             # if a flush or unflush is cancelled. 900 seconds should be plenty for real long | ||||
|             # migations while still avoiding an indefinite blocking here. | ||||
|             # TODO: Really dig into why | ||||
|             lock.acquire(timeout=900) | ||||
|             # This is strictly a synchronizng step | ||||
|             lock.release() | ||||
|         except Exception: | ||||
|             self.logger.out('Failed to acquire lock for phase C within 15 minutes, continuing', state='w', prefix='Domain {}'.format(self.domuuid)) | ||||
|  | ||||
|         time.sleep(0.5) | ||||
|  | ||||
|         self.logger.out('Acquiring lock for phase E', state='i', prefix='Domain {}'.format(self.domuuid)) | ||||
|         lock.acquire() | ||||
|         # Set the updated data | ||||
|         self.last_currentnode = self.zkhandler.read(('domain.node', self.domuuid)) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user