Better handle aborting migrations

This commit is contained in:
Joshua Boniface 2020-10-20 15:22:16 -04:00
parent 567fe8f36b
commit abfe0108ab
1 changed files with 28 additions and 6 deletions

View File

@ -364,6 +364,15 @@ class VMInstance(object):
self.inmigrate = True
self.logger.out('Migrating VM to node "{}"'.format(self.node), state='i', prefix='Domain {}'.format(self.domuuid))
aborted = False
def abort_migrate():
zkhandler.writedata(self.zk_conn, {
'/domains/{}/state'.format(self.domuuid): 'start',
'/domains/{}/node'.format(self.domuuid): self.this_node.name,
'/domains/{}/lastnode'.format(self.domuuid): ''
})
# Acquire exclusive lock on the domain node key
migrate_lock = zkhandler.exclusivelock(self.zk_conn, '/domains/{}/node'.format(self.domuuid))
migrate_lock.acquire()
@ -375,10 +384,23 @@ class VMInstance(object):
self.logger.out('Acquiring read lock for synchronization phase A', state='i', prefix='Domain {}'.format(self.domuuid))
lock.acquire()
self.logger.out('Acquired read lock for synchronization phase A', state='o', prefix='Domain {}'.format(self.domuuid))
if zkhandler.readdata(self.zk_conn, '/locks/domain_migrate') == '':
self.logger.out('Waiting for peer', state='i', prefix='Domain {}'.format(self.domuuid))
ticks = 0
while zkhandler.readdata(self.zk_conn, '/locks/domain_migrate') == '':
time.sleep(0.1)
ticks += 1
if ticks > 300:
self.logger.out('Timed out waiting 30s for peer, aborting migration', state='e', prefix='Domain {}'.format(self.domuuid))
abort_migrate()
aborted = True
self.logger.out('Releasing read lock for synchronization phase A', state='i', prefix='Domain {}'.format(self.domuuid))
lock.release()
self.logger.out('Released read lock for synchronization phase A', state='o', prefix='Domain {}'.format(self.domuuid))
if aborted:
return
# Synchronize nodes B (I am writer)
lock = zkhandler.writelock(self.zk_conn, '/locks/domain_migrate')
self.logger.out('Acquiring write lock for synchronization phase B', state='i', prefix='Domain {}'.format(self.domuuid))
@ -431,11 +453,8 @@ class VMInstance(object):
if not migrate_live_result:
if force_live:
self.logger.out('Could not live migrate VM; live migration enforced, aborting', state='e', prefix='Domain {}'.format(self.domuuid))
zkhandler.writedata(self.zk_conn, {
'/domains/{}/state'.format(self.domuuid): 'start',
'/domains/{}/node'.format(self.domuuid): self.this_node.name,
'/domains/{}/lastnode'.format(self.domuuid): ''
})
abort_migrate()
aborted = True
else:
do_migrate_shutdown = True
@ -444,6 +463,9 @@ class VMInstance(object):
lock.release()
self.logger.out('Released write lock for synchronization phase B', state='o')
if aborted:
return
# Synchronize nodes C (I am writer)
lock = zkhandler.writelock(self.zk_conn, '/locks/domain_migrate')
self.logger.out('Acquiring write lock for synchronization phase C', state='i', prefix='Domain {}'.format(self.domuuid))
@ -554,7 +576,7 @@ class VMInstance(object):
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
else:
# The send failed catastrophically
self.logger.out('VM in undefined state: {}'.format(self.state), state='e', prefix='Domain {}'.format(self.domuuid))
self.logger.out('Migrate aborted or failed; VM in state {}'.format(self.state), state='w', prefix='Domain {}'.format(self.domuuid))
self.logger.out('Releasing write lock for synchronization phase D', state='i', prefix='Domain {}'.format(self.domuuid))
zkhandler.writedata(self.zk_conn, { '/locks/domain_migrate': '' })