Revamp VM migration handling
This was very old code that was hard to follow and quite fragile, with failures and infinite loops occurring fairly frequently. These changes make the code more robust, including the addition of timeouts, some code cleanup, and some improvements to the logical flow. Also forces the libvirt migration to occur on the cluster network, which couples to changes in the libvirtd listen (via pvc-ansible) and in Daemon.py via the previous commit.
This commit is contained in:
parent
d336fce253
commit
08cb16bfbc
|
@ -236,51 +236,61 @@ class DomainInstance(object):
|
||||||
self.logger.out('Gracefully stopping VM', state='i', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Gracefully stopping VM', state='i', prefix='Domain {}:'.format(self.domuuid))
|
||||||
self.inshutdown = True
|
self.inshutdown = True
|
||||||
self.dom.shutdown()
|
self.dom.shutdown()
|
||||||
try:
|
|
||||||
tick = 0
|
tick = 0
|
||||||
while self.dom.state()[0] == libvirt.VIR_DOMAIN_RUNNING and tick < 60:
|
while True:
|
||||||
tick += 1
|
tick += 1
|
||||||
time.sleep(0.5)
|
time.sleep(1)
|
||||||
|
|
||||||
if tick >= 60:
|
try:
|
||||||
self.logger.out('Shutdown timeout expired', state='e', prefix='Domain {}:'.format(self.domuuid))
|
lvdomstate = self.dom.state()[0]
|
||||||
self.stop_vm()
|
|
||||||
self.inshutdown = False
|
|
||||||
return
|
|
||||||
except:
|
except:
|
||||||
pass
|
lvdomstate = None
|
||||||
|
|
||||||
|
if lvdomstate != libvirt.VIR_DOMAIN_RUNNING:
|
||||||
self.removeDomainFromList()
|
self.removeDomainFromList()
|
||||||
|
|
||||||
if self.inrestart == False:
|
|
||||||
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'stop' })
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'stop' })
|
||||||
|
|
||||||
self.logger.out('Successfully shutdown VM', state='o', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Successfully shutdown VM', state='o', prefix='Domain {}:'.format(self.domuuid))
|
||||||
self.dom = None
|
self.dom = None
|
||||||
self.inshutdown = False
|
|
||||||
|
|
||||||
# Stop the log watcher
|
# Stop the log watcher
|
||||||
self.console_log_instance.stop()
|
self.console_log_instance.stop()
|
||||||
|
break
|
||||||
|
|
||||||
def live_migrate_vm(self, dest_node):
|
# HARDCODE: 90s is a reasonable amount of time for any operating system to shut down cleanly
|
||||||
|
if tick >= 90:
|
||||||
|
self.logger.out('Shutdown timeout expired', state='e', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'stop' })
|
||||||
|
break
|
||||||
|
|
||||||
|
self.inshutdown = False
|
||||||
|
|
||||||
|
if self.inrestart:
|
||||||
|
# Wait to prevent race conditions
|
||||||
|
time.sleep(1)
|
||||||
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
|
||||||
|
|
||||||
|
def live_migrate_vm(self):
|
||||||
|
dest_lv = 'qemu+tcp://{}.{}/system'.format(self.node, self.config['cluster_domain'])
|
||||||
|
dest_tcp = 'tcp://{}.{}'.format(self.node, self.config['cluster_domain'])
|
||||||
try:
|
try:
|
||||||
dest_lv_conn = libvirt.open('qemu+tcp://{}/system'.format(self.node))
|
# Open a connection to the destination
|
||||||
if dest_lv_conn == None:
|
dest_lv_conn = libvirt.open(dest_lv)
|
||||||
|
if not dest_lv_conn:
|
||||||
raise
|
raise
|
||||||
except:
|
except:
|
||||||
self.logger.out('Failed to open connection to qemu+tcp://{}/system; aborting migration.'.format(self.node), state='e', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Failed to open connection to {}; aborting live migration.'.format(dest_lv), state='e', prefix='Domain {}:'.format(self.domuuid))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
target_dom = self.dom.migrate(dest_lv_conn, libvirt.VIR_MIGRATE_LIVE, None, None, 0)
|
# Send the live migration; force the destination URI to ensure we transit over the cluster network
|
||||||
if target_dom == None:
|
target_dom = self.dom.migrate(dest_lv_conn, libvirt.VIR_MIGRATE_LIVE, None, dest_tcp, 0)
|
||||||
|
if not target_dom:
|
||||||
raise
|
raise
|
||||||
self.logger.out('Successfully migrated VM', state='o', prefix='Domain {}:'.format(self.domuuid))
|
|
||||||
|
|
||||||
except:
|
except:
|
||||||
|
self.logger.out('Failed to send VM to {}; aborting live migration.'.format(dest_lv), state='e', prefix='Domain {}:'.format(self.domuuid))
|
||||||
dest_lv_conn.close()
|
dest_lv_conn.close()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
self.logger.out('Successfully migrated VM', state='o', prefix='Domain {}:'.format(self.domuuid))
|
||||||
dest_lv_conn.close()
|
dest_lv_conn.close()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -289,53 +299,94 @@ class DomainInstance(object):
|
||||||
self.inmigrate = True
|
self.inmigrate = True
|
||||||
self.logger.out('Migrating VM to node "{}"'.format(self.node), state='i', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Migrating VM to node "{}"'.format(self.node), state='i', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
|
||||||
migrate_ret = self.live_migrate_vm(self.node)
|
migrate_ret = self.live_migrate_vm()
|
||||||
if not migrate_ret:
|
if not migrate_ret:
|
||||||
self.logger.out('Could not live migrate VM; shutting down to migrate instead', state='e', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Could not live migrate VM; shutting down to migrate instead', state='e', prefix='Domain {}:'.format(self.domuuid))
|
||||||
self.shutdown_vm()
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'shutdown' })
|
||||||
else:
|
else:
|
||||||
self.removeDomainFromList()
|
self.removeDomainFromList()
|
||||||
|
|
||||||
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
|
|
||||||
self.inmigrate = False
|
|
||||||
|
|
||||||
# Stop the log watcher
|
# Stop the log watcher
|
||||||
self.console_log_instance.stop()
|
self.console_log_instance.stop()
|
||||||
|
|
||||||
|
self.inmigrate = False
|
||||||
|
|
||||||
# Receive the migration from another host (wait until VM is running)
|
# Receive the migration from another host (wait until VM is running)
|
||||||
def receive_migrate(self):
|
def receive_migrate(self):
|
||||||
# Start the log watcher
|
|
||||||
self.console_log_instance.start()
|
|
||||||
|
|
||||||
self.inreceive = True
|
self.inreceive = True
|
||||||
|
live_receive = True
|
||||||
|
tick = 0
|
||||||
self.logger.out('Receiving migration', state='i', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Receiving migration', state='i', prefix='Domain {}:'.format(self.domuuid))
|
||||||
while True:
|
while True:
|
||||||
|
# Wait 1 second and increment the tick
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
tick += 1
|
||||||
|
|
||||||
|
# Get zookeeper state and look for the VM in the local libvirt database
|
||||||
self.state = zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(self.domuuid))
|
self.state = zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(self.domuuid))
|
||||||
self.dom = self.lookupByUUID(self.domuuid)
|
self.dom = self.lookupByUUID(self.domuuid)
|
||||||
|
|
||||||
if self.dom == None and self.state == 'migrate':
|
# If the dom is found
|
||||||
continue
|
if self.dom:
|
||||||
|
lvdomstate = self.dom.state()[0]
|
||||||
if self.state != 'migrate':
|
if lvdomstate == libvirt.VIR_DOMAIN_RUNNING:
|
||||||
break
|
# VM has been received and started
|
||||||
|
|
||||||
try:
|
|
||||||
if self.dom.state()[0] == libvirt.VIR_DOMAIN_RUNNING:
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
dom_state = self.dom.state()[0]
|
|
||||||
except AttributeError:
|
|
||||||
dom_state = None
|
|
||||||
|
|
||||||
if dom_state == libvirt.VIR_DOMAIN_RUNNING:
|
|
||||||
self.addDomainToList()
|
self.addDomainToList()
|
||||||
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
|
||||||
self.logger.out('Successfully received migrated VM', state='o', prefix='Domain {}:'.format(self.domuuid))
|
self.logger.out('Successfully received migrated VM', state='o', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
self.logger.out('Failed to receive migrated VM', state='e', prefix='Domain {}:'.format(self.domuuid))
|
# If the state is no longer migrate
|
||||||
|
if self.state != 'migrate':
|
||||||
|
# The receive was aborted before it timed out or was completed
|
||||||
|
self.logger.out('Receive aborted via state change', state='w', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
break
|
||||||
|
# If the dom is not found
|
||||||
|
else:
|
||||||
|
# If the state is changed to shutdown or stop
|
||||||
|
if self.state == 'shutdown' or self.state == 'stop':
|
||||||
|
# The receive failed on the remote end, and VM is being shut down instead
|
||||||
|
live_receive = False
|
||||||
|
self.logger.out('Send failed on remote end', state='w', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
break
|
||||||
|
|
||||||
|
# If we've already been waiting 90s for a receive
|
||||||
|
# HARDCODE: 90s should be plenty of time for even extremely large VMs on reasonable networks
|
||||||
|
if tick > 90:
|
||||||
|
# The receive timed out
|
||||||
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'failed' })
|
||||||
|
self.logger.out('Receive timed out without state change', state='e', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
break
|
||||||
|
|
||||||
|
# We are waiting on a shutdown
|
||||||
|
if not live_receive:
|
||||||
|
tick = 0
|
||||||
|
self.logger.out('Waiting for VM to shut down on remote end', state='i', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
while True:
|
||||||
|
# Wait 1 second and increment the tick
|
||||||
|
time.sleep(1)
|
||||||
|
tick += 1
|
||||||
|
|
||||||
|
# Get zookeeper state and look for the VM in the local libvirt database
|
||||||
|
self.state = zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(self.domuuid))
|
||||||
|
|
||||||
|
# If the VM has stopped
|
||||||
|
if self.state == 'stop':
|
||||||
|
# Wait one more second to avoid race conditions
|
||||||
|
time.sleep(1)
|
||||||
|
# Start the VM up
|
||||||
|
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
|
||||||
|
break
|
||||||
|
|
||||||
|
# If we've already been waiting 120s for a shutdown
|
||||||
|
# HARDCODE: The remote timeout is 90s, so an extra 30s of buffer
|
||||||
|
if tick > 120:
|
||||||
|
# The shutdown timed out; something is very amiss, so switch state to failed and abort
|
||||||
|
zkhandler.writedata(self.zk_conn, {
|
||||||
|
'/domains/{}/state'.format(self.domuuid): 'failed',
|
||||||
|
'/domains/{}/failedreason'.format(self.domuuid): 'Timeout waiting for migrate or shutdown'
|
||||||
|
})
|
||||||
|
self.logger.out('Shutdown timed out without state change', state='e', prefix='Domain {}:'.format(self.domuuid))
|
||||||
|
break
|
||||||
|
|
||||||
self.inreceive = False
|
self.inreceive = False
|
||||||
|
|
||||||
|
@ -430,7 +481,10 @@ class DomainInstance(object):
|
||||||
# VM should be migrated away from this node
|
# VM should be migrated away from this node
|
||||||
if self.state == "migrate":
|
if self.state == "migrate":
|
||||||
self.migrate_vm()
|
self.migrate_vm()
|
||||||
# VM should be terminated
|
# VM should be shutdown gracefully
|
||||||
|
elif self.state == 'shutdown':
|
||||||
|
self.shutdown_vm()
|
||||||
|
# VM should be forcibly terminated
|
||||||
else:
|
else:
|
||||||
self.terminate_vm()
|
self.terminate_vm()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue