Allow enforcement of live migration

Provides a CLI and API argument to force live migration, which triggers
a new VM state "migrate-live". The node daemon VMInstance during migrate
will read this flag from the state and, if enforced, will not trigger a
shutdown migration.

Closes #95
This commit is contained in:
Joshua Boniface 2020-06-06 11:49:21 -04:00
parent b5434ba744
commit ce60836c34
6 changed files with 72 additions and 32 deletions

View File

@ -1365,7 +1365,8 @@ class API_VM_Node(Resource):
{ 'name': 'action', 'choices': ('migrate', 'unmigrate', 'move'), 'helptext': "A valid action must be specified", 'required': True }, { 'name': 'action', 'choices': ('migrate', 'unmigrate', 'move'), 'helptext': "A valid action must be specified", 'required': True },
{ 'name': 'node' }, { 'name': 'node' },
{ 'name': 'force' }, { 'name': 'force' },
{ 'name': 'wait' } { 'name': 'wait' },
{ 'name': 'force_live' }
]) ])
@Authenticator @Authenticator
def post(self, vm, reqargs): def post(self, vm, reqargs):
@ -1396,6 +1397,10 @@ class API_VM_Node(Resource):
name: wait name: wait
type: boolean type: boolean
description: Whether to block waiting for the migration to complete description: Whether to block waiting for the migration to complete
- in: query
name: force_live
type: boolean
description: Whether to enforce live migration and disable shutdown-based fallback migration
responses: responses:
200: 200:
description: OK description: OK
@ -1412,13 +1417,14 @@ class API_VM_Node(Resource):
node = reqargs.get('node', None) node = reqargs.get('node', None)
force = bool(strtobool(reqargs.get('force', 'false'))) force = bool(strtobool(reqargs.get('force', 'false')))
wait = bool(strtobool(reqargs.get('wait', 'false'))) wait = bool(strtobool(reqargs.get('wait', 'false')))
force_live = bool(strtobool(reqargs.get('force_live', 'false')))
if action == 'move': if action == 'move':
return api_helper.vm_move(vm, node, wait) return api_helper.vm_move(vm, node, wait, force_live)
if action == 'migrate': if action == 'migrate':
return api_helper.vm_migrate(vm, node, force, wait) return api_helper.vm_migrate(vm, node, force, wait, force_live)
if action == 'unmigrate': if action == 'unmigrate':
return api_helper.vm_unmigrate(vm, wait) return api_helper.vm_unmigrate(vm, wait, force_live)
abort(400) abort(400)
api.add_resource(API_VM_Node, '/vm/<vm>/node') api.add_resource(API_VM_Node, '/vm/<vm>/node')

View File

@ -661,12 +661,12 @@ def vm_disable(name):
} }
return output, retcode return output, retcode
def vm_move(name, node, wait): def vm_move(name, node, wait, force_live):
""" """
Move a VM to another node. Move a VM to another node.
""" """
zk_conn = pvc_common.startZKConnection(config['coordinators']) zk_conn = pvc_common.startZKConnection(config['coordinators'])
retflag, retdata = pvc_vm.move_vm(zk_conn, name, node, wait) retflag, retdata = pvc_vm.move_vm(zk_conn, name, node, wait, force_live)
pvc_common.stopZKConnection(zk_conn) pvc_common.stopZKConnection(zk_conn)
if retflag: if retflag:
@ -679,12 +679,12 @@ def vm_move(name, node, wait):
} }
return output, retcode return output, retcode
def vm_migrate(name, node, flag_force, wait): def vm_migrate(name, node, flag_force, wait, force_live):
""" """
Temporarily migrate a VM to another node. Temporarily migrate a VM to another node.
""" """
zk_conn = pvc_common.startZKConnection(config['coordinators']) zk_conn = pvc_common.startZKConnection(config['coordinators'])
retflag, retdata = pvc_vm.migrate_vm(zk_conn, name, node, flag_force, wait) retflag, retdata = pvc_vm.migrate_vm(zk_conn, name, node, flag_force, wait, force_live)
pvc_common.stopZKConnection(zk_conn) pvc_common.stopZKConnection(zk_conn)
if retflag: if retflag:
@ -697,12 +697,12 @@ def vm_migrate(name, node, flag_force, wait):
} }
return output, retcode return output, retcode
def vm_unmigrate(name, wait): def vm_unmigrate(name, wait, force_live):
""" """
Unmigrate a migrated VM. Unmigrate a migrated VM.
""" """
zk_conn = pvc_common.startZKConnection(config['coordinators']) zk_conn = pvc_common.startZKConnection(config['coordinators'])
retflag, retdata = pvc_vm.unmigrate_vm(zk_conn, name, wait) retflag, retdata = pvc_vm.unmigrate_vm(zk_conn, name, wait, force_live)
pvc_common.stopZKConnection(zk_conn) pvc_common.stopZKConnection(zk_conn)
if retflag: if retflag:

View File

@ -203,19 +203,20 @@ def vm_state(config, vm, target_state, wait=False):
return retstatus, response.json()['message'] return retstatus, response.json()['message']
def vm_node(config, vm, target_node, action, force=False, wait=False): def vm_node(config, vm, target_node, action, force=False, wait=False, force_live=False):
""" """
Modify the current node of VM via {action} Modify the current node of VM via {action}
API endpoint: POST /vm/{vm}/node API endpoint: POST /vm/{vm}/node
API arguments: node={target_node}, action={action}, force={force}, wait={wait} API arguments: node={target_node}, action={action}, force={force}, wait={wait}, force_live={force_live}
API schema: {"message":"{data}"} API schema: {"message":"{data}"}
""" """
params={ params={
'node': target_node, 'node': target_node,
'action': action, 'action': action,
'force': str(force).lower(), 'force': str(force).lower(),
'wait': str(wait).lower() 'wait': str(wait).lower(),
'force_live': str(force_live).lower()
} }
response = call_api(config, 'post', '/vm/{vm}/node'.format(vm=vm), params=params) response = call_api(config, 'post', '/vm/{vm}/node'.format(vm=vm), params=params)

View File

@ -868,13 +868,17 @@ def vm_disable(domain):
'-w', '--wait', 'wait', is_flag=True, default=False, '-w', '--wait', 'wait', is_flag=True, default=False,
help='Wait for migration to complete before returning.' help='Wait for migration to complete before returning.'
) )
@click.option(
'--force-live', 'force_live', is_flag=True, default=False,
help='Do not fall back to shutdown-based migration if live migration fails.'
)
@cluster_req @cluster_req
def vm_move(domain, target_node, wait): def vm_move(domain, target_node, wait, force_live):
""" """
Permanently move virtual machine DOMAIN, via live migration if running and possible, to another node. DOMAIN may be a UUID or name. Permanently move virtual machine DOMAIN, via live migration if running and possible, to another node. DOMAIN may be a UUID or name.
""" """
retcode, retmsg = pvc_vm.vm_node(config, domain, target_node, 'move', force=False, wait=wait) retcode, retmsg = pvc_vm.vm_node(config, domain, target_node, 'move', force=False, wait=wait, force_live=force_live)
cleanup(retcode, retmsg) cleanup(retcode, retmsg)
############################################################################### ###############################################################################
@ -896,13 +900,17 @@ def vm_move(domain, target_node, wait):
'-w', '--wait', 'wait', is_flag=True, default=False, '-w', '--wait', 'wait', is_flag=True, default=False,
help='Wait for migration to complete before returning.' help='Wait for migration to complete before returning.'
) )
@click.option(
'--force-live', 'force_live', is_flag=True, default=False,
help='Do not fall back to shutdown-based migration if live migration fails.'
)
@cluster_req @cluster_req
def vm_migrate(domain, target_node, force_migrate, wait): def vm_migrate(domain, target_node, force_migrate, wait, force_live):
""" """
Temporarily migrate running virtual machine DOMAIN, via live migration if possible, to another node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node. Temporarily migrate running virtual machine DOMAIN, via live migration if possible, to another node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node.
""" """
retcode, retmsg = pvc_vm.vm_node(config, domain, target_node, 'migrate', force=force_migrate, wait=wait) retcode, retmsg = pvc_vm.vm_node(config, domain, target_node, 'migrate', force=force_migrate, wait=wait, force_live=force_live)
cleanup(retcode, retmsg) cleanup(retcode, retmsg)
############################################################################### ###############################################################################
@ -916,13 +924,17 @@ def vm_migrate(domain, target_node, force_migrate, wait):
'-w', '--wait', 'wait', is_flag=True, default=False, '-w', '--wait', 'wait', is_flag=True, default=False,
help='Wait for migration to complete before returning.' help='Wait for migration to complete before returning.'
) )
@click.option(
'--force-live', 'force_live', is_flag=True, default=False,
help='Do not fall back to shutdown-based migration if live migration fails.'
)
@cluster_req @cluster_req
def vm_unmigrate(domain, wait): def vm_unmigrate(domain, wait, force_live):
""" """
Restore previously migrated virtual machine DOMAIN, via live migration if possible, to its original node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node. Restore previously migrated virtual machine DOMAIN, via live migration if possible, to its original node. DOMAIN may be a UUID or name. If DOMAIN is not running, it will be started on the target node.
""" """
retcode, retmsg = pvc_vm.vm_node(config, domain, None, 'unmigrate', force=False, wait=wait) retcode, retmsg = pvc_vm.vm_node(config, domain, None, 'unmigrate', force=False, wait=wait, force_live=force_live)
cleanup(retcode, retmsg) cleanup(retcode, retmsg)
############################################################################### ###############################################################################

View File

@ -441,7 +441,7 @@ def disable_vm(zk_conn, domain):
return True, 'Marked VM "{}" as disable.'.format(domain) return True, 'Marked VM "{}" as disable.'.format(domain)
def move_vm(zk_conn, domain, target_node, wait=False): def move_vm(zk_conn, domain, target_node, wait=False, force_live=False):
# Validate that VM exists in cluster # Validate that VM exists in cluster
dom_uuid = getDomainUUID(zk_conn, domain) dom_uuid = getDomainUUID(zk_conn, domain)
if not dom_uuid: if not dom_uuid:
@ -453,7 +453,10 @@ def move_vm(zk_conn, domain, target_node, wait=False):
# If the current state isn't start, preserve it; we're not doing live migration # If the current state isn't start, preserve it; we're not doing live migration
target_state = current_state target_state = current_state
else: else:
target_state = 'migrate' if force_live:
target_state = 'migrate-live'
else:
target_state = 'migrate'
current_node = zkhandler.readdata(zk_conn, '/domains/{}/node'.format(dom_uuid)) current_node = zkhandler.readdata(zk_conn, '/domains/{}/node'.format(dom_uuid))
@ -497,7 +500,7 @@ def move_vm(zk_conn, domain, target_node, wait=False):
return True, retmsg return True, retmsg
def migrate_vm(zk_conn, domain, target_node, force_migrate, wait=False): def migrate_vm(zk_conn, domain, target_node, force_migrate, wait=False, force_live=False):
# Validate that VM exists in cluster # Validate that VM exists in cluster
dom_uuid = getDomainUUID(zk_conn, domain) dom_uuid = getDomainUUID(zk_conn, domain)
if not dom_uuid: if not dom_uuid:
@ -509,7 +512,10 @@ def migrate_vm(zk_conn, domain, target_node, force_migrate, wait=False):
# If the current state isn't start, preserve it; we're not doing live migration # If the current state isn't start, preserve it; we're not doing live migration
target_state = current_state target_state = current_state
else: else:
target_state = 'migrate' if force_live:
target_state = 'migrate-live'
else:
target_state = 'migrate'
current_node = zkhandler.readdata(zk_conn, '/domains/{}/node'.format(dom_uuid)) current_node = zkhandler.readdata(zk_conn, '/domains/{}/node'.format(dom_uuid))
last_node = zkhandler.readdata(zk_conn, '/domains/{}/lastnode'.format(dom_uuid)) last_node = zkhandler.readdata(zk_conn, '/domains/{}/lastnode'.format(dom_uuid))
@ -556,7 +562,7 @@ def migrate_vm(zk_conn, domain, target_node, force_migrate, wait=False):
return True, retmsg return True, retmsg
def unmigrate_vm(zk_conn, domain, wait=False): def unmigrate_vm(zk_conn, domain, wait=False, force_live=False):
# Validate that VM exists in cluster # Validate that VM exists in cluster
dom_uuid = getDomainUUID(zk_conn, domain) dom_uuid = getDomainUUID(zk_conn, domain)
if not dom_uuid: if not dom_uuid:
@ -568,7 +574,10 @@ def unmigrate_vm(zk_conn, domain, wait=False):
# If the current state isn't start, preserve it; we're not doing live migration # If the current state isn't start, preserve it; we're not doing live migration
target_state = current_state target_state = current_state
else: else:
target_state = 'migrate' if force_live:
target_state = 'migrate-live'
else:
target_state = 'migrate'
target_node = zkhandler.readdata(zk_conn, '/domains/{}/lastnode'.format(dom_uuid)) target_node = zkhandler.readdata(zk_conn, '/domains/{}/lastnode'.format(dom_uuid))

View File

@ -371,7 +371,7 @@ class VMInstance(object):
return True return True
# Migrate the VM to a target host # Migrate the VM to a target host
def migrate_vm(self): def migrate_vm(self, force_live=False):
# Don't try to migrate a node to itself, set back to start # Don't try to migrate a node to itself, set back to start
if self.node == self.lastnode: if self.node == self.lastnode:
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' }) zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
@ -383,8 +383,16 @@ class VMInstance(object):
migrate_ret = self.live_migrate_vm() migrate_ret = self.live_migrate_vm()
if not migrate_ret: if not migrate_ret:
self.logger.out('Could not live migrate VM; shutting down to migrate instead', state='e', prefix='Domain {}:'.format(self.domuuid)) if force_live:
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'shutdown' }) self.logger.out('Could not live migrate VM; live migration enforced, aborting', state='e', prefix='Domain {}:'.format(self.domuuid))
zkhandler.writedata(self.zk_conn, {
'/domains/{}/state'.format(self.domuuid): 'start',
'/domains/{}/node'.format(self.domuuid): self.this_node.name,
'/domains/{}/lastnode'.format(self.domuuid): ''
})
else:
self.logger.out('Could not live migrate VM; shutting down to migrate instead', state='e', prefix='Domain {}:'.format(self.domuuid))
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'shutdown' })
else: else:
self.removeDomainFromList() self.removeDomainFromList()
# Stop the log watcher # Stop the log watcher
@ -418,7 +426,7 @@ class VMInstance(object):
break break
else: else:
# If the state is no longer migrate # If the state is no longer migrate
if self.state != 'migrate': if self.state not in ['migrate', 'migrate-live']:
# The receive was aborted before it timed out or was completed # The receive was aborted before it timed out or was completed
self.logger.out('Receive aborted via state change', state='w', prefix='Domain {}:'.format(self.domuuid)) self.logger.out('Receive aborted via state change', state='w', prefix='Domain {}:'.format(self.domuuid))
break break
@ -498,6 +506,7 @@ class VMInstance(object):
# Valid states are: # Valid states are:
# start # start
# migrate # migrate
# migrate-live
# restart # restart
# shutdown # shutdown
# stop # stop
@ -523,7 +532,7 @@ class VMInstance(object):
# Add domain to running list # Add domain to running list
self.addDomainToList() self.addDomainToList()
# VM is already running and should be but stuck in migrate state # VM is already running and should be but stuck in migrate state
elif self.state == "migrate": elif self.state == "migrate" or self.state == "migrate-live":
# Start the log watcher # Start the log watcher
self.console_log_instance.start() self.console_log_instance.start()
zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' }) zkhandler.writedata(self.zk_conn, { '/domains/{}/state'.format(self.domuuid): 'start' })
@ -544,7 +553,7 @@ class VMInstance(object):
# Start the domain # Start the domain
self.start_vm() self.start_vm()
# VM should be migrated to this node # VM should be migrated to this node
elif self.state == "migrate": elif self.state == "migrate" or self.state == "migrate-live":
# Receive the migration # Receive the migration
self.receive_migrate() self.receive_migrate()
# VM should be restarted (i.e. started since it isn't running) # VM should be restarted (i.e. started since it isn't running)
@ -566,7 +575,10 @@ class VMInstance(object):
if running == libvirt.VIR_DOMAIN_RUNNING: if running == libvirt.VIR_DOMAIN_RUNNING:
# VM should be migrated away from this node # VM should be migrated away from this node
if self.state == "migrate": if self.state == "migrate":
self.migrate_vm() self.migrate_vm(force_live=False)
# VM should be migrated away from this node, live only (no shutdown fallback)
elif self.state == "migrate-live":
self.migrate_vm(force_live=True)
# VM should be shutdown gracefully # VM should be shutdown gracefully
elif self.state == 'shutdown': elif self.state == 'shutdown':
self.shutdown_vm() self.shutdown_vm()