Don't try any shenannegans when updating NICs

Trying to do this on the VMInstance side had problems because we can't
differentiate the 3 types of migration there. So, just update this in
the API side and hope everything goes well.

This introduces an edge bug: if a VM is using a macvtap SR-IOV device,
and then tries to migrate, and the migrate is aborted, the NIC lists
will be inconsistent.

When I revamp the VMInstance in the future, I should be able to correct
this, but for now we'll have to live with that edgecase.
This commit is contained in:
Joshua Boniface 2021-06-22 00:00:50 -04:00
parent dc560c1dcb
commit 60e1da09dd
2 changed files with 11 additions and 19 deletions

View File

@ -635,10 +635,6 @@ def move_vm(zkhandler, domain, target_node, wait=False, force_live=False):
retmsg = 'Permanently migrating VM "{}" to node "{}".'.format(domain, target_node) retmsg = 'Permanently migrating VM "{}" to node "{}".'.format(domain, target_node)
if target_state not in ['migrate', 'migrate-live']:
# Update any SR-IOV NICs - with online migrations this is done by pvcnoded, but offline we must do it here
update_vm_sriov_nics(zkhandler, dom_uuid, zkhandler.read(('domain.node', dom_uuid)), target_node)
lock = zkhandler.exclusivelock(('domain.state', dom_uuid)) lock = zkhandler.exclusivelock(('domain.state', dom_uuid))
with lock: with lock:
zkhandler.write([ zkhandler.write([
@ -650,6 +646,9 @@ def move_vm(zkhandler, domain, target_node, wait=False, force_live=False):
# Wait for 1/2 second for migration to start # Wait for 1/2 second for migration to start
time.sleep(0.5) time.sleep(0.5)
# Update any SR-IOV NICs
update_vm_sriov_nics(zkhandler, dom_uuid, current_node, target_node)
if wait: if wait:
while zkhandler.read(('domain.state', dom_uuid)) == target_state: while zkhandler.read(('domain.state', dom_uuid)) == target_state:
time.sleep(0.5) time.sleep(0.5)
@ -702,15 +701,12 @@ def migrate_vm(zkhandler, domain, target_node, force_migrate, wait=False, force_
return False, 'ERROR: Could not find a valid migration target for VM "{}".'.format(domain) return False, 'ERROR: Could not find a valid migration target for VM "{}".'.format(domain)
# Don't overwrite an existing last_node when using force_migrate # Don't overwrite an existing last_node when using force_migrate
real_current_node = current_node # Used for the SR-IOV update
if last_node and force_migrate: if last_node and force_migrate:
current_node = last_node current_node = last_node
retmsg = 'Migrating VM "{}" to node "{}".'.format(domain, target_node) retmsg = 'Migrating VM "{}" to node "{}".'.format(domain, target_node)
if target_state not in ['migrate', 'migrate-live']:
# Update any SR-IOV NICs - with online migrations this is done by pvcnoded, but offline we must do it here
update_vm_sriov_nics(zkhandler, dom_uuid, zkhandler.read(('domain.node', dom_uuid)), target_node)
lock = zkhandler.exclusivelock(('domain.state', dom_uuid)) lock = zkhandler.exclusivelock(('domain.state', dom_uuid))
with lock: with lock:
zkhandler.write([ zkhandler.write([
@ -722,6 +718,9 @@ def migrate_vm(zkhandler, domain, target_node, force_migrate, wait=False, force_
# Wait for 1/2 second for migration to start # Wait for 1/2 second for migration to start
time.sleep(0.5) time.sleep(0.5)
# Update any SR-IOV NICs
update_vm_sriov_nics(zkhandler, dom_uuid, real_current_node, target_node)
if wait: if wait:
while zkhandler.read(('domain.state', dom_uuid)) == target_state: while zkhandler.read(('domain.state', dom_uuid)) == target_state:
time.sleep(0.5) time.sleep(0.5)
@ -747,6 +746,7 @@ def unmigrate_vm(zkhandler, domain, wait=False, force_live=False):
else: else:
target_state = 'migrate' target_state = 'migrate'
current_node = zkhandler.read(('domain.node', dom_uuid))
target_node = zkhandler.read(('domain.last_node', dom_uuid)) target_node = zkhandler.read(('domain.last_node', dom_uuid))
if target_node == '': if target_node == '':
@ -754,10 +754,6 @@ def unmigrate_vm(zkhandler, domain, wait=False, force_live=False):
retmsg = 'Unmigrating VM "{}" back to node "{}".'.format(domain, target_node) retmsg = 'Unmigrating VM "{}" back to node "{}".'.format(domain, target_node)
if target_state not in ['migrate', 'migrate-live']:
# Update any SR-IOV NICs - with online migrations this is done by pvcnoded, but offline we must do it here
update_vm_sriov_nics(zkhandler, dom_uuid, zkhandler.read(('domain.node', dom_uuid)), target_node)
lock = zkhandler.exclusivelock(('domain.state', dom_uuid)) lock = zkhandler.exclusivelock(('domain.state', dom_uuid))
with lock: with lock:
zkhandler.write([ zkhandler.write([
@ -769,6 +765,9 @@ def unmigrate_vm(zkhandler, domain, wait=False, force_live=False):
# Wait for 1/2 second for migration to start # Wait for 1/2 second for migration to start
time.sleep(0.5) time.sleep(0.5)
# Update any SR-IOV NICs
update_vm_sriov_nics(zkhandler, dom_uuid, current_node, target_node)
if wait: if wait:
while zkhandler.read(('domain.state', dom_uuid)) == target_state: while zkhandler.read(('domain.state', dom_uuid)) == target_state:
time.sleep(0.5) time.sleep(0.5)

View File

@ -34,8 +34,6 @@ import pvcnoded.VMConsoleWatcherInstance as VMConsoleWatcherInstance
import daemon_lib.common as daemon_common import daemon_lib.common as daemon_common
from daemon_lib.vm import update_vm_sriov_nics
def flush_locks(zkhandler, logger, dom_uuid, this_node=None): def flush_locks(zkhandler, logger, dom_uuid, this_node=None):
logger.out('Flushing RBD locks for VM "{}"'.format(dom_uuid), state='i') logger.out('Flushing RBD locks for VM "{}"'.format(dom_uuid), state='i')
@ -674,11 +672,6 @@ class VMInstance(object):
self.logger.out('Acquired write lock for synchronization phase D', state='o', prefix='Domain {}'.format(self.domuuid)) self.logger.out('Acquired write lock for synchronization phase D', state='o', prefix='Domain {}'.format(self.domuuid))
time.sleep(0.5) # Time for reader to acquire the lock time.sleep(0.5) # Time for reader to acquire the lock
# Update any SR-IOV NIC states now
sriov_update_result, sriov_update_error = update_vm_sriov_nics(self.zkhandler, self.domuuid, self.last_currentnode, self.node)
if not sriov_update_result:
self.logger.out('{}; VM will likely fail to start.'.format(sriov_update_error), state='w', prefix='Domain {}'.format(self.domuuid))
self.state = self.zkhandler.read(('domain.state', self.domuuid)) self.state = self.zkhandler.read(('domain.state', self.domuuid))
self.dom = self.lookupByUUID(self.domuuid) self.dom = self.lookupByUUID(self.domuuid)
if self.dom: if self.dom: