2018-05-31 20:26:44 -04:00
#!/usr/bin/env python3
2020-02-08 19:16:19 -05:00
# VMInstance.py - Class implementing a PVC virtual machine in pvcnoded
2018-06-06 01:47:53 -04:00
# Part of the Parallel Virtual Cluster (PVC) system
#
2021-03-25 17:01:55 -04:00
# Copyright (C) 2018-2021 Joshua M. Boniface <joshua@boniface.me>
2018-06-06 01:47:53 -04:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
2021-03-25 16:57:17 -04:00
# the Free Software Foundation, version 3.
2018-06-06 01:47:53 -04:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
2018-09-20 03:42:40 -04:00
import uuid
import time
import libvirt
2019-08-07 14:03:15 -04:00
import json
2018-09-20 03:42:40 -04:00
2020-08-11 11:46:41 -04:00
from threading import Thread
2020-12-20 16:00:55 -05:00
from xml . etree import ElementTree
2021-09-12 15:41:05 -04:00
from re import match
2021-06-01 12:17:25 -04:00
import daemon_lib . common as common
2018-06-06 22:56:03 -04:00
2021-08-21 02:46:11 -04:00
import pvcnoded . objects . VMConsoleWatcherInstance as VMConsoleWatcherInstance
2019-04-11 19:06:06 -04:00
2020-10-18 14:02:34 -04:00
import daemon_lib . common as daemon_common
2020-11-07 14:45:24 -05:00
2019-07-07 15:20:37 -04:00
class VMInstance ( object ) :
2018-06-06 22:59:31 -04:00
# Initialization function
2021-06-01 11:46:27 -04:00
def __init__ ( self , domuuid , zkhandler , config , logger , this_node ) :
2018-05-31 20:26:44 -04:00
# Passed-in variables on creation
self . domuuid = domuuid
2021-06-01 11:46:27 -04:00
self . zkhandler = zkhandler
2018-06-08 12:19:48 -04:00
self . config = config
2018-10-14 02:01:35 -04:00
self . logger = logger
self . this_node = this_node
2018-05-31 20:26:44 -04:00
2019-05-21 22:56:40 -04:00
# Get data from zookeeper
2021-06-09 23:15:08 -04:00
self . domname = self . zkhandler . read ( ( ' domain ' , domuuid ) )
self . state = self . zkhandler . read ( ( ' domain.state ' , domuuid ) )
self . node = self . zkhandler . read ( ( ' domain.node ' , domuuid ) )
self . lastnode = self . zkhandler . read ( ( ' domain.last_node ' , domuuid ) )
self . last_currentnode = self . zkhandler . read ( ( ' domain.node ' , domuuid ) )
self . last_lastnode = self . zkhandler . read ( ( ' domain.last_node ' , domuuid ) )
2020-10-29 11:31:32 -04:00
try :
2021-06-09 23:15:08 -04:00
self . migration_method = self . zkhandler . read ( ( ' domain.meta.migrate_method ' , self . domuuid ) )
2020-11-06 18:55:10 -05:00
except Exception :
2020-10-29 11:31:32 -04:00
self . migration_method = ' none '
2019-05-21 22:56:40 -04:00
2018-05-31 20:26:44 -04:00
# These will all be set later
2018-06-02 18:34:48 -04:00
self . instart = False
2018-06-13 12:47:30 -04:00
self . inrestart = False
2018-06-02 18:34:48 -04:00
self . inmigrate = False
self . inreceive = False
2018-06-13 12:47:30 -04:00
self . inshutdown = False
self . instop = False
2018-05-31 20:26:44 -04:00
2019-04-11 19:06:06 -04:00
# Libvirt domuuid
2018-06-11 20:12:11 -04:00
self . dom = self . lookupByUUID ( self . domuuid )
2018-07-17 21:42:28 -04:00
2019-04-11 19:06:06 -04:00
# Log watcher instance
2021-06-01 11:46:27 -04:00
self . console_log_instance = VMConsoleWatcherInstance . VMConsoleWatcherInstance ( self . domuuid , self . domname , self . zkhandler , self . config , self . logger , self . this_node )
2019-04-11 19:06:06 -04:00
2018-05-31 20:26:44 -04:00
# Watch for changes to the state field in Zookeeper
2021-06-09 23:15:08 -04:00
@self.zkhandler.zk_conn.DataWatch ( self . zkhandler . schema . path ( ' domain.state ' , self . domuuid ) )
2018-06-01 12:21:58 -04:00
def watch_state ( data , stat , event = " " ) :
2018-10-01 22:51:34 -04:00
if event and event . type == ' DELETED ' :
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
2019-07-31 23:57:31 -04:00
# Perform a management command
2019-12-26 11:08:16 -05:00
self . logger . out ( ' Updating state of VM {} ' . format ( self . domuuid ) , state = ' i ' )
2020-08-11 11:46:41 -04:00
state_thread = Thread ( target = self . manage_vm_state , args = ( ) , kwargs = { } )
2019-12-26 11:08:16 -05:00
state_thread . start ( )
2019-04-11 19:06:06 -04:00
2018-06-02 15:03:44 -04:00
# Get data functions
2018-06-02 15:19:29 -04:00
def getstate ( self ) :
2018-06-02 15:03:44 -04:00
return self . state
2018-10-14 02:01:35 -04:00
def getnode ( self ) :
return self . node
2018-06-02 15:03:44 -04:00
2020-06-04 10:26:47 -04:00
def getlastnode ( self ) :
return self . lastnode
2018-06-12 01:54:01 -04:00
def getdom ( self ) :
return self . dom
2018-07-17 21:51:49 -04:00
def getmemory ( self ) :
try :
2020-10-18 14:02:34 -04:00
if self . dom is not None :
memory = int ( self . dom . info ( ) [ 2 ] / 1024 )
else :
2021-06-01 11:46:27 -04:00
domain_information = daemon_common . getInformationFromXML ( self . zkhandler , self . domuuid )
2020-10-18 14:02:34 -04:00
memory = int ( domain_information [ ' memory ' ] )
2020-11-06 18:55:10 -05:00
except Exception :
2018-07-17 21:51:49 -04:00
memory = 0
return memory
2018-07-17 21:40:30 -04:00
2018-07-18 12:09:07 -04:00
def getvcpus ( self ) :
try :
vcpus = int ( self . dom . info ( ) [ 3 ] )
2020-11-06 18:55:10 -05:00
except Exception :
2018-07-18 12:09:07 -04:00
vcpus = 0
return vcpus
2018-06-19 20:01:26 -04:00
# Manage local node domain_list
def addDomainToList ( self ) :
2020-11-06 20:37:52 -05:00
if self . domuuid not in self . this_node . domain_list :
2018-06-19 20:01:26 -04:00
try :
2018-06-20 11:53:48 -04:00
# Add the domain to the domain_list array
2018-10-14 02:01:35 -04:00
self . this_node . domain_list . append ( self . domuuid )
2018-06-20 11:53:48 -04:00
# Push the change up to Zookeeper
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-13 14:37:23 -04:00
( ( ' node.running_domains ' , self . this_node . name ) , ' ' . join ( self . this_node . domain_list ) )
2021-06-01 11:46:27 -04:00
] )
2018-06-19 20:01:26 -04:00
except Exception as e :
2019-07-26 20:53:01 -04:00
self . logger . out ( ' Error adding domain to list: {} ' . format ( e ) , state = ' e ' )
2018-06-19 20:01:26 -04:00
def removeDomainFromList ( self ) :
2018-10-14 02:01:35 -04:00
if self . domuuid in self . this_node . domain_list :
2018-06-19 20:01:26 -04:00
try :
2018-06-20 11:53:48 -04:00
# Remove the domain from the domain_list array
2018-10-14 02:01:35 -04:00
self . this_node . domain_list . remove ( self . domuuid )
2018-06-20 11:53:48 -04:00
# Push the change up to Zookeeper
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-13 14:37:23 -04:00
( ( ' node.running_domains ' , self . this_node . name ) , ' ' . join ( self . this_node . domain_list ) )
2021-06-01 11:46:27 -04:00
] )
2018-06-19 20:01:26 -04:00
except Exception as e :
2019-07-26 20:53:01 -04:00
self . logger . out ( ' Error removing domain from list: {} ' . format ( e ) , state = ' e ' )
2018-06-19 20:01:26 -04:00
2020-12-20 16:00:55 -05:00
# Update the VNC live data
def update_vnc ( self ) :
if self . dom is not None :
live_xml = ElementTree . fromstring ( self . dom . XMLDesc ( 0 ) )
graphics = live_xml . find ( ' ./devices/graphics ' )
if graphics is not None :
self . logger . out ( ' Updating VNC data ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
port = graphics . get ( ' port ' , ' ' )
listen = graphics . get ( ' listen ' , ' ' )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.console.vnc ' , self . domuuid ) , ' {} : {} ' . format ( listen , port ) )
2021-06-01 11:46:27 -04:00
] )
2020-12-20 16:00:55 -05:00
else :
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.console.vnc ' , self . domuuid ) , ' ' )
2021-06-01 11:46:27 -04:00
] )
2020-12-20 16:00:55 -05:00
else :
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.console.vnc ' , self . domuuid ) , ' ' )
2021-06-01 11:46:27 -04:00
] )
2020-12-20 16:00:55 -05:00
2021-09-12 15:41:05 -04:00
# Attach a device to the running domain
def attach_device ( self , xml_spec ) :
if not self . dom :
self . logger . out ( ' Cannot attach device to non-running domain ' , state = ' w ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
return False
try :
self . logger . out ( ' Attaching new device to VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
self . dom . attachDevice ( xml_spec )
return True
except Exception as e :
self . logger . out ( ' Failed to attach device: {} ' . format ( e ) , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
return False
# Detach a device from the running domain
def detach_device ( self , xml_spec ) :
if not self . dom :
self . logger . out ( ' Cannot detach device from non-running domain ' , state = ' w ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
return False
try :
self . logger . out ( ' Detaching device from VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
self . dom . detachDevice ( xml_spec )
return True
except Exception as e :
self . logger . out ( ' Failed to detach device: {} ' . format ( e ) , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
return False
2018-05-31 20:26:44 -04:00
# Start up the VM
2018-06-13 12:31:27 -04:00
def start_vm ( self ) :
2019-04-11 19:06:06 -04:00
# Start the log watcher
self . console_log_instance . start ( )
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Starting VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-02 18:34:48 -04:00
self . instart = True
2018-06-04 01:22:18 -04:00
# Start up a new Libvirt connection
libvirt_name = " qemu:///system "
2018-06-17 21:55:39 -04:00
lv_conn = libvirt . open ( libvirt_name )
2020-11-06 19:36:36 -05:00
if lv_conn is None :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to open local libvirt connection ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-04 01:22:18 -04:00
self . instart = False
return
2019-06-25 22:31:04 -04:00
2018-08-23 19:01:57 -04:00
# Try to get the current state in case it's already running
2018-06-02 00:30:25 -04:00
try :
2018-08-23 19:01:57 -04:00
self . dom = self . lookupByUUID ( self . domuuid )
curstate = self . dom . state ( ) [ 0 ]
2020-11-06 18:55:10 -05:00
except Exception :
2018-08-23 19:01:57 -04:00
curstate = ' notstart '
2020-12-14 14:39:51 -05:00
# Handle situations where the VM crashed or the node unexpectedly rebooted
if self . getdom ( ) is None or self . getdom ( ) . state ( ) [ 0 ] != libvirt . VIR_DOMAIN_RUNNING :
# Flush locks
self . logger . out ( ' Flushing RBD locks ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-08-21 02:46:11 -04:00
VMInstance . flush_locks ( self . zkhandler , self . logger , self . domuuid , self . this_node )
2021-06-09 23:15:08 -04:00
if self . zkhandler . read ( ( ' domain.state ' , self . domuuid ) ) == ' fail ' :
2020-12-14 16:04:38 -05:00
lv_conn . close ( )
self . dom = None
self . instart = False
return
2020-12-14 14:39:51 -05:00
2018-08-23 19:01:57 -04:00
if curstate == libvirt . VIR_DOMAIN_RUNNING :
# If it is running just update the model
2018-06-19 20:01:26 -04:00
self . addDomainToList ( )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.failed_reason ' , self . domuuid ) , ' ' )
2021-06-01 11:46:27 -04:00
] )
2018-08-23 19:01:57 -04:00
else :
# Or try to create it
try :
# Grab the domain information from Zookeeper
2021-06-09 23:15:08 -04:00
xmlconfig = self . zkhandler . read ( ( ' domain.xml ' , self . domuuid ) )
2018-08-23 19:01:57 -04:00
dom = lv_conn . createXML ( xmlconfig , 0 )
self . addDomainToList ( )
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Successfully started VM ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-08-23 19:01:57 -04:00
self . dom = dom
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.failed_reason ' , self . domuuid ) , ' ' )
2021-06-01 11:46:27 -04:00
] )
2018-08-23 19:01:57 -04:00
except libvirt . libvirtError as e :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to create VM ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' fail ' ) ,
( ( ' domain.failed_reason ' , self . domuuid ) , str ( e ) )
2021-06-01 11:46:27 -04:00
] )
2020-12-14 16:04:38 -05:00
lv_conn . close ( )
2018-08-23 19:01:57 -04:00
self . dom = None
2020-12-14 16:04:38 -05:00
self . instart = False
return
2018-06-02 00:33:54 -04:00
2018-06-17 21:55:39 -04:00
lv_conn . close ( )
2019-04-11 19:06:06 -04:00
2018-06-02 18:34:48 -04:00
self . instart = False
2019-06-25 22:31:04 -04:00
2018-06-13 12:47:30 -04:00
# Restart the VM
def restart_vm ( self ) :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Restarting VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-13 12:47:30 -04:00
self . inrestart = True
# Start up a new Libvirt connection
libvirt_name = " qemu:///system "
2018-06-17 21:55:39 -04:00
lv_conn = libvirt . open ( libvirt_name )
2020-11-06 19:36:36 -05:00
if lv_conn is None :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to open local libvirt connection ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-13 12:47:30 -04:00
self . inrestart = False
return
2019-06-25 22:31:04 -04:00
2018-07-20 01:02:18 -04:00
self . shutdown_vm ( )
2019-05-21 22:56:40 -04:00
time . sleep ( 0.2 )
2018-07-20 01:02:18 -04:00
self . start_vm ( )
self . addDomainToList ( )
2018-06-13 12:47:30 -04:00
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' )
2021-06-01 11:46:27 -04:00
] )
2018-06-17 21:55:39 -04:00
lv_conn . close ( )
2018-06-13 12:47:30 -04:00
self . inrestart = False
2018-06-06 21:47:06 -04:00
# Stop the VM forcibly without updating state
def terminate_vm ( self ) :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Terminating VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-06 21:47:06 -04:00
self . instop = True
try :
self . dom . destroy ( )
2021-07-02 11:34:19 -04:00
time . sleep ( 0.2 )
try :
if self . getdom ( ) . state ( ) [ 0 ] == libvirt . VIR_DOMAIN_RUNNING :
# It didn't terminate, try again
self . dom . destroy ( )
except libvirt . libvirtError :
pass
2018-06-06 21:47:06 -04:00
except AttributeError :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to terminate VM ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-19 20:01:26 -04:00
self . removeDomainFromList ( )
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Successfully terminated VM ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-12 21:44:06 -04:00
self . dom = None
2018-06-11 19:14:22 -04:00
self . instop = False
2018-06-06 21:47:06 -04:00
2019-04-11 19:06:06 -04:00
# Stop the log watcher
self . console_log_instance . stop ( )
2018-05-31 20:26:44 -04:00
# Stop the VM forcibly
def stop_vm ( self ) :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Forcibly stopping VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-02 18:34:48 -04:00
self . instop = True
2018-06-06 11:48:28 -04:00
try :
self . dom . destroy ( )
2021-07-02 11:34:19 -04:00
time . sleep ( 0.2 )
try :
if self . getdom ( ) . state ( ) [ 0 ] == libvirt . VIR_DOMAIN_RUNNING :
# It didn't terminate, try again
self . dom . destroy ( )
except libvirt . libvirtError :
pass
2018-06-06 11:48:28 -04:00
except AttributeError :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to stop VM ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-19 20:01:26 -04:00
self . removeDomainFromList ( )
2018-06-02 15:52:50 -04:00
2020-11-06 19:35:51 -05:00
if self . inrestart is False :
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' stop ' )
2021-06-01 11:46:27 -04:00
] )
2018-06-13 12:52:40 -04:00
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Successfully stopped VM ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-02 15:52:50 -04:00
self . dom = None
2018-06-02 18:34:48 -04:00
self . instop = False
2019-06-25 22:31:04 -04:00
2019-04-11 19:06:06 -04:00
# Stop the log watcher
self . console_log_instance . stop ( )
2018-05-31 20:26:44 -04:00
# Shutdown the VM gracefully
def shutdown_vm ( self ) :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Gracefully stopping VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2019-08-07 10:57:19 -04:00
is_aborted = False
2018-06-02 18:34:48 -04:00
self . inshutdown = True
2018-05-31 20:26:44 -04:00
self . dom . shutdown ( )
2019-06-25 22:23:48 -04:00
tick = 0
while True :
2020-08-20 21:26:12 -04:00
tick + = 1
time . sleep ( 1 )
2019-08-07 10:57:19 -04:00
# Abort shutdown if the state changes to start
2021-06-09 23:15:08 -04:00
current_state = self . zkhandler . read ( ( ' domain.state ' , self . domuuid ) )
2021-06-21 23:28:53 -04:00
if current_state not in [ ' shutdown ' , ' restart ' , ' migrate ' ] :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Aborting VM shutdown due to state change ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2019-08-07 10:57:19 -04:00
is_aborted = True
break
2018-06-02 16:28:18 -04:00
2019-06-25 22:23:48 -04:00
try :
lvdomstate = self . dom . state ( ) [ 0 ]
2020-11-06 18:55:10 -05:00
except Exception :
2019-06-25 22:23:48 -04:00
lvdomstate = None
2018-06-02 16:19:51 -04:00
2019-06-25 22:23:48 -04:00
if lvdomstate != libvirt . VIR_DOMAIN_RUNNING :
self . removeDomainFromList ( )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' stop ' )
2021-06-01 11:46:27 -04:00
] )
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Successfully shutdown VM ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2019-06-25 22:23:48 -04:00
self . dom = None
# Stop the log watcher
self . console_log_instance . stop ( )
break
2018-06-19 19:52:03 -04:00
2020-08-20 21:26:12 -04:00
if tick > = self . config [ ' vm_shutdown_timeout ' ] :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Shutdown timeout ( {} s) expired, forcing off ' . format ( self . config [ ' vm_shutdown_timeout ' ] ) , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' stop ' )
2021-06-01 11:46:27 -04:00
] )
2019-06-25 22:23:48 -04:00
break
2018-06-13 12:52:40 -04:00
2018-06-02 18:34:48 -04:00
self . inshutdown = False
2018-06-02 15:52:50 -04:00
2019-08-07 10:57:19 -04:00
if is_aborted :
self . manage_vm_state ( )
2019-06-25 22:23:48 -04:00
if self . inrestart :
# Wait to prevent race conditions
time . sleep ( 1 )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' )
2021-06-01 11:46:27 -04:00
] )
2019-04-11 19:06:06 -04:00
2018-06-04 12:15:37 -04:00
# Migrate the VM to a target host
2020-10-29 11:31:32 -04:00
def migrate_vm ( self , force_live = False , force_shutdown = False ) :
2020-10-21 10:43:42 -04:00
# Wait for any previous migration
while self . inmigrate :
time . sleep ( 0.1 )
2020-10-29 11:31:32 -04:00
if self . migration_method == ' live ' :
force_live = True
elif self . migration_method == ' shutdown ' :
force_shutdown = True
2018-06-04 12:15:37 -04:00
self . inmigrate = True
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Migrating VM to node " {} " ' . format ( self . node ) , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-21 02:52:36 -04:00
# Used for sanity checking later
2021-06-09 23:15:08 -04:00
target_node = self . zkhandler . read ( ( ' domain.node ' , self . domuuid ) )
2020-10-21 02:52:36 -04:00
2020-10-20 15:22:16 -04:00
aborted = False
2020-10-21 02:35:45 -04:00
def abort_migrate ( reason ) :
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' ) ,
( ( ' domain.node ' , self . domuuid ) , self . this_node . name ) ,
( ( ' domain.last_node ' , self . domuuid ) , self . last_lastnode )
2021-06-01 11:46:27 -04:00
] )
2020-10-21 02:13:41 -04:00
migrate_lock_node . release ( )
migrate_lock_state . release ( )
2020-11-17 12:57:37 -05:00
self . inmigrate = False
2020-10-21 02:35:45 -04:00
self . logger . out ( ' Aborted migration: {} ' . format ( reason ) , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 15:22:16 -04:00
2020-10-20 12:13:26 -04:00
# Acquire exclusive lock on the domain node key
2021-06-09 23:15:08 -04:00
migrate_lock_node = self . zkhandler . exclusivelock ( ( ' domain.node ' , self . domuuid ) )
migrate_lock_state = self . zkhandler . exclusivelock ( ( ' domain.state ' , self . domuuid ) )
2020-10-21 02:13:41 -04:00
migrate_lock_node . acquire ( )
migrate_lock_state . acquire ( )
2020-10-20 12:13:26 -04:00
2020-11-07 13:11:03 -05:00
time . sleep ( 0.2 ) # Initial delay for the first writer to grab the lock
2020-10-20 12:13:26 -04:00
2020-10-21 02:13:41 -04:00
# Don't try to migrate a node to itself, set back to start
if self . node == self . lastnode or self . node == self . this_node . name :
2020-10-21 02:35:45 -04:00
abort_migrate ( ' Target node matches the current active node during initial check ' )
2020-10-21 02:13:41 -04:00
return
2020-10-20 12:13:26 -04:00
# Synchronize nodes A (I am reader)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . readlock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring read lock for synchronization phase A ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired read lock for synchronization phase A ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-09 23:15:08 -04:00
if self . zkhandler . read ( ( ' domain.migrate.sync_lock ' , self . domuuid ) ) == ' ' :
2020-10-20 15:22:16 -04:00
self . logger . out ( ' Waiting for peer ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
ticks = 0
2021-06-09 23:15:08 -04:00
while self . zkhandler . read ( ( ' domain.migrate.sync_lock ' , self . domuuid ) ) == ' ' :
2020-10-20 15:22:16 -04:00
time . sleep ( 0.1 )
ticks + = 1
if ticks > 300 :
2020-10-21 02:35:45 -04:00
self . logger . out ( ' Timed out waiting 30s for peer ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 15:22:16 -04:00
aborted = True
2020-10-20 15:39:29 -04:00
break
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing read lock for synchronization phase A ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Released read lock for synchronization phase A ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
2020-10-20 15:22:16 -04:00
if aborted :
2020-10-21 02:35:45 -04:00
abort_migrate ( ' Timed out waiting for peer ' )
2020-10-20 15:22:16 -04:00
return
2020-10-20 12:13:26 -04:00
# Synchronize nodes B (I am writer)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . writelock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring write lock for synchronization phase B ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired write lock for synchronization phase B ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 17:00:11 -04:00
time . sleep ( 0.5 ) # Time for reader to acquire the lock
2020-10-20 12:13:26 -04:00
def migrate_live ( ) :
self . logger . out ( ' Setting up live migration ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
# Set up destination connection
dest_lv = ' qemu+tcp:// {} . {} /system ' . format ( self . node , self . config [ ' cluster_domain ' ] )
dest_tcp = ' tcp:// {} . {} ' . format ( self . node , self . config [ ' cluster_domain ' ] )
try :
self . logger . out ( ' Opening remote libvirt connection ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
# Open a connection to the destination
dest_lv_conn = libvirt . open ( dest_lv )
if not dest_lv_conn :
raise
2020-11-06 18:55:10 -05:00
except Exception :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to open connection to {} ; aborting live migration. ' . format ( dest_lv ) , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
return False
try :
self . logger . out ( ' Live migrating VM ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
# Send the live migration; force the destination URI to ensure we transit over the cluster network
target_dom = self . dom . migrate ( dest_lv_conn , libvirt . VIR_MIGRATE_LIVE , None , dest_tcp , 0 )
if not target_dom :
raise
except Exception as e :
self . logger . out ( ' Failed to send VM to {} - aborting live migration; error: {} ' . format ( dest_lv , e ) , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
dest_lv_conn . close ( )
return False
self . logger . out ( ' Successfully migrated VM ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
dest_lv_conn . close ( )
self . console_log_instance . stop ( )
self . removeDomainFromList ( )
2018-06-20 11:53:48 -04:00
2020-10-20 12:13:26 -04:00
return True
def migrate_shutdown ( ) :
2020-10-21 02:41:42 -04:00
self . logger . out ( ' Shutting down VM for offline migration ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-15 02:32:14 -04:00
self . shutdown_vm ( )
2020-10-20 12:13:26 -04:00
return True
do_migrate_shutdown = False
2020-10-20 15:33:41 -04:00
migrate_live_result = False
2020-10-21 02:13:41 -04:00
# Do a final verification
if self . node == self . lastnode or self . node == self . this_node . name :
2020-10-21 02:35:45 -04:00
abort_migrate ( ' Target node matches the current active node during final check ' )
2020-10-21 02:13:41 -04:00
return
2020-10-21 02:52:36 -04:00
if self . node != target_node :
abort_migrate ( ' Target node changed during preparation ' )
return
2020-10-21 02:13:41 -04:00
2020-10-29 11:31:32 -04:00
if not force_shutdown :
# A live migrate is attemped 3 times in succession
ticks = 0
while True :
ticks + = 1
self . logger . out ( ' Attempting live migration try {} ' . format ( ticks ) , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
migrate_live_result = migrate_live ( )
if migrate_live_result :
break
time . sleep ( 0.5 )
if ticks > 2 :
break
else :
migrate_live_result = False
2020-10-20 15:33:41 -04:00
2020-10-20 12:13:26 -04:00
if not migrate_live_result :
2020-06-06 11:49:21 -04:00
if force_live :
2020-10-21 02:35:45 -04:00
self . logger . out ( ' Could not live migrate VM while live migration enforced ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 15:22:16 -04:00
aborted = True
2020-06-06 11:49:21 -04:00
else :
2020-10-20 12:13:26 -04:00
do_migrate_shutdown = True
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing write lock for synchronization phase B ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-21 10:52:53 -04:00
self . logger . out ( ' Released write lock for synchronization phase B ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
2020-10-20 15:22:16 -04:00
if aborted :
2020-10-21 02:35:45 -04:00
abort_migrate ( ' Live migration failed and is required ' )
2020-10-20 15:22:16 -04:00
return
2020-10-20 12:13:26 -04:00
# Synchronize nodes C (I am writer)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . writelock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring write lock for synchronization phase C ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired write lock for synchronization phase C ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 17:00:11 -04:00
time . sleep ( 0.5 ) # Time for reader to acquire the lock
2020-10-20 12:13:26 -04:00
if do_migrate_shutdown :
2020-11-06 21:13:13 -05:00
migrate_shutdown ( )
2020-10-20 12:13:26 -04:00
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing write lock for synchronization phase C ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-21 10:52:53 -04:00
self . logger . out ( ' Released write lock for synchronization phase C ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
# Synchronize nodes D (I am reader)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . readlock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring read lock for synchronization phase D ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired read lock for synchronization phase D ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-21 11:02:02 -04:00
2021-06-09 23:15:08 -04:00
self . last_currentnode = self . zkhandler . read ( ( ' domain.node ' , self . domuuid ) )
self . last_lastnode = self . zkhandler . read ( ( ' domain.last_node ' , self . domuuid ) )
2020-10-21 11:02:02 -04:00
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing read lock for synchronization phase D ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Released read lock for synchronization phase D ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
2020-10-21 14:43:54 -04:00
# Wait for the receive side to complete before we declare all-done and release locks
2021-06-09 23:15:08 -04:00
ticks = 0
while self . zkhandler . read ( ( ' domain.migrate.sync_lock ' , self . domuuid ) ) != ' ' :
time . sleep ( 0.1 )
ticks + = 1
if ticks > 100 :
self . logger . out ( ' Sync lock clear exceeded 10s timeout, continuing ' , state = ' w ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
break
2020-10-21 02:13:41 -04:00
migrate_lock_node . release ( )
2020-10-21 11:02:02 -04:00
migrate_lock_state . release ( )
2018-06-02 18:34:48 -04:00
self . inmigrate = False
2020-10-20 12:13:26 -04:00
return
2018-06-04 11:49:39 -04:00
2020-10-20 12:13:26 -04:00
# Receive the migration from another host
2018-06-04 01:22:18 -04:00
def receive_migrate ( self ) :
2020-10-21 10:43:42 -04:00
# Wait for any previous migration
while self . inreceive :
time . sleep ( 0.1 )
2020-10-20 14:54:48 -04:00
2018-06-02 18:34:48 -04:00
self . inreceive = True
2019-06-25 22:31:04 -04:00
2021-07-22 15:43:08 -04:00
self . logger . out ( ' Receiving VM migration from node " {} " ' . format ( self . last_currentnode ) , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
2021-06-09 23:15:08 -04:00
# Short delay to ensure sender is in sync
time . sleep ( 0.5 )
2020-10-20 12:13:26 -04:00
# Ensure our lock key is populated
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.migrate.sync_lock ' , self . domuuid ) , self . domuuid )
2021-06-01 11:46:27 -04:00
] )
2020-10-20 12:13:26 -04:00
# Synchronize nodes A (I am writer)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . writelock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring write lock for synchronization phase A ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired write lock for synchronization phase A ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 17:00:11 -04:00
time . sleep ( 1 ) # Time for reader to acquire the lock
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing write lock for synchronization phase A ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Released write lock for synchronization phase A ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 17:00:11 -04:00
time . sleep ( 0.1 ) # Time for new writer to acquire the lock
2020-10-20 12:13:26 -04:00
# Synchronize nodes B (I am reader)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . readlock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring read lock for synchronization phase B ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired read lock for synchronization phase B ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
self . logger . out ( ' Releasing read lock for synchronization phase B ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Released read lock for synchronization phase B ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
# Synchronize nodes C (I am reader)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . readlock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring read lock for synchronization phase C ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired read lock for synchronization phase C ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-21 02:22:00 -04:00
# Set the updated data
2021-06-09 23:15:08 -04:00
self . last_currentnode = self . zkhandler . read ( ( ' domain.node ' , self . domuuid ) )
self . last_lastnode = self . zkhandler . read ( ( ' domain.last_node ' , self . domuuid ) )
2020-10-21 02:22:00 -04:00
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing read lock for synchronization phase C ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Released read lock for synchronization phase C ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
# Synchronize nodes D (I am writer)
2021-06-09 23:15:08 -04:00
lock = self . zkhandler . writelock ( ( ' domain.migrate.sync_lock ' , self . domuuid ) )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquiring write lock for synchronization phase D ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . acquire ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Acquired write lock for synchronization phase D ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2021-06-01 17:00:11 -04:00
time . sleep ( 0.5 ) # Time for reader to acquire the lock
2018-06-11 13:11:42 -04:00
2021-06-09 23:15:08 -04:00
self . state = self . zkhandler . read ( ( ' domain.state ' , self . domuuid ) )
2020-10-20 12:13:26 -04:00
self . dom = self . lookupByUUID ( self . domuuid )
if self . dom :
lvdomstate = self . dom . state ( ) [ 0 ]
if lvdomstate == libvirt . VIR_DOMAIN_RUNNING :
# VM has been received and started
self . addDomainToList ( )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' )
2021-06-01 11:46:27 -04:00
] )
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Successfully received migrated VM ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2019-06-25 22:23:48 -04:00
else :
2020-10-20 12:13:26 -04:00
# The receive somehow failed
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' fail ' ) ,
( ( ' domain.failed_reason ' , self . domuuid ) , ' Failed to receive migration ' )
2021-06-01 11:46:27 -04:00
] )
self . logger . out ( ' Failed to receive migrated VM ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
else :
2020-10-21 11:58:01 -04:00
if self . node == self . this_node . name :
if self . state in [ ' start ' ] :
# The receive was aborted
self . logger . out ( ' Receive aborted via state change ' , state = ' w ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
elif self . state in [ ' stop ' ] :
# The send was shutdown-based
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' )
2021-06-01 11:46:27 -04:00
] )
2020-10-21 11:58:01 -04:00
else :
# The send failed or was aborted
self . logger . out ( ' Migrate aborted or failed; VM in state {} ' . format ( self . state ) , state = ' w ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2018-06-02 18:38:59 -04:00
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Releasing write lock for synchronization phase D ' , state = ' i ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2020-10-20 12:13:26 -04:00
lock . release ( )
2020-10-20 13:08:18 -04:00
self . logger . out ( ' Released write lock for synchronization phase D ' , state = ' o ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2019-06-25 22:31:04 -04:00
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.migrate.sync_lock ' , self . domuuid ) , ' ' )
2021-06-01 11:46:27 -04:00
] )
2018-06-02 18:34:48 -04:00
self . inreceive = False
2020-10-20 12:13:26 -04:00
return
2018-05-31 23:28:26 -04:00
2018-05-31 20:26:44 -04:00
#
# Main function to manage a VM (taking only self)
#
def manage_vm_state ( self ) :
2019-05-21 22:56:40 -04:00
# Update the current values from zookeeper
2021-06-09 23:15:08 -04:00
self . state = self . zkhandler . read ( ( ' domain.state ' , self . domuuid ) )
self . node = self . zkhandler . read ( ( ' domain.node ' , self . domuuid ) )
self . lastnode = self . zkhandler . read ( ( ' domain.last_node ' , self . domuuid ) )
2021-06-22 03:20:15 -04:00
self . migration_method = self . zkhandler . read ( ( ' domain.meta.migrate_method ' , self . domuuid ) )
2018-06-11 18:45:37 -04:00
2018-05-31 20:26:44 -04:00
# Check the current state of the VM
try :
2020-11-06 19:37:13 -05:00
if self . dom is not None :
2018-06-06 11:42:49 -04:00
running , reason = self . dom . state ( )
2018-05-31 23:28:26 -04:00
else :
2018-06-02 15:52:50 -04:00
raise
2020-11-06 18:55:10 -05:00
except Exception :
2018-06-01 12:21:58 -04:00
running = libvirt . VIR_DOMAIN_NOSTATE
2018-05-31 20:26:44 -04:00
2018-10-14 02:01:35 -04:00
self . logger . out ( ' VM state change for " {} " : {} {} ' . format ( self . domuuid , self . state , self . node ) , state = ' i ' )
2018-06-11 19:22:33 -04:00
2018-06-13 12:31:27 -04:00
#######################
# Handle state changes
#######################
# Valid states are:
# start
# migrate
2020-06-06 11:49:21 -04:00
# migrate-live
2018-06-13 12:47:30 -04:00
# restart
2018-06-13 12:31:27 -04:00
# shutdown
# stop
2020-01-08 17:40:02 -05:00
# States we don't (need to) handle are:
# disable
# provision
2018-06-13 12:31:27 -04:00
# Conditional pass one - Are we already performing an action
2020-11-06 19:35:51 -05:00
if self . instart is False \
2020-11-07 13:49:54 -05:00
and self . inrestart is False \
and self . inmigrate is False \
and self . inreceive is False \
and self . inshutdown is False \
and self . instop is False :
2018-10-14 02:01:35 -04:00
# Conditional pass two - Is this VM configured to run on this node
if self . node == self . this_node . name :
# Conditional pass three - Is this VM currently running on this node
2018-06-13 12:31:27 -04:00
if running == libvirt . VIR_DOMAIN_RUNNING :
# VM is already running and should be
if self . state == " start " :
2019-04-11 19:06:06 -04:00
# Start the log watcher
self . console_log_instance . start ( )
# Add domain to running list
2018-06-19 20:01:26 -04:00
self . addDomainToList ( )
2018-06-13 12:31:27 -04:00
# VM is already running and should be but stuck in migrate state
2020-06-06 11:49:21 -04:00
elif self . state == " migrate " or self . state == " migrate-live " :
2019-04-11 19:06:06 -04:00
# Start the log watcher
self . console_log_instance . start ( )
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' )
2021-06-01 11:46:27 -04:00
] )
2019-04-11 19:06:06 -04:00
# Add domain to running list
2018-06-19 20:01:26 -04:00
self . addDomainToList ( )
2018-06-13 12:47:30 -04:00
# VM should be restarted
elif self . state == " restart " :
self . restart_vm ( )
2018-06-13 12:31:27 -04:00
# VM should be shut down
elif self . state == " shutdown " :
self . shutdown_vm ( )
# VM should be stopped
elif self . state == " stop " :
self . stop_vm ( )
else :
# VM should be started
if self . state == " start " :
2019-04-11 19:06:06 -04:00
# Start the domain
2018-06-13 12:31:27 -04:00
self . start_vm ( )
2018-10-14 02:01:35 -04:00
# VM should be migrated to this node
2020-06-06 11:49:21 -04:00
elif self . state == " migrate " or self . state == " migrate-live " :
2019-04-11 19:06:06 -04:00
# Receive the migration
2018-06-13 12:31:27 -04:00
self . receive_migrate ( )
2018-06-13 12:47:30 -04:00
# VM should be restarted (i.e. started since it isn't running)
if self . state == " restart " :
2021-06-01 11:46:27 -04:00
self . zkhandler . write ( [
2021-06-09 23:15:08 -04:00
( ( ' domain.state ' , self . domuuid ) , ' start ' )
2021-06-01 11:46:27 -04:00
] )
2018-06-19 19:52:03 -04:00
# VM should be shut down; ensure it's gone from this node's domain_list
2018-06-13 12:31:27 -04:00
elif self . state == " shutdown " :
2018-06-19 20:01:26 -04:00
self . removeDomainFromList ( )
2019-04-11 19:06:06 -04:00
# Stop the log watcher
self . console_log_instance . stop ( )
2018-06-19 19:52:03 -04:00
# VM should be stoped; ensure it's gone from this node's domain_list
2018-06-13 12:31:27 -04:00
elif self . state == " stop " :
2018-06-19 20:01:26 -04:00
self . removeDomainFromList ( )
2019-04-11 19:06:06 -04:00
# Stop the log watcher
self . console_log_instance . stop ( )
2020-12-20 16:00:55 -05:00
# Update the VNC information
self . update_vnc ( )
2018-06-13 12:31:27 -04:00
else :
2018-10-14 02:01:35 -04:00
# Conditional pass three - Is this VM currently running on this node
2018-06-13 12:31:27 -04:00
if running == libvirt . VIR_DOMAIN_RUNNING :
2018-10-14 02:01:35 -04:00
# VM should be migrated away from this node
2018-06-13 12:31:27 -04:00
if self . state == " migrate " :
2020-06-06 11:49:21 -04:00
self . migrate_vm ( force_live = False )
# VM should be migrated away from this node, live only (no shutdown fallback)
elif self . state == " migrate-live " :
self . migrate_vm ( force_live = True )
2019-06-25 22:23:48 -04:00
# VM should be shutdown gracefully
elif self . state == ' shutdown ' :
self . shutdown_vm ( )
# VM should be forcibly terminated
2018-06-13 12:31:27 -04:00
else :
self . terminate_vm ( )
2018-06-06 21:45:03 -04:00
2018-06-06 22:59:31 -04:00
# This function is a wrapper for libvirt.lookupByUUID which fixes some problems
# 1. Takes a text UUID and handles converting it to bytes
# 2. Try's it and returns a sensible value if not
def lookupByUUID ( self , tuuid ) :
2019-05-21 22:56:40 -04:00
# Don't do anything if the VM shouldn't live on this node
if self . node != self . this_node . name :
return None
2018-06-17 21:55:39 -04:00
lv_conn = None
2018-06-06 22:59:31 -04:00
libvirt_name = " qemu:///system "
2019-06-25 22:31:04 -04:00
2018-06-06 22:59:31 -04:00
# Convert the text UUID to bytes
buuid = uuid . UUID ( tuuid ) . bytes
2019-06-25 22:31:04 -04:00
2018-06-06 22:59:31 -04:00
# Try
try :
# Open a libvirt connection
2018-06-17 21:55:39 -04:00
lv_conn = libvirt . open ( libvirt_name )
2020-11-06 19:36:36 -05:00
if lv_conn is None :
2020-10-20 12:13:26 -04:00
self . logger . out ( ' Failed to open local libvirt connection ' , state = ' e ' , prefix = ' Domain {} ' . format ( self . domuuid ) )
2019-05-21 22:56:40 -04:00
return None
2019-06-25 22:31:04 -04:00
2018-06-06 22:59:31 -04:00
# Lookup the UUID
2018-06-17 21:55:39 -04:00
dom = lv_conn . lookupByUUID ( buuid )
2019-06-25 22:31:04 -04:00
2018-06-06 22:59:31 -04:00
# Fail
2020-11-06 18:55:10 -05:00
except Exception :
2019-05-21 22:56:40 -04:00
dom = None
2019-06-25 22:31:04 -04:00
2018-06-06 22:59:31 -04:00
# After everything
finally :
# Close the libvirt connection
2020-11-06 19:37:13 -05:00
if lv_conn is not None :
2018-06-17 21:55:39 -04:00
lv_conn . close ( )
2019-06-25 22:31:04 -04:00
2018-06-06 22:59:31 -04:00
# Return the dom object (or None)
return dom
2021-08-21 02:46:11 -04:00
# Flush the locks of a VM based on UUID
@staticmethod
def flush_locks ( zkhandler , logger , dom_uuid , this_node = None ) :
logger . out ( ' Flushing RBD locks for VM " {} " ' . format ( dom_uuid ) , state = ' i ' )
# Get the list of RBD images
rbd_list = zkhandler . read ( ( ' domain.storage.volumes ' , dom_uuid ) ) . split ( ' , ' )
for rbd in rbd_list :
# Check if a lock exists
lock_list_retcode , lock_list_stdout , lock_list_stderr = common . run_os_command ( ' rbd lock list --format json {} ' . format ( rbd ) )
if lock_list_retcode != 0 :
logger . out ( ' Failed to obtain lock list for volume " {} " ' . format ( rbd ) , state = ' e ' )
continue
try :
lock_list = json . loads ( lock_list_stdout )
except Exception as e :
logger . out ( ' Failed to parse lock list for volume " {} " : {} ' . format ( rbd , e ) , state = ' e ' )
continue
# If there's at least one lock
if lock_list :
# Loop through the locks
for lock in lock_list :
if this_node is not None and zkhandler . read ( ( ' domain.state ' , dom_uuid ) ) != ' stop ' and lock [ ' address ' ] . split ( ' : ' ) [ 0 ] != this_node . storage_ipaddr :
logger . out ( ' RBD lock does not belong to this host (lock owner: {} ): freeing this lock would be unsafe, aborting ' . format ( lock [ ' address ' ] . split ( ' : ' ) [ 0 ] , state = ' e ' ) )
zkhandler . write ( [
( ( ' domain.state ' , dom_uuid ) , ' fail ' ) ,
( ( ' domain.failed_reason ' , dom_uuid ) , ' Could not safely free RBD lock {} ( {} ) on volume {} ; stop VM and flush locks manually ' . format ( lock [ ' id ' ] , lock [ ' address ' ] , rbd ) ) ,
] )
break
# Free the lock
lock_remove_retcode , lock_remove_stdout , lock_remove_stderr = common . run_os_command ( ' rbd lock remove {} " {} " " {} " ' . format ( rbd , lock [ ' id ' ] , lock [ ' locker ' ] ) )
if lock_remove_retcode != 0 :
logger . out ( ' Failed to free RBD lock " {} " on volume " {} " : {} ' . format ( lock [ ' id ' ] , rbd , lock_remove_stderr ) , state = ' e ' )
zkhandler . write ( [
( ( ' domain.state ' , dom_uuid ) , ' fail ' ) ,
( ( ' domain.failed_reason ' , dom_uuid ) , ' Could not free RBD lock {} ( {} ) on volume {} : {} ' . format ( lock [ ' id ' ] , lock [ ' address ' ] , rbd , lock_remove_stderr ) ) ,
] )
break
logger . out ( ' Freed RBD lock " {} " on volume " {} " ' . format ( lock [ ' id ' ] , rbd ) , state = ' o ' )
return True
# Primary command function
def vm_command ( zkhandler , logger , this_node , data ) :
# Get the command and args
2021-09-12 15:41:05 -04:00
command , dom_uuid , * args = data . split ( )
if match ( ' success-.* ' , command ) or match ( ' failure-.* ' , command ) :
return
logger . out ( ' Getting command " {} " for domain " {} " ' . format ( command , dom_uuid ) , state = ' i ' )
# Verify that the VM is set to run on this node
domain = this_node . d_domain . get ( dom_uuid , None )
if domain is None :
return False
if domain . getnode ( ) != this_node . name :
return
# Lock the command queue
zk_lock = zkhandler . writelock ( ' base.cmd.domain ' )
with zk_lock :
# Flushing VM RBD locks
if command == ' flush_locks ' :
result = VMInstance . flush_locks ( zkhandler , logger , dom_uuid , this_node )
# Attaching a device
elif command == ' attach_device ' :
xml_spec = ' ' . join ( args )
result = domain . attach_device ( xml_spec )
# Detaching a device
elif command == ' detach_device ' :
xml_spec = ' ' . join ( args )
result = domain . detach_device ( xml_spec )
# Command not defined
else :
result = False
# Command succeeded
if result :
# Update the command queue
zkhandler . write ( [
( ' base.cmd.domain ' , ' success- {} ' . format ( data ) )
] )
# Command failed
else :
# Update the command queue
zkhandler . write ( [
( ' base.cmd.domain ' , ' failure- {} ' . format ( data ) )
] )
# Wait 1 seconds before we free the lock, to ensure the client hits the lock
time . sleep ( 1 )