2018-09-20 03:25:58 -04:00
#!/usr/bin/env python3
# vm.py - PVC client function library, VM fuctions
# Part of the Parallel Virtual Cluster (PVC) system
#
2022-10-06 11:55:27 -04:00
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
2018-09-20 03:25:58 -04:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
2021-03-25 16:57:17 -04:00
# the Free Software Foundation, version 3.
2018-09-20 03:25:58 -04:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
import time
import re
2023-10-17 10:15:06 -04:00
import os . path
2018-09-20 03:25:58 -04:00
import lxml . objectify
2021-05-23 16:41:42 -04:00
import lxml . etree
2019-04-11 19:06:06 -04:00
2021-07-01 17:24:47 -04:00
from concurrent . futures import ThreadPoolExecutor
2023-10-17 10:15:06 -04:00
from datetime import datetime
2023-10-24 01:08:36 -04:00
from distutils . util import strtobool
2023-10-17 10:15:06 -04:00
from json import dump as jdump
2023-10-23 11:00:54 -04:00
from json import load as jload
2023-11-05 22:32:41 -05:00
from json import loads as jloads
from libvirt import open as lvopen
2023-10-24 01:08:36 -04:00
from shutil import rmtree
from socket import gethostname
from uuid import UUID
2021-06-28 12:27:43 -04:00
2020-02-08 18:48:59 -05:00
import daemon_lib . common as common
import daemon_lib . ceph as ceph
2023-11-05 22:32:41 -05:00
2021-06-21 22:21:54 -04:00
from daemon_lib . network import set_sriov_vf_vm , unset_sriov_vf_vm
2023-11-05 22:32:41 -05:00
from daemon_lib . celery import start , update , fail , finish
2019-06-27 11:19:48 -04:00
2020-11-07 14:45:24 -05:00
2018-09-20 03:25:58 -04:00
#
# Cluster search functions
#
2021-05-29 21:17:19 -04:00
def getClusterDomainList ( zkhandler ) :
2018-09-20 03:25:58 -04:00
# Get a list of UUIDs by listing the children of /domains
2021-11-06 03:02:43 -04:00
uuid_list = zkhandler . children ( " base.domain " )
2018-09-20 03:25:58 -04:00
name_list = [ ]
# For each UUID, get the corresponding name from the data
for uuid in uuid_list :
2021-11-06 03:02:43 -04:00
name_list . append ( zkhandler . read ( ( " domain " , uuid ) ) )
2018-09-20 03:25:58 -04:00
return uuid_list , name_list
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def searchClusterByUUID ( zkhandler , uuid ) :
2018-09-20 03:25:58 -04:00
try :
# Get the lists
2021-05-29 21:17:19 -04:00
uuid_list , name_list = getClusterDomainList ( zkhandler )
2018-09-20 03:25:58 -04:00
# We're looking for UUID, so find that element ID
index = uuid_list . index ( uuid )
# Get the name_list element at that index
name = name_list [ index ]
except ValueError :
# We didn't find anything
return None
return name
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def searchClusterByName ( zkhandler , name ) :
2018-09-20 03:25:58 -04:00
try :
# Get the lists
2021-05-29 21:17:19 -04:00
uuid_list , name_list = getClusterDomainList ( zkhandler )
2018-09-20 03:25:58 -04:00
# We're looking for name, so find that element ID
index = name_list . index ( name )
# Get the uuid_list element at that index
uuid = uuid_list [ index ]
except ValueError :
# We didn't find anything
return None
return uuid
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def getDomainUUID ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2018-09-20 03:25:58 -04:00
if common . validateUUID ( domain ) :
2021-05-29 21:17:19 -04:00
dom_name = searchClusterByUUID ( zkhandler , domain )
dom_uuid = searchClusterByName ( zkhandler , dom_name )
2018-09-20 03:25:58 -04:00
else :
2021-05-29 21:17:19 -04:00
dom_uuid = searchClusterByName ( zkhandler , domain )
dom_name = searchClusterByUUID ( zkhandler , dom_uuid )
2018-09-20 03:25:58 -04:00
return dom_uuid
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def getDomainName ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2018-09-25 02:26:37 -04:00
if common . validateUUID ( domain ) :
2021-05-29 21:17:19 -04:00
dom_name = searchClusterByUUID ( zkhandler , domain )
dom_uuid = searchClusterByName ( zkhandler , dom_name )
2018-09-25 02:26:37 -04:00
else :
2021-05-29 21:17:19 -04:00
dom_uuid = searchClusterByName ( zkhandler , domain )
dom_name = searchClusterByUUID ( zkhandler , dom_uuid )
2018-09-25 02:26:37 -04:00
return dom_name
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
#
# Helper functions
#
def change_state ( zkhandler , dom_uuid , new_state ) :
2021-11-06 03:02:43 -04:00
lock = zkhandler . exclusivelock ( ( " domain.state " , dom_uuid ) )
2021-09-12 15:41:05 -04:00
with lock :
2021-11-06 03:02:43 -04:00
zkhandler . write ( [ ( ( " domain.state " , dom_uuid ) , new_state ) ] )
2021-05-29 21:17:19 -04:00
2021-09-12 15:41:05 -04:00
# Wait for 1/2 second to allow state to flow to all nodes
time . sleep ( 0.5 )
2021-05-29 21:17:19 -04:00
2018-09-20 03:25:58 -04:00
#
# Direct functions
#
2021-05-29 21:17:19 -04:00
def is_migrated ( zkhandler , domain ) :
2019-07-25 14:33:50 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-07-25 14:33:50 -04:00
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-11-06 03:02:43 -04:00
last_node = zkhandler . read ( ( " domain.last_node " , dom_uuid ) )
2019-07-25 14:33:50 -04:00
if last_node :
return True
2019-07-25 15:45:45 -04:00
else :
2019-07-25 14:33:50 -04:00
return False
2020-11-07 14:45:24 -05:00
2021-11-06 03:02:43 -04:00
def define_vm (
zkhandler ,
config_data ,
target_node ,
node_limit ,
node_selector ,
node_autostart ,
migration_method = None ,
profile = None ,
tags = [ ] ,
initial_state = " stop " ,
) :
2018-09-20 03:25:58 -04:00
# Parse the XML data
2019-07-09 10:22:23 -04:00
try :
parsed_xml = lxml . objectify . fromstring ( config_data )
2020-11-06 19:24:10 -05:00
except Exception :
2021-11-06 03:02:43 -04:00
return False , " ERROR: Failed to parse XML data. "
2018-09-20 03:25:58 -04:00
dom_uuid = parsed_xml . uuid . text
dom_name = parsed_xml . name . text
2019-12-09 09:56:59 -05:00
# Ensure that the UUID and name are unique
2021-11-06 03:02:43 -04:00
if searchClusterByUUID ( zkhandler , dom_uuid ) or searchClusterByName (
zkhandler , dom_name
) :
return (
False ,
' ERROR: Specified VM " {} " or UUID " {} " matches an existing VM on the cluster ' . format (
dom_name , dom_uuid
) ,
)
2019-12-09 09:56:59 -05:00
2019-06-24 13:37:56 -04:00
if not target_node :
2021-05-29 21:17:19 -04:00
target_node = common . findTargetNode ( zkhandler , dom_uuid )
2019-06-24 13:25:24 -04:00
else :
# Verify node is valid
2021-05-29 21:17:19 -04:00
valid_node = common . verifyNode ( zkhandler , target_node )
2019-06-24 13:25:24 -04:00
if not valid_node :
2019-07-09 10:22:23 -04:00
return False , ' ERROR: Specified node " {} " is invalid. ' . format ( target_node )
2021-09-13 01:46:53 -04:00
# Validate the new RAM against the current active node
2021-11-06 03:02:43 -04:00
node_total_memory = int ( zkhandler . read ( ( " node.memory.total " , target_node ) ) )
2021-09-13 01:46:53 -04:00
if int ( parsed_xml . memory . text ) > = node_total_memory :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: VM configuration specifies more memory ( {} MiB) than node " {} " has available ( {} MiB). ' . format (
parsed_xml . memory . text , target_node , node_total_memory
) ,
)
2021-09-13 01:46:53 -04:00
# Validate the number of vCPUs against the current active node
2021-11-06 03:02:43 -04:00
node_total_cpus = int ( zkhandler . read ( ( " node.data.static " , target_node ) ) . split ( ) [ 0 ] )
2021-09-13 01:46:53 -04:00
if ( node_total_cpus - 2 ) < = int ( parsed_xml . vcpu . text ) :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: VM configuration specifies more vCPUs ( {} ) than node " {} " has available ( {} minus 2). ' . format (
parsed_xml . vcpu . text , target_node , node_total_cpus
) ,
)
2021-09-13 01:46:53 -04:00
2021-06-21 22:21:54 -04:00
# If a SR-IOV network device is being added, set its used state
dnetworks = common . getDomainNetworks ( parsed_xml , { } )
for network in dnetworks :
2021-11-06 03:02:43 -04:00
if network [ " type " ] in [ " direct " , " hostdev " ] :
dom_node = zkhandler . read ( ( " domain.node " , dom_uuid ) )
2021-06-21 22:21:54 -04:00
# Check if the network is already in use
2021-11-06 03:02:43 -04:00
is_used = zkhandler . read (
( " node.sriov.vf " , dom_node , " sriov_vf.used " , network [ " source " ] )
)
if is_used == " True " :
used_by_name = searchClusterByUUID (
zkhandler ,
zkhandler . read (
(
" node.sriov.vf " ,
dom_node ,
" sriov_vf.used_by " ,
network [ " source " ] ,
)
) ,
)
return (
False ,
' ERROR: Attempted to use SR-IOV network " {} " which is already used by VM " {} " on node " {} " . ' . format (
network [ " source " ] , used_by_name , dom_node
) ,
)
2021-06-21 22:21:54 -04:00
# We must update the "used" section
2021-11-06 03:02:43 -04:00
set_sriov_vf_vm (
zkhandler ,
dom_uuid ,
dom_node ,
network [ " source " ] ,
network [ " mac " ] ,
network [ " type " ] ,
)
2021-06-21 22:21:54 -04:00
2019-07-09 10:22:23 -04:00
# Obtain the RBD disk list using the common functions
2020-06-07 00:40:21 -04:00
ddisks = common . getDomainDisks ( parsed_xml , { } )
2019-07-09 10:22:23 -04:00
rbd_list = [ ]
for disk in ddisks :
2021-11-06 03:02:43 -04:00
if disk [ " type " ] == " rbd " :
rbd_list . append ( disk [ " name " ] )
2018-09-20 03:25:58 -04:00
2019-12-19 12:03:46 -05:00
# Join the limit
2019-12-19 13:22:38 -05:00
if isinstance ( node_limit , list ) and node_limit :
2021-11-06 03:02:43 -04:00
formatted_node_limit = " , " . join ( node_limit )
2019-12-19 12:03:46 -05:00
else :
2021-11-06 03:02:43 -04:00
formatted_node_limit = " "
2019-12-19 12:03:46 -05:00
# Join the RBD list
2019-12-19 13:22:38 -05:00
if isinstance ( rbd_list , list ) and rbd_list :
2021-11-06 03:02:43 -04:00
formatted_rbd_list = " , " . join ( rbd_list )
2019-12-19 12:03:46 -05:00
else :
2021-11-06 03:02:43 -04:00
formatted_rbd_list = " "
2019-12-19 12:03:46 -05:00
2018-09-20 03:25:58 -04:00
# Add the new domain to Zookeeper
2021-11-06 03:02:43 -04:00
zkhandler . write (
[
( ( " domain " , dom_uuid ) , dom_name ) ,
( ( " domain.xml " , dom_uuid ) , config_data ) ,
( ( " domain.state " , dom_uuid ) , initial_state ) ,
( ( " domain.profile " , dom_uuid ) , profile ) ,
( ( " domain.stats " , dom_uuid ) , " " ) ,
( ( " domain.node " , dom_uuid ) , target_node ) ,
( ( " domain.last_node " , dom_uuid ) , " " ) ,
( ( " domain.failed_reason " , dom_uuid ) , " " ) ,
( ( " domain.storage.volumes " , dom_uuid ) , formatted_rbd_list ) ,
( ( " domain.console.log " , dom_uuid ) , " " ) ,
( ( " domain.console.vnc " , dom_uuid ) , " " ) ,
( ( " domain.meta.autostart " , dom_uuid ) , node_autostart ) ,
2022-11-07 11:59:53 -05:00
( ( " domain.meta.migrate_method " , dom_uuid ) , str ( migration_method ) . lower ( ) ) ,
2021-11-06 03:02:43 -04:00
( ( " domain.meta.node_limit " , dom_uuid ) , formatted_node_limit ) ,
2022-11-07 11:59:53 -05:00
( ( " domain.meta.node_selector " , dom_uuid ) , str ( node_selector ) . lower ( ) ) ,
2021-11-06 03:02:43 -04:00
( ( " domain.meta.tags " , dom_uuid ) , " " ) ,
( ( " domain.migrate.sync_lock " , dom_uuid ) , " " ) ,
]
)
2018-09-20 03:25:58 -04:00
2021-07-13 19:04:56 -04:00
for tag in tags :
2021-11-06 03:02:43 -04:00
tag_name = tag [ " name " ]
zkhandler . write (
[
( ( " domain.meta.tags " , dom_uuid , " tag.name " , tag_name ) , tag [ " name " ] ) ,
( ( " domain.meta.tags " , dom_uuid , " tag.type " , tag_name ) , tag [ " type " ] ) ,
(
( " domain.meta.tags " , dom_uuid , " tag.protected " , tag_name ) ,
tag [ " protected " ] ,
) ,
]
)
return True , ' Added new VM with Name " {} " and UUID " {} " to database. ' . format (
dom_name , dom_uuid
)
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-11-06 03:02:43 -04:00
def modify_vm_metadata (
zkhandler ,
domain ,
node_limit ,
node_selector ,
node_autostart ,
provisioner_profile ,
migration_method ,
) :
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-10-12 01:17:39 -04:00
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-06-13 13:41:21 -04:00
update_list = list ( )
2019-10-12 01:17:39 -04:00
if node_limit is not None :
2021-11-06 03:02:43 -04:00
update_list . append ( ( ( " domain.meta.node_limit " , dom_uuid ) , node_limit ) )
2019-10-12 01:17:39 -04:00
if node_selector is not None :
2022-11-07 11:59:53 -05:00
update_list . append (
( ( " domain.meta.node_selector " , dom_uuid ) , str ( node_selector ) . lower ( ) )
)
2019-10-12 01:17:39 -04:00
if node_autostart is not None :
2021-11-06 03:02:43 -04:00
update_list . append ( ( ( " domain.meta.autostart " , dom_uuid ) , node_autostart ) )
2019-10-12 01:17:39 -04:00
2020-01-30 11:45:46 -05:00
if provisioner_profile is not None :
2021-11-06 03:02:43 -04:00
update_list . append ( ( ( " domain.profile " , dom_uuid ) , provisioner_profile ) )
2020-01-30 11:45:46 -05:00
2020-10-29 11:31:32 -04:00
if migration_method is not None :
2022-11-07 11:59:53 -05:00
update_list . append (
( ( " domain.meta.migrate_method " , dom_uuid ) , str ( migration_method ) . lower ( ) )
)
2021-06-13 13:41:21 -04:00
if len ( update_list ) < 1 :
2021-11-06 03:02:43 -04:00
return False , " ERROR: No updates to apply. "
2021-06-13 13:41:21 -04:00
zkhandler . write ( update_list )
2020-10-29 11:31:32 -04:00
2019-10-12 01:17:39 -04:00
return True , ' Successfully modified PVC metadata of VM " {} " . ' . format ( domain )
2020-11-07 14:45:24 -05:00
2021-07-13 19:04:56 -04:00
def modify_vm_tag ( zkhandler , domain , action , tag , protected = False ) :
2021-07-13 01:46:50 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-11-06 03:02:43 -04:00
if action == " add " :
zkhandler . write (
[
( ( " domain.meta.tags " , dom_uuid , " tag.name " , tag ) , tag ) ,
( ( " domain.meta.tags " , dom_uuid , " tag.type " , tag ) , " user " ) ,
( ( " domain.meta.tags " , dom_uuid , " tag.protected " , tag ) , protected ) ,
]
)
2021-07-13 19:04:56 -04:00
return True , ' Successfully added tag " {} " to VM " {} " . ' . format ( tag , domain )
2021-11-06 03:02:43 -04:00
elif action == " remove " :
if not zkhandler . exists ( ( " domain.meta.tags " , dom_uuid , " tag " , tag ) ) :
2021-07-13 19:04:56 -04:00
return False , ' The tag " {} " does not exist. ' . format ( tag )
2021-11-06 03:02:43 -04:00
if zkhandler . read ( ( " domain.meta.tags " , dom_uuid , " tag.type " , tag ) ) != " user " :
return (
False ,
' The tag " {} " is not a user tag and cannot be removed. ' . format ( tag ) ,
)
if bool (
strtobool (
zkhandler . read ( ( " domain.meta.tags " , dom_uuid , " tag.protected " , tag ) )
)
) :
2021-07-13 19:04:56 -04:00
return False , ' The tag " {} " is protected and cannot be removed. ' . format ( tag )
2021-11-06 03:02:43 -04:00
zkhandler . delete ( [ ( ( " domain.meta.tags " , dom_uuid , " tag " , tag ) ) ] )
2021-07-13 19:04:56 -04:00
return True , ' Successfully removed tag " {} " from VM " {} " . ' . format ( tag , domain )
2021-07-13 01:46:50 -04:00
else :
2021-11-06 03:02:43 -04:00
return False , " Specified tag action is not available. "
2021-07-13 01:46:50 -04:00
2021-05-29 21:17:19 -04:00
def modify_vm ( zkhandler , domain , restart , new_vm_config ) :
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-05-29 21:17:19 -04:00
dom_name = getDomainName ( zkhandler , domain )
2018-09-20 03:25:58 -04:00
2020-05-12 11:07:58 -04:00
# Parse and valiate the XML
try :
parsed_xml = lxml . objectify . fromstring ( new_vm_config )
2020-11-06 19:24:10 -05:00
except Exception :
2021-11-06 03:02:43 -04:00
return False , " ERROR: Failed to parse new XML data. "
2021-06-21 22:21:54 -04:00
2021-06-22 02:21:55 -04:00
# Get our old network list for comparison purposes
2021-11-06 03:02:43 -04:00
old_vm_config = zkhandler . read ( ( " domain.xml " , dom_uuid ) )
2021-06-22 02:21:55 -04:00
old_parsed_xml = lxml . objectify . fromstring ( old_vm_config )
old_dnetworks = common . getDomainNetworks ( old_parsed_xml , { } )
2021-09-13 01:46:53 -04:00
# Validate the new RAM against the current active node
2021-11-06 03:02:43 -04:00
node_name = zkhandler . read ( ( " domain.node " , dom_uuid ) )
node_total_memory = int ( zkhandler . read ( ( " node.memory.total " , node_name ) ) )
2021-09-13 01:46:53 -04:00
if int ( parsed_xml . memory . text ) > = node_total_memory :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: Updated VM configuration specifies more memory ( {} MiB) than node " {} " has available ( {} MiB). ' . format (
parsed_xml . memory . text , node_name , node_total_memory
) ,
)
2021-09-13 01:46:53 -04:00
# Validate the number of vCPUs against the current active node
2021-11-06 03:02:43 -04:00
node_total_cpus = int ( zkhandler . read ( ( " node.data.static " , node_name ) ) . split ( ) [ 0 ] )
2021-09-13 01:46:53 -04:00
if ( node_total_cpus - 2 ) < = int ( parsed_xml . vcpu . text ) :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: Updated VM configuration specifies more vCPUs ( {} ) than node " {} " has available ( {} minus 2). ' . format (
parsed_xml . vcpu . text , node_name , node_total_cpus
) ,
)
2021-09-13 01:46:53 -04:00
2021-06-21 22:21:54 -04:00
# If a SR-IOV network device is being added, set its used state
dnetworks = common . getDomainNetworks ( parsed_xml , { } )
for network in dnetworks :
2021-06-22 02:21:55 -04:00
# Ignore networks that are already there
2021-11-06 03:02:43 -04:00
if network [ " source " ] in [ net [ " source " ] for net in old_dnetworks ] :
2021-06-22 02:21:55 -04:00
continue
2021-11-06 03:02:43 -04:00
if network [ " type " ] in [ " direct " , " hostdev " ] :
dom_node = zkhandler . read ( ( " domain.node " , dom_uuid ) )
2021-06-21 22:21:54 -04:00
# Check if the network is already in use
2021-11-06 03:02:43 -04:00
is_used = zkhandler . read (
( " node.sriov.vf " , dom_node , " sriov_vf.used " , network [ " source " ] )
)
if is_used == " True " :
used_by_name = searchClusterByUUID (
zkhandler ,
zkhandler . read (
(
" node.sriov.vf " ,
dom_node ,
" sriov_vf.used_by " ,
network [ " source " ] ,
)
) ,
)
return (
False ,
' ERROR: Attempted to use SR-IOV network " {} " which is already used by VM " {} " on node " {} " . ' . format (
network [ " source " ] , used_by_name , dom_node
) ,
)
2021-06-21 22:21:54 -04:00
# We must update the "used" section
2021-11-06 03:02:43 -04:00
set_sriov_vf_vm (
zkhandler ,
dom_uuid ,
dom_node ,
network [ " source " ] ,
network [ " mac " ] ,
network [ " type " ] ,
)
2021-06-21 22:21:54 -04:00
# If a SR-IOV network device is being removed, unset its used state
for network in old_dnetworks :
2021-11-06 03:02:43 -04:00
if network [ " type " ] in [ " direct " , " hostdev " ] :
if network [ " mac " ] not in [ n [ " mac " ] for n in dnetworks ] :
dom_node = zkhandler . read ( ( " domain.node " , dom_uuid ) )
2021-06-21 22:21:54 -04:00
# We must update the "used" section
2021-11-06 03:02:43 -04:00
unset_sriov_vf_vm ( zkhandler , dom_node , network [ " source " ] )
2020-05-12 11:07:58 -04:00
# Obtain the RBD disk list using the common functions
2020-06-07 00:40:21 -04:00
ddisks = common . getDomainDisks ( parsed_xml , { } )
2020-05-12 11:07:58 -04:00
rbd_list = [ ]
for disk in ddisks :
2021-11-06 03:02:43 -04:00
if disk [ " type " ] == " rbd " :
rbd_list . append ( disk [ " name " ] )
2020-05-12 11:07:58 -04:00
# Join the RBD list
if isinstance ( rbd_list , list ) and rbd_list :
2021-11-06 03:02:43 -04:00
formatted_rbd_list = " , " . join ( rbd_list )
2020-05-12 11:07:58 -04:00
else :
2021-11-06 03:02:43 -04:00
formatted_rbd_list = " "
2020-05-12 11:07:58 -04:00
2018-09-20 03:25:58 -04:00
# Add the modified config to Zookeeper
2021-11-06 03:02:43 -04:00
zkhandler . write (
[
( ( " domain " , dom_uuid ) , dom_name ) ,
( ( " domain.storage.volumes " , dom_uuid ) , formatted_rbd_list ) ,
( ( " domain.xml " , dom_uuid ) , new_vm_config ) ,
]
)
2018-09-20 03:25:58 -04:00
2020-01-30 11:15:07 -05:00
if restart :
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " restart " )
2020-01-30 11:15:07 -05:00
2020-11-07 18:01:26 -05:00
return True , ' Successfully modified configuration of VM " {} " . ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def dump_vm ( zkhandler , domain ) :
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2019-03-12 21:09:54 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Gram the domain XML and dump it to stdout
2021-11-06 03:02:43 -04:00
vm_xml = zkhandler . read ( ( " domain.xml " , dom_uuid ) )
2019-03-12 21:09:54 -04:00
2019-05-20 22:15:28 -04:00
return True , vm_xml
2019-03-12 21:09:54 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def rename_vm ( zkhandler , domain , new_domain ) :
dom_uuid = getDomainUUID ( zkhandler , domain )
2021-05-23 16:41:42 -04:00
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Verify that the VM is in a stopped state; renaming is not supported otherwise
2021-11-06 03:02:43 -04:00
state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
2023-01-30 11:48:43 -05:00
if state not in [ " stop " , " disable " ] :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: VM " {} " is not in stopped state; VMs cannot be renamed while running. ' . format (
domain
) ,
)
2021-05-23 16:41:42 -04:00
# Parse and valiate the XML
2021-05-29 21:17:19 -04:00
vm_config = common . getDomainXML ( zkhandler , dom_uuid )
2021-05-23 16:41:42 -04:00
# Obtain the RBD disk list using the common functions
ddisks = common . getDomainDisks ( vm_config , { } )
pool_list = [ ]
rbd_list = [ ]
for disk in ddisks :
2021-11-06 03:02:43 -04:00
if disk [ " type " ] == " rbd " :
pool_list . append ( disk [ " name " ] . split ( " / " ) [ 0 ] )
rbd_list . append ( disk [ " name " ] . split ( " / " ) [ 1 ] )
2021-05-23 16:41:42 -04:00
# Rename each volume in turn
for idx , rbd in enumerate ( rbd_list ) :
rbd_new = re . sub ( r " {} " . format ( domain ) , new_domain , rbd )
# Skip renaming if nothing changed
if rbd_new == rbd :
continue
2021-05-29 21:17:19 -04:00
ceph . rename_volume ( zkhandler , pool_list [ idx ] , rbd , rbd_new )
2021-05-23 16:41:42 -04:00
# Replace the name in the config
2021-11-06 03:02:43 -04:00
vm_config_new = (
lxml . etree . tostring ( vm_config , encoding = " ascii " , method = " xml " )
. decode ( )
. replace ( domain , new_domain )
)
2021-05-23 16:41:42 -04:00
# Get VM information
2021-05-29 21:17:19 -04:00
_b , dom_info = get_info ( zkhandler , dom_uuid )
2021-05-23 16:41:42 -04:00
# Undefine the old VM
2021-05-29 21:17:19 -04:00
undefine_vm ( zkhandler , dom_uuid )
2021-05-23 16:41:42 -04:00
# Define the new VM
2021-11-06 03:02:43 -04:00
define_vm (
zkhandler ,
vm_config_new ,
dom_info [ " node " ] ,
dom_info [ " node_limit " ] ,
dom_info [ " node_selector " ] ,
dom_info [ " node_autostart " ] ,
migration_method = dom_info [ " migration_method " ] ,
profile = dom_info [ " profile " ] ,
tags = dom_info [ " tags " ] ,
initial_state = " stop " ,
)
2021-05-23 16:41:42 -04:00
# If the VM is migrated, store that
2021-11-06 03:02:43 -04:00
if dom_info [ " migrated " ] != " no " :
zkhandler . write ( [ ( ( " domain.last_node " , dom_uuid ) , dom_info [ " last_node " ] ) ] )
2021-05-23 16:41:42 -04:00
return True , ' Successfully renamed VM " {} " to " {} " . ' . format ( domain , new_domain )
2021-05-29 21:17:19 -04:00
def undefine_vm ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Shut down the VM
2021-11-06 03:02:43 -04:00
current_vm_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_vm_state != " stop " :
change_state ( zkhandler , dom_uuid , " stop " )
2019-06-27 11:19:48 -04:00
# Gracefully terminate the class instances
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " delete " )
2018-09-20 03:25:58 -04:00
# Delete the configurations
2021-11-06 03:02:43 -04:00
zkhandler . delete ( [ ( " domain " , dom_uuid ) ] )
2019-06-27 11:19:48 -04:00
return True , ' Undefined VM " {} " from the cluster. ' . format ( domain )
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def remove_vm ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-06-27 11:19:48 -04:00
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-05-29 21:17:19 -04:00
disk_list = common . getDomainDiskList ( zkhandler , dom_uuid )
2019-06-27 11:19:48 -04:00
# Shut down the VM
2021-11-06 03:02:43 -04:00
current_vm_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_vm_state != " stop " :
change_state ( zkhandler , dom_uuid , " stop " )
2019-06-27 11:19:48 -04:00
2021-05-29 21:17:19 -04:00
# Wait for 1 second to allow state to flow to all nodes
time . sleep ( 1 )
2019-06-27 11:19:48 -04:00
# Remove disks
for disk in disk_list :
# vmpool/vmname_volume
2021-07-09 15:48:57 -04:00
try :
2021-11-06 03:02:43 -04:00
disk_pool , disk_name = disk . split ( " / " )
2021-07-09 15:48:57 -04:00
except ValueError :
continue
2021-07-09 15:39:06 -04:00
retcode , message = ceph . remove_volume ( zkhandler , disk_pool , disk_name )
if not retcode :
2021-11-06 03:02:43 -04:00
if re . match ( " ^ERROR: No volume with name " , message ) :
2021-07-09 15:48:57 -04:00
continue
else :
return False , message
2021-07-09 15:39:06 -04:00
# Gracefully terminate the class instances
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " delete " )
2021-07-09 15:39:06 -04:00
# Wait for 1/2 second to allow state to flow to all nodes
time . sleep ( 0.5 )
# Delete the VM configuration from Zookeeper
2021-11-06 03:02:43 -04:00
zkhandler . delete ( [ ( " domain " , dom_uuid ) ] )
2018-09-20 03:25:58 -04:00
2021-07-09 15:39:06 -04:00
return True , ' Removed VM " {} " and its disks from the cluster. ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def start_vm ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Set the VM to start
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " start " )
2018-09-20 03:25:58 -04:00
2019-07-05 16:38:54 -04:00
return True , ' Starting VM " {} " . ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def restart_vm ( zkhandler , domain , wait = False ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Get state and verify we're OK to proceed
2021-11-06 03:02:43 -04:00
current_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_state != " start " :
2019-07-05 16:38:54 -04:00
return False , ' ERROR: VM " {} " is not in " start " state! ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-02-19 09:45:31 -05:00
retmsg = ' Restarting VM " {} " . ' . format ( domain )
# Set the VM to restart
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " restart " )
2018-09-20 03:25:58 -04:00
2020-02-19 09:45:31 -05:00
if wait :
2021-11-06 03:02:43 -04:00
while zkhandler . read ( ( " domain.state " , dom_uuid ) ) == " restart " :
2021-06-13 13:41:21 -04:00
time . sleep ( 0.5 )
2020-02-19 09:45:31 -05:00
retmsg = ' Restarted VM " {} " ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-02-19 09:45:31 -05:00
return True , retmsg
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def shutdown_vm ( zkhandler , domain , wait = False ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Get state and verify we're OK to proceed
2021-11-06 03:02:43 -04:00
current_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_state != " start " :
2019-07-05 16:38:54 -04:00
return False , ' ERROR: VM " {} " is not in " start " state! ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-02-19 09:45:31 -05:00
retmsg = ' Shutting down VM " {} " ' . format ( domain )
2018-09-20 03:25:58 -04:00
# Set the VM to shutdown
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " shutdown " )
2020-02-19 09:45:31 -05:00
if wait :
2021-11-06 03:02:43 -04:00
while zkhandler . read ( ( " domain.state " , dom_uuid ) ) == " shutdown " :
2021-06-13 13:41:21 -04:00
time . sleep ( 0.5 )
2020-02-19 10:04:58 -05:00
retmsg = ' Shut down VM " {} " ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-02-19 09:45:31 -05:00
return True , retmsg
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def stop_vm ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-05-29 21:17:19 -04:00
# Set the VM to stop
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " stop " )
2018-09-20 03:25:58 -04:00
2019-07-05 16:38:54 -04:00
return True , ' Forcibly stopping VM " {} " . ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-11-06 03:53:44 -04:00
def disable_vm ( zkhandler , domain , force = False ) :
2019-10-23 23:37:42 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-10-23 23:37:42 -04:00
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2021-11-06 03:53:44 -04:00
# Get state and perform a shutdown/stop if VM is online
2021-11-06 03:02:43 -04:00
current_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
2021-11-06 04:08:33 -04:00
if current_state in [ " start " ] :
2021-11-06 03:53:44 -04:00
if force :
change_state ( zkhandler , dom_uuid , " stop " )
# Wait for the command to be registered by the node
time . sleep ( 0.5 )
else :
change_state ( zkhandler , dom_uuid , " shutdown " )
# Wait for the shutdown to complete
while zkhandler . read ( ( " domain.state " , dom_uuid ) ) != " stop " :
time . sleep ( 0.5 )
2019-10-23 23:37:42 -04:00
2021-05-29 21:17:19 -04:00
# Set the VM to disable
2021-11-06 03:02:43 -04:00
change_state ( zkhandler , dom_uuid , " disable " )
2019-10-23 23:37:42 -04:00
2021-11-06 03:53:44 -04:00
return True , ' Disabled VM " {} " . ' . format ( domain )
2019-10-23 23:37:42 -04:00
2020-11-07 14:45:24 -05:00
2021-06-21 23:18:34 -04:00
def update_vm_sriov_nics ( zkhandler , dom_uuid , source_node , target_node ) :
# Update all the SR-IOV device states on both nodes, used during migrations but called by the node-side
2021-11-06 03:02:43 -04:00
vm_config = zkhandler . read ( ( " domain.xml " , dom_uuid ) )
2021-06-21 23:18:34 -04:00
parsed_xml = lxml . objectify . fromstring ( vm_config )
dnetworks = common . getDomainNetworks ( parsed_xml , { } )
retcode = True
2021-11-06 03:02:43 -04:00
retmsg = " "
2021-06-21 23:18:34 -04:00
for network in dnetworks :
2021-11-06 03:02:43 -04:00
if network [ " type " ] in [ " direct " , " hostdev " ] :
2021-06-21 23:18:34 -04:00
# Check if the network is already in use
2021-11-06 03:02:43 -04:00
is_used = zkhandler . read (
( " node.sriov.vf " , target_node , " sriov_vf.used " , network [ " source " ] )
)
if is_used == " True " :
used_by_name = searchClusterByUUID (
zkhandler ,
zkhandler . read (
(
" node.sriov.vf " ,
target_node ,
" sriov_vf.used_by " ,
network [ " source " ] ,
)
) ,
)
2021-06-21 23:18:34 -04:00
if retcode :
2021-06-21 23:46:47 -04:00
retcode_this = False
2021-11-06 03:02:43 -04:00
retmsg = ' Attempting to use SR-IOV network " {} " which is already used by VM " {} " ' . format (
network [ " source " ] , used_by_name
)
2021-06-21 23:46:47 -04:00
else :
retcode_this = True
2021-06-21 23:18:34 -04:00
# We must update the "used" section
2021-06-21 23:46:47 -04:00
if retcode_this :
2021-06-21 23:18:34 -04:00
# This conditional ensure that if we failed the is_used check, we don't try to overwrite the information of a VF that belongs to another VM
2021-11-06 03:02:43 -04:00
set_sriov_vf_vm (
zkhandler ,
dom_uuid ,
target_node ,
network [ " source " ] ,
network [ " mac " ] ,
network [ " type " ] ,
)
2021-06-21 23:18:34 -04:00
# ... but we still want to free the old node in an case
2021-11-06 03:02:43 -04:00
unset_sriov_vf_vm ( zkhandler , source_node , network [ " source " ] )
2021-06-21 23:18:34 -04:00
2021-06-21 23:46:47 -04:00
if not retcode_this :
retcode = retcode_this
2021-06-21 23:18:34 -04:00
return retcode , retmsg
2021-05-29 21:17:19 -04:00
def move_vm ( zkhandler , domain , target_node , wait = False , force_live = False ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2020-02-19 09:45:31 -05:00
# Get state and verify we're OK to proceed
2021-11-06 03:02:43 -04:00
current_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_state != " start " :
2020-02-19 09:45:31 -05:00
# If the current state isn't start, preserve it; we're not doing live migration
target_state = current_state
else :
2020-06-06 11:49:21 -04:00
if force_live :
2021-11-06 03:02:43 -04:00
target_state = " migrate-live "
2020-06-06 11:49:21 -04:00
else :
2021-11-06 03:02:43 -04:00
target_state = " migrate "
2020-02-19 09:45:31 -05:00
2021-11-06 03:02:43 -04:00
current_node = zkhandler . read ( ( " domain.node " , dom_uuid ) )
2018-09-20 03:25:58 -04:00
2019-06-24 13:37:56 -04:00
if not target_node :
2021-05-29 21:17:19 -04:00
target_node = common . findTargetNode ( zkhandler , dom_uuid )
2018-09-20 03:25:58 -04:00
else :
2019-06-24 13:25:24 -04:00
# Verify node is valid
2021-05-29 21:17:19 -04:00
valid_node = common . verifyNode ( zkhandler , target_node )
2019-06-24 13:25:24 -04:00
if not valid_node :
2019-10-12 01:50:15 -04:00
return False , ' ERROR: Specified node " {} " is invalid. ' . format ( target_node )
2019-06-24 13:25:24 -04:00
2019-10-12 01:45:44 -04:00
# Check if node is within the limit
2021-11-06 03:02:43 -04:00
node_limit = zkhandler . read ( ( " domain.meta.node_limit " , dom_uuid ) )
if node_limit and target_node not in node_limit . split ( " , " ) :
return (
False ,
' ERROR: Specified node " {} " is not in the allowed list of nodes for VM " {} " . ' . format (
target_node , domain
) ,
)
2019-10-12 01:45:44 -04:00
2019-06-24 13:25:24 -04:00
# Verify if node is current node
2018-10-14 02:01:35 -04:00
if target_node == current_node :
2021-11-06 03:02:43 -04:00
last_node = zkhandler . read ( ( " domain.last_node " , dom_uuid ) )
2020-06-06 11:21:58 -04:00
if last_node :
2021-11-06 03:02:43 -04:00
zkhandler . write ( [ ( ( " domain.last_node " , dom_uuid ) , " " ) ] )
return True , ' Making temporary migration permanent for VM " {} " . ' . format (
domain
)
2020-06-06 11:21:58 -04:00
2021-11-06 03:02:43 -04:00
return False , ' ERROR: VM " {} " is already running on node " {} " . ' . format (
domain , current_node
)
2018-09-20 03:25:58 -04:00
2019-10-17 12:16:21 -04:00
if not target_node :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: Could not find a valid migration target for VM " {} " . ' . format (
domain
) ,
)
2019-10-17 12:16:21 -04:00
2020-02-19 09:45:31 -05:00
retmsg = ' Permanently migrating VM " {} " to node " {} " . ' . format ( domain , target_node )
2018-09-20 03:25:58 -04:00
2021-11-06 03:02:43 -04:00
lock = zkhandler . exclusivelock ( ( " domain.state " , dom_uuid ) )
2021-05-29 21:17:19 -04:00
with lock :
2021-11-06 03:02:43 -04:00
zkhandler . write (
[
( ( " domain.state " , dom_uuid ) , target_state ) ,
( ( " domain.node " , dom_uuid ) , target_node ) ,
( ( " domain.last_node " , dom_uuid ) , " " ) ,
]
)
2021-05-29 21:17:19 -04:00
# Wait for 1/2 second for migration to start
time . sleep ( 0.5 )
2018-09-20 03:25:58 -04:00
2021-06-22 00:00:50 -04:00
# Update any SR-IOV NICs
update_vm_sriov_nics ( zkhandler , dom_uuid , current_node , target_node )
2020-02-19 09:45:31 -05:00
if wait :
2021-11-06 03:02:43 -04:00
while zkhandler . read ( ( " domain.state " , dom_uuid ) ) == target_state :
2021-06-13 13:41:21 -04:00
time . sleep ( 0.5 )
2020-02-19 09:45:31 -05:00
retmsg = ' Permanently migrated VM " {} " to node " {} " ' . format ( domain , target_node )
return True , retmsg
2020-11-07 14:45:24 -05:00
2021-11-06 03:02:43 -04:00
def migrate_vm (
zkhandler , domain , target_node , force_migrate , wait = False , force_live = False
) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Get state and verify we're OK to proceed
2021-11-06 03:02:43 -04:00
current_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_state != " start " :
2020-02-19 09:45:31 -05:00
# If the current state isn't start, preserve it; we're not doing live migration
target_state = current_state
2018-09-20 03:25:58 -04:00
else :
2020-06-06 11:49:21 -04:00
if force_live :
2021-11-06 03:02:43 -04:00
target_state = " migrate-live "
2020-06-06 11:49:21 -04:00
else :
2021-11-06 03:02:43 -04:00
target_state = " migrate "
2018-09-20 03:25:58 -04:00
2021-11-06 03:02:43 -04:00
current_node = zkhandler . read ( ( " domain.node " , dom_uuid ) )
last_node = zkhandler . read ( ( " domain.last_node " , dom_uuid ) )
2018-09-20 03:25:58 -04:00
2019-06-24 13:37:56 -04:00
if last_node and not force_migrate :
2020-02-19 10:18:41 -05:00
return False , ' ERROR: VM " {} " has been previously migrated. ' . format ( domain )
2018-09-20 03:25:58 -04:00
2019-06-24 13:37:56 -04:00
if not target_node :
2021-05-29 21:17:19 -04:00
target_node = common . findTargetNode ( zkhandler , dom_uuid )
2018-09-20 03:25:58 -04:00
else :
2019-06-24 13:25:24 -04:00
# Verify node is valid
2021-05-29 21:17:19 -04:00
valid_node = common . verifyNode ( zkhandler , target_node )
2019-06-24 13:25:24 -04:00
if not valid_node :
2019-10-12 01:50:15 -04:00
return False , ' ERROR: Specified node " {} " is invalid. ' . format ( target_node )
2019-06-24 13:25:24 -04:00
2019-10-12 01:45:44 -04:00
# Check if node is within the limit
2021-11-06 03:02:43 -04:00
node_limit = zkhandler . read ( ( " domain.meta.node_limit " , dom_uuid ) )
if node_limit and target_node not in node_limit . split ( " , " ) :
return (
False ,
' ERROR: Specified node " {} " is not in the allowed list of nodes for VM " {} " . ' . format (
target_node , domain
) ,
)
2019-10-12 01:45:44 -04:00
2019-06-24 13:25:24 -04:00
# Verify if node is current node
2018-10-14 02:01:35 -04:00
if target_node == current_node :
2021-11-06 03:02:43 -04:00
return False , ' ERROR: VM " {} " is already running on node " {} " . ' . format (
domain , current_node
)
2018-09-20 03:25:58 -04:00
2019-10-17 12:16:21 -04:00
if not target_node :
2021-11-06 03:02:43 -04:00
return (
False ,
' ERROR: Could not find a valid migration target for VM " {} " . ' . format (
domain
) ,
)
2019-10-17 12:16:21 -04:00
2019-07-07 15:10:48 -04:00
# Don't overwrite an existing last_node when using force_migrate
2021-06-22 00:00:50 -04:00
real_current_node = current_node # Used for the SR-IOV update
2019-07-07 15:10:48 -04:00
if last_node and force_migrate :
current_node = last_node
2020-02-19 09:45:31 -05:00
retmsg = ' Migrating VM " {} " to node " {} " . ' . format ( domain , target_node )
2021-11-06 03:02:43 -04:00
lock = zkhandler . exclusivelock ( ( " domain.state " , dom_uuid ) )
2021-05-29 21:17:19 -04:00
with lock :
2021-11-06 03:02:43 -04:00
zkhandler . write (
[
( ( " domain.state " , dom_uuid ) , target_state ) ,
( ( " domain.node " , dom_uuid ) , target_node ) ,
( ( " domain.last_node " , dom_uuid ) , current_node ) ,
]
)
2021-05-29 21:17:19 -04:00
# Wait for 1/2 second for migration to start
time . sleep ( 0.5 )
2018-09-20 03:25:58 -04:00
2021-06-22 00:00:50 -04:00
# Update any SR-IOV NICs
update_vm_sriov_nics ( zkhandler , dom_uuid , real_current_node , target_node )
2020-02-19 09:45:31 -05:00
if wait :
2021-11-06 03:02:43 -04:00
while zkhandler . read ( ( " domain.state " , dom_uuid ) ) == target_state :
2021-06-13 13:41:21 -04:00
time . sleep ( 0.5 )
2020-02-19 09:45:31 -05:00
retmsg = ' Migrated VM " {} " to node " {} " ' . format ( domain , target_node )
return True , retmsg
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def unmigrate_vm ( zkhandler , domain , wait = False , force_live = False ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
2018-09-20 03:25:58 -04:00
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Get state and verify we're OK to proceed
2021-11-06 03:02:43 -04:00
current_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if current_state != " start " :
2019-03-20 10:19:01 -04:00
# If the current state isn't start, preserve it; we're not doing live migration
target_state = current_state
2018-09-20 03:25:58 -04:00
else :
2020-06-06 11:49:21 -04:00
if force_live :
2021-11-06 03:02:43 -04:00
target_state = " migrate-live "
2020-06-06 11:49:21 -04:00
else :
2021-11-06 03:02:43 -04:00
target_state = " migrate "
2018-09-20 03:25:58 -04:00
2021-11-06 03:02:43 -04:00
current_node = zkhandler . read ( ( " domain.node " , dom_uuid ) )
target_node = zkhandler . read ( ( " domain.last_node " , dom_uuid ) )
2018-09-20 03:25:58 -04:00
2021-11-06 03:02:43 -04:00
if target_node == " " :
2019-07-05 16:38:54 -04:00
return False , ' ERROR: VM " {} " has not been previously migrated. ' . format ( domain )
2018-09-20 03:25:58 -04:00
2020-02-19 09:45:31 -05:00
retmsg = ' Unmigrating VM " {} " back to node " {} " . ' . format ( domain , target_node )
2021-11-06 03:02:43 -04:00
lock = zkhandler . exclusivelock ( ( " domain.state " , dom_uuid ) )
2021-05-29 21:17:19 -04:00
with lock :
2021-11-06 03:02:43 -04:00
zkhandler . write (
[
( ( " domain.state " , dom_uuid ) , target_state ) ,
( ( " domain.node " , dom_uuid ) , target_node ) ,
( ( " domain.last_node " , dom_uuid ) , " " ) ,
]
)
2021-05-29 21:17:19 -04:00
# Wait for 1/2 second for migration to start
time . sleep ( 0.5 )
2018-09-20 03:25:58 -04:00
2021-06-22 00:00:50 -04:00
# Update any SR-IOV NICs
update_vm_sriov_nics ( zkhandler , dom_uuid , current_node , target_node )
2020-02-19 09:45:31 -05:00
if wait :
2021-11-06 03:02:43 -04:00
while zkhandler . read ( ( " domain.state " , dom_uuid ) ) == target_state :
2021-06-13 13:41:21 -04:00
time . sleep ( 0.5 )
2020-02-19 09:45:31 -05:00
retmsg = ' Unmigrated VM " {} " back to node " {} " ' . format ( domain , target_node )
return True , retmsg
2018-09-20 03:25:58 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def get_console_log ( zkhandler , domain , lines = 1000 ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-04-11 19:06:06 -04:00
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Get the data from ZK
2021-11-06 03:02:43 -04:00
console_log = zkhandler . read ( ( " domain.console.log " , dom_uuid ) )
2019-04-11 19:06:06 -04:00
2021-07-09 13:08:00 -04:00
if console_log is None :
2021-11-06 03:02:43 -04:00
return True , " "
2021-07-09 13:08:00 -04:00
2019-04-11 19:06:06 -04:00
# Shrink the log buffer to length lines
2021-11-06 03:02:43 -04:00
shrunk_log = console_log . split ( " \n " ) [ - lines : ]
loglines = " \n " . join ( shrunk_log )
2019-04-11 19:06:06 -04:00
2019-12-25 19:10:12 -05:00
return True , loglines
2019-04-11 19:06:06 -04:00
2020-11-07 14:45:24 -05:00
2021-05-29 21:17:19 -04:00
def get_info ( zkhandler , domain ) :
2019-06-27 11:19:48 -04:00
# Validate that VM exists in cluster
2021-05-29 21:17:19 -04:00
dom_uuid = getDomainUUID ( zkhandler , domain )
2019-05-20 22:15:28 -04:00
if not dom_uuid :
2021-11-06 03:02:43 -04:00
return False , ' ERROR: No VM named " {} " is present in the cluster. ' . format (
domain
)
2019-05-20 22:15:28 -04:00
# Gather information from XML config and print it
2021-05-29 21:17:19 -04:00
domain_information = common . getInformationFromXML ( zkhandler , dom_uuid )
2019-06-24 13:37:56 -04:00
if not domain_information :
2019-05-20 22:15:28 -04:00
return False , ' ERROR: Could not get information about VM " {} " . ' . format ( domain )
return True , domain_information
2020-11-07 14:45:24 -05:00
2023-10-17 11:01:38 -04:00
def get_list (
zkhandler , node = None , state = None , tag = None , limit = None , is_fuzzy = True , negate = False
) :
if node is not None :
2018-09-20 03:25:58 -04:00
# Verify node is valid
2021-05-29 21:17:19 -04:00
if not common . verifyNode ( zkhandler , node ) :
2019-07-08 22:37:26 -04:00
return False , ' Specified node " {} " is invalid. ' . format ( node )
2018-09-20 03:25:58 -04:00
2023-10-17 11:01:38 -04:00
if state is not None :
2021-11-06 03:02:43 -04:00
valid_states = [
" start " ,
" restart " ,
" shutdown " ,
" stop " ,
" disable " ,
" fail " ,
" migrate " ,
" unmigrate " ,
" provision " ,
]
2020-11-06 20:37:52 -05:00
if state not in valid_states :
2019-03-20 11:31:54 -04:00
return False , ' VM state " {} " is not valid. ' . format ( state )
2021-11-06 03:02:43 -04:00
full_vm_list = zkhandler . children ( " base.domain " )
2022-08-09 12:05:40 -04:00
full_vm_list . sort ( )
2018-09-20 03:25:58 -04:00
2019-03-12 23:52:59 -04:00
# Set our limit to a sensible regex
2023-10-17 11:01:38 -04:00
if limit is not None :
2021-06-28 12:27:43 -04:00
# Check if the limit is a UUID
is_limit_uuid = False
2019-03-12 23:52:59 -04:00
try :
2021-06-28 12:27:43 -04:00
uuid_obj = UUID ( limit , version = 4 )
limit = str ( uuid_obj )
is_limit_uuid = True
except ValueError :
pass
if is_fuzzy and not is_limit_uuid :
try :
# Implcitly assume fuzzy limits
2021-11-06 03:02:43 -04:00
if not re . match ( r " \ ^.* " , limit ) :
limit = " .* " + limit
if not re . match ( r " .* \ $ " , limit ) :
limit = limit + " .* "
2021-06-28 12:27:43 -04:00
except Exception as e :
2021-11-06 03:02:43 -04:00
return False , " Regex Error: {} " . format ( e )
2019-03-12 23:52:59 -04:00
2021-06-23 19:14:26 -04:00
get_vm_info = dict ( )
2018-09-25 02:20:32 -04:00
for vm in full_vm_list :
2021-11-06 03:02:43 -04:00
name = zkhandler . read ( ( " domain " , vm ) )
2021-06-23 19:14:26 -04:00
is_limit_match = False
2021-07-14 00:51:48 -04:00
is_tag_match = False
2021-06-23 19:14:26 -04:00
is_node_match = False
is_state_match = False
# Check on limit
2023-10-17 11:01:38 -04:00
if limit is not None :
2021-06-28 12:27:43 -04:00
# Try to match the limit against the UUID (if applicable) and name
2018-09-20 03:25:58 -04:00
try :
2021-12-06 16:35:29 -05:00
if is_limit_uuid and re . fullmatch ( limit , vm ) :
2021-06-28 12:27:43 -04:00
is_limit_match = True
2021-12-06 16:35:29 -05:00
if re . fullmatch ( limit , name ) :
2021-06-23 19:14:26 -04:00
is_limit_match = True
2018-09-20 03:25:58 -04:00
except Exception as e :
2021-11-06 03:02:43 -04:00
return False , " Regex Error: {} " . format ( e )
2018-09-20 03:25:58 -04:00
else :
2021-06-23 19:14:26 -04:00
is_limit_match = True
2023-10-17 11:01:38 -04:00
if tag is not None :
2021-11-06 03:02:43 -04:00
vm_tags = zkhandler . children ( ( " domain.meta.tags " , vm ) )
2021-10-07 11:13:30 -04:00
if negate and tag not in vm_tags :
is_tag_match = True
if not negate and tag in vm_tags :
2021-07-14 00:51:48 -04:00
is_tag_match = True
else :
is_tag_match = True
2021-06-23 19:14:26 -04:00
# Check on node
2023-10-17 11:01:38 -04:00
if node is not None :
2021-11-06 03:02:43 -04:00
vm_node = zkhandler . read ( ( " domain.node " , vm ) )
2021-10-07 11:13:30 -04:00
if negate and vm_node != node :
is_node_match = True
if not negate and vm_node == node :
2021-06-23 19:14:26 -04:00
is_node_match = True
else :
is_node_match = True
# Check on state
2023-10-17 11:01:38 -04:00
if state is not None :
2021-11-06 03:02:43 -04:00
vm_state = zkhandler . read ( ( " domain.state " , vm ) )
2021-10-07 11:13:30 -04:00
if negate and vm_state != state :
is_state_match = True
if not negate and vm_state == state :
2021-06-23 19:14:26 -04:00
is_state_match = True
else :
is_state_match = True
2021-11-06 03:02:43 -04:00
get_vm_info [ vm ] = (
True
if is_limit_match and is_tag_match and is_node_match and is_state_match
else False
)
2021-07-01 17:24:47 -04:00
# Obtain our VM data in a thread pool
# This helps parallelize the numerous Zookeeper calls a bit, within the bounds of the GIL, and
# should help prevent this task from becoming absurdly slow with very large numbers of VMs.
# The max_workers is capped at 32 to avoid creating an absurd number of threads especially if
# the list gets called multiple times simultaneously by the API, but still provides a noticeable
# speedup.
vm_execute_list = [ vm for vm in full_vm_list if get_vm_info [ vm ] ]
vm_data_list = list ( )
2021-11-06 03:02:43 -04:00
with ThreadPoolExecutor ( max_workers = 32 , thread_name_prefix = " vm_list " ) as executor :
2021-07-01 17:24:47 -04:00
futures = [ ]
for vm_uuid in vm_execute_list :
2021-11-06 03:02:43 -04:00
futures . append (
executor . submit ( common . getInformationFromXML , zkhandler , vm_uuid )
)
2021-07-01 17:24:47 -04:00
for future in futures :
2021-07-09 13:05:37 -04:00
try :
vm_data_list . append ( future . result ( ) )
except Exception :
pass
2021-07-01 17:24:47 -04:00
2022-08-12 17:46:29 -04:00
return True , sorted ( vm_data_list , key = lambda d : d [ " name " ] )
2023-10-17 10:15:06 -04:00
def backup_vm (
2023-10-24 01:20:44 -04:00
zkhandler , domain , backup_path , incremental_parent = None , retain_snapshot = False
2023-10-17 10:15:06 -04:00
) :
2023-10-17 12:39:37 -04:00
tstart = time . time ( )
2023-10-17 12:32:40 -04:00
2023-10-17 10:15:06 -04:00
# 0. Validations
2023-10-24 00:23:12 -04:00
# Disallow retaining snapshots with an incremental parent
if incremental_parent is not None and retain_snapshot :
2023-10-24 01:20:44 -04:00
return (
False ,
" ERROR: Retaining snapshots of incremental backups is not supported! " ,
)
2023-10-24 00:23:12 -04:00
2023-10-17 10:15:06 -04:00
# Validate that VM exists in cluster
dom_uuid = getDomainUUID ( zkhandler , domain )
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
2023-10-24 01:08:36 -04:00
# Validate that the target path is valid
2023-10-24 01:20:44 -04:00
if not re . match ( r " ^/ " , backup_path ) :
2023-10-17 10:15:06 -04:00
return (
False ,
2023-10-24 01:20:44 -04:00
f " ERROR: Target path { backup_path } is not a valid absolute path on the primary coordinator! " ,
2023-10-17 10:15:06 -04:00
)
2023-10-24 01:20:44 -04:00
# Ensure that backup_path (on this node) exists
if not os . path . isdir ( backup_path ) :
return False , f " ERROR: Target path { backup_path } does not exist! "
2023-10-17 10:15:06 -04:00
# 1. Get information about VM
2023-10-17 11:51:03 -04:00
vm_detail = get_list ( zkhandler , limit = dom_uuid , is_fuzzy = False ) [ 1 ] [ 0 ]
if not isinstance ( vm_detail , dict ) :
return False , f " ERROR: VM listing returned invalid data: { vm_detail } "
2023-10-23 22:23:17 -04:00
vm_volumes = list ( )
for disk in vm_detail [ " disks " ] :
if disk [ " type " ] != " rbd " :
continue
2023-10-24 01:20:44 -04:00
pool , volume = disk [ " name " ] . split ( " / " )
2023-10-23 22:23:17 -04:00
retcode , retdata = ceph . get_list_volume ( zkhandler , pool , volume , is_fuzzy = False )
if not retcode or len ( retdata ) != 1 :
if len ( retdata ) < 1 :
retdata = " No volumes returned. "
elif len ( retdata ) > 1 :
retdata = " Multiple volumes returned. "
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to get volume details for { pool } / { volume } : { retdata } " ,
)
2023-10-23 22:23:17 -04:00
try :
size = retdata [ 0 ] [ " stats " ] [ " size " ]
except Exception as e :
return False , f " ERROR: Failed to get volume size for { pool } / { volume } : { e } "
vm_volumes . append ( ( pool , volume , size ) )
2023-10-17 10:15:06 -04:00
# 2a. Validate that all volumes exist (they should, but just in case)
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
if not ceph . verifyVolume ( zkhandler , pool , volume ) :
return (
False ,
f " ERROR: VM defines a volume { pool } / { volume } which does not exist! " ,
)
# 2b. Validate that, if an incremental_parent is given, it is valid
# The incremental parent is just a datestring
if incremental_parent is not None :
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
if not ceph . verifySnapshot (
zkhandler , pool , volume , f " backup_ { incremental_parent } "
) :
return (
False ,
2023-10-17 12:17:29 -04:00
f " ERROR: Incremental parent { incremental_parent } given, but no snapshots were found; cannot export an incremental backup. " ,
2023-10-17 10:15:06 -04:00
)
export_fileext = " rbddiff "
else :
export_fileext = " rbdimg "
2023-10-23 11:00:54 -04:00
# 2c. Validate that there's enough space on the target
# TODO
2023-10-17 10:15:06 -04:00
# 3. Set datestring in YYYYMMDDHHMMSS format
now = datetime . now ( )
2023-10-17 12:05:45 -04:00
datestring = now . strftime ( " % Y % m %d % H % M % S " )
2023-10-17 10:15:06 -04:00
snapshot_name = f " backup_ { datestring } "
# 4. Create destination directory
2023-10-24 01:20:44 -04:00
vm_target_root = f " { backup_path } / { domain } "
2023-10-24 01:40:56 -04:00
vm_target_backup = f " { backup_path } / { domain } / { datestring } /pvcdisks "
2023-10-17 10:15:06 -04:00
if not os . path . isdir ( vm_target_backup ) :
try :
os . makedirs ( vm_target_backup )
except Exception as e :
return False , f " ERROR: Failed to create backup directory: { e } "
# 5. Take snapshot of each disks with the name @backup_{datestring}
is_snapshot_create_failed = False
which_snapshot_create_failed = list ( )
msg_snapshot_create_failed = list ( )
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
retcode , retmsg = ceph . add_snapshot ( zkhandler , pool , volume , snapshot_name )
if not retcode :
is_snapshot_create_failed = True
which_snapshot_create_failed . append ( f " { pool } / { volume } " )
msg_snapshot_create_failed . append ( retmsg )
if is_snapshot_create_failed :
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
if ceph . verifySnapshot ( zkhandler , pool , volume , snapshot_name ) :
ceph . remove_snapshot ( zkhandler , pool , volume , snapshot_name )
return (
False ,
f ' ERROR: Failed to create snapshot for volume(s) { " , " . join ( which_snapshot_create_failed ) } : { " , " . join ( msg_snapshot_create_failed ) } ' ,
)
# 6. Dump snapshot to folder with `rbd export` (full) or `rbd export-diff` (incremental)
is_snapshot_export_failed = False
which_snapshot_export_failed = list ( )
msg_snapshot_export_failed = list ( )
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
if incremental_parent is not None :
incremental_parent_snapshot_name = f " backup_ { incremental_parent } "
retcode , stdout , stderr = common . run_os_command (
2023-10-23 22:23:17 -04:00
f " rbd export-diff --from-snap { incremental_parent_snapshot_name } { pool } / { volume } @ { snapshot_name } { vm_target_backup } / { pool } . { volume } . { export_fileext } "
2023-10-17 10:15:06 -04:00
)
if retcode :
is_snapshot_export_failed = True
which_snapshot_export_failed . append ( f " { pool } / { volume } " )
msg_snapshot_export_failed . append ( stderr )
else :
retcode , stdout , stderr = common . run_os_command (
2023-10-23 22:23:17 -04:00
f " rbd export --export-format 2 { pool } / { volume } @ { snapshot_name } { vm_target_backup } / { pool } . { volume } . { export_fileext } "
2023-10-17 10:15:06 -04:00
)
if retcode :
is_snapshot_export_failed = True
which_snapshot_export_failed . append ( f " { pool } / { volume } " )
msg_snapshot_export_failed . append ( stderr )
if is_snapshot_export_failed :
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
if ceph . verifySnapshot ( zkhandler , pool , volume , snapshot_name ) :
ceph . remove_snapshot ( zkhandler , pool , volume , snapshot_name )
return (
False ,
f ' ERROR: Failed to export snapshot for volume(s) { " , " . join ( which_snapshot_export_failed ) } : { " , " . join ( msg_snapshot_export_failed ) } ' ,
)
2023-10-23 11:01:16 -04:00
# 7. Create and dump VM backup information
2023-10-17 12:10:55 -04:00
backup_type = " incremental " if incremental_parent is not None else " full "
2023-10-17 10:15:06 -04:00
vm_backup = {
2023-10-17 12:10:55 -04:00
" type " : backup_type ,
2023-10-17 10:15:06 -04:00
" datestring " : datestring ,
" incremental_parent " : incremental_parent ,
2023-10-24 01:32:18 -04:00
" retained_snapshot " : retain_snapshot ,
2023-10-17 10:15:06 -04:00
" vm_detail " : vm_detail ,
2023-10-24 01:20:44 -04:00
" backup_files " : [
2023-10-24 01:40:56 -04:00
( f " pvcdisks/ { p } . { v } . { export_fileext } " , s ) for p , v , s in vm_volumes
2023-10-24 01:20:44 -04:00
] ,
2023-10-17 10:15:06 -04:00
}
2023-10-24 01:40:56 -04:00
with open ( f " { vm_target_root } / { datestring } /pvcbackup.json " , " w " ) as fh :
2023-10-17 12:02:24 -04:00
jdump ( vm_backup , fh )
2023-10-17 10:15:06 -04:00
2023-10-23 11:01:16 -04:00
# 8. Remove snapshots if retain_snapshot is False
2023-10-17 12:32:40 -04:00
is_snapshot_remove_failed = False
which_snapshot_remove_failed = list ( )
msg_snapshot_remove_failed = list ( )
2023-10-24 00:23:12 -04:00
if not retain_snapshot :
2023-10-23 22:23:17 -04:00
for pool , volume , _ in vm_volumes :
2023-10-17 10:15:06 -04:00
if ceph . verifySnapshot ( zkhandler , pool , volume , snapshot_name ) :
retcode , retmsg = ceph . remove_snapshot (
zkhandler , pool , volume , snapshot_name
)
if not retcode :
is_snapshot_remove_failed = True
which_snapshot_remove_failed . append ( f " { pool } / { volume } " )
msg_snapshot_remove_failed . append ( retmsg )
2023-10-17 12:39:37 -04:00
tend = time . time ( )
ttot = round ( tend - tstart , 2 )
2023-10-23 22:23:17 -04:00
retlines = list ( )
2023-10-17 12:32:40 -04:00
2023-10-23 11:01:16 -04:00
if is_snapshot_remove_failed :
2023-10-24 01:20:44 -04:00
retlines . append (
f " WARNING: Failed to remove snapshot(s) as requested for volume(s) { ' , ' . join ( which_snapshot_remove_failed ) } : { ' , ' . join ( msg_snapshot_remove_failed ) } "
)
2023-10-23 22:23:17 -04:00
myhostname = gethostname ( ) . split ( " . " ) [ 0 ]
2023-10-24 00:23:12 -04:00
if retain_snapshot :
2023-10-24 01:20:44 -04:00
retlines . append (
f " Successfully backed up VM ' { domain } ' ( { backup_type } @ { datestring } , snapshots retained) to ' { myhostname } : { backup_path } ' in { ttot } s. "
)
2023-10-17 12:53:08 -04:00
else :
2023-10-24 01:20:44 -04:00
retlines . append (
f " Successfully backed up VM ' { domain } ' ( { backup_type } @ { datestring } ) to ' { myhostname } : { backup_path } ' in { ttot } s. "
)
2023-10-17 10:15:06 -04:00
2023-10-24 01:20:44 -04:00
return True , " \n " . join ( retlines )
2023-10-23 11:00:54 -04:00
2023-10-24 01:20:44 -04:00
def remove_backup ( zkhandler , domain , backup_path , datestring ) :
2023-10-24 01:08:36 -04:00
tstart = time . time ( )
# 0. Validation
# Validate that VM exists in cluster
dom_uuid = getDomainUUID ( zkhandler , domain )
if not dom_uuid :
return False , ' ERROR: Could not find VM " {} " in the cluster! ' . format ( domain )
# Validate that the source path is valid
2023-10-24 01:20:44 -04:00
if not re . match ( r " ^/ " , backup_path ) :
2023-10-24 01:08:36 -04:00
return (
False ,
2023-10-24 01:20:44 -04:00
f " ERROR: Source path { backup_path } is not a valid absolute path on the primary coordinator! " ,
2023-10-24 01:08:36 -04:00
)
2023-10-24 01:20:44 -04:00
# Ensure that backup_path (on this node) exists
if not os . path . isdir ( backup_path ) :
return False , f " ERROR: Source path { backup_path } does not exist! "
2023-10-24 01:08:36 -04:00
# Ensure that domain path (on this node) exists
2023-10-24 01:40:56 -04:00
vm_backup_path = f " { backup_path } / { domain } "
if not os . path . isdir ( vm_backup_path ) :
return False , f " ERROR: Source VM path { vm_backup_path } does not exist! "
2023-10-24 01:08:36 -04:00
# Ensure that the archives are present
2023-10-24 01:40:56 -04:00
backup_source_pvcbackup_file = f " { vm_backup_path } / { datestring } /pvcbackup.json "
2023-10-24 01:08:36 -04:00
if not os . path . isfile ( backup_source_pvcbackup_file ) :
return False , " ERROR: The specified source backup files do not exist! "
2023-10-23 11:00:54 -04:00
2023-10-24 01:40:56 -04:00
backup_source_pvcdisks_path = f " { vm_backup_path } / { datestring } /pvcdisks "
2023-10-24 01:08:36 -04:00
if not os . path . isdir ( backup_source_pvcdisks_path ) :
return False , " ERROR: The specified source backup files do not exist! "
# 1. Read the backup file and get VM details
try :
with open ( backup_source_pvcbackup_file ) as fh :
backup_source_details = jload ( fh )
except Exception as e :
return False , f " ERROR: Failed to read source backup details: { e } "
# 2. Remove snapshots
is_snapshot_remove_failed = False
which_snapshot_remove_failed = list ( )
msg_snapshot_remove_failed = list ( )
2023-10-24 01:32:18 -04:00
if backup_source_details [ " retained_snapshot " ] :
for volume_file , _ in backup_source_details . get ( " backup_files " ) :
pool , volume , _ = volume_file . split ( " / " ) [ - 1 ] . split ( " . " )
snapshot = f " backup_ { datestring } "
retcode , retmsg = ceph . remove_snapshot ( zkhandler , pool , volume , snapshot )
if not retcode :
is_snapshot_remove_failed = True
which_snapshot_remove_failed . append ( f " { pool } / { volume } " )
msg_snapshot_remove_failed . append ( retmsg )
2023-10-24 01:08:36 -04:00
# 3. Remove files
is_files_remove_failed = False
msg_files_remove_failed = None
try :
2023-10-24 01:40:56 -04:00
rmtree ( f " { vm_backup_path } / { datestring } " )
2023-10-24 01:08:36 -04:00
except Exception as e :
is_files_remove_failed = True
msg_files_remove_failed = e
tend = time . time ( )
ttot = round ( tend - tstart , 2 )
retlines = list ( )
if is_snapshot_remove_failed :
2023-10-24 01:20:44 -04:00
retlines . append (
f " WARNING: Failed to remove snapshot(s) as requested for volume(s) { ' , ' . join ( which_snapshot_remove_failed ) } : { ' , ' . join ( msg_snapshot_remove_failed ) } "
)
2023-10-24 01:08:36 -04:00
if is_files_remove_failed :
2023-10-24 01:20:44 -04:00
retlines . append (
f " WARNING: Failed to remove backup file(s) from { backup_path } : { msg_files_remove_failed } "
)
2023-10-24 01:08:36 -04:00
myhostname = gethostname ( ) . split ( " . " ) [ 0 ]
2023-10-24 01:20:44 -04:00
retlines . append (
f " Removed VM backup { datestring } for ' { domain } ' from ' { myhostname } : { backup_path } ' in { ttot } s. "
)
2023-10-24 01:08:36 -04:00
2023-10-24 01:20:44 -04:00
return True , " \n " . join ( retlines )
2023-10-24 01:08:36 -04:00
2023-10-24 01:20:44 -04:00
def restore_vm ( zkhandler , domain , backup_path , datestring , retain_snapshot = False ) :
2023-10-23 11:00:54 -04:00
tstart = time . time ( )
# 0. Validations
# Validate that VM does not exist in cluster
dom_uuid = getDomainUUID ( zkhandler , domain )
if dom_uuid :
return (
False ,
f ' ERROR: VM " { domain } " already exists in the cluster! Remove or rename it before restoring a backup. ' ,
)
2023-10-24 01:08:36 -04:00
# Validate that the source path is valid
2023-10-24 01:20:44 -04:00
if not re . match ( r " ^/ " , backup_path ) :
2023-10-23 11:00:54 -04:00
return (
False ,
2023-10-24 01:20:44 -04:00
f " ERROR: Source path { backup_path } is not a valid absolute path on the primary coordinator! " ,
2023-10-23 11:00:54 -04:00
)
2023-10-24 01:20:44 -04:00
# Ensure that backup_path (on this node) exists
if not os . path . isdir ( backup_path ) :
return False , f " ERROR: Source path { backup_path } does not exist! "
2023-10-23 11:00:54 -04:00
# Ensure that domain path (on this node) exists
2023-10-24 01:40:56 -04:00
vm_backup_path = f " { backup_path } / { domain } "
if not os . path . isdir ( vm_backup_path ) :
return False , f " ERROR: Source VM path { vm_backup_path } does not exist! "
2023-10-23 11:00:54 -04:00
# Ensure that the archives are present
2023-10-24 01:40:56 -04:00
backup_source_pvcbackup_file = f " { vm_backup_path } / { datestring } /pvcbackup.json "
2023-10-23 22:23:17 -04:00
if not os . path . isfile ( backup_source_pvcbackup_file ) :
return False , " ERROR: The specified source backup files do not exist! "
2023-10-23 11:00:54 -04:00
2023-10-23 22:23:17 -04:00
# 1. Read the backup file and get VM details
try :
with open ( backup_source_pvcbackup_file ) as fh :
backup_source_details = jload ( fh )
except Exception as e :
return False , f " ERROR: Failed to read source backup details: { e } "
# Handle incrementals
incremental_parent = backup_source_details . get ( " incremental_parent " , None )
2023-10-23 11:00:54 -04:00
if incremental_parent is not None :
2023-10-23 22:23:17 -04:00
backup_source_parent_pvcbackup_file = (
2023-10-24 01:40:56 -04:00
f " { vm_backup_path } / { incremental_parent } /pvcbackup.json "
2023-10-23 11:00:54 -04:00
)
2023-10-23 22:23:17 -04:00
if not os . path . isfile ( backup_source_parent_pvcbackup_file ) :
2023-10-23 11:00:54 -04:00
return (
False ,
2023-10-23 22:23:17 -04:00
" ERROR: The specified backup is incremental but the required incremental parent source backup files do not exist! " ,
2023-10-23 11:00:54 -04:00
)
2023-10-23 22:23:17 -04:00
try :
with open ( backup_source_parent_pvcbackup_file ) as fh :
backup_source_parent_details = jload ( fh )
except Exception as e :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to read source incremental parent backup details: { e } " ,
)
2023-10-23 22:23:17 -04:00
# 2. Import VM config and metadata in provision state
2023-10-23 11:00:54 -04:00
try :
2023-10-23 22:23:17 -04:00
retcode , retmsg = define_vm (
zkhandler ,
backup_source_details [ " vm_detail " ] [ " xml " ] ,
backup_source_details [ " vm_detail " ] [ " node " ] ,
backup_source_details [ " vm_detail " ] [ " node_limit " ] ,
backup_source_details [ " vm_detail " ] [ " node_selector " ] ,
backup_source_details [ " vm_detail " ] [ " node_autostart " ] ,
backup_source_details [ " vm_detail " ] [ " migration_method " ] ,
backup_source_details [ " vm_detail " ] [ " profile " ] ,
backup_source_details [ " vm_detail " ] [ " tags " ] ,
" restore " ,
)
if not retcode :
return False , f " ERROR: Failed to define restored VM: { retmsg } "
2023-10-23 11:00:54 -04:00
except Exception as e :
2023-10-23 22:23:17 -04:00
return False , f " ERROR: Failed to parse VM backup details: { e } "
2023-10-23 11:00:54 -04:00
2023-10-23 22:23:17 -04:00
# 4. Import volumes
is_snapshot_remove_failed = False
which_snapshot_remove_failed = list ( )
msg_snapshot_remove_failed = list ( )
if incremental_parent is not None :
2023-10-24 01:20:44 -04:00
for volume_file , volume_size in backup_source_details . get ( " backup_files " ) :
pool , volume , _ = volume_file . split ( " / " ) [ - 1 ] . split ( " . " )
2023-10-23 22:23:17 -04:00
try :
2023-10-24 01:20:44 -04:00
parent_volume_file = [
f [ 0 ]
for f in backup_source_parent_details . get ( " backup_files " )
if f [ 0 ] . split ( " / " ) [ - 1 ] . replace ( " .rbdimg " , " " )
== volume_file . split ( " / " ) [ - 1 ] . replace ( " .rbddiff " , " " )
] [ 0 ]
2023-10-23 22:23:17 -04:00
except Exception as e :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to find parent volume for volume { pool } / { volume } ; backup may be corrupt or invalid: { e } " ,
)
2023-10-23 11:00:54 -04:00
2023-10-23 22:23:17 -04:00
# First we create the expected volumes then clean them up
# This process is a bit of a hack because rbd import does not expect an existing volume,
# but we need the information in PVC.
# Thus create the RBD volume using ceph.add_volume based on the backup size, and then
# manually remove the RBD volume (leaving the PVC metainfo)
retcode , retmsg = ceph . add_volume ( zkhandler , pool , volume , volume_size )
if not retcode :
return False , f " ERROR: Failed to create restored volume: { retmsg } "
2023-10-23 11:00:54 -04:00
2023-10-23 22:23:17 -04:00
retcode , stdout , stderr = common . run_os_command (
f " rbd remove { pool } / { volume } "
)
if retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to remove temporary RBD volume ' { pool } / { volume } ' : { stderr } " ,
)
2023-10-23 22:23:17 -04:00
# Next we import the parent images
retcode , stdout , stderr = common . run_os_command (
2023-10-24 01:40:56 -04:00
f " rbd import --export-format 2 --dest-pool { pool } { backup_path } / { domain } / { incremental_parent } / { parent_volume_file } { volume } "
2023-10-23 22:23:17 -04:00
)
if retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to import parent backup image { parent_volume_file } : { stderr } " ,
)
2023-10-23 22:23:17 -04:00
# Then we import the incremental diffs
retcode , stdout , stderr = common . run_os_command (
2023-10-24 01:40:56 -04:00
f " rbd import-diff { backup_path } / { domain } / { datestring } / { volume_file } { pool } / { volume } "
2023-10-23 22:23:17 -04:00
)
if retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to import incremental backup image { volume_file } : { stderr } " ,
)
2023-10-23 22:23:17 -04:00
# Finally we remove the parent and child snapshots (no longer required required)
2023-10-24 00:23:12 -04:00
if retain_snapshot :
2023-10-24 01:20:44 -04:00
retcode , retmsg = ceph . add_snapshot (
zkhandler ,
pool ,
volume ,
f " backup_ { incremental_parent } " ,
zk_only = True ,
)
2023-10-24 00:23:12 -04:00
if not retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to add imported image snapshot for { parent_volume_file } : { retmsg } " ,
)
2023-10-24 00:23:12 -04:00
else :
retcode , stdout , stderr = common . run_os_command (
f " rbd snap rm { pool } / { volume } @backup_ { incremental_parent } "
)
if retcode :
2023-10-24 01:08:36 -04:00
is_snapshot_remove_failed = True
which_snapshot_remove_failed . append ( f " { pool } / { volume } " )
msg_snapshot_remove_failed . append ( retmsg )
2023-10-23 22:23:17 -04:00
retcode , stdout , stderr = common . run_os_command (
f " rbd snap rm { pool } / { volume } @backup_ { datestring } "
)
if retcode :
2023-10-24 01:08:36 -04:00
is_snapshot_remove_failed = True
which_snapshot_remove_failed . append ( f " { pool } / { volume } " )
msg_snapshot_remove_failed . append ( retmsg )
2023-10-23 22:23:17 -04:00
else :
2023-10-24 01:20:44 -04:00
for volume_file , volume_size in backup_source_details . get ( " backup_files " ) :
pool , volume , _ = volume_file . split ( " / " ) [ - 1 ] . split ( " . " )
2023-10-23 22:23:17 -04:00
# First we create the expected volumes then clean them up
# This process is a bit of a hack because rbd import does not expect an existing volume,
# but we need the information in PVC.
# Thus create the RBD volume using ceph.add_volume based on the backup size, and then
# manually remove the RBD volume (leaving the PVC metainfo)
retcode , retmsg = ceph . add_volume ( zkhandler , pool , volume , volume_size )
if not retcode :
return False , f " ERROR: Failed to create restored volume: { retmsg } "
retcode , stdout , stderr = common . run_os_command (
f " rbd remove { pool } / { volume } "
)
if retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to remove temporary RBD volume ' { pool } / { volume } ' : { stderr } " ,
)
2023-10-23 22:23:17 -04:00
# Then we perform the actual import
retcode , stdout , stderr = common . run_os_command (
2023-10-24 01:40:56 -04:00
f " rbd import --export-format 2 --dest-pool { pool } { backup_path } / { domain } / { datestring } / { volume_file } { volume } "
2023-10-23 22:23:17 -04:00
)
if retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to import backup image { volume_file } : { stderr } " ,
)
2023-10-23 22:23:17 -04:00
# Finally we remove the source snapshot (not required)
2023-10-24 00:23:12 -04:00
if retain_snapshot :
2023-10-24 01:20:44 -04:00
retcode , retmsg = ceph . add_snapshot (
zkhandler ,
pool ,
volume ,
2023-10-24 01:25:01 -04:00
f " backup_ { datestring } " ,
2023-10-24 01:20:44 -04:00
zk_only = True ,
)
2023-10-24 00:23:12 -04:00
if not retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to add imported image snapshot for { volume_file } : { retmsg } " ,
)
2023-10-24 00:23:12 -04:00
else :
retcode , stdout , stderr = common . run_os_command (
f " rbd snap rm { pool } / { volume } @backup_ { datestring } "
)
if retcode :
2023-10-24 01:20:44 -04:00
return (
False ,
f " ERROR: Failed to remove imported image snapshot for { volume_file } : { stderr } " ,
)
2023-10-23 11:00:54 -04:00
# 5. Start VM
2023-10-23 22:23:17 -04:00
retcode , retmsg = start_vm ( zkhandler , domain )
if not retcode :
return False , f " ERROR: Failed to start restored VM { domain } : { retmsg } "
tend = time . time ( )
ttot = round ( tend - tstart , 2 )
retlines = list ( )
if is_snapshot_remove_failed :
2023-10-24 01:20:44 -04:00
retlines . append (
f " WARNING: Failed to remove hanging snapshot(s) as requested for volume(s) { ' , ' . join ( which_snapshot_remove_failed ) } : { ' , ' . join ( msg_snapshot_remove_failed ) } "
)
2023-10-23 22:23:17 -04:00
myhostname = gethostname ( ) . split ( " . " ) [ 0 ]
2023-10-24 01:20:44 -04:00
retlines . append (
f " Successfully restored VM backup { datestring } for ' { domain } ' from ' { myhostname } : { backup_path } ' in { ttot } s. "
)
2023-10-23 22:23:17 -04:00
2023-10-24 01:20:44 -04:00
return True , " \n " . join ( retlines )
2023-11-05 22:32:41 -05:00
#
# Celery worker tasks (must be run on node, outputs log messages to worker)
#
def vm_worker_helper_getdom ( tuuid ) :
lv_conn = None
libvirt_uri = " qemu:///system "
# Convert (text) UUID into bytes
buuid = UUID ( tuuid ) . bytes
try :
lv_conn = lvopen ( libvirt_uri )
if lv_conn is None :
raise Exception ( " Failed to open local libvirt connection " )
# Lookup the UUID
dom = lv_conn . lookupByUUID ( buuid )
except Exception as e :
print ( f " Error: { e } " )
dom = None
finally :
if lv_conn is not None :
lv_conn . close ( )
return dom
def vm_worker_flush_locks ( zkhandler , celery , domain , force_unlock = False ) :
2023-11-09 14:05:15 -05:00
current_stage = 0
total_stages = 3
2023-11-05 22:32:41 -05:00
start (
celery ,
f " Flushing RBD locks for VM { domain } [forced= { force_unlock } ] " ,
2023-11-09 14:05:15 -05:00
current = current_stage ,
total = total_stages ,
2023-11-05 22:32:41 -05:00
)
dom_uuid = getDomainUUID ( zkhandler , domain )
# Check that the domain is stopped (unless force_unlock is set)
domain_state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if not force_unlock and domain_state not in [ " stop " , " disable " , " fail " ] :
fail (
celery ,
f " VM state { domain_state } not in [stop, disable, fail] and not forcing " ,
)
return
# Get the list of RBD images
rbd_list = zkhandler . read ( ( " domain.storage.volumes " , dom_uuid ) ) . split ( " , " )
2023-11-09 14:05:15 -05:00
current_stage + = 1
update (
celery ,
f " Obtaining RBD locks for VM { domain } " ,
current = current_stage ,
total = total_stages ,
)
2023-11-05 22:32:41 -05:00
# Prepare a list of locks
rbd_locks = list ( )
for rbd in rbd_list :
# Check if a lock exists
(
lock_list_retcode ,
lock_list_stdout ,
lock_list_stderr ,
) = common . run_os_command ( f " rbd lock list --format json { rbd } " )
if lock_list_retcode != 0 :
fail (
celery ,
f " Failed to obtain lock list for volume { rbd } : { lock_list_stderr } " ,
)
return
try :
lock_list = jloads ( lock_list_stdout )
except Exception as e :
2023-11-09 14:05:15 -05:00
fail (
celery ,
f " Failed to parse JSON lock list for volume { rbd } : { e } " ,
)
2023-11-05 22:32:41 -05:00
return
if lock_list :
for lock in lock_list :
rbd_locks . append ( { " rbd " : rbd , " lock " : lock } )
2023-11-09 14:05:15 -05:00
current_stage + = 1
update (
celery ,
f " Freeing RBD locks for VM { domain } " ,
current = current_stage ,
total = total_stages ,
)
2023-11-05 22:32:41 -05:00
for _lock in rbd_locks :
rbd = _lock [ " rbd " ]
lock = _lock [ " lock " ]
(
lock_remove_retcode ,
lock_remove_stdout ,
lock_remove_stderr ,
) = common . run_os_command (
f " rbd lock remove { rbd } \" { lock [ ' id ' ] } \" \" { lock [ ' locker ' ] } \" "
)
if lock_remove_retcode != 0 :
fail (
celery ,
f " Failed to free RBD lock { lock [ ' id ' ] } on volume { rbd } : { lock_remove_stderr } " ,
)
return
2023-11-09 14:05:15 -05:00
current_stage + = 1
2023-11-05 22:32:41 -05:00
return finish (
2023-11-09 14:05:15 -05:00
celery ,
f " Successfully flushed RBD locks for VM { domain } " ,
current = 4 ,
total = 4 ,
2023-11-05 22:32:41 -05:00
)
def vm_worker_attach_device ( zkhandler , celery , domain , xml_spec ) :
2023-11-09 14:05:15 -05:00
current_stage = 0
total_stages = 1
start (
celery ,
f " Hot-attaching XML device to VM { domain } " ,
current = current_stage ,
total = total_stages ,
)
2023-11-05 22:32:41 -05:00
dom_uuid = getDomainUUID ( zkhandler , domain )
state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if state not in [ " start " ] :
fail (
celery ,
f " VM { domain } not in start state; hot-attach unnecessary or impossible " ,
)
return
dom = vm_worker_helper_getdom ( dom_uuid )
if dom is None :
2023-11-09 14:05:15 -05:00
fail (
celery ,
f " Failed to find Libvirt object for VM { domain } " ,
)
2023-11-05 22:32:41 -05:00
return
try :
dom . attachDevice ( xml_spec )
except Exception as e :
fail ( celery , e )
return
2023-11-09 14:05:15 -05:00
current_stage + = 1
return finish (
celery ,
f " Successfully hot-attached XML device to VM { domain } " ,
current = current_stage ,
total = total_stages ,
)
2023-11-05 22:32:41 -05:00
def vm_worker_detach_device ( zkhandler , celery , domain , xml_spec ) :
2023-11-09 14:05:15 -05:00
current_stage = 0
total_stages = 1
start (
celery ,
f " Hot-detaching XML device from VM { domain } " ,
current = current_stage ,
total_stages = total_stages ,
)
2023-11-05 22:32:41 -05:00
dom_uuid = getDomainUUID ( zkhandler , domain )
state = zkhandler . read ( ( " domain.state " , dom_uuid ) )
if state not in [ " start " ] :
fail (
celery ,
f " VM { domain } not in start state; hot-detach unnecessary or impossible " ,
)
return
dom = vm_worker_helper_getdom ( dom_uuid )
if dom is None :
2023-11-09 14:05:15 -05:00
fail (
celery ,
f " Failed to find Libvirt object for VM { domain } " ,
)
2023-11-05 22:32:41 -05:00
return
try :
dom . detachDevice ( xml_spec )
except Exception as e :
fail ( celery , e )
return
2023-11-09 14:05:15 -05:00
current_stage + = 1
return finish (
celery ,
f " Successfully hot-detached XML device from VM { domain } " ,
current = current_stage ,
total_stages = total_stages ,
)