Move all host provisioner steps to a try block

Make the provisioner a bit more robust. This way, even if a provisioning
step fails, cleanup is still performed this preventing the system from
being left in an undefined state requiring manual correction.

Addresses #91
This commit is contained in:
Joshua Boniface 2020-08-06 12:24:04 -04:00
parent ccee124c8b
commit 5526e13da9
1 changed files with 206 additions and 194 deletions

View File

@ -1388,232 +1388,244 @@ def create_vm(self, vm_name, vm_profile, define_vm=True, start_vm=True, script_r
vm_schema += libvirt_schema.libvirt_footer vm_schema += libvirt_schema.libvirt_footer
print("Final VM schema:\n{}\n".format(vm_schema)) print("Final VM schema:\n{}\n".format(vm_schema))
# Phase 5 - definition
# * Create the VM in the PVC cluster
self.update_state(state='RUNNING', meta={'current': 5, 'total': 10, 'status': 'Defining VM on the cluster'})
time.sleep(1)
if define_vm: # All the following steps may require cleanup later on, so catch them here and do cleanup in a Finally block
print("Defining VM on cluster") try:
node_limit = vm_data['system_details']['node_limit'] # Phase 5 - definition
if node_limit: # * Create the VM in the PVC cluster
node_limit = node_limit.split(',') self.update_state(state='RUNNING', meta={'current': 5, 'total': 10, 'status': 'Defining VM on the cluster'})
node_selector = vm_data['system_details']['node_selector'] time.sleep(1)
node_autostart = vm_data['system_details']['node_autostart']
retcode, retmsg = pvc_vm.define_vm(zk_conn, vm_schema.strip(), target_node, node_limit, node_selector, node_autostart, vm_profile, initial_state='provision')
print(retmsg)
else:
print("Skipping VM definition")
# Phase 6 - disk creation if define_vm:
# * Create each Ceph storage volume for the disks print("Defining VM on cluster")
self.update_state(state='RUNNING', meta={'current': 6, 'total': 10, 'status': 'Creating storage volumes'}) node_limit = vm_data['system_details']['node_limit']
time.sleep(1) if node_limit:
node_limit = node_limit.split(',')
for volume in vm_data['volumes']: node_selector = vm_data['system_details']['node_selector']
if volume.get('source_volume') is not None: node_autostart = vm_data['system_details']['node_autostart']
success, message = pvc_ceph.clone_volume(zk_conn, volume['pool'], "{}_{}".format(vm_name, volume['disk_id']), volume['source_volume']) retcode, retmsg = pvc_vm.define_vm(zk_conn, vm_schema.strip(), target_node, node_limit, node_selector, node_autostart, vm_profile, initial_state='provision')
print(message) print(retmsg)
if not success:
raise ClusterError('Failed to clone volume "{}" to "{}".'.format(volume['source_volume'], volume['disk_id']))
else: else:
success, message = pvc_ceph.add_volume(zk_conn, volume['pool'], "{}_{}".format(vm_name, volume['disk_id']), "{}G".format(volume['disk_size_gb'])) print("Skipping VM definition")
print(message)
if not success:
raise ClusterError('Failed to create volume "{}".'.format(volume['disk_id']))
# Phase 7 - disk mapping # Phase 6 - disk creation
# * Map each volume to the local host in order # * Create each Ceph storage volume for the disks
# * Format each volume with any specified filesystems self.update_state(state='RUNNING', meta={'current': 6, 'total': 10, 'status': 'Creating storage volumes'})
# * If any mountpoints are specified, create a temporary mount directory time.sleep(1)
# * Mount any volumes to their respective mountpoints
self.update_state(state='RUNNING', meta={'current': 7, 'total': 10, 'status': 'Mapping, formatting, and mounting storage volumes locally'}) for volume in vm_data['volumes']:
time.sleep(1)
for volume in vm_data['volumes']:
dst_volume_name = "{}_{}".format(vm_name, volume['disk_id'])
dst_volume = "{}/{}".format(volume['pool'], dst_volume_name)
if is_ova_install:
src_volume_name = volume['volume_name']
src_volume = "{}/{}".format(volume['pool'], src_volume_name)
print('Converting {} source volume {} to raw format on {}'.format(volume['volume_format'], src_volume, dst_volume))
# Map the target RBD device
retcode, retmsg = pvc_ceph.map_volume(zk_conn, volume['pool'], dst_volume_name)
if not retcode:
raise ProvisioningError('Failed to map destination volume "{}": {}'.format(dst_volume_name, retmsg))
# Map the source RBD device
retcode, retmsg = pvc_ceph.map_volume(zk_conn, volume['pool'], src_volume_name)
if not retcode:
raise ProvisioningError('Failed to map source volume "{}": {}'.format(src_volume_name, retmsg))
# Convert from source to target
retcode, stdout, stderr = pvc_common.run_os_command(
'qemu-img convert -C -f {} -O raw {} {}'.format(
volume['volume_format'],
"/dev/rbd/{}".format(src_volume),
"/dev/rbd/{}".format(dst_volume)
)
)
if retcode:
raise ProvisioningError('Failed to convert {} volume "{}" to raw volume "{}": {}'.format(volume['volume_format'], src_volume, dst_volume, stderr))
# Unmap the source RBD device (don't bother later)
retcode, retmsg = pvc_ceph.unmap_volume(zk_conn, volume['pool'], src_volume_name)
if not retcode:
raise ProvisioningError('Failed to unmap source volume "{}": {}'.format(src_volume_name, retmsg))
# Unmap the target RBD device (don't bother later)
retcode, retmsg = pvc_ceph.unmap_volume(zk_conn, volume['pool'], dst_volume_name)
if not retcode:
raise ProvisioningError('Failed to unmap destination volume "{}": {}'.format(dst_volume_name, retmsg))
else:
if volume.get('source_volume') is not None: if volume.get('source_volume') is not None:
continue success, message = pvc_ceph.clone_volume(zk_conn, volume['pool'], "{}_{}".format(vm_name, volume['disk_id']), volume['source_volume'])
print(message)
if volume.get('filesystem') is None: if not success:
continue raise ProvisioningError('Failed to clone volume "{}" to "{}".'.format(volume['source_volume'], volume['disk_id']))
print("Creating {} filesystem on {}:\n{}".format(volume['filesystem'], dst_volume, stdout))
filesystem_args_list = list()
for arg in volume['filesystem_args'].split():
arg_entry, arg_data = arg.split('=')
filesystem_args_list.append(arg_entry)
filesystem_args_list.append(arg_data)
filesystem_args = ' '.join(filesystem_args_list)
# Map the RBD device
retcode, retmsg = pvc_ceph.map_volume(zk_conn, volume['pool'], dst_volume_name)
if not retcode:
raise ProvisioningError('Failed to map volume "{}": {}'.format(dst_volume, retmsg))
# Create the filesystem
if volume['filesystem'] == 'swap':
retcode, stdout, stderr = pvc_common.run_os_command("mkswap -f /dev/rbd/{}".format(dst_volume))
if retcode:
raise ProvisioningError('Failed to create swap on "{}": {}'.format(dst_volume, stderr))
else: else:
retcode, stdout, stderr = pvc_common.run_os_command("mkfs.{} {} /dev/rbd/{}".format(volume['filesystem'], filesystem_args, dst_volume)) success, message = pvc_ceph.add_volume(zk_conn, volume['pool'], "{}_{}".format(vm_name, volume['disk_id']), "{}G".format(volume['disk_size_gb']))
if retcode: print(message)
raise ProvisioningError('Failed to create {} filesystem on "{}": {}'.format(volume['filesystem'], dst_volume, stderr)) if not success:
raise ProvisioningError('Failed to create volume "{}".'.format(volume['disk_id']))
if is_script_install: # Phase 7 - disk mapping
# Create temporary directory # * Map each volume to the local host in order
retcode, stdout, stderr = pvc_common.run_os_command("mktemp -d") # * Format each volume with any specified filesystems
if retcode: # * If any mountpoints are specified, create a temporary mount directory
raise ProvisioningError("Failed to create a temporary directory: {}".format(stderr)) # * Mount any volumes to their respective mountpoints
temp_dir = stdout.strip() self.update_state(state='RUNNING', meta={'current': 7, 'total': 10, 'status': 'Mapping, formatting, and mounting storage volumes locally'})
time.sleep(1)
for volume in vm_data['volumes']: for volume in vm_data['volumes']:
if volume['source_volume'] is not None: dst_volume_name = "{}_{}".format(vm_name, volume['disk_id'])
continue dst_volume = "{}/{}".format(volume['pool'], dst_volume_name)
if not volume['mountpoint'] or volume['mountpoint'] == 'swap': if is_ova_install:
continue src_volume_name = volume['volume_name']
src_volume = "{}/{}".format(volume['pool'], src_volume_name)
mapped_dst_volume = "/dev/rbd/{}/{}_{}".format(volume['pool'], vm_name, volume['disk_id']) print('Converting {} source volume {} to raw format on {}'.format(volume['volume_format'], src_volume, dst_volume))
mount_path = "{}{}".format(temp_dir, volume['mountpoint'])
# Ensure the mount path exists (within the filesystems) # Map the target RBD device
retcode, stdout, stderr = pvc_common.run_os_command("mkdir -p {}".format(mount_path)) retcode, retmsg = pvc_ceph.map_volume(zk_conn, volume['pool'], dst_volume_name)
if not retcode:
raise ProvisioningError('Failed to map destination volume "{}": {}'.format(dst_volume_name, retmsg))
# Map the source RBD device
retcode, retmsg = pvc_ceph.map_volume(zk_conn, volume['pool'], src_volume_name)
if not retcode:
raise ProvisioningError('Failed to map source volume "{}": {}'.format(src_volume_name, retmsg))
# Convert from source to target
retcode, stdout, stderr = pvc_common.run_os_command(
'qemu-img convert -C -f {} -O raw {} {}'.format(
volume['volume_format'],
"/dev/rbd/{}".format(src_volume),
"/dev/rbd/{}".format(dst_volume)
)
)
if retcode:
raise ProvisioningError('Failed to convert {} volume "{}" to raw volume "{}": {}'.format(volume['volume_format'], src_volume, dst_volume, stderr))
# Unmap the source RBD device (don't bother later)
retcode, retmsg = pvc_ceph.unmap_volume(zk_conn, volume['pool'], src_volume_name)
if not retcode:
raise ProvisioningError('Failed to unmap source volume "{}": {}'.format(src_volume_name, retmsg))
# Unmap the target RBD device (don't bother later)
retcode, retmsg = pvc_ceph.unmap_volume(zk_conn, volume['pool'], dst_volume_name)
if not retcode:
raise ProvisioningError('Failed to unmap destination volume "{}": {}'.format(dst_volume_name, retmsg))
else:
if volume.get('source_volume') is not None:
continue
if volume.get('filesystem') is None:
continue
print("Creating {} filesystem on {}:\n{}".format(volume['filesystem'], dst_volume, stdout))
filesystem_args_list = list()
for arg in volume['filesystem_args'].split():
arg_entry, arg_data = arg.split('=')
filesystem_args_list.append(arg_entry)
filesystem_args_list.append(arg_data)
filesystem_args = ' '.join(filesystem_args_list)
# Map the RBD device
retcode, retmsg = pvc_ceph.map_volume(zk_conn, volume['pool'], dst_volume_name)
if not retcode:
raise ProvisioningError('Failed to map volume "{}": {}'.format(dst_volume, retmsg))
# Create the filesystem
if volume['filesystem'] == 'swap':
retcode, stdout, stderr = pvc_common.run_os_command("mkswap -f /dev/rbd/{}".format(dst_volume))
if retcode:
raise ProvisioningError('Failed to create swap on "{}": {}'.format(dst_volume, stderr))
else:
retcode, stdout, stderr = pvc_common.run_os_command("mkfs.{} {} /dev/rbd/{}".format(volume['filesystem'], filesystem_args, dst_volume))
if retcode:
raise ProvisioningError('Failed to create {} filesystem on "{}": {}'.format(volume['filesystem'], dst_volume, stderr))
if is_script_install:
# Create temporary directory
retcode, stdout, stderr = pvc_common.run_os_command("mktemp -d")
if retcode: if retcode:
raise ProvisioningError('Failed to create mountpoint "{}": {}'.format(mount_path, stderr)) raise ProvisioningError("Failed to create a temporary directory: {}".format(stderr))
temp_dir = stdout.strip()
# Mount filesystems to temporary directory for volume in vm_data['volumes']:
retcode, stdout, stderr = pvc_common.run_os_command("mount {} {}".format(mapped_dst_volume, mount_path)) if volume['source_volume'] is not None:
if retcode: continue
raise ProvisioningError('Failed to mount "{}" on "{}": {}'.format(mapped_dst_volume, mount_path, stderr))
print("Successfully mounted {} on {}".format(mapped_dst_volume, mount_path)) if not volume['mountpoint'] or volume['mountpoint'] == 'swap':
continue
# Phase 8 - provisioning script execution mapped_dst_volume = "/dev/rbd/{}/{}_{}".format(volume['pool'], vm_name, volume['disk_id'])
# * Execute the provisioning script main function ("install") passing any custom arguments mount_path = "{}{}".format(temp_dir, volume['mountpoint'])
self.update_state(state='RUNNING', meta={'current': 8, 'total': 10, 'status': 'Executing provisioning script'})
time.sleep(1)
if is_script_install: # Ensure the mount path exists (within the filesystems)
print("Running installer script") retcode, stdout, stderr = pvc_common.run_os_command("mkdir -p {}".format(mount_path))
if retcode:
raise ProvisioningError('Failed to create mountpoint "{}": {}'.format(mount_path, stderr))
# Parse the script arguments # Mount filesystems to temporary directory
script_arguments = dict() retcode, stdout, stderr = pvc_common.run_os_command("mount {} {}".format(mapped_dst_volume, mount_path))
for argument in vm_data['script_arguments']: if retcode:
argument_name, argument_data = argument.split('=') raise ProvisioningError('Failed to mount "{}" on "{}": {}'.format(mapped_dst_volume, mount_path, stderr))
script_arguments[argument_name] = argument_data
# Parse the runtime arguments print("Successfully mounted {} on {}".format(mapped_dst_volume, mount_path))
if script_run_args is not None:
for argument in script_run_args: # Phase 8 - provisioning script execution
# * Execute the provisioning script main function ("install") passing any custom arguments
self.update_state(state='RUNNING', meta={'current': 8, 'total': 10, 'status': 'Executing provisioning script'})
time.sleep(1)
if is_script_install:
print("Running installer script")
# Parse the script arguments
script_arguments = dict()
for argument in vm_data['script_arguments']:
argument_name, argument_data = argument.split('=') argument_name, argument_data = argument.split('=')
script_arguments[argument_name] = argument_data script_arguments[argument_name] = argument_data
print("Script arguments: {}".format(script_arguments)) # Parse the runtime arguments
if script_run_args is not None:
for argument in script_run_args:
argument_name, argument_data = argument.split('=')
script_arguments[argument_name] = argument_data
# Run the script print("Script arguments: {}".format(script_arguments))
try:
installer_script.install(
vm_name=vm_name,
vm_id=vm_id,
temporary_directory=temp_dir,
disks=vm_data['volumes'],
networks=vm_data['networks'],
**script_arguments
)
except:
pass
# Phase 9 - install cleanup # Run the script
# * Unmount any mounted volumes try:
# * Remove any temporary directories installer_script.install(
self.update_state(state='RUNNING', meta={'current': 9, 'total': 10, 'status': 'Cleaning up local mounts and directories'}) vm_name=vm_name,
time.sleep(1) vm_id=vm_id,
temporary_directory=temp_dir,
disks=vm_data['volumes'],
networks=vm_data['networks'],
**script_arguments
)
except Exception as e:
raise ProvisioningError('Failed to run install script: {}'.format(e))
if not is_ova_install: except Exception as e:
for volume in list(reversed(vm_data['volumes'])): start_vm = False
if volume.get('source_volume') is not None: raise e
continue
if is_script_install: # Always perform the cleanup steps
# Unmount the volume finally:
if volume.get('mountpoint') is not None and volume.get('mountpoint') != 'swap': # Phase 9 - install cleanup
print("Cleaning up mount {}{}".format(temp_dir, volume['mountpoint'])) # * Unmount any mounted volumes
# * Remove any temporary directories
self.update_state(state='RUNNING', meta={'current': 9, 'total': 10, 'status': 'Cleaning up local mounts and directories'})
time.sleep(1)
mount_path = "{}{}".format(temp_dir, volume['mountpoint']) if not is_ova_install:
retcode, stdout, stderr = pvc_common.run_os_command("umount {}".format(mount_path)) for volume in list(reversed(vm_data['volumes'])):
if volume.get('source_volume') is not None:
continue
if is_script_install:
# Unmount the volume
if volume.get('mountpoint') is not None and volume.get('mountpoint') != 'swap':
print("Cleaning up mount {}{}".format(temp_dir, volume['mountpoint']))
mount_path = "{}{}".format(temp_dir, volume['mountpoint'])
# Make sure any bind mounts or submounts are unmounted first
if volume['mountpoint'] == '/':
retcode, stdout, stderr = pvc_common.run_os_command("umount {}/**{/**,}".format(mount_path))
retcode, stdout, stderr = pvc_common.run_os_command("umount {}".format(mount_path))
if retcode:
print('Failed to unmount "{}": {}'.format(mount_path, stderr))
# Unmap the RBD device
if volume['filesystem']:
print("Cleaning up RBD mapping /dev/rbd/{}/{}_{}".format(volume['pool'], vm_name, volume['disk_id']))
rbd_volume = "/dev/rbd/{}/{}_{}".format(volume['pool'], vm_name, volume['disk_id'])
retcode, stdout, stderr = pvc_common.run_os_command("rbd unmap {}".format(rbd_volume))
if retcode: if retcode:
raise ProvisioningError('Failed to unmount "{}": {}'.format(mount_path, stderr)) print('Failed to unmap volume "{}": {}'.format(rbd_volume, stderr))
# Unmap the RBD device print("Cleaning up temporary directories and files")
if volume['filesystem']:
print("Cleaning up RBD mapping /dev/rbd/{}/{}_{}".format(volume['pool'], vm_name, volume['disk_id']))
rbd_volume = "/dev/rbd/{}/{}_{}".format(volume['pool'], vm_name, volume['disk_id']) if is_script_install:
retcode, stdout, stderr = pvc_common.run_os_command("rbd unmap {}".format(rbd_volume)) # Remove temporary mount directory (don't fail if not removed)
if retcode: retcode, stdout, stderr = pvc_common.run_os_command("rmdir {}".format(temp_dir))
raise ProvisioningError('Failed to unmap volume "{}": {}'.format(rbd_volume, stderr)) if retcode:
print('Failed to delete temporary directory "{}": {}'.format(temp_dir, stderr))
print("Cleaning up temporary directories and files") # Remote temporary script (don't fail if not removed)
retcode, stdout, stderr = pvc_common.run_os_command("rm -f {}".format(script_file))
if retcode:
print('Failed to delete temporary script file "{}": {}'.format(script_file, stderr))
if is_script_install: # Phase 10 - startup
# Remove temporary mount directory (don't fail if not removed) # * Start the VM in the PVC cluster
retcode, stdout, stderr = pvc_common.run_os_command("rmdir {}".format(temp_dir)) if start_vm:
if retcode: self.update_state(state='RUNNING', meta={'current': 10, 'total': 10, 'status': 'Starting VM'})
print('Failed to delete temporary directory "{}": {}'.format(temp_dir, stderr)) time.sleep(1)
retcode, retmsg = pvc_vm.start_vm(zk_conn, vm_name)
# Remote temporary script (don't fail if not removed) print(retmsg)
retcode, stdout, stderr = pvc_common.run_os_command("rm -f {}".format(script_file))
if retcode:
print('Failed to delete temporary script file "{}": {}'.format(script_file, stderr))
# Phase 10 - startup
# * Start the VM in the PVC cluster
self.update_state(state='RUNNING', meta={'current': 10, 'total': 10, 'status': 'Starting VM'})
time.sleep(1)
if start_vm:
retcode, retmsg = pvc_vm.start_vm(zk_conn, vm_name)
print(retmsg)
pvc_common.stopZKConnection(zk_conn) pvc_common.stopZKConnection(zk_conn)
return {'status': 'VM "{}" with profile "{}" has been provisioned and started successfully'.format(vm_name, vm_profile), 'current': 10, 'total': 10} return {'status': 'VM "{}" with profile "{}" has been provisioned and started successfully'.format(vm_name, vm_profile), 'current': 10, 'total': 10}