Compare commits

..

4 Commits

Author SHA1 Message Date
942de9f15b Add better exception handling for XML configs 2024-08-16 10:46:04 -04:00
9aca8e215b Run IPMI check 3 times with 2s timeout
Avoids potential timeouts or deadlocks, and retries if a single try
fails.
2024-07-28 12:36:01 -04:00
97329bb90d Sort Ceph pool data by name
There is no guarantee that both commands output the pools in the same
order, so sort them by name first so the iteration over the pools by ID
is successful.
2024-07-22 13:26:27 -04:00
c186015d6f Add check for invalid profile 2024-07-13 17:13:40 -04:00
4 changed files with 103 additions and 25 deletions

View File

@ -155,10 +155,37 @@ def define_vm(
# Parse the XML data # Parse the XML data
try: try:
parsed_xml = lxml.objectify.fromstring(config_data) parsed_xml = lxml.objectify.fromstring(config_data)
except Exception: except Exception as e:
return False, "ERROR: Failed to parse XML data." return False, f"ERROR: Failed to parse XML data: {e}"
dom_uuid = parsed_xml.uuid.text
dom_name = parsed_xml.name.text # Extract the required items from the XML document and error if not valid
next_field = 0
next_map = {
0: "uuid",
1: "name",
2: "memory",
3: "vcpu",
4: "networks",
5: "disks",
}
try:
dom_uuid = parsed_xml.uuid.text
next_field += 1
dom_name = parsed_xml.name.text
next_field += 1
parsed_memory = int(parsed_xml.memory.text)
next_field += 1
parsed_vcpu = int(parsed_xml.vcpu.text)
next_field += 1
dnetworks = common.getDomainNetworks(parsed_xml, {})
next_field += 1
ddisks = common.getDomainDisks(parsed_xml, {})
next_field += 1
except Exception as e:
return (
False,
f'ERROR: Failed to parse XML data: field data for "{next_map[next_field]}" is not valid: {e}',
)
# Ensure that the UUID and name are unique # Ensure that the UUID and name are unique
if searchClusterByUUID(zkhandler, dom_uuid) or searchClusterByName( if searchClusterByUUID(zkhandler, dom_uuid) or searchClusterByName(
@ -181,26 +208,25 @@ def define_vm(
# Validate the new RAM against the current active node # Validate the new RAM against the current active node
node_total_memory = int(zkhandler.read(("node.memory.total", target_node))) node_total_memory = int(zkhandler.read(("node.memory.total", target_node)))
if int(parsed_xml.memory.text) >= node_total_memory: if parsed_memory >= node_total_memory:
return ( return (
False, False,
'ERROR: VM configuration specifies more memory ({} MiB) than node "{}" has available ({} MiB).'.format( 'ERROR: VM configuration specifies more memory ({} MiB) than node "{}" has available ({} MiB).'.format(
parsed_xml.memory.text, target_node, node_total_memory parsed_memory, target_node, node_total_memory
), ),
) )
# Validate the number of vCPUs against the current active node # Validate the number of vCPUs against the current active node
node_total_cpus = int(zkhandler.read(("node.data.static", target_node)).split()[0]) node_total_cpus = int(zkhandler.read(("node.data.static", target_node)).split()[0])
if (node_total_cpus - 2) <= int(parsed_xml.vcpu.text): if parsed_vcpu >= (node_total_cpus - 2):
return ( return (
False, False,
'ERROR: VM configuration specifies more vCPUs ({}) than node "{}" has available ({} minus 2).'.format( 'ERROR: VM configuration specifies more vCPUs ({}) than node "{}" has available ({} minus 2).'.format(
parsed_xml.vcpu.text, target_node, node_total_cpus parsed_vcpu, target_node, node_total_cpus
), ),
) )
# If a SR-IOV network device is being added, set its used state # If a SR-IOV network device is being added, set its used state
dnetworks = common.getDomainNetworks(parsed_xml, {})
for network in dnetworks: for network in dnetworks:
if network["type"] in ["direct", "hostdev"]: if network["type"] in ["direct", "hostdev"]:
dom_node = zkhandler.read(("domain.node", dom_uuid)) dom_node = zkhandler.read(("domain.node", dom_uuid))
@ -239,7 +265,6 @@ def define_vm(
) )
# Obtain the RBD disk list using the common functions # Obtain the RBD disk list using the common functions
ddisks = common.getDomainDisks(parsed_xml, {})
rbd_list = [] rbd_list = []
for disk in ddisks: for disk in ddisks:
if disk["type"] == "rbd": if disk["type"] == "rbd":
@ -404,6 +429,35 @@ def modify_vm(zkhandler, domain, restart, new_vm_config):
except Exception: except Exception:
return False, "ERROR: Failed to parse new XML data." return False, "ERROR: Failed to parse new XML data."
# Extract the required items from the XML document and error if not valid
next_field = 0
next_map = {
0: "uuid",
1: "name",
2: "memory",
3: "vcpu",
4: "networks",
5: "disks",
}
try:
dom_uuid = parsed_xml.uuid.text
next_field += 1
dom_name = parsed_xml.name.text
next_field += 1
parsed_memory = int(parsed_xml.memory.text)
next_field += 1
parsed_vcpu = int(parsed_xml.vcpu.text)
next_field += 1
dnetworks = common.getDomainNetworks(parsed_xml, {})
next_field += 1
ddisks = common.getDomainDisks(parsed_xml, {})
next_field += 1
except Exception as e:
return (
False,
f'ERROR: Failed to parse XML data: field data for "{next_map[next_field]}" is not valid: {e}',
)
# Get our old network list for comparison purposes # Get our old network list for comparison purposes
old_vm_config = zkhandler.read(("domain.xml", dom_uuid)) old_vm_config = zkhandler.read(("domain.xml", dom_uuid))
old_parsed_xml = lxml.objectify.fromstring(old_vm_config) old_parsed_xml = lxml.objectify.fromstring(old_vm_config)
@ -412,26 +466,25 @@ def modify_vm(zkhandler, domain, restart, new_vm_config):
# Validate the new RAM against the current active node # Validate the new RAM against the current active node
node_name = zkhandler.read(("domain.node", dom_uuid)) node_name = zkhandler.read(("domain.node", dom_uuid))
node_total_memory = int(zkhandler.read(("node.memory.total", node_name))) node_total_memory = int(zkhandler.read(("node.memory.total", node_name)))
if int(parsed_xml.memory.text) >= node_total_memory: if parsed_memory >= node_total_memory:
return ( return (
False, False,
'ERROR: Updated VM configuration specifies more memory ({} MiB) than node "{}" has available ({} MiB).'.format( 'ERROR: Updated VM configuration specifies more memory ({} MiB) than node "{}" has available ({} MiB).'.format(
parsed_xml.memory.text, node_name, node_total_memory parsed_memory, node_name, node_total_memory
), ),
) )
# Validate the number of vCPUs against the current active node # Validate the number of vCPUs against the current active node
node_total_cpus = int(zkhandler.read(("node.data.static", node_name)).split()[0]) node_total_cpus = int(zkhandler.read(("node.data.static", node_name)).split()[0])
if (node_total_cpus - 2) <= int(parsed_xml.vcpu.text): if parsed_vcpu >= (node_total_cpus - 2):
return ( return (
False, False,
'ERROR: Updated VM configuration specifies more vCPUs ({}) than node "{}" has available ({} minus 2).'.format( 'ERROR: Updated VM configuration specifies more vCPUs ({}) than node "{}" has available ({} minus 2).'.format(
parsed_xml.vcpu.text, node_name, node_total_cpus parsed_vcpu, node_name, node_total_cpus
), ),
) )
# If a SR-IOV network device is being added, set its used state # If a SR-IOV network device is being added, set its used state
dnetworks = common.getDomainNetworks(parsed_xml, {})
for network in dnetworks: for network in dnetworks:
# Ignore networks that are already there # Ignore networks that are already there
if network["source"] in [net["source"] for net in old_dnetworks]: if network["source"] in [net["source"] for net in old_dnetworks]:
@ -482,7 +535,6 @@ def modify_vm(zkhandler, domain, restart, new_vm_config):
unset_sriov_vf_vm(zkhandler, dom_node, network["source"]) unset_sriov_vf_vm(zkhandler, dom_node, network["source"])
# Obtain the RBD disk list using the common functions # Obtain the RBD disk list using the common functions
ddisks = common.getDomainDisks(parsed_xml, {})
rbd_list = [] rbd_list = []
for disk in ddisks: for disk in ddisks:
if disk["type"] == "rbd": if disk["type"] == "rbd":
@ -754,7 +806,15 @@ def update_vm_sriov_nics(zkhandler, dom_uuid, source_node, target_node):
# Update all the SR-IOV device states on both nodes, used during migrations but called by the node-side # Update all the SR-IOV device states on both nodes, used during migrations but called by the node-side
vm_config = zkhandler.read(("domain.xml", dom_uuid)) vm_config = zkhandler.read(("domain.xml", dom_uuid))
parsed_xml = lxml.objectify.fromstring(vm_config) parsed_xml = lxml.objectify.fromstring(vm_config)
dnetworks = common.getDomainNetworks(parsed_xml, {}) # Extract the required items from the XML document and error if not valid
try:
dnetworks = common.getDomainNetworks(parsed_xml, {})
except Exception as e:
return (
False,
f'ERROR: Failed to parse XML data: field data for "networks" is not valid: {e}',
)
retcode = True retcode = True
retmsg = "" retmsg = ""
for network in dnetworks: for network in dnetworks:

View File

@ -258,6 +258,13 @@ def worker_create_vm(
args = (vm_profile,) args = (vm_profile,)
db_cur.execute(query, args) db_cur.execute(query, args)
profile_data = db_cur.fetchone() profile_data = db_cur.fetchone()
if profile_data is None:
fail(
celery,
f'Provisioner profile "{vm_profile}" is not present on the cluster',
exception=ClusterError,
)
if profile_data.get("arguments"): if profile_data.get("arguments"):
vm_data["script_arguments"] = profile_data.get("arguments").split("|") vm_data["script_arguments"] = profile_data.get("arguments").split("|")
else: else:

View File

@ -69,26 +69,33 @@ class MonitoringPluginScript(MonitoringPlugin):
# Run any imports first # Run any imports first
from daemon_lib.common import run_os_command from daemon_lib.common import run_os_command
from time import sleep
# Check the node's IPMI interface # Check the node's IPMI interface
ipmi_hostname = self.config["ipmi_hostname"] ipmi_hostname = self.config["ipmi_hostname"]
ipmi_username = self.config["ipmi_username"] ipmi_username = self.config["ipmi_username"]
ipmi_password = self.config["ipmi_password"] ipmi_password = self.config["ipmi_password"]
retcode, _, _ = run_os_command( retcode = 1
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status", trycount = 0
timeout=5 while retcode > 0 and trycount < 3:
) retcode, _, _ = run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status",
timeout=2
)
trycount += 1
if retcode > 0 and trycount < 3:
sleep(trycount)
if retcode > 0: if retcode > 0:
# Set the health delta to 10 (subtract 10 from the total of 100) # Set the health delta to 10 (subtract 10 from the total of 100)
health_delta = 10 health_delta = 10
# Craft a message that can be used by the clients # Craft a message that can be used by the clients
message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding" message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding after 3 attempts"
else: else:
# Set the health delta to 0 (no change) # Set the health delta to 0 (no change)
health_delta = 0 health_delta = 0
# Craft a message that can be used by the clients # Craft a message that can be used by the clients
message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding" message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding after {trycount} attempts"
# Set the health delta in our local PluginResult object # Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta) self.plugin_result.set_health_delta(health_delta)

View File

@ -157,7 +157,9 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
1 1
].decode("ascii") ].decode("ascii")
try: try:
ceph_pool_df_raw = json.loads(ceph_df_output)["pools"] ceph_pool_df_raw = sorted(
json.loads(ceph_df_output)["pools"], key=lambda x: x["name"]
)
except Exception as e: except Exception as e:
logger.out("Failed to obtain Pool data (ceph df): {}".format(e), state="w") logger.out("Failed to obtain Pool data (ceph df): {}".format(e), state="w")
ceph_pool_df_raw = [] ceph_pool_df_raw = []
@ -166,7 +168,9 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
"rados df --format json", timeout=1 "rados df --format json", timeout=1
) )
try: try:
rados_pool_df_raw = json.loads(stdout)["pools"] rados_pool_df_raw = sorted(
json.loads(stdout)["pools"], key=lambda x: x["name"]
)
except Exception as e: except Exception as e:
logger.out("Failed to obtain Pool data (rados df): {}".format(e), state="w") logger.out("Failed to obtain Pool data (rados df): {}".format(e), state="w")
rados_pool_df_raw = [] rados_pool_df_raw = []