Compare commits

...

2 Commits

Author SHA1 Message Date
0f24184b78 Explicitly clear resources of fenced node
This actually solves the bug originally "fixed" in
5f1432ccdd without breaking VM resource
allocations for working nodes.
2023-12-11 12:14:56 -05:00
1ba37fe33d Restore VM resource allocation location
Commit 5f1432ccdd changed where these
happen due to a bug after fencing. However this completely broke node
resource reporting as only the final instance will be queried here.

Revert this change and look further into the original bug.
2023-12-11 11:52:59 -05:00
2 changed files with 25 additions and 5 deletions

View File

@ -115,6 +115,27 @@ def fence_node(node_name, zkhandler, config, logger):
):
migrateFromFencedNode(zkhandler, node_name, config, logger)
# Reset all node resource values
logger.out(
f"Resetting all resource values for dead node {node_name} to zero",
state="i",
prefix=f"fencing {node_name}",
)
zkhandler.write(
[
(("node.running_domains", node_name), "0"),
(("node.count.provisioned_domains", node_name), "0"),
(("node.cpu.load", node_name), "0"),
(("node.vcpu.allocated", node_name), "0"),
(("node.memory.total", node_name), "0"),
(("node.memory.used", node_name), "0"),
(("node.memory.free", node_name), "0"),
(("node.memory.allocated", node_name), "0"),
(("node.memory.provisioned", node_name), "0"),
(("node.monitoring.health", node_name), None),
]
)
# Migrate hosts away from a fenced node
def migrateFromFencedNode(zkhandler, node_name, config, logger):

View File

@ -477,6 +477,10 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
fixed_d_domain = this_node.d_domain.copy()
for domain, instance in fixed_d_domain.items():
if domain in this_node.domain_list:
# Add the allocated memory to our memalloc value
memalloc += instance.getmemory()
memprov += instance.getmemory()
vcpualloc += instance.getvcpus()
if instance.getstate() == "start" and instance.getnode() == this_node.name:
if instance.getdom() is not None:
try:
@ -532,11 +536,6 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
continue
domain_memory_stats = domain.memoryStats()
domain_cpu_stats = domain.getCPUStats(True)[0]
# Add the allocated memory to our memalloc value
memalloc += instance.getmemory()
memprov += instance.getmemory()
vcpualloc += instance.getvcpus()
except Exception as e:
if debug:
try: