Compare commits

...

6 Commits

Author SHA1 Message Date
ab4ec7a5fa Remove WebUI from README 2023-12-25 02:48:44 -05:00
9604f655d0 Improve node utilization metrics and fix bugs 2023-12-25 02:47:41 -05:00
3e4cc53fdd Add node network statistics and utilization values
Adds a new physical network interface stats parser to the node
keepalives, and leverages this information to provide a network
utilization overview in the Prometheus metrics.
2023-12-21 15:45:01 -05:00
d2d2a9c617 Include our newline atomically
Sometimes clashing log entries would print on the same line, likely due
to some sort of race condition in Python's print() built-in.

Instead, add a newline to our actual message and print without an end
character. This ensures atomic printing of our log messages.
2023-12-21 13:12:43 -05:00
6ed4efad33 Add new network.stats key to nodes 2023-12-21 12:48:48 -05:00
39f9f3640c Rename health metrics and add resource metrics 2023-12-21 09:40:49 -05:00
12 changed files with 1555 additions and 46 deletions

View File

@ -19,7 +19,7 @@ As a consequence of its features, PVC makes administrating very high-uptime VMs
PVC also features an optional, fully customizable VM provisioning framework, designed to automate and simplify VM deployments using custom provisioning profiles, scripts, and CloudInit userdata API support.
Installation of PVC is accomplished by two main components: a [Node installer ISO](https://github.com/parallelvirtualcluster/pvc-installer) which creates on-demand installer ISOs, and an [Ansible role framework](https://github.com/parallelvirtualcluster/pvc-ansible) to configure, bootstrap, and administrate the nodes. Installation can also be fully automated with a companion [cluster bootstrapping system](https://github.com/parallelvirtualcluster/pvc-bootstrap). Once up, the cluster is managed via an HTTP REST API, accessible via a Python Click CLI client or WebUI.
Installation of PVC is accomplished by two main components: a [Node installer ISO](https://github.com/parallelvirtualcluster/pvc-installer) which creates on-demand installer ISOs, and an [Ansible role framework](https://github.com/parallelvirtualcluster/pvc-ansible) to configure, bootstrap, and administrate the nodes. Installation can also be fully automated with a companion [cluster bootstrapping system](https://github.com/parallelvirtualcluster/pvc-bootstrap). Once up, the cluster is managed via an HTTP REST API, accessible via a Python Click CLI client ~~or WebUI~~ (eventually).
Just give it physical servers, and it will run your VMs without you having to think about it, all in just an hour or two of setup time.

View File

@ -640,14 +640,15 @@ class API_Metrics(Resource):
400:
description: Bad request
"""
cluster_output, cluster_retcode = api_helper.cluster_metrics()
health_output, health_retcode = api_helper.cluster_health_metrics()
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
ceph_output, ceph_retcode = api_helper.ceph_metrics()
if cluster_retcode != 200 or ceph_retcode != 200:
if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
output = "Error: Failed to obtain data"
retcode = 400
else:
output = cluster_output + ceph_output
output = health_output + resource_output + ceph_output
retcode = 200
response = flask.make_response(output, retcode)
@ -658,11 +659,11 @@ class API_Metrics(Resource):
api.add_resource(API_Metrics, "/metrics")
# /metrics/pvc
class API_Metrics_PVC(Resource):
# /metrics/health
class API_Metrics_Health(Resource):
def get(self):
"""
Return the current PVC cluster status in Prometheus-compatible metrics format
Return the current PVC cluster health status in Prometheus-compatible metrics format
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
with the Prometheus compatibility later.
@ -675,13 +676,13 @@ class API_Metrics_PVC(Resource):
400:
description: Bad request
"""
cluster_output, cluster_retcode = api_helper.cluster_metrics()
health_output, health_retcode = api_helper.cluster_health_metrics()
if cluster_retcode != 200:
if health_retcode != 200:
output = "Error: Failed to obtain data"
retcode = 400
else:
output = cluster_output
output = health_output
retcode = 200
response = flask.make_response(output, retcode)
@ -689,7 +690,41 @@ class API_Metrics_PVC(Resource):
return response
api.add_resource(API_Metrics_PVC, "/metrics/pvc")
api.add_resource(API_Metrics_Health, "/metrics/health")
# /metrics/resource
class API_Metrics_Resource(Resource):
def get(self):
"""
Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
with the Prometheus compatibility later.
---
tags:
- root
responses:
200:
description: OK
400:
description: Bad request
"""
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
if resource_retcode != 200:
output = "Error: Failed to obtain data"
retcode = 400
else:
output = resource_output
retcode = 200
response = flask.make_response(output, retcode)
response.mimetype = "text/plain"
return response
api.add_resource(API_Metrics_Resource, "/metrics/resource")
# /metrics/ceph
@ -1133,6 +1168,9 @@ class API_Node_Root(Resource):
provisioned:
type: integer
description: The total amount of RAM provisioned to all domains (regardless of state) on this node in MB
interfaces:
type: object
description: Details on speed, bytes, and packets per second of each node physical network interface
parameters:
- in: query
name: limit

View File

@ -126,12 +126,27 @@ def cluster_maintenance(zkhandler, maint_state="false"):
#
@pvc_common.Profiler(config)
@ZKConnection(config)
def cluster_metrics(zkhandler):
def cluster_health_metrics(zkhandler):
"""
Format status data from cluster_status into Prometheus-compatible metrics
Get cluster-wide Prometheus metrics for health
"""
retflag, retdata = pvc_cluster.get_metrics(zkhandler)
retflag, retdata = pvc_cluster.get_health_metrics(zkhandler)
if retflag:
retcode = 200
else:
retcode = 400
return retdata, retcode
@pvc_common.Profiler(config)
@ZKConnection(config)
def cluster_resource_metrics(zkhandler):
"""
Get cluster-wide Prometheus metrics for resource utilization
"""
retflag, retdata = pvc_cluster.get_resource_metrics(zkhandler)
if retflag:
retcode = 200
else:

File diff suppressed because it is too large Load Diff

View File

@ -146,7 +146,7 @@ class Logger(object):
if self.config["stdout_logging"]:
# Assemble output string
output = colour + prompt + endc + date + prefix + message
print(output)
print(output + "\n", end="")
# Log to file
if self.config["file_logging"]:

View File

@ -0,0 +1 @@
{"version": "12", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}

View File

@ -103,6 +103,7 @@ def getNodeInformation(zkhandler, node_name):
_node_running_domains,
_node_health,
_node_health_plugins,
_node_network_stats,
) = zkhandler.read_many(
[
("node.state.daemon", node_name),
@ -121,6 +122,7 @@ def getNodeInformation(zkhandler, node_name):
("node.running_domains", node_name),
("node.monitoring.health", node_name),
("node.monitoring.plugins", node_name),
("node.network.stats", node_name),
]
)
@ -154,6 +156,8 @@ def getNodeInformation(zkhandler, node_name):
zkhandler, node_name, node_health_plugins
)
node_network_stats = json.loads(_node_network_stats)
# Construct a data structure to represent the data
node_information = {
"name": node_name,
@ -182,6 +186,7 @@ def getNodeInformation(zkhandler, node_name):
"used": node_mem_used,
"free": node_mem_free,
},
"interfaces": node_network_stats,
}
return node_information

View File

@ -572,7 +572,7 @@ class ZKHandler(object):
#
class ZKSchema(object):
# Current version
_version = 11
_version = 12
# Root for doing nested keys
_schema_root = ""
@ -651,6 +651,7 @@ class ZKSchema(object):
"monitoring.plugins": "/monitoring_plugins",
"monitoring.data": "/monitoring_data",
"monitoring.health": "/monitoring_health",
"network.stats": "/network_stats",
},
# The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin})
"monitoring_plugin": {

View File

@ -1510,7 +1510,7 @@
{
"id": "color",
"value": {
"fixedColor": "light-red",
"fixedColor": "super-light-red",
"mode": "fixed"
}
}
@ -1525,7 +1525,7 @@
{
"id": "color",
"value": {
"fixedColor": "semi-dark-red",
"fixedColor": "red",
"mode": "fixed"
}
}
@ -1540,7 +1540,7 @@
{
"id": "color",
"value": {
"fixedColor": "semi-dark-blue",
"fixedColor": "dark-red",
"mode": "fixed"
}
}
@ -2556,7 +2556,8 @@
"refresh": "5s",
"schemaVersion": 38,
"tags": [
"pvc"
"pvc",
"health"
],
"templating": {
"list": [
@ -2592,8 +2593,8 @@
},
"timepicker": {},
"timezone": "",
"title": "PVC Cluster",
"title": "PVC Cluster Health",
"uid": "fbddd9f9-aadb-4c97-8aea-57c29e5de234",
"version": 57,
"version": 1,
"weekStart": ""
}

View File

@ -31,6 +31,7 @@ import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance
import pvcnoded.objects.VMInstance as VMInstance
import pvcnoded.objects.NodeInstance as NodeInstance
import pvcnoded.objects.VXNetworkInstance as VXNetworkInstance
import pvcnoded.objects.NetstatsInstance as NetstatsInstance
import pvcnoded.objects.SRIOVVFInstance as SRIOVVFInstance
import pvcnoded.objects.CephInstance as CephInstance
@ -200,9 +201,9 @@ def entrypoint():
# Define a cleanup function
def cleanup(failure=False):
nonlocal logger, zkhandler, keepalive_timer, d_domain
nonlocal logger, zkhandler, keepalive_timer, d_domain, netstats
logger.out("Terminating pvcnoded and cleaning up", state="s")
logger.out("Terminating pvcnoded", state="s")
# Set shutdown state in Zookeeper
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "shutdown")])
@ -249,12 +250,20 @@ def entrypoint():
except Exception:
pass
# Set stop state in Zookeeper
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
logger.out("Cleaning up", state="s")
# Stop netstats instance
try:
netstats.shutdown()
except Exception:
pass
# Forcibly terminate dnsmasq because it gets stuck sometimes
common.run_os_command("killall dnsmasq")
# Set stop state in Zookeeper
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
# Close the Zookeeper connection
try:
zkhandler.disconnect(persistent=True)
@ -1000,9 +1009,12 @@ def entrypoint():
state="s",
)
# Set up netstats
netstats = NetstatsInstance.NetstatsInstance(logger, config, zkhandler, this_node)
# Start keepalived thread
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
logger, config, zkhandler, this_node
logger, config, zkhandler, this_node, netstats
)
# Tick loop; does nothing since everything is async

View File

@ -0,0 +1,293 @@
#!/usr/bin/env python3
# NetstatsInstance.py - Class implementing a PVC network stats gatherer and run by pvcnoded
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
from apscheduler.schedulers.background import BackgroundScheduler
from collections import deque
from json import dumps
from os import walk
from os.path import exists
class NetstatsIfaceInstance(object):
"""
NetstatsIfaceInstance
This class implements a rolling statistics poller for a network interface,
collecting stats on the bits and packets per second in both directions every
second.
Via the get_stats() function, it returns the rolling average of all 4 values,
as well as totals, over the last 5 seconds (self.avg_samples) as a tuple of:
(rx_bps, rx_pps, tx_bps, tx_pps, total_bps, total_pps, link_speed, state)
"""
def __init__(self, logger, iface, avg_samples):
"""
Initialize the class instance, creating our BackgroundScheduler, setting
the average sample rate, and creating the deques and average values.
"""
self.logger = logger
self.iface = iface
self.data_valid = False
self.data_polls = 0
self.timer = BackgroundScheduler()
self.timer.add_job(self.gather_stats, trigger="interval", seconds=1)
self.avg_samples = avg_samples
self.link_speed = 0
self.state = "down"
self.rx_bits_rolling = deque(list(), self.avg_samples + 1)
self.rx_bps = 0
self.rx_packets_rolling = deque(list(), self.avg_samples + 1)
self.rx_pps = 0
self.tx_bits_rolling = deque(list(), self.avg_samples + 1)
self.tx_bps = 0
self.tx_packets_rolling = deque(list(), self.avg_samples + 1)
self.tx_pps = 0
self.total_bps = 0
self.total_pps = 0
def get_iface_stats(self):
"""
Reads the interface statistics from the sysfs for the interface.
"""
iface_state_path = f"/sys/class/net/{self.iface}/operstate"
with open(iface_state_path) as stfh:
self.state = stfh.read().strip()
iface_speed_path = f"/sys/class/net/{self.iface}/speed"
try:
with open(iface_speed_path) as spfh:
# The speed key is always in Mbps so multiply by 1000*1000 to get bps
self.link_speed = int(spfh.read()) * 1000 * 1000
except OSError:
self.link_speed = 0
iface_stats_path = f"/sys/class/net/{self.iface}/statistics"
with open(f"{iface_stats_path}/rx_bytes") as rxbfh:
self.rx_bits_rolling.append(int(rxbfh.read()) * 8)
with open(f"{iface_stats_path}/tx_bytes") as txbfh:
self.tx_bits_rolling.append(int(txbfh.read()) * 8)
with open(f"{iface_stats_path}/rx_packets") as rxpfh:
self.rx_packets_rolling.append(int(rxpfh.read()) * 8)
with open(f"{iface_stats_path}/tx_packets") as txpfh:
self.tx_packets_rolling.append(int(txpfh.read()) * 8)
def calculate_averages(self):
"""
Calculates the bps/pps values from the rolling values.
"""
rx_bits_diffs = list()
for sample_idx in range(self.avg_samples, 0, -1):
rx_bits_diffs.append(
self.rx_bits_rolling[sample_idx] - self.rx_bits_rolling[sample_idx - 1]
)
self.rx_bps = int(sum(rx_bits_diffs) / self.avg_samples)
rx_packets_diffs = list()
for sample_idx in range(self.avg_samples, 0, -1):
rx_packets_diffs.append(
self.rx_packets_rolling[sample_idx]
- self.rx_packets_rolling[sample_idx - 1]
)
self.rx_pps = int(sum(rx_packets_diffs) / self.avg_samples)
tx_bits_diffs = list()
for sample_idx in range(self.avg_samples, 0, -1):
tx_bits_diffs.append(
self.tx_bits_rolling[sample_idx] - self.tx_bits_rolling[sample_idx - 1]
)
self.tx_bps = int(sum(tx_bits_diffs) / self.avg_samples)
tx_packets_diffs = list()
for sample_idx in range(self.avg_samples, 0, -1):
tx_packets_diffs.append(
self.tx_packets_rolling[sample_idx]
- self.tx_packets_rolling[sample_idx - 1]
)
self.tx_pps = int(sum(tx_packets_diffs) / self.avg_samples)
self.total_bps = self.rx_bps + self.tx_bps
self.total_pps = self.rx_pps + self.tx_pps
def gather_stats(self):
"""
Gathers the current stats and then calculates the averages.
Runs via the BackgroundScheduler timer every 1 second.
"""
self.get_iface_stats()
if self.data_valid:
self.calculate_averages()
# Handle data validity: our data is invalid until we hit enough polls
# to make a valid average (avg_samples plus 1).
if not self.data_valid:
self.data_polls += 1
if self.data_polls > self.avg_samples:
self.data_valid = True
def start(self):
"""
Starts the timer.
"""
self.timer.start()
def stop(self):
"""
Stops the timer.
"""
self.timer.shutdown()
def get_stats(self):
"""
Returns a tuple of the current statistics.
"""
if not self.data_valid:
return None
return (
self.rx_bps,
self.rx_pps,
self.tx_bps,
self.tx_pps,
self.total_bps,
self.total_pps,
self.link_speed,
self.state,
)
class NetstatsInstance(object):
"""
NetstatsInstance
This class implements a rolling statistics poller for all PHYSICAL network interfaces,
on the system, initializing a NetstatsIfaceInstance for each, as well as handling
value updates into Zookeeper.
"""
def __init__(self, logger, config, zkhandler, this_node):
"""
Initialize the class instance.
"""
self.logger = logger
self.config = config
self.zkhandler = zkhandler
self.node_name = this_node.name
self.interfaces = dict()
self.logger.out(
f"Starting netstats collector ({self.config['keepalive_interval']} second interval)",
state="s",
)
self.set_interfaces()
def shutdown(self):
"""
Stop all pollers and delete the NetstatsIfaceInstance objects
"""
# Empty the network stats object
self.zkhandler.write([(("node.network.stats", self.node_name), dumps({}))])
for iface in self.interfaces.keys():
self.interfaces[iface].stop()
def set_interfaces(self):
"""
Sets the list of interfaces on the system, and then ensures that each
interface has a NetstatsIfaceInstance assigned to it and polling.
"""
# Get a list of all active interfaces
net_root_path = "/sys/class/net"
all_ifaces = list()
for (_, dirnames, _) in walk(net_root_path):
all_ifaces.extend(dirnames)
all_ifaces.sort()
self.logger.out(
f"Parsing network list: {all_ifaces}", state="d", prefix="netstats-thread"
)
# Add any missing interfaces
for iface in all_ifaces:
if not exists(f"{net_root_path}/{iface}/device"):
# This is not a physical interface; skip it
continue
if iface not in self.interfaces.keys():
# Set the number of samples to be equal to the keepalive interval, so that each
# keepalive has a fresh set of data from the last keepalive_interval seconds.
self.interfaces[iface] = NetstatsIfaceInstance(
self.logger, iface, self.config["keepalive_interval"]
)
self.interfaces[iface].start()
# Remove any superfluous interfaces
for iface in self.interfaces.keys():
if iface not in all_ifaces:
self.interfaces[iface].stop()
del self.interfaces[iface]
def set_data(self):
data = dict()
for iface in self.interfaces.keys():
self.logger.out(
f"Getting data for interface {iface}",
state="d",
prefix="netstats-thread",
)
iface_stats = self.interfaces[iface].get_stats()
if iface_stats is None:
continue
(
iface_rx_bps,
iface_rx_pps,
iface_tx_bps,
iface_tx_pps,
iface_total_bps,
iface_total_pps,
iface_link_speed,
iface_state,
) = iface_stats
data[iface] = {
"rx_bps": iface_rx_bps,
"rx_pps": iface_rx_pps,
"tx_bps": iface_tx_bps,
"tx_pps": iface_tx_pps,
"total_bps": iface_total_bps,
"total_pps": iface_total_pps,
"link_speed": iface_link_speed,
"state": iface_state,
}
self.zkhandler.write([(("node.network.stats", self.node_name), dumps(data))])

View File

@ -51,7 +51,7 @@ libvirt_vm_states = {
}
def start_keepalive_timer(logger, config, zkhandler, this_node):
def start_keepalive_timer(logger, config, zkhandler, this_node, netstats):
keepalive_interval = config["keepalive_interval"]
logger.out(
f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node):
keepalive_timer = BackgroundScheduler()
keepalive_timer.add_job(
node_keepalive,
args=(logger, config, zkhandler, this_node),
args=(logger, config, zkhandler, this_node, netstats),
trigger="interval",
seconds=keepalive_interval,
)
@ -684,7 +684,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
# Keepalive update function
def node_keepalive(logger, config, zkhandler, this_node):
def node_keepalive(logger, config, zkhandler, this_node, netstats):
debug = config["debug"]
# Display node information to the terminal
@ -793,6 +793,10 @@ def node_keepalive(logger, config, zkhandler, this_node):
this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024)
this_node.cpuload = round(os.getloadavg()[0], 2)
# Get node network statistics via netstats instance
netstats.set_interfaces()
netstats.set_data()
# Join against running threads
if config["enable_hypervisor"]:
vm_stats_thread.join(timeout=config["keepalive_interval"])