Add node network statistics and utilization values

Adds a new physical network interface stats parser to the node
keepalives, and leverages this information to provide a network
utilization overview in the Prometheus metrics.
This commit is contained in:
2023-12-21 15:12:20 -05:00
parent d2d2a9c617
commit 3e4cc53fdd
6 changed files with 386 additions and 13 deletions

View File

@ -31,6 +31,7 @@ import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance
import pvcnoded.objects.VMInstance as VMInstance
import pvcnoded.objects.NodeInstance as NodeInstance
import pvcnoded.objects.VXNetworkInstance as VXNetworkInstance
import pvcnoded.objects.NetstatsInstance as NetstatsInstance
import pvcnoded.objects.SRIOVVFInstance as SRIOVVFInstance
import pvcnoded.objects.CephInstance as CephInstance
@ -200,9 +201,9 @@ def entrypoint():
# Define a cleanup function
def cleanup(failure=False):
nonlocal logger, zkhandler, keepalive_timer, d_domain
nonlocal logger, zkhandler, keepalive_timer, d_domain, netstats
logger.out("Terminating pvcnoded and cleaning up", state="s")
logger.out("Terminating pvcnoded", state="s")
# Set shutdown state in Zookeeper
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "shutdown")])
@ -249,12 +250,20 @@ def entrypoint():
except Exception:
pass
# Set stop state in Zookeeper
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
logger.out("Cleaning up", state="s")
# Stop netstats instance
try:
netstats.shutdown()
except Exception:
pass
# Forcibly terminate dnsmasq because it gets stuck sometimes
common.run_os_command("killall dnsmasq")
# Set stop state in Zookeeper
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
# Close the Zookeeper connection
try:
zkhandler.disconnect(persistent=True)
@ -1000,9 +1009,12 @@ def entrypoint():
state="s",
)
# Set up netstats
netstats = NetstatsInstance.NetstatsInstance(logger, config, zkhandler, this_node)
# Start keepalived thread
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
logger, config, zkhandler, this_node
logger, config, zkhandler, this_node, netstats
)
# Tick loop; does nothing since everything is async