Compare commits
6 Commits
v0.9.86
...
ab4ec7a5fa
Author | SHA1 | Date | |
---|---|---|---|
ab4ec7a5fa | |||
9604f655d0 | |||
3e4cc53fdd | |||
d2d2a9c617 | |||
6ed4efad33 | |||
39f9f3640c |
@ -19,7 +19,7 @@ As a consequence of its features, PVC makes administrating very high-uptime VMs
|
|||||||
|
|
||||||
PVC also features an optional, fully customizable VM provisioning framework, designed to automate and simplify VM deployments using custom provisioning profiles, scripts, and CloudInit userdata API support.
|
PVC also features an optional, fully customizable VM provisioning framework, designed to automate and simplify VM deployments using custom provisioning profiles, scripts, and CloudInit userdata API support.
|
||||||
|
|
||||||
Installation of PVC is accomplished by two main components: a [Node installer ISO](https://github.com/parallelvirtualcluster/pvc-installer) which creates on-demand installer ISOs, and an [Ansible role framework](https://github.com/parallelvirtualcluster/pvc-ansible) to configure, bootstrap, and administrate the nodes. Installation can also be fully automated with a companion [cluster bootstrapping system](https://github.com/parallelvirtualcluster/pvc-bootstrap). Once up, the cluster is managed via an HTTP REST API, accessible via a Python Click CLI client or WebUI.
|
Installation of PVC is accomplished by two main components: a [Node installer ISO](https://github.com/parallelvirtualcluster/pvc-installer) which creates on-demand installer ISOs, and an [Ansible role framework](https://github.com/parallelvirtualcluster/pvc-ansible) to configure, bootstrap, and administrate the nodes. Installation can also be fully automated with a companion [cluster bootstrapping system](https://github.com/parallelvirtualcluster/pvc-bootstrap). Once up, the cluster is managed via an HTTP REST API, accessible via a Python Click CLI client ~~or WebUI~~ (eventually).
|
||||||
|
|
||||||
Just give it physical servers, and it will run your VMs without you having to think about it, all in just an hour or two of setup time.
|
Just give it physical servers, and it will run your VMs without you having to think about it, all in just an hour or two of setup time.
|
||||||
|
|
||||||
|
@ -640,14 +640,15 @@ class API_Metrics(Resource):
|
|||||||
400:
|
400:
|
||||||
description: Bad request
|
description: Bad request
|
||||||
"""
|
"""
|
||||||
cluster_output, cluster_retcode = api_helper.cluster_metrics()
|
health_output, health_retcode = api_helper.cluster_health_metrics()
|
||||||
|
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
|
||||||
ceph_output, ceph_retcode = api_helper.ceph_metrics()
|
ceph_output, ceph_retcode = api_helper.ceph_metrics()
|
||||||
|
|
||||||
if cluster_retcode != 200 or ceph_retcode != 200:
|
if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
|
||||||
output = "Error: Failed to obtain data"
|
output = "Error: Failed to obtain data"
|
||||||
retcode = 400
|
retcode = 400
|
||||||
else:
|
else:
|
||||||
output = cluster_output + ceph_output
|
output = health_output + resource_output + ceph_output
|
||||||
retcode = 200
|
retcode = 200
|
||||||
|
|
||||||
response = flask.make_response(output, retcode)
|
response = flask.make_response(output, retcode)
|
||||||
@ -658,11 +659,11 @@ class API_Metrics(Resource):
|
|||||||
api.add_resource(API_Metrics, "/metrics")
|
api.add_resource(API_Metrics, "/metrics")
|
||||||
|
|
||||||
|
|
||||||
# /metrics/pvc
|
# /metrics/health
|
||||||
class API_Metrics_PVC(Resource):
|
class API_Metrics_Health(Resource):
|
||||||
def get(self):
|
def get(self):
|
||||||
"""
|
"""
|
||||||
Return the current PVC cluster status in Prometheus-compatible metrics format
|
Return the current PVC cluster health status in Prometheus-compatible metrics format
|
||||||
|
|
||||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
||||||
with the Prometheus compatibility later.
|
with the Prometheus compatibility later.
|
||||||
@ -675,13 +676,13 @@ class API_Metrics_PVC(Resource):
|
|||||||
400:
|
400:
|
||||||
description: Bad request
|
description: Bad request
|
||||||
"""
|
"""
|
||||||
cluster_output, cluster_retcode = api_helper.cluster_metrics()
|
health_output, health_retcode = api_helper.cluster_health_metrics()
|
||||||
|
|
||||||
if cluster_retcode != 200:
|
if health_retcode != 200:
|
||||||
output = "Error: Failed to obtain data"
|
output = "Error: Failed to obtain data"
|
||||||
retcode = 400
|
retcode = 400
|
||||||
else:
|
else:
|
||||||
output = cluster_output
|
output = health_output
|
||||||
retcode = 200
|
retcode = 200
|
||||||
|
|
||||||
response = flask.make_response(output, retcode)
|
response = flask.make_response(output, retcode)
|
||||||
@ -689,7 +690,41 @@ class API_Metrics_PVC(Resource):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
api.add_resource(API_Metrics_PVC, "/metrics/pvc")
|
api.add_resource(API_Metrics_Health, "/metrics/health")
|
||||||
|
|
||||||
|
|
||||||
|
# /metrics/resource
|
||||||
|
class API_Metrics_Resource(Resource):
|
||||||
|
def get(self):
|
||||||
|
"""
|
||||||
|
Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format
|
||||||
|
|
||||||
|
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
||||||
|
with the Prometheus compatibility later.
|
||||||
|
---
|
||||||
|
tags:
|
||||||
|
- root
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: OK
|
||||||
|
400:
|
||||||
|
description: Bad request
|
||||||
|
"""
|
||||||
|
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
|
||||||
|
|
||||||
|
if resource_retcode != 200:
|
||||||
|
output = "Error: Failed to obtain data"
|
||||||
|
retcode = 400
|
||||||
|
else:
|
||||||
|
output = resource_output
|
||||||
|
retcode = 200
|
||||||
|
|
||||||
|
response = flask.make_response(output, retcode)
|
||||||
|
response.mimetype = "text/plain"
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
api.add_resource(API_Metrics_Resource, "/metrics/resource")
|
||||||
|
|
||||||
|
|
||||||
# /metrics/ceph
|
# /metrics/ceph
|
||||||
@ -1133,6 +1168,9 @@ class API_Node_Root(Resource):
|
|||||||
provisioned:
|
provisioned:
|
||||||
type: integer
|
type: integer
|
||||||
description: The total amount of RAM provisioned to all domains (regardless of state) on this node in MB
|
description: The total amount of RAM provisioned to all domains (regardless of state) on this node in MB
|
||||||
|
interfaces:
|
||||||
|
type: object
|
||||||
|
description: Details on speed, bytes, and packets per second of each node physical network interface
|
||||||
parameters:
|
parameters:
|
||||||
- in: query
|
- in: query
|
||||||
name: limit
|
name: limit
|
||||||
|
@ -126,12 +126,27 @@ def cluster_maintenance(zkhandler, maint_state="false"):
|
|||||||
#
|
#
|
||||||
@pvc_common.Profiler(config)
|
@pvc_common.Profiler(config)
|
||||||
@ZKConnection(config)
|
@ZKConnection(config)
|
||||||
def cluster_metrics(zkhandler):
|
def cluster_health_metrics(zkhandler):
|
||||||
"""
|
"""
|
||||||
Format status data from cluster_status into Prometheus-compatible metrics
|
Get cluster-wide Prometheus metrics for health
|
||||||
"""
|
"""
|
||||||
|
|
||||||
retflag, retdata = pvc_cluster.get_metrics(zkhandler)
|
retflag, retdata = pvc_cluster.get_health_metrics(zkhandler)
|
||||||
|
if retflag:
|
||||||
|
retcode = 200
|
||||||
|
else:
|
||||||
|
retcode = 400
|
||||||
|
return retdata, retcode
|
||||||
|
|
||||||
|
|
||||||
|
@pvc_common.Profiler(config)
|
||||||
|
@ZKConnection(config)
|
||||||
|
def cluster_resource_metrics(zkhandler):
|
||||||
|
"""
|
||||||
|
Get cluster-wide Prometheus metrics for resource utilization
|
||||||
|
"""
|
||||||
|
|
||||||
|
retflag, retdata = pvc_cluster.get_resource_metrics(zkhandler)
|
||||||
if retflag:
|
if retflag:
|
||||||
retcode = 200
|
retcode = 200
|
||||||
else:
|
else:
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -146,7 +146,7 @@ class Logger(object):
|
|||||||
if self.config["stdout_logging"]:
|
if self.config["stdout_logging"]:
|
||||||
# Assemble output string
|
# Assemble output string
|
||||||
output = colour + prompt + endc + date + prefix + message
|
output = colour + prompt + endc + date + prefix + message
|
||||||
print(output)
|
print(output + "\n", end="")
|
||||||
|
|
||||||
# Log to file
|
# Log to file
|
||||||
if self.config["file_logging"]:
|
if self.config["file_logging"]:
|
||||||
|
1
daemon-common/migrations/versions/12.json
Normal file
1
daemon-common/migrations/versions/12.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"version": "12", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
|
@ -103,6 +103,7 @@ def getNodeInformation(zkhandler, node_name):
|
|||||||
_node_running_domains,
|
_node_running_domains,
|
||||||
_node_health,
|
_node_health,
|
||||||
_node_health_plugins,
|
_node_health_plugins,
|
||||||
|
_node_network_stats,
|
||||||
) = zkhandler.read_many(
|
) = zkhandler.read_many(
|
||||||
[
|
[
|
||||||
("node.state.daemon", node_name),
|
("node.state.daemon", node_name),
|
||||||
@ -121,6 +122,7 @@ def getNodeInformation(zkhandler, node_name):
|
|||||||
("node.running_domains", node_name),
|
("node.running_domains", node_name),
|
||||||
("node.monitoring.health", node_name),
|
("node.monitoring.health", node_name),
|
||||||
("node.monitoring.plugins", node_name),
|
("node.monitoring.plugins", node_name),
|
||||||
|
("node.network.stats", node_name),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -154,6 +156,8 @@ def getNodeInformation(zkhandler, node_name):
|
|||||||
zkhandler, node_name, node_health_plugins
|
zkhandler, node_name, node_health_plugins
|
||||||
)
|
)
|
||||||
|
|
||||||
|
node_network_stats = json.loads(_node_network_stats)
|
||||||
|
|
||||||
# Construct a data structure to represent the data
|
# Construct a data structure to represent the data
|
||||||
node_information = {
|
node_information = {
|
||||||
"name": node_name,
|
"name": node_name,
|
||||||
@ -182,6 +186,7 @@ def getNodeInformation(zkhandler, node_name):
|
|||||||
"used": node_mem_used,
|
"used": node_mem_used,
|
||||||
"free": node_mem_free,
|
"free": node_mem_free,
|
||||||
},
|
},
|
||||||
|
"interfaces": node_network_stats,
|
||||||
}
|
}
|
||||||
return node_information
|
return node_information
|
||||||
|
|
||||||
|
@ -572,7 +572,7 @@ class ZKHandler(object):
|
|||||||
#
|
#
|
||||||
class ZKSchema(object):
|
class ZKSchema(object):
|
||||||
# Current version
|
# Current version
|
||||||
_version = 11
|
_version = 12
|
||||||
|
|
||||||
# Root for doing nested keys
|
# Root for doing nested keys
|
||||||
_schema_root = ""
|
_schema_root = ""
|
||||||
@ -651,6 +651,7 @@ class ZKSchema(object):
|
|||||||
"monitoring.plugins": "/monitoring_plugins",
|
"monitoring.plugins": "/monitoring_plugins",
|
||||||
"monitoring.data": "/monitoring_data",
|
"monitoring.data": "/monitoring_data",
|
||||||
"monitoring.health": "/monitoring_health",
|
"monitoring.health": "/monitoring_health",
|
||||||
|
"network.stats": "/network_stats",
|
||||||
},
|
},
|
||||||
# The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin})
|
# The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin})
|
||||||
"monitoring_plugin": {
|
"monitoring_plugin": {
|
||||||
|
@ -1510,7 +1510,7 @@
|
|||||||
{
|
{
|
||||||
"id": "color",
|
"id": "color",
|
||||||
"value": {
|
"value": {
|
||||||
"fixedColor": "light-red",
|
"fixedColor": "super-light-red",
|
||||||
"mode": "fixed"
|
"mode": "fixed"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1525,7 +1525,7 @@
|
|||||||
{
|
{
|
||||||
"id": "color",
|
"id": "color",
|
||||||
"value": {
|
"value": {
|
||||||
"fixedColor": "semi-dark-red",
|
"fixedColor": "red",
|
||||||
"mode": "fixed"
|
"mode": "fixed"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1540,7 +1540,7 @@
|
|||||||
{
|
{
|
||||||
"id": "color",
|
"id": "color",
|
||||||
"value": {
|
"value": {
|
||||||
"fixedColor": "semi-dark-blue",
|
"fixedColor": "dark-red",
|
||||||
"mode": "fixed"
|
"mode": "fixed"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2556,7 +2556,8 @@
|
|||||||
"refresh": "5s",
|
"refresh": "5s",
|
||||||
"schemaVersion": 38,
|
"schemaVersion": 38,
|
||||||
"tags": [
|
"tags": [
|
||||||
"pvc"
|
"pvc",
|
||||||
|
"health"
|
||||||
],
|
],
|
||||||
"templating": {
|
"templating": {
|
||||||
"list": [
|
"list": [
|
||||||
@ -2592,8 +2593,8 @@
|
|||||||
},
|
},
|
||||||
"timepicker": {},
|
"timepicker": {},
|
||||||
"timezone": "",
|
"timezone": "",
|
||||||
"title": "PVC Cluster",
|
"title": "PVC Cluster Health",
|
||||||
"uid": "fbddd9f9-aadb-4c97-8aea-57c29e5de234",
|
"uid": "fbddd9f9-aadb-4c97-8aea-57c29e5de234",
|
||||||
"version": 57,
|
"version": 1,
|
||||||
"weekStart": ""
|
"weekStart": ""
|
||||||
}
|
}
|
@ -31,6 +31,7 @@ import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance
|
|||||||
import pvcnoded.objects.VMInstance as VMInstance
|
import pvcnoded.objects.VMInstance as VMInstance
|
||||||
import pvcnoded.objects.NodeInstance as NodeInstance
|
import pvcnoded.objects.NodeInstance as NodeInstance
|
||||||
import pvcnoded.objects.VXNetworkInstance as VXNetworkInstance
|
import pvcnoded.objects.VXNetworkInstance as VXNetworkInstance
|
||||||
|
import pvcnoded.objects.NetstatsInstance as NetstatsInstance
|
||||||
import pvcnoded.objects.SRIOVVFInstance as SRIOVVFInstance
|
import pvcnoded.objects.SRIOVVFInstance as SRIOVVFInstance
|
||||||
import pvcnoded.objects.CephInstance as CephInstance
|
import pvcnoded.objects.CephInstance as CephInstance
|
||||||
|
|
||||||
@ -200,9 +201,9 @@ def entrypoint():
|
|||||||
|
|
||||||
# Define a cleanup function
|
# Define a cleanup function
|
||||||
def cleanup(failure=False):
|
def cleanup(failure=False):
|
||||||
nonlocal logger, zkhandler, keepalive_timer, d_domain
|
nonlocal logger, zkhandler, keepalive_timer, d_domain, netstats
|
||||||
|
|
||||||
logger.out("Terminating pvcnoded and cleaning up", state="s")
|
logger.out("Terminating pvcnoded", state="s")
|
||||||
|
|
||||||
# Set shutdown state in Zookeeper
|
# Set shutdown state in Zookeeper
|
||||||
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "shutdown")])
|
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "shutdown")])
|
||||||
@ -249,12 +250,20 @@ def entrypoint():
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Set stop state in Zookeeper
|
logger.out("Cleaning up", state="s")
|
||||||
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
|
|
||||||
|
# Stop netstats instance
|
||||||
|
try:
|
||||||
|
netstats.shutdown()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Forcibly terminate dnsmasq because it gets stuck sometimes
|
# Forcibly terminate dnsmasq because it gets stuck sometimes
|
||||||
common.run_os_command("killall dnsmasq")
|
common.run_os_command("killall dnsmasq")
|
||||||
|
|
||||||
|
# Set stop state in Zookeeper
|
||||||
|
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
|
||||||
|
|
||||||
# Close the Zookeeper connection
|
# Close the Zookeeper connection
|
||||||
try:
|
try:
|
||||||
zkhandler.disconnect(persistent=True)
|
zkhandler.disconnect(persistent=True)
|
||||||
@ -1000,9 +1009,12 @@ def entrypoint():
|
|||||||
state="s",
|
state="s",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Set up netstats
|
||||||
|
netstats = NetstatsInstance.NetstatsInstance(logger, config, zkhandler, this_node)
|
||||||
|
|
||||||
# Start keepalived thread
|
# Start keepalived thread
|
||||||
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
|
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
|
||||||
logger, config, zkhandler, this_node
|
logger, config, zkhandler, this_node, netstats
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tick loop; does nothing since everything is async
|
# Tick loop; does nothing since everything is async
|
||||||
|
293
node-daemon/pvcnoded/objects/NetstatsInstance.py
Normal file
293
node-daemon/pvcnoded/objects/NetstatsInstance.py
Normal file
@ -0,0 +1,293 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# NetstatsInstance.py - Class implementing a PVC network stats gatherer and run by pvcnoded
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
from collections import deque
|
||||||
|
from json import dumps
|
||||||
|
from os import walk
|
||||||
|
from os.path import exists
|
||||||
|
|
||||||
|
|
||||||
|
class NetstatsIfaceInstance(object):
|
||||||
|
"""
|
||||||
|
NetstatsIfaceInstance
|
||||||
|
|
||||||
|
This class implements a rolling statistics poller for a network interface,
|
||||||
|
collecting stats on the bits and packets per second in both directions every
|
||||||
|
second.
|
||||||
|
|
||||||
|
Via the get_stats() function, it returns the rolling average of all 4 values,
|
||||||
|
as well as totals, over the last 5 seconds (self.avg_samples) as a tuple of:
|
||||||
|
(rx_bps, rx_pps, tx_bps, tx_pps, total_bps, total_pps, link_speed, state)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, logger, iface, avg_samples):
|
||||||
|
"""
|
||||||
|
Initialize the class instance, creating our BackgroundScheduler, setting
|
||||||
|
the average sample rate, and creating the deques and average values.
|
||||||
|
"""
|
||||||
|
self.logger = logger
|
||||||
|
self.iface = iface
|
||||||
|
|
||||||
|
self.data_valid = False
|
||||||
|
self.data_polls = 0
|
||||||
|
|
||||||
|
self.timer = BackgroundScheduler()
|
||||||
|
self.timer.add_job(self.gather_stats, trigger="interval", seconds=1)
|
||||||
|
|
||||||
|
self.avg_samples = avg_samples
|
||||||
|
|
||||||
|
self.link_speed = 0
|
||||||
|
self.state = "down"
|
||||||
|
|
||||||
|
self.rx_bits_rolling = deque(list(), self.avg_samples + 1)
|
||||||
|
self.rx_bps = 0
|
||||||
|
|
||||||
|
self.rx_packets_rolling = deque(list(), self.avg_samples + 1)
|
||||||
|
self.rx_pps = 0
|
||||||
|
|
||||||
|
self.tx_bits_rolling = deque(list(), self.avg_samples + 1)
|
||||||
|
self.tx_bps = 0
|
||||||
|
|
||||||
|
self.tx_packets_rolling = deque(list(), self.avg_samples + 1)
|
||||||
|
self.tx_pps = 0
|
||||||
|
|
||||||
|
self.total_bps = 0
|
||||||
|
self.total_pps = 0
|
||||||
|
|
||||||
|
def get_iface_stats(self):
|
||||||
|
"""
|
||||||
|
Reads the interface statistics from the sysfs for the interface.
|
||||||
|
"""
|
||||||
|
iface_state_path = f"/sys/class/net/{self.iface}/operstate"
|
||||||
|
with open(iface_state_path) as stfh:
|
||||||
|
self.state = stfh.read().strip()
|
||||||
|
|
||||||
|
iface_speed_path = f"/sys/class/net/{self.iface}/speed"
|
||||||
|
try:
|
||||||
|
with open(iface_speed_path) as spfh:
|
||||||
|
# The speed key is always in Mbps so multiply by 1000*1000 to get bps
|
||||||
|
self.link_speed = int(spfh.read()) * 1000 * 1000
|
||||||
|
except OSError:
|
||||||
|
self.link_speed = 0
|
||||||
|
|
||||||
|
iface_stats_path = f"/sys/class/net/{self.iface}/statistics"
|
||||||
|
with open(f"{iface_stats_path}/rx_bytes") as rxbfh:
|
||||||
|
self.rx_bits_rolling.append(int(rxbfh.read()) * 8)
|
||||||
|
with open(f"{iface_stats_path}/tx_bytes") as txbfh:
|
||||||
|
self.tx_bits_rolling.append(int(txbfh.read()) * 8)
|
||||||
|
with open(f"{iface_stats_path}/rx_packets") as rxpfh:
|
||||||
|
self.rx_packets_rolling.append(int(rxpfh.read()) * 8)
|
||||||
|
with open(f"{iface_stats_path}/tx_packets") as txpfh:
|
||||||
|
self.tx_packets_rolling.append(int(txpfh.read()) * 8)
|
||||||
|
|
||||||
|
def calculate_averages(self):
|
||||||
|
"""
|
||||||
|
Calculates the bps/pps values from the rolling values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
rx_bits_diffs = list()
|
||||||
|
for sample_idx in range(self.avg_samples, 0, -1):
|
||||||
|
rx_bits_diffs.append(
|
||||||
|
self.rx_bits_rolling[sample_idx] - self.rx_bits_rolling[sample_idx - 1]
|
||||||
|
)
|
||||||
|
self.rx_bps = int(sum(rx_bits_diffs) / self.avg_samples)
|
||||||
|
|
||||||
|
rx_packets_diffs = list()
|
||||||
|
for sample_idx in range(self.avg_samples, 0, -1):
|
||||||
|
rx_packets_diffs.append(
|
||||||
|
self.rx_packets_rolling[sample_idx]
|
||||||
|
- self.rx_packets_rolling[sample_idx - 1]
|
||||||
|
)
|
||||||
|
self.rx_pps = int(sum(rx_packets_diffs) / self.avg_samples)
|
||||||
|
|
||||||
|
tx_bits_diffs = list()
|
||||||
|
for sample_idx in range(self.avg_samples, 0, -1):
|
||||||
|
tx_bits_diffs.append(
|
||||||
|
self.tx_bits_rolling[sample_idx] - self.tx_bits_rolling[sample_idx - 1]
|
||||||
|
)
|
||||||
|
self.tx_bps = int(sum(tx_bits_diffs) / self.avg_samples)
|
||||||
|
|
||||||
|
tx_packets_diffs = list()
|
||||||
|
for sample_idx in range(self.avg_samples, 0, -1):
|
||||||
|
tx_packets_diffs.append(
|
||||||
|
self.tx_packets_rolling[sample_idx]
|
||||||
|
- self.tx_packets_rolling[sample_idx - 1]
|
||||||
|
)
|
||||||
|
self.tx_pps = int(sum(tx_packets_diffs) / self.avg_samples)
|
||||||
|
|
||||||
|
self.total_bps = self.rx_bps + self.tx_bps
|
||||||
|
self.total_pps = self.rx_pps + self.tx_pps
|
||||||
|
|
||||||
|
def gather_stats(self):
|
||||||
|
"""
|
||||||
|
Gathers the current stats and then calculates the averages.
|
||||||
|
|
||||||
|
Runs via the BackgroundScheduler timer every 1 second.
|
||||||
|
"""
|
||||||
|
self.get_iface_stats()
|
||||||
|
if self.data_valid:
|
||||||
|
self.calculate_averages()
|
||||||
|
|
||||||
|
# Handle data validity: our data is invalid until we hit enough polls
|
||||||
|
# to make a valid average (avg_samples plus 1).
|
||||||
|
if not self.data_valid:
|
||||||
|
self.data_polls += 1
|
||||||
|
if self.data_polls > self.avg_samples:
|
||||||
|
self.data_valid = True
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""
|
||||||
|
Starts the timer.
|
||||||
|
"""
|
||||||
|
self.timer.start()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""
|
||||||
|
Stops the timer.
|
||||||
|
"""
|
||||||
|
self.timer.shutdown()
|
||||||
|
|
||||||
|
def get_stats(self):
|
||||||
|
"""
|
||||||
|
Returns a tuple of the current statistics.
|
||||||
|
"""
|
||||||
|
if not self.data_valid:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (
|
||||||
|
self.rx_bps,
|
||||||
|
self.rx_pps,
|
||||||
|
self.tx_bps,
|
||||||
|
self.tx_pps,
|
||||||
|
self.total_bps,
|
||||||
|
self.total_pps,
|
||||||
|
self.link_speed,
|
||||||
|
self.state,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NetstatsInstance(object):
|
||||||
|
"""
|
||||||
|
NetstatsInstance
|
||||||
|
|
||||||
|
This class implements a rolling statistics poller for all PHYSICAL network interfaces,
|
||||||
|
on the system, initializing a NetstatsIfaceInstance for each, as well as handling
|
||||||
|
value updates into Zookeeper.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, logger, config, zkhandler, this_node):
|
||||||
|
"""
|
||||||
|
Initialize the class instance.
|
||||||
|
"""
|
||||||
|
self.logger = logger
|
||||||
|
self.config = config
|
||||||
|
self.zkhandler = zkhandler
|
||||||
|
self.node_name = this_node.name
|
||||||
|
|
||||||
|
self.interfaces = dict()
|
||||||
|
|
||||||
|
self.logger.out(
|
||||||
|
f"Starting netstats collector ({self.config['keepalive_interval']} second interval)",
|
||||||
|
state="s",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.set_interfaces()
|
||||||
|
|
||||||
|
def shutdown(self):
|
||||||
|
"""
|
||||||
|
Stop all pollers and delete the NetstatsIfaceInstance objects
|
||||||
|
"""
|
||||||
|
# Empty the network stats object
|
||||||
|
self.zkhandler.write([(("node.network.stats", self.node_name), dumps({}))])
|
||||||
|
|
||||||
|
for iface in self.interfaces.keys():
|
||||||
|
self.interfaces[iface].stop()
|
||||||
|
|
||||||
|
def set_interfaces(self):
|
||||||
|
"""
|
||||||
|
Sets the list of interfaces on the system, and then ensures that each
|
||||||
|
interface has a NetstatsIfaceInstance assigned to it and polling.
|
||||||
|
"""
|
||||||
|
# Get a list of all active interfaces
|
||||||
|
net_root_path = "/sys/class/net"
|
||||||
|
all_ifaces = list()
|
||||||
|
for (_, dirnames, _) in walk(net_root_path):
|
||||||
|
all_ifaces.extend(dirnames)
|
||||||
|
all_ifaces.sort()
|
||||||
|
|
||||||
|
self.logger.out(
|
||||||
|
f"Parsing network list: {all_ifaces}", state="d", prefix="netstats-thread"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add any missing interfaces
|
||||||
|
for iface in all_ifaces:
|
||||||
|
if not exists(f"{net_root_path}/{iface}/device"):
|
||||||
|
# This is not a physical interface; skip it
|
||||||
|
continue
|
||||||
|
|
||||||
|
if iface not in self.interfaces.keys():
|
||||||
|
# Set the number of samples to be equal to the keepalive interval, so that each
|
||||||
|
# keepalive has a fresh set of data from the last keepalive_interval seconds.
|
||||||
|
self.interfaces[iface] = NetstatsIfaceInstance(
|
||||||
|
self.logger, iface, self.config["keepalive_interval"]
|
||||||
|
)
|
||||||
|
self.interfaces[iface].start()
|
||||||
|
# Remove any superfluous interfaces
|
||||||
|
for iface in self.interfaces.keys():
|
||||||
|
if iface not in all_ifaces:
|
||||||
|
self.interfaces[iface].stop()
|
||||||
|
del self.interfaces[iface]
|
||||||
|
|
||||||
|
def set_data(self):
|
||||||
|
data = dict()
|
||||||
|
for iface in self.interfaces.keys():
|
||||||
|
self.logger.out(
|
||||||
|
f"Getting data for interface {iface}",
|
||||||
|
state="d",
|
||||||
|
prefix="netstats-thread",
|
||||||
|
)
|
||||||
|
iface_stats = self.interfaces[iface].get_stats()
|
||||||
|
if iface_stats is None:
|
||||||
|
continue
|
||||||
|
(
|
||||||
|
iface_rx_bps,
|
||||||
|
iface_rx_pps,
|
||||||
|
iface_tx_bps,
|
||||||
|
iface_tx_pps,
|
||||||
|
iface_total_bps,
|
||||||
|
iface_total_pps,
|
||||||
|
iface_link_speed,
|
||||||
|
iface_state,
|
||||||
|
) = iface_stats
|
||||||
|
data[iface] = {
|
||||||
|
"rx_bps": iface_rx_bps,
|
||||||
|
"rx_pps": iface_rx_pps,
|
||||||
|
"tx_bps": iface_tx_bps,
|
||||||
|
"tx_pps": iface_tx_pps,
|
||||||
|
"total_bps": iface_total_bps,
|
||||||
|
"total_pps": iface_total_pps,
|
||||||
|
"link_speed": iface_link_speed,
|
||||||
|
"state": iface_state,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.zkhandler.write([(("node.network.stats", self.node_name), dumps(data))])
|
@ -51,7 +51,7 @@ libvirt_vm_states = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def start_keepalive_timer(logger, config, zkhandler, this_node):
|
def start_keepalive_timer(logger, config, zkhandler, this_node, netstats):
|
||||||
keepalive_interval = config["keepalive_interval"]
|
keepalive_interval = config["keepalive_interval"]
|
||||||
logger.out(
|
logger.out(
|
||||||
f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
|
f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
|
||||||
@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node):
|
|||||||
keepalive_timer = BackgroundScheduler()
|
keepalive_timer = BackgroundScheduler()
|
||||||
keepalive_timer.add_job(
|
keepalive_timer.add_job(
|
||||||
node_keepalive,
|
node_keepalive,
|
||||||
args=(logger, config, zkhandler, this_node),
|
args=(logger, config, zkhandler, this_node, netstats),
|
||||||
trigger="interval",
|
trigger="interval",
|
||||||
seconds=keepalive_interval,
|
seconds=keepalive_interval,
|
||||||
)
|
)
|
||||||
@ -684,7 +684,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
|
|||||||
|
|
||||||
|
|
||||||
# Keepalive update function
|
# Keepalive update function
|
||||||
def node_keepalive(logger, config, zkhandler, this_node):
|
def node_keepalive(logger, config, zkhandler, this_node, netstats):
|
||||||
debug = config["debug"]
|
debug = config["debug"]
|
||||||
|
|
||||||
# Display node information to the terminal
|
# Display node information to the terminal
|
||||||
@ -793,6 +793,10 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
|||||||
this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024)
|
this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024)
|
||||||
this_node.cpuload = round(os.getloadavg()[0], 2)
|
this_node.cpuload = round(os.getloadavg()[0], 2)
|
||||||
|
|
||||||
|
# Get node network statistics via netstats instance
|
||||||
|
netstats.set_interfaces()
|
||||||
|
netstats.set_data()
|
||||||
|
|
||||||
# Join against running threads
|
# Join against running threads
|
||||||
if config["enable_hypervisor"]:
|
if config["enable_hypervisor"]:
|
||||||
vm_stats_thread.join(timeout=config["keepalive_interval"])
|
vm_stats_thread.join(timeout=config["keepalive_interval"])
|
||||||
|
Reference in New Issue
Block a user