diff --git a/node-daemon/plugins/nics b/node-daemon/plugins/nics new file mode 100644 index 00000000..3edc4dec --- /dev/null +++ b/node-daemon/plugins/nics @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +# nics.py - PVC Monitoring example plugin for NIC interfaces +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +# This script provides an example of a PVC monitoring plugin script. It will create +# a simple plugin to check the network interfaces of the host, specifically for speed +# and 802.3ad status (if applicable). + +# This script can thus be used as an example or reference implementation of a +# PVC monitoring pluginscript and expanded upon as required. + +# A monitoring plugin script must implement the class "MonitoringPluginScript" which +# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation +# of the role of each function is provided in context of the example; see the other +# examples for more potential uses. + +# WARNING: +# +# This script will run in the context of the node daemon keepalives as root. +# DO NOT install untrusted, unvetted plugins under any circumstances. + + +# This import is always required here, as MonitoringPlugin is used by the +# MonitoringPluginScript class +from pvcnoded.objects.MonitoringInstance import MonitoringPlugin + + +# A monitoring plugin script must always expose its nice name, which must be identical to +# the file name +PLUGIN_NAME = "nics" + +# Set a minimum link speed variable used below +# For PVC at least 10 Gbps is required for proper operation of a cluster +MINIMUM_LINKSPEED = 10000 + + +# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. +class MonitoringPluginScript(MonitoringPlugin): + def setup(self): + """ + setup(): Perform special setup steps during node daemon startup + + This step is optional and should be used sparingly. + """ + + pass + + def run(self): + """ + run(): Perform the check actions and return a PluginResult object + """ + + # Run any imports first + import daemon_lib.common as common + from re import match, search + + messages = list() + health_delta = 0 + + # Get a list of the various underlying devices + _core_nics = set() + + for dev in [ + self.config['bridge_dev'], + self.config['upstream_dev'], + self.config['cluster_dev'], + self.config['storage_dev'], + ]: + with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent: + _devtype = uevent.readlines()[0].split('=')[-1].strip() + + if _devtype == 'vlan': + with open(f"/proc/net/vlan/{dev}") as devfh: + vlan_info = devfh.read().split('\n') + for line in vlan_info: + if match(r'^Device:', line): + dev = line.split()[-1] + + _core_nics.add(dev) + + core_nics = sorted(list(_core_nics)) + + for dev in core_nics: + with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent: + _devtype = uevent.readlines()[0].split('=')[-1].strip() + + if _devtype == "bond": + syspath = f"/proc/net/bonding/{dev}" + + with open(syspath) as devfh: + bonding_stats = devfh.read() + + _, _mode, _info, *_slaves = bonding_stats.split('\n\n') + + slave_interfaces = list() + for slavedev in _slaves: + lines = slavedev.split('\n') + for line in lines: + if match(r'^Slave Interface:', line): + interface_name = line.split()[-1] + if match(r'^MII Status:', line): + interface_status = line.split()[-1] + if match(r'^Speed:', line): + try: + interface_speed_mbps = int(line.split()[-2]) + except Exception: + interface_speed_mbps = 0 + if match(r'^Duplex:', line): + interface_duplex = line.split()[-1] + slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex)) + + # Ensure at least 2 slave interfaces are up + slave_interface_up_count = 0 + for slave_interface in slave_interfaces: + if slave_interface[1] == 'up': + slave_interface_up_count += 1 + if slave_interface_up_count < 2: + messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves") + health_delta += 10 + else: + messages.append(f"{dev} OK with {slave_interface_up_count} active slaves") + + # Get ethtool supported speeds for slave interfaces + supported_link_speeds = set() + for slave_interface in slave_interfaces: + slave_dev = slave_interface[0] + _, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}") + in_modes = False + for line in ethtool_stdout.split('\n'): + if search('Supported link modes:', line): + in_modes = True + if search('Supported pause frame use:', line): + in_modes = False + break + if in_modes: + speed = int(line.split()[-1].replace('baseT', '').split('/')[0]) + supported_link_speeds.add(speed) + else: + # Get ethtool supported speeds for interface + supported_link_speeds = set() + _, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}") + in_modes = False + for line in ethtool_stdout.split('\n'): + if search('Supported link modes:', line): + in_modes = True + if search('Supported pause frame use:', line): + in_modes = False + break + if in_modes: + speed = int(line.split()[-1].replace('baseT', '').split('/')[0]) + supported_link_speeds.add(speed) + + max_supported_link_speed = sorted(list(supported_link_speeds))[-1] + + # Ensure interface is running at MINIMUM_LINKSPEED + with open(f"/sys/class/net/{dev}/speed") as devfh: + dev_speed = int(devfh.read()) + if dev_speed < max_supported_link_speed: + messages.append(f"{dev} DEGRADED at {dev_speed} Mbps") + health_delta += 10 + else: + messages.append(f"{dev} OK at {dev_speed} Mbps") + + # Set the health delta in our local PluginResult object + self.plugin_result.set_health_delta(health_delta) + + # Set the message in our local PluginResult object + self.plugin_result.set_message(', '.join(messages)) + + # Return our local PluginResult object + return self.plugin_result + + def cleanup(self): + """ + cleanup(): Perform special cleanup steps during node daemon termination + + This step is optional and should be used sparingly. + """ + + pass