#!/usr/bin/env python3 # nics.py - PVC Monitoring example plugin for NIC interfaces # Part of the Parallel Virtual Cluster (PVC) system # # Copyright (C) 2018-2022 Joshua M. Boniface # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################### # This script provides an example of a PVC monitoring plugin script. It will create # a simple plugin to check the network interfaces of the host, specifically for speed # and 802.3ad status (if applicable). # This script can thus be used as an example or reference implementation of a # PVC monitoring pluginscript and expanded upon as required. # A monitoring plugin script must implement the class "MonitoringPluginScript" which # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation # of the role of each function is provided in context of the example; see the other # examples for more potential uses. # WARNING: # # This script will run in the context of the node daemon keepalives as root. # DO NOT install untrusted, unvetted plugins under any circumstances. # This import is always required here, as MonitoringPlugin is used by the # MonitoringPluginScript class from pvcnoded.objects.MonitoringInstance import MonitoringPlugin # A monitoring plugin script must always expose its nice name, which must be identical to # the file name PLUGIN_NAME = "nics" # Set a minimum link speed variable used below # For PVC at least 10 Gbps is required for proper operation of a cluster MINIMUM_LINKSPEED = 10000 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin. class MonitoringPluginScript(MonitoringPlugin): def setup(self): """ setup(): Perform special setup steps during node daemon startup This step is optional and should be used sparingly. """ pass def run(self): """ run(): Perform the check actions and return a PluginResult object """ # Run any imports first import daemon_lib.common as common from re import match, search, findall messages = list() health_delta = 0 # Get a list of the various underlying devices _core_nics = set() for dev in [ self.config['bridge_dev'], self.config['upstream_dev'], self.config['cluster_dev'], self.config['storage_dev'], ]: with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent: _devtype = uevent.readlines()[0].split('=')[-1].strip() if _devtype == 'vlan': with open(f"/proc/net/vlan/{dev}") as devfh: vlan_info = devfh.read().split('\n') for line in vlan_info: if match(r'^Device:', line): dev = line.split()[-1] _core_nics.add(dev) core_nics = sorted(list(_core_nics)) for dev in core_nics: with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent: _devtype = uevent.readlines()[0].split('=')[-1].strip() if _devtype == "bond": syspath = f"/proc/net/bonding/{dev}" with open(syspath) as devfh: bonding_stats = devfh.read() _, _mode, _info, *_slaves = bonding_stats.split('\n\n') slave_interfaces = list() for slavedev in _slaves: lines = slavedev.split('\n') for line in lines: if match(r'^Slave Interface:', line): interface_name = line.split()[-1] if match(r'^MII Status:', line): interface_status = line.split()[-1] if match(r'^Speed:', line): try: interface_speed_mbps = int(line.split()[-2]) except Exception: interface_speed_mbps = 0 if match(r'^Duplex:', line): interface_duplex = line.split()[-1] slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex)) # Ensure at least 2 slave interfaces are up slave_interface_up_count = 0 for slave_interface in slave_interfaces: if slave_interface[1] == 'up': slave_interface_up_count += 1 if slave_interface_up_count < 2: messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves") health_delta += 10 else: messages.append(f"{dev} OK with {slave_interface_up_count} active slaves") # Get ethtool supported speeds for slave interfaces supported_link_speeds = set() for slave_interface in slave_interfaces: slave_dev = slave_interface[0] _, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}") in_modes = False for line in ethtool_stdout.split('\n'): if search('Supported link modes:', line): in_modes = True if search('Supported pause frame use:', line): in_modes = False break if in_modes: speed = int(findall(r'\d+', line.split()[-1])[0]) supported_link_speeds.add(speed) else: # Get ethtool supported speeds for interface supported_link_speeds = set() _, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}") in_modes = False for line in ethtool_stdout.split('\n'): if search('Supported link modes:', line): in_modes = True if search('Supported pause frame use:', line): in_modes = False break if in_modes: speed = int(line.split()[-1].replace('baseT', '').split('/')[0]) supported_link_speeds.add(speed) max_supported_link_speed = sorted(list(supported_link_speeds))[-1] # Ensure interface is running at MINIMUM_LINKSPEED with open(f"/sys/class/net/{dev}/speed") as devfh: dev_speed = int(devfh.read()) if dev_speed < max_supported_link_speed: messages.append(f"{dev} DEGRADED at {dev_speed} Mbps") health_delta += 10 else: messages.append(f"{dev} OK at {dev_speed} Mbps") # Set the health delta in our local PluginResult object self.plugin_result.set_health_delta(health_delta) # Set the message in our local PluginResult object self.plugin_result.set_message(', '.join(messages)) # Return our local PluginResult object return self.plugin_result def cleanup(self): """ cleanup(): Perform special cleanup steps during node daemon termination This step is optional and should be used sparingly. """ pass