199 lines
		
	
	
		
			8.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			199 lines
		
	
	
		
			8.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| # nics.py - PVC Monitoring example plugin for NIC interfaces
 | |
| # Part of the Parallel Virtual Cluster (PVC) system
 | |
| #
 | |
| #    Copyright (C) 2018-2024 Joshua M. Boniface <joshua@boniface.me>
 | |
| #
 | |
| #    This program is free software: you can redistribute it and/or modify
 | |
| #    it under the terms of the GNU General Public License as published by
 | |
| #    the Free Software Foundation, version 3.
 | |
| #
 | |
| #    This program is distributed in the hope that it will be useful,
 | |
| #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| #    GNU General Public License for more details.
 | |
| #
 | |
| #    You should have received a copy of the GNU General Public License
 | |
| #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 | |
| #
 | |
| ###############################################################################
 | |
| 
 | |
| # This script provides an example of a PVC monitoring plugin script. It will create
 | |
| # a simple plugin to check the network interfaces of the host, specifically for speed
 | |
| # and 802.3ad status (if applicable).
 | |
| 
 | |
| # This script can thus be used as an example or reference implementation of a
 | |
| # PVC monitoring pluginscript and expanded upon as required.
 | |
| 
 | |
| # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 | |
| # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 | |
| # of the role of each function is provided in context of the example; see the other
 | |
| # examples for more potential uses.
 | |
| 
 | |
| # WARNING:
 | |
| #
 | |
| # This script will run in the context of the node daemon keepalives as root.
 | |
| # DO NOT install untrusted, unvetted plugins under any circumstances.
 | |
| 
 | |
| 
 | |
| # This import is always required here, as MonitoringPlugin is used by the
 | |
| # MonitoringPluginScript class
 | |
| from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
 | |
| 
 | |
| 
 | |
| # A monitoring plugin script must always expose its nice name, which must be identical to
 | |
| # the file name
 | |
| PLUGIN_NAME = "nics"
 | |
| 
 | |
| 
 | |
| # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 | |
| class MonitoringPluginScript(MonitoringPlugin):
 | |
|     def setup(self):
 | |
|         """
 | |
|         setup(): Perform special setup steps during node daemon startup
 | |
| 
 | |
|         This step is optional and should be used sparingly.
 | |
| 
 | |
|         If you wish for the plugin to not load in certain conditions, do any checks here
 | |
|         and return a non-None failure message to indicate the error.
 | |
|         """
 | |
| 
 | |
|         pass
 | |
| 
 | |
|     def run(self, coordinator_state=None):
 | |
|         """
 | |
|         run(): Perform the check actions and return a PluginResult object
 | |
| 
 | |
|         The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
 | |
|         """
 | |
| 
 | |
|         # Run any imports first
 | |
|         import daemon_lib.common as common
 | |
|         from re import match, search, findall
 | |
| 
 | |
|         messages = list()
 | |
|         health_delta = 0
 | |
| 
 | |
|         # Get a list of the various underlying devices
 | |
|         _core_nics = set()
 | |
| 
 | |
|         for dev in [
 | |
|                 self.config['bridge_dev'],
 | |
|                 self.config['upstream_dev'],
 | |
|                 self.config['cluster_dev'],
 | |
|                 self.config['storage_dev'],
 | |
|         ]:
 | |
|             with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
 | |
|                 _devtype = uevent.readlines()[0].split('=')[-1].strip()
 | |
| 
 | |
|             if _devtype == 'vlan':
 | |
|                 with open(f"/proc/net/vlan/{dev}") as devfh:
 | |
|                     vlan_info = devfh.read().split('\n')
 | |
|                 for line in vlan_info:
 | |
|                     if match(r'^Device:', line):
 | |
|                         dev = line.split()[-1]
 | |
| 
 | |
|             _core_nics.add(dev)
 | |
| 
 | |
|         core_nics = sorted(list(_core_nics))
 | |
| 
 | |
|         for dev in core_nics:
 | |
|             with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
 | |
|                 _devtype = uevent.readlines()[0].split('=')[-1].strip()
 | |
| 
 | |
|             if _devtype == "bond":
 | |
|                 syspath = f"/proc/net/bonding/{dev}"
 | |
| 
 | |
|                 with open(syspath) as devfh:
 | |
|                     bonding_stats = devfh.read()
 | |
| 
 | |
|                 _, _mode, _info, *_slaves = bonding_stats.split('\n\n')
 | |
| 
 | |
|                 slave_interfaces = list()
 | |
|                 for slavedev in _slaves:
 | |
|                     lines = slavedev.split('\n')
 | |
|                     for line in lines:
 | |
|                         if match(r'^Slave Interface:', line):
 | |
|                             interface_name = line.split()[-1]
 | |
|                         if match(r'^MII Status:', line):
 | |
|                             interface_status = line.split()[-1]
 | |
|                         if match(r'^Speed:', line):
 | |
|                             try:
 | |
|                                 interface_speed_mbps = int(line.split()[-2])
 | |
|                             except Exception:
 | |
|                                 interface_speed_mbps = 0
 | |
|                         if match(r'^Duplex:', line):
 | |
|                             interface_duplex = line.split()[-1]
 | |
|                     slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex))
 | |
| 
 | |
|                 # Ensure at least 2 slave interfaces are up
 | |
|                 slave_interface_up_count = 0
 | |
|                 for slave_interface in slave_interfaces:
 | |
|                     if slave_interface[1] == 'up':
 | |
|                         slave_interface_up_count += 1
 | |
|                 if slave_interface_up_count < len(slave_interfaces):
 | |
|                     messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
 | |
|                     health_delta += 10
 | |
|                 else:
 | |
|                     messages.append(f"{dev} OK with {slave_interface_up_count} active slaves")
 | |
| 
 | |
|                 # Get ethtool supported speeds for slave interfaces
 | |
|                 supported_link_speeds = set()
 | |
|                 for slave_interface in slave_interfaces:
 | |
|                     slave_dev = slave_interface[0]
 | |
|                     _, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}")
 | |
|                     in_modes = False
 | |
|                     for line in ethtool_stdout.split('\n'):
 | |
|                         if search('Supported link modes:', line):
 | |
|                             in_modes = True
 | |
|                         if search('Supported pause frame use:', line):
 | |
|                             in_modes = False
 | |
|                             break
 | |
|                         if in_modes:
 | |
|                             speed = int(findall(r'\d+', line.split()[-1])[0])
 | |
|                             supported_link_speeds.add(speed)
 | |
|             else:
 | |
|                 # Get ethtool supported speeds for interface
 | |
|                 supported_link_speeds = set()
 | |
|                 _, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}")
 | |
|                 in_modes = False
 | |
|                 for line in ethtool_stdout.split('\n'):
 | |
|                     if search('Supported link modes:', line):
 | |
|                         in_modes = True
 | |
|                     if search('Supported pause frame use:', line):
 | |
|                         in_modes = False
 | |
|                         break
 | |
|                     if in_modes:
 | |
|                         speed = int(line.split()[-1].replace('baseT', '').split('/')[0])
 | |
|                         supported_link_speeds.add(speed)
 | |
| 
 | |
|             max_supported_link_speed = sorted(list(supported_link_speeds))[-1]
 | |
| 
 | |
|             # Ensure interface is running at its maximum speed
 | |
|             with open(f"/sys/class/net/{dev}/speed") as devfh:
 | |
|                 dev_speed = int(devfh.read())
 | |
|             if dev_speed < max_supported_link_speed:
 | |
|                 messages.append(f"{dev} DEGRADED at {dev_speed} Mbps")
 | |
|                 health_delta += 10
 | |
|             else:
 | |
|                 messages.append(f"{dev} OK at {dev_speed} Mbps")
 | |
| 
 | |
|         # Set the health delta in our local PluginResult object
 | |
|         self.plugin_result.set_health_delta(health_delta)
 | |
| 
 | |
|         # Set the message in our local PluginResult object
 | |
|         self.plugin_result.set_message(', '.join(messages))
 | |
| 
 | |
|         # Return our local PluginResult object
 | |
|         return self.plugin_result
 | |
| 
 | |
|     def cleanup(self):
 | |
|         """
 | |
|         cleanup(): Perform special cleanup steps during node daemon termination
 | |
| 
 | |
|         This step is optional and should be used sparingly.
 | |
|         """
 | |
| 
 | |
|         pass
 |