Compare commits

...

11 Commits

Author SHA1 Message Date
f46bfc962f Bump version to 0.9.75 2023-09-16 23:06:38 -04:00
714d4b6005 Revert float conversion of cpu_cores
Results in much uglier output, there are no decimal core counts.
2023-09-16 23:06:07 -04:00
fa8329ac3d Explicitly round load avg in load plugin 2023-09-16 22:58:49 -04:00
457b7bed3d Handle exceptions in fence migrations 2023-09-16 22:56:09 -04:00
86115b2928 Add startup message for IPMI reachability
It's good to know that this succeeded in addition to knowing if it
failed.
2023-09-16 22:41:58 -04:00
1a906b589e Bump version to 0.9.74 2023-09-16 00:18:13 -04:00
7b230d8bd5 Add monitoring plugin for hardware RAID arrays 2023-09-16 00:02:53 -04:00
48662e90c1 Remove obsolete monitoring_instance passing 2023-09-15 22:47:45 -04:00
079381c03e Move printing to end and add runtime 2023-09-15 22:40:09 -04:00
794cea4a02 Reverse ordering, run checks before starting timer 2023-09-15 22:25:37 -04:00
fa24f3ba75 Fix bad fstring in psur check 2023-09-15 22:19:49 -04:00
12 changed files with 325 additions and 26 deletions

View File

@ -1 +1 @@
0.9.73 0.9.75

View File

@ -1,5 +1,19 @@
## PVC Changelog ## PVC Changelog
###### [v0.9.75](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.75)
* [Node Daemon] Adds a startup message about IPMI when succeeding
* [Node Daemon] Fixes a bug in fencing allowing non-failing VMs to migrate
* [Node Daemon] Adds rounding to load average in load plugin for consistency
###### [v0.9.74](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.74)
* [Docs] Removes docs from the main repo
* [Client CLI] Ensures that "provision" VMs are shown in the right colour
* [Node Daemon] Separates the node monitoring subsystem into its own thread with a longer, customizable update interval
* [Node Daemon] Adds checks for PSU input power reundancy (psur) and hardware RAID (hwrd)
* [Node Daemon] Updates when Keepalive start messages are printed (end of run, with runtime) to align with new monitoring messages
###### [v0.9.73](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.73) ###### [v0.9.73](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.73)
* [Node Daemon] Fixes a bug creating monitoring instance * [Node Daemon] Fixes a bug creating monitoring instance

View File

@ -27,7 +27,7 @@ from ssl import SSLContext, TLSVersion
from distutils.util import strtobool as dustrtobool from distutils.util import strtobool as dustrtobool
# Daemon version # Daemon version
version = "0.9.73" version = "0.9.75"
# API version # API version
API_VERSION = 1.0 API_VERSION = 1.0

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="pvc", name="pvc",
version="0.9.73", version="0.9.75",
packages=["pvc.cli", "pvc.lib"], packages=["pvc.cli", "pvc.lib"],
install_requires=[ install_requires=[
"Click", "Click",

18
debian/changelog vendored
View File

@ -1,3 +1,21 @@
pvc (0.9.75-0) unstable; urgency=high
* [Node Daemon] Adds a startup message about IPMI when succeeding
* [Node Daemon] Fixes a bug in fencing allowing non-failing VMs to migrate
* [Node Daemon] Adds rounding to load average in load plugin for consistency
-- Joshua M. Boniface <joshua@boniface.me> Sat, 16 Sep 2023 23:06:38 -0400
pvc (0.9.74-0) unstable; urgency=high
* [Docs] Removes docs from the main repo
* [Client CLI] Ensures that "provision" VMs are shown in the right colour
* [Node Daemon] Separates the node monitoring subsystem into its own thread with a longer, customizable update interval
* [Node Daemon] Adds checks for PSU input power reundancy (psur) and hardware RAID (hwrd)
* [Node Daemon] Updates when Keepalive start messages are printed (end of run, with runtime) to align with new monitoring messages
-- Joshua M. Boniface <joshua@boniface.me> Sat, 16 Sep 2023 00:18:13 -0400
pvc (0.9.73-0) unstable; urgency=high pvc (0.9.73-0) unstable; urgency=high
* [Node Daemon] Fixes a bug creating monitoring instance * [Node Daemon] Fixes a bug creating monitoring instance

247
node-daemon/plugins/hwrd Normal file
View File

@ -0,0 +1,247 @@
#!/usr/bin/env python3
# hwrd.py - PVC Monitoring example plugin for hardware RAID Arrays
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check any hardwrae RAID virtual disks for health and report errors.
# Supports Dell BOSS cards, LSI/Avago/Broadcom MegaRAID, and HP SmartArray RAID.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "hwrd"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def check_dellboss(self):
# Run any imports first
from daemon_lib.common import run_os_command
from re import match
health_delta = 0
messages = list()
_dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
if _dellboss_ret != 0:
health_delta = 50
messages.append("Error running MVCLI command")
else:
arrays = list()
idx = None
for line in _dellboss_list.split('\n'):
if match(r"^id:", line):
idx = int(line.split(":")[-1].strip())
arrays.append(dict())
if match(r"^name:", line):
arrays[idx]["name"] = line.split(":")[-1].strip()
if match(r"^status:", line):
arrays[idx]["status"] = line.split(":")[-1].strip()
for idx, array in enumerate(arrays):
if array["status"] != "functional":
health_delta += 50
messages.append(f"RAID Dell BOSS ID {idx} (Name: {array['name']}, State: {array['status']})")
if len(messages) < 1:
messages.append(f"No valid RAID arrays found")
return health_delta, messages
def check_megaraid(self):
# Run any imports first
from daemon_lib.common import run_os_command
from re import match
health_delta = 0
messages = list()
_megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
if _megaraid_ret != 0:
health_delta = 50
messages.append("Error running MegaCLI command")
else:
vd_list = _megaraid_list.split('\n\n\n')
for idx, _vd in enumerate(vd_list):
vd = _vd.split('\n')
if "Virtual Drive Information" not in vd[2]:
continue
raid_name = None
raid_count = 0
raid_state = None
for entry in vd:
if len(entry.split(':')) < 2:
continue
entry_key = entry.split(':')[0].strip()
entry_value = entry.split(':')[1].strip()
if entry_key == "State":
raid_state = entry_value
if entry_key == "Name":
raid_name = entry_value
if entry_key == "Number Of Drives":
raid_count = entry_value
if raid_state is None or raid_name is None or raid_count == 0:
health_delta += 10
messages.append(f"RAID ID {idx} did not report useful values")
continue
if raid_state != "Optimal":
health_delta += 50
messages.append(f"RAID MegaRAID ID {idx} (Name: {raid_name}, Disks: {raid_count}, State: {raid_state})")
if len(messages) < 1:
messages.append(f"No valid RAID arrays found")
return health_delta, messages
def check_hpsa(self):
# Run any imports first
from daemon_lib.common import run_os_command
from re import match, findall
health_delta = 0
messages = list()
_hparray_ret, _hparray_list, _ = run_os_command(f"ssacli ctrl slot={self.controller_slot} ld all show")
if _hparray_ret != 0:
health_delta = 50
messages.append("Error running SSACLI command")
else:
vd_lines = _hparray_list.split('\n\n')
arrays = list()
cur_array = None
for idx, _line in enumerate(vd_lines):
line = _line.strip()
if match(r"^Array", line):
cur_array = line
if match(r"^logicaldrive", line) and cur_array is not None:
arrays.append(f"{cur_array} {line}")
for array in arrays:
if "OK" not in array:
health_delta += 50
messages.append(f"RAID HPSA {array}")
if len(messages) < 1:
messages.append(f"No valid RAID arrays found")
return health_delta, messages
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
from daemon_lib.common import run_os_command
from re import match, findall
self.raid_type = list()
_dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
if _dellboss_ret == 0:
# If this returns 0 at all, there's a valid BOSS card to manage
self.raid_type.append("dellboss")
_megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
if _megaraid_ret == 0:
vd_list = _megaraid_list.split('\n\n\n')
for idx, _vd in enumerate(vd_list):
vd = _vd.split('\n')
if "Virtual Drive Information" in vd[2]:
self.raid_type.append("megaraid")
_hpraid_ret, _hpraid_list, _ = run_os_command("ssacli ctrl all show status")
if _hpraid_ret == 0:
for line in _hpraid_list.split('\n'):
if match(r"^Smart", line):
controller_slots = findall("Slot ([0-9])", line)
if len(controller_slots) > 0:
self.raid_type.append("hpsa")
self.controller_slot = controller_slots[0]
if len(self.raid_type) < 1:
return "No hardware RAID management commands found"
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
"""
health_delta = 0
messages = list()
raid_function_map = {
"megaraid": self.check_megaraid,
"hpsa": self.check_hpsa,
"dellboss": self.check_dellboss,
}
for raid_type in self.raid_type:
_health_delta, _messages = raid_function_map.get(raid_type)()
health_delta += _health_delta
messages += _messages
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(', '.join(messages))
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

View File

@ -72,7 +72,7 @@ class MonitoringPluginScript(MonitoringPlugin):
from psutil import cpu_count from psutil import cpu_count
# Get the current 1-minute system load average # Get the current 1-minute system load average
load_average = getloadavg()[0] load_average = float(round(getloadavg()[0], 2))
# Get the number of CPU cores # Get the number of CPU cores
cpu_cores = cpu_count() cpu_cores = cpu_count()

View File

@ -111,7 +111,7 @@ class MonitoringPluginScript(MonitoringPlugin):
messages.append(f"Input power sensor {reading_sensor} reports {reading_text}") messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
elif reading_text == "No Reading": elif reading_text == "No Reading":
health_delta += 5 health_delta += 5
messages.append("Input power sensor {reading_sensor} reports {reading_text}, redundant power not configured") messages.append(f"Input power sensor {reading_sensor} reports {reading_text} (PSU redundancy not configured?)")
else: else:
health_delta += 10 health_delta += 10
messages.append(f"Input power sensor {reading_sensor} reports {reading_text}") messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")

View File

@ -49,7 +49,7 @@ import re
import json import json
# Daemon version # Daemon version
version = "0.9.73" version = "0.9.75"
########################################################## ##########################################################
@ -324,9 +324,14 @@ def entrypoint():
config["ipmi_hostname"], config["ipmi_username"], config["ipmi_password"] config["ipmi_hostname"], config["ipmi_username"], config["ipmi_password"]
): ):
logger.out( logger.out(
"Our IPMI is not reachable; fencing of this node will likely fail", "Our IPMI interface is not reachable; fencing of this node will fail until corrected",
state="w", state="w",
) )
else:
logger.out(
"Our IPMI interface is reachable; fencing of this node is possible",
state="o",
)
# Validate libvirt # Validate libvirt
if not pvcnoded.util.libvirt.validate_libvirtd(logger, config): if not pvcnoded.util.libvirt.validate_libvirtd(logger, config):
@ -1024,14 +1029,14 @@ def entrypoint():
state="i", state="i",
) )
# Set up the node monitoring instance # Set up the node monitoring instance and thread
monitoring_instance = MonitoringInstance.MonitoringInstance( monitoring_instance = MonitoringInstance.MonitoringInstance(
zkhandler, config, logger, this_node zkhandler, config, logger, this_node
) )
# Start keepalived thread # Start keepalived thread
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer( keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
logger, config, zkhandler, this_node, monitoring_instance logger, config, zkhandler, this_node
) )
# Tick loop; does nothing since everything is async # Tick loop; does nothing since everything is async

View File

@ -335,8 +335,8 @@ class MonitoringInstance(object):
) )
) )
self.start_check_timer()
self.run_plugins() self.run_plugins()
self.start_check_timer()
def __del__(self): def __del__(self):
self.shutdown() self.shutdown()

View File

@ -153,7 +153,13 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
# Loop through the VMs # Loop through the VMs
for dom_uuid in dead_node_running_domains: for dom_uuid in dead_node_running_domains:
fence_migrate_vm(dom_uuid) try:
fence_migrate_vm(dom_uuid)
except Exception as e:
logger.out(
f"Failed to migrate VM {dom_uuid}, continuing: {e}",
state="w",
)
# Set node in flushed state for easy remigrating when it comes back # Set node in flushed state for easy remigrating when it comes back
zkhandler.write([(("node.state.domain", node_name), "flushed")]) zkhandler.write([(("node.state.domain", node_name), "flushed")])

View File

@ -51,7 +51,7 @@ libvirt_vm_states = {
} }
def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_instance): def start_keepalive_timer(logger, config, zkhandler, this_node):
keepalive_interval = config["keepalive_interval"] keepalive_interval = config["keepalive_interval"]
logger.out( logger.out(
f"Starting keepalive timer ({keepalive_interval} second interval)", state="s" f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_insta
keepalive_timer = BackgroundScheduler() keepalive_timer = BackgroundScheduler()
keepalive_timer.add_job( keepalive_timer.add_job(
node_keepalive, node_keepalive,
args=(logger, config, zkhandler, this_node, monitoring_instance), args=(logger, config, zkhandler, this_node),
trigger="interval", trigger="interval",
seconds=keepalive_interval, seconds=keepalive_interval,
) )
@ -674,7 +674,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
# Keepalive update function # Keepalive update function
def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance): def node_keepalive(logger, config, zkhandler, this_node):
debug = config["debug"] debug = config["debug"]
# Display node information to the terminal # Display node information to the terminal
@ -685,18 +685,10 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
cst_colour = logger.fmt_blue cst_colour = logger.fmt_blue
else: else:
cst_colour = logger.fmt_cyan cst_colour = logger.fmt_cyan
logger.out(
"{}{} keepalive @ {}{} [{}{}{}]".format( active_coordinator_state = this_node.coordinator_state
logger.fmt_purple,
config["node_hostname"], runtime_start = datetime.now()
datetime.now(),
logger.fmt_end,
logger.fmt_bold + cst_colour,
this_node.coordinator_state,
logger.fmt_end,
),
state="t",
)
# Set the migration selector in Zookeeper for clients to read # Set the migration selector in Zookeeper for clients to read
if config["enable_hypervisor"]: if config["enable_hypervisor"]:
@ -860,6 +852,23 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
logger.out("Failed to set keepalive data", state="e") logger.out("Failed to set keepalive data", state="e")
if config["log_keepalives"]: if config["log_keepalives"]:
runtime_end = datetime.now()
runtime_delta = runtime_end - runtime_start
runtime = "{:0.02f}".format(runtime_delta.total_seconds())
logger.out(
"{start_colour}{hostname} keepalive @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] in {runtime} seconds".format(
start_colour=logger.fmt_purple,
cst_colour=logger.fmt_bold + cst_colour,
nofmt=logger.fmt_end,
hostname=config["node_hostname"],
starttime=runtime_start,
costate=active_coordinator_state,
runtime=runtime,
),
state="t",
)
if this_node.maintenance is True: if this_node.maintenance is True:
maintenance_colour = logger.fmt_blue maintenance_colour = logger.fmt_blue
else: else: