From 3bd93563e6cd57b129423e45f15fa16f4f0cb3d2 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 16 Feb 2023 13:06:35 -0500 Subject: [PATCH] Add CheckMK monitoring example plugins --- node-daemon/monitoring/README.md | 30 ++++++++- node-daemon/monitoring/checkmk/pvc | 6 ++ node-daemon/monitoring/checkmk/pvc.py | 95 +++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 3 deletions(-) create mode 100755 node-daemon/monitoring/checkmk/pvc create mode 100644 node-daemon/monitoring/checkmk/pvc.py diff --git a/node-daemon/monitoring/README.md b/node-daemon/monitoring/README.md index 5d786a79..3845c239 100644 --- a/node-daemon/monitoring/README.md +++ b/node-daemon/monitoring/README.md @@ -2,9 +2,9 @@ This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system. -### Munin +## Munin -The included munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time. +The included Munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time. When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99. @@ -21,4 +21,28 @@ env.pvc_storage_warning 0.99 env.pvc_storage_critical 1.99 ``` -### Check_MK +## CheckMK + +The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin, and can be activated as follows: + +### Agent plugin: `pvc` + +Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node. + +### Server plugin: `pvc.py` + +This monitoring server plugin requires CheckMK version 2.0 or higher. + +Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory for each monitoring site. + +### Output + +With both the agent and server plugins installed, you can then run `cmk -II ` (or use WATO) to inventory each node, which should produce two new checks: + +* `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check). + +* `PVC Node `: Provides the per-node health. + +The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage. + +The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself. diff --git a/node-daemon/monitoring/checkmk/pvc b/node-daemon/monitoring/checkmk/pvc new file mode 100755 index 00000000..cdddd8fd --- /dev/null +++ b/node-daemon/monitoring/checkmk/pvc @@ -0,0 +1,6 @@ +#!/bin/bash + +# PVC cluster status check for Check_MK (agent-side) + +echo "<<>>" +pvc --quiet status --format json diff --git a/node-daemon/monitoring/checkmk/pvc.py b/node-daemon/monitoring/checkmk/pvc.py new file mode 100644 index 00000000..d93e6135 --- /dev/null +++ b/node-daemon/monitoring/checkmk/pvc.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# +# Check_MK PVC plugin +# +# Copyright 2017-2021, Joshua Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from .agent_based_api.v1 import * +from cmk.base.check_api import host_name +from time import time +from json import loads + + +def discover_pvc(section): + my_node = host_name().split(".")[0] + yield Service(item=f"PVC Node {my_node}") + yield Service(item="PVC Cluster") + + +def check_pvc(item, params, section): + state = State.OK + summary = "Stuff" + details = None + data = loads(" ".join(section[0])) + my_node = host_name().split(".")[0] + + maintenance_map = { + "true": "on", + "false": "off", + } + maintenance = maintenance_map[data["maintenance"]] + + # Node check + if item == f"PVC Node {my_node}": + my_node = host_name().split(".")[0] + node_health = data["node_health"][my_node]["health"] + node_messages = data["node_health"][my_node]["messages"] + + summary = f"Node health is {node_health}% (maintenance {maintenance})" + + if len(node_messages) > 0: + details = ", ".join(node_messages) + + if node_health <= 50 and maintenance == "off": + state = State.CRIT + elif node_health <= 90 and maintenance == "off": + state = State.WARN + else: + state = State.OK + + yield Metric(name="node-health", value=node_health) + + # Cluster check + elif item == "PVC Cluster": + cluster_health = data["cluster_health"]["health"] + cluster_messages = data["cluster_health"]["messages"] + + summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})" + + if len(cluster_messages) > 0: + details = ", ".join(cluster_messages) + + if cluster_health <= 50 and maintenance == "off": + state = State.CRIT + elif cluster_health <= 90 and maintenance == "off": + state = State.WARN + else: + state = State.OK + + yield Metric(name="cluster-health", value=cluster_health) + + yield Result(state=state, summary=summary, details=details) + return + + +register.check_plugin( + name="pvc", + service_name="%s", + check_ruleset_name="pvc", + discovery_function=discover_pvc, + check_function=check_pvc, + check_default_parameters={}, +)