Compare commits

...

6 Commits

8 changed files with 206 additions and 333 deletions

View File

@@ -125,81 +125,56 @@ def format_info(cluster_information, oformat):
return json.dumps(cluster_information, indent=4)
# Plain formatting, i.e. human-readable
if cluster_information["health"] == "Optimal":
health_colour = ansiprint.green()
elif cluster_information["health"] == "Maintenance":
if cluster_information["maintenance"] == "True":
health_colour = ansiprint.blue()
else:
elif cluster_information["health"] > 90:
health_colour = ansiprint.green()
elif cluster_information["health"] > 50:
health_colour = ansiprint.yellow()
if cluster_information["storage_health"] == "Optimal":
storage_health_colour = ansiprint.green()
elif cluster_information["storage_health"] == "Maintenance":
storage_health_colour = ansiprint.blue()
else:
storage_health_colour = ansiprint.yellow()
health_colour = ansiprint.red()
ainformation = []
if oformat == "short":
ainformation.append(
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
)
ainformation.append(
"{}Cluster health:{} {}{}{}".format(
ansiprint.purple(),
ansiprint.end(),
health_colour,
cluster_information["health"],
ansiprint.end(),
)
)
if cluster_information["health_msg"]:
for line in cluster_information["health_msg"]:
ainformation.append(" > {}".format(line))
ainformation.append(
"{}Storage health:{} {}{}{}".format(
ansiprint.purple(),
ansiprint.end(),
storage_health_colour,
cluster_information["storage_health"],
ansiprint.end(),
)
)
if cluster_information["storage_health_msg"]:
for line in cluster_information["storage_health_msg"]:
ainformation.append(" > {}".format(line))
return "\n".join(ainformation)
ainformation.append(
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
)
ainformation.append("")
health_text = f"{cluster_information['health']}%"
if cluster_information["maintenance"] == "True":
health_text += " (maintenance on)"
ainformation.append(
"{}Cluster health:{} {}{}{}".format(
"{}Cluster health:{} {}{}{}".format(
ansiprint.purple(),
ansiprint.end(),
health_colour,
cluster_information["health"],
health_text,
ansiprint.end(),
)
)
if cluster_information["health_msg"]:
for line in cluster_information["health_msg"]:
ainformation.append(" > {}".format(line))
ainformation.append(
"{}Storage health:{} {}{}{}".format(
ansiprint.purple(),
ansiprint.end(),
storage_health_colour,
cluster_information["storage_health"],
ansiprint.end(),
if cluster_information["health_messages"]:
health_messages = "\n > ".join(
sorted(cluster_information["health_messages"])
)
)
if cluster_information["storage_health_msg"]:
for line in cluster_information["storage_health_msg"]:
ainformation.append(" > {}".format(line))
ainformation.append(
"{}Health messages:{} > {}".format(
ansiprint.purple(),
ansiprint.end(),
health_messages,
)
)
else:
ainformation.append(
"{}Health messages:{} N/A".format(
ansiprint.purple(),
ansiprint.end(),
)
)
if oformat == "short":
return "\n".join(ainformation)
ainformation.append("")
ainformation.append(

View File

@@ -158,6 +158,19 @@ def get_status(zkhandler):
return True, status_data
def get_health(zkhandler):
primary_node = zkhandler.read("base.config.primary_node")
ceph_health = zkhandler.read("base.storage.health").rstrip()
# Create a data structure for the information
status_data = {
"type": "health",
"primary_node": primary_node,
"ceph_data": ceph_health,
}
return True, status_data
def get_util(zkhandler):
primary_node = zkhandler.read("base.config.primary_node")
ceph_df = zkhandler.read("base.storage.util").rstrip()

View File

@@ -19,7 +19,7 @@
#
###############################################################################
import re
from json import loads
import daemon_lib.common as common
import daemon_lib.vm as pvc_vm
@@ -44,13 +44,123 @@ def set_maintenance(zkhandler, maint_state):
return True, "Successfully set cluster in normal mode"
def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
health_delta_map = {
"node_stopped": 50,
"node_flushed": 10,
"vm_stopped": 10,
"osd_out": 50,
"osd_down": 10,
"memory_overprovisioned": 50,
"ceph_err": 50,
"ceph_warn": 10,
}
# Generate total cluster health numbers
cluster_health = 100
messages = list()
for index, node in enumerate(node_list):
# Apply node health values to total health number
try:
node_health_int = int(node["health"])
except Exception:
node_health_int = 100
cluster_health -= 100 - node_health_int
for entry in node["health_details"]:
if entry["health_delta"] > 0:
messages.append(
f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
)
# Handle unhealthy node states
if node["daemon_state"] not in ["run"]:
cluster_health -= health_delta_map["node_stopped"]
messages.append(
f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
)
elif node["domain_state"] not in ["ready"]:
cluster_health -= health_delta_map["node_flushed"]
messages.append(
f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
)
for index, vm in enumerate(vm_list):
# Handle unhealthy VM states
if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]:
cluster_health -= health_delta_map["vm_stopped"]
messages.append(f"cluster: VM {vm['name']} in {vm['state'].upper()} state")
for index, ceph_osd in enumerate(ceph_osd_list):
in_texts = {1: "in", 0: "out"}
up_texts = {1: "up", 0: "down"}
# Handle unhealthy OSD states
if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
cluster_health -= health_delta_map["osd_out"]
messages.append(
f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
)
elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
cluster_health -= health_delta_map["osd_down"]
messages.append(
f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
)
# Check for (n-1) overprovisioning
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
# the total memory of the (n-1) smallest nodes, trigger this warning.
n_minus_1_total = 0
alloc_total = 0
node_largest_index = None
node_largest_count = 0
for index, node in enumerate(node_list):
node_mem_total = node["memory"]["total"]
node_mem_alloc = node["memory"]["allocated"]
alloc_total += node_mem_alloc
# Determine if this node is the largest seen so far
if node_mem_total > node_largest_count:
node_largest_index = index
node_largest_count = node_mem_total
n_minus_1_node_list = list()
for index, node in enumerate(node_list):
if index == node_largest_index:
continue
n_minus_1_node_list.append(node)
for index, node in enumerate(n_minus_1_node_list):
n_minus_1_total += node["memory"]["total"]
if alloc_total > n_minus_1_total:
cluster_health -= health_delta_map["memory_overprovisioned"]
messages.append(
f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
)
# Check Ceph cluster health
ceph_health = loads(zkhandler.read("base.storage.health"))
ceph_health_status = ceph_health["status"]
ceph_health_entries = ceph_health["checks"].keys()
ceph_health_status_map = {
"HEALTH_ERR": "ERROR",
"HEALTH_WARN": "WARNING",
}
for entry in ceph_health_entries:
messages.append(
f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
)
if ceph_health_status == "HEALTH_ERR":
cluster_health -= health_delta_map["ceph_err"]
elif ceph_health_status == "HEALTH_WARN":
cluster_health -= health_delta_map["ceph_warn"]
return cluster_health, messages
def getClusterInformation(zkhandler):
# Get cluster maintenance state
maint_state = zkhandler.read("base.config.maintenance")
# List of messages to display to the clients
cluster_health_msg = []
storage_health_msg = []
maintenance_state = zkhandler.read("base.config.maintenance")
# Get node information object list
retcode, node_list = pvc_node.get_list(zkhandler, None)
@@ -78,135 +188,6 @@ def getClusterInformation(zkhandler):
ceph_volume_count = len(ceph_volume_list)
ceph_snapshot_count = len(ceph_snapshot_list)
# Determinations for general cluster health
cluster_healthy_status = True
# Check for (n-1) overprovisioning
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
# the total memory of the (n-1) smallest nodes, trigger this warning.
n_minus_1_total = 0
alloc_total = 0
node_largest_index = None
node_largest_count = 0
for index, node in enumerate(node_list):
node_mem_total = node["memory"]["total"]
node_mem_alloc = node["memory"]["allocated"]
alloc_total += node_mem_alloc
# Determine if this node is the largest seen so far
if node_mem_total > node_largest_count:
node_largest_index = index
node_largest_count = node_mem_total
n_minus_1_node_list = list()
for index, node in enumerate(node_list):
if index == node_largest_index:
continue
n_minus_1_node_list.append(node)
for index, node in enumerate(n_minus_1_node_list):
n_minus_1_total += node["memory"]["total"]
if alloc_total > n_minus_1_total:
cluster_healthy_status = False
cluster_health_msg.append(
"Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
alloc_total, n_minus_1_total
)
)
# Determinations for node health
node_healthy_status = list(range(0, node_count))
node_report_status = list(range(0, node_count))
for index, node in enumerate(node_list):
daemon_state = node["daemon_state"]
domain_state = node["domain_state"]
if daemon_state != "run" and domain_state != "ready":
node_healthy_status[index] = False
cluster_health_msg.append(
"Node '{}' in {},{} state".format(
node["name"], daemon_state, domain_state
)
)
else:
node_healthy_status[index] = True
node_report_status[index] = daemon_state + "," + domain_state
# Determinations for VM health
vm_healthy_status = list(range(0, vm_count))
vm_report_status = list(range(0, vm_count))
for index, vm in enumerate(vm_list):
vm_state = vm["state"]
if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
vm_healthy_status[index] = False
cluster_health_msg.append(
"VM '{}' in {} state".format(vm["name"], vm_state)
)
else:
vm_healthy_status[index] = True
vm_report_status[index] = vm_state
# Determinations for OSD health
ceph_osd_healthy_status = list(range(0, ceph_osd_count))
ceph_osd_report_status = list(range(0, ceph_osd_count))
for index, ceph_osd in enumerate(ceph_osd_list):
try:
ceph_osd_up = ceph_osd["stats"]["up"]
except KeyError:
ceph_osd_up = 0
try:
ceph_osd_in = ceph_osd["stats"]["in"]
except KeyError:
ceph_osd_in = 0
up_texts = {1: "up", 0: "down"}
in_texts = {1: "in", 0: "out"}
if not ceph_osd_up or not ceph_osd_in:
ceph_osd_healthy_status[index] = False
cluster_health_msg.append(
"OSD {} in {},{} state".format(
ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
)
)
else:
ceph_osd_healthy_status[index] = True
ceph_osd_report_status[index] = (
up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
)
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
if maint_state == "true":
cluster_health = "Maintenance"
elif (
cluster_healthy_status is False
or False in node_healthy_status
or False in vm_healthy_status
or False in ceph_osd_healthy_status
):
cluster_health = "Degraded"
else:
cluster_health = "Optimal"
# Find out our storage health from Ceph
ceph_status = zkhandler.read("base.storage").split("\n")
ceph_health = ceph_status[2].split()[-1]
# Parse the status output to get the health indicators
line_record = False
for index, line in enumerate(ceph_status):
if re.search("services:", line):
line_record = False
if line_record and len(line.strip()) > 0:
storage_health_msg.append(line.strip())
if re.search("health:", line):
line_record = True
if maint_state == "true":
storage_health = "Maintenance"
elif ceph_health != "HEALTH_OK":
storage_health = "Degraded"
else:
storage_health = "Optimal"
# State lists
node_state_combinations = [
"run,ready",
@@ -237,13 +218,19 @@ def getClusterInformation(zkhandler):
"unmigrate",
"provision",
]
ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
ceph_osd_state_combinations = [
"up,in",
"up,out",
"down,in",
"down,out",
]
# Format the Node states
formatted_node_states = {"total": node_count}
for state in node_state_combinations:
state_count = 0
for node_state in node_report_status:
for node in node_list:
node_state = f"{node['daemon_state']},{node['domain_state']}"
if node_state == state:
state_count += 1
if state_count > 0:
@@ -253,28 +240,35 @@ def getClusterInformation(zkhandler):
formatted_vm_states = {"total": vm_count}
for state in vm_state_combinations:
state_count = 0
for vm_state in vm_report_status:
if vm_state == state:
for vm in vm_list:
if vm["state"] == state:
state_count += 1
if state_count > 0:
formatted_vm_states[state] = state_count
# Format the OSD states
up_texts = {1: "up", 0: "down"}
in_texts = {1: "in", 0: "out"}
formatted_osd_states = {"total": ceph_osd_count}
for state in ceph_osd_state_combinations:
state_count = 0
for ceph_osd_state in ceph_osd_report_status:
for ceph_osd in ceph_osd_list:
ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
if ceph_osd_state == state:
state_count += 1
if state_count > 0:
formatted_osd_states[state] = state_count
# Get cluster health data
cluster_health, cluster_health_messages = getClusterHealth(
zkhandler, node_list, vm_list, ceph_osd_list
)
# Format the status data
cluster_information = {
"health": cluster_health,
"health_msg": cluster_health_msg,
"storage_health": storage_health,
"storage_health_msg": storage_health_msg,
"health_messages": cluster_health_messages,
"maintenance": maintenance_state,
"primary_node": common.getPrimaryNode(zkhandler),
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
"nodes": formatted_node_states,

View File

@@ -1 +1 @@
{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}

View File

@@ -569,6 +569,7 @@ class ZKSchema(object):
"domain": f"{_schema_root}/domains",
"network": f"{_schema_root}/networks",
"storage": f"{_schema_root}/ceph",
"storage.health": f"{_schema_root}/ceph/health",
"storage.util": f"{_schema_root}/ceph/util",
"osd": f"{_schema_root}/ceph/osds",
"pool": f"{_schema_root}/ceph/pools",

View File

@@ -1,129 +0,0 @@
#!/usr/bin/env python3
# ceph.py - PVC Monitoring example plugin for Ceph status
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the Ceph cluster health for anomalies, and return a health
# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50).
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "ceph"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self):
"""
run(): Perform the check actions and return a PluginResult object
"""
# Run any imports first
from rados import Rados
from json import loads, dumps
# Connect to the Ceph cluster
try:
ceph_conn = Rados(
conffile=self.config["ceph_config_file"],
conf=dict(keyring=self.config["ceph_admin_keyring"]),
)
ceph_conn.connect(timeout=1)
except Exception as e:
self.log(f"Failed to connect to Ceph cluster: {e}", state="e")
return self.plugin_result
# Get the Ceph cluster health
try:
health_status = loads(
ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1]
)
ceph_health = health_status["status"]
except Exception as e:
self.log(f"Failed to get health data from Ceph cluster: {e}", state="e")
return self.plugin_result
finally:
ceph_conn.shutdown()
# Get a list of error entries in the health status output
error_entries = health_status["checks"].keys()
# Set the health delta based on the errors presented
if ceph_health == "HEALTH_ERR":
health_delta = 50
message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}"
elif ceph_health == "HEALTH_WARN":
health_delta = 10
message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}"
else:
health_delta = 0
message = "Ceph cluster in OK state"
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Set the detailed data in our local PluginResult object
self.plugin_result.set_data(health_status)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

View File

@@ -133,7 +133,7 @@ class MonitoringPluginScript(MonitoringPlugin):
self.plugin_result.set_health_delta(health_delta)
# Craft the message
message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages valid: {count_ok}, inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)

View File

@@ -99,9 +99,10 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
# Primary-only functions
if this_node.router_state == "primary":
# Get Ceph status information (pretty)
if debug:
logger.out(
"Set ceph health information in zookeeper (primary only)",
"Set Ceph status information in zookeeper (primary only)",
state="d",
prefix="ceph-thread",
)
@@ -115,9 +116,27 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
except Exception as e:
logger.out("Failed to set Ceph status data: {}".format(e), state="e")
# Get Ceph health information (JSON)
if debug:
logger.out(
"Set ceph rados df information in zookeeper (primary only)",
"Set Ceph health information in zookeeper (primary only)",
state="d",
prefix="ceph-thread",
)
command = {"prefix": "health", "format": "json"}
ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
1
].decode("ascii")
try:
zkhandler.write([("base.storage.health", str(ceph_health))])
except Exception as e:
logger.out("Failed to set Ceph health data: {}".format(e), state="e")
# Get Ceph df information (pretty)
if debug:
logger.out(
"Set Ceph rados df information in zookeeper (primary only)",
state="d",
prefix="ceph-thread",
)