2019-10-22 11:23:12 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
# cluster.py - PVC client function library, cluster management
|
|
|
|
# Part of the Parallel Virtual Cluster (PVC) system
|
|
|
|
#
|
2022-10-06 11:55:27 -04:00
|
|
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
2019-10-22 11:23:12 -04:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
2021-03-25 16:57:17 -04:00
|
|
|
# the Free Software Foundation, version 3.
|
2019-10-22 11:23:12 -04:00
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
###############################################################################
|
|
|
|
|
2023-12-11 19:20:30 -05:00
|
|
|
from distutils.util import strtobool
|
2023-02-15 15:45:43 -05:00
|
|
|
from json import loads
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2020-02-08 18:48:59 -05:00
|
|
|
import daemon_lib.common as common
|
2023-12-04 15:48:49 -05:00
|
|
|
import daemon_lib.faults as faults
|
2020-02-08 18:48:59 -05:00
|
|
|
import daemon_lib.node as pvc_node
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2020-11-07 14:45:24 -05:00
|
|
|
|
2021-05-29 20:32:20 -04:00
|
|
|
def set_maintenance(zkhandler, maint_state):
|
2021-11-06 03:02:43 -04:00
|
|
|
current_maint_state = zkhandler.read("base.config.maintenance")
|
2021-06-05 01:36:40 -04:00
|
|
|
if maint_state == current_maint_state:
|
2021-11-06 03:02:43 -04:00
|
|
|
if maint_state == "true":
|
|
|
|
return True, "Cluster is already in maintenance mode"
|
2020-01-09 10:53:27 -05:00
|
|
|
else:
|
2021-11-06 03:02:43 -04:00
|
|
|
return True, "Cluster is already in normal mode"
|
2021-06-05 01:36:40 -04:00
|
|
|
|
2021-11-06 03:02:43 -04:00
|
|
|
if maint_state == "true":
|
|
|
|
zkhandler.write([("base.config.maintenance", "true")])
|
|
|
|
return True, "Successfully set cluster in maintenance mode"
|
2021-06-05 01:36:40 -04:00
|
|
|
else:
|
2021-11-06 03:02:43 -04:00
|
|
|
zkhandler.write([("base.config.maintenance", "false")])
|
|
|
|
return True, "Successfully set cluster in normal mode"
|
2020-01-09 10:53:27 -05:00
|
|
|
|
2020-11-07 14:45:24 -05:00
|
|
|
|
2023-12-10 17:24:21 -05:00
|
|
|
def getClusterHealthFromFaults(zkhandler, faults_list):
|
2023-12-04 15:48:49 -05:00
|
|
|
unacknowledged_faults = [fault for fault in faults_list if fault["status"] != "ack"]
|
|
|
|
|
|
|
|
# Generate total cluster health numbers
|
|
|
|
cluster_health_value = 100
|
|
|
|
cluster_health_messages = list()
|
|
|
|
|
|
|
|
for fault in sorted(
|
|
|
|
unacknowledged_faults,
|
|
|
|
key=lambda x: (x["health_delta"], x["last_reported"]),
|
|
|
|
reverse=True,
|
|
|
|
):
|
2023-12-07 11:13:36 -05:00
|
|
|
cluster_health_value -= fault["health_delta"]
|
2023-12-04 15:48:49 -05:00
|
|
|
message = {
|
|
|
|
"id": fault["id"],
|
|
|
|
"health_delta": fault["health_delta"],
|
|
|
|
"text": fault["message"],
|
|
|
|
}
|
|
|
|
cluster_health_messages.append(message)
|
|
|
|
|
|
|
|
if cluster_health_value < 0:
|
|
|
|
cluster_health_value = 0
|
|
|
|
|
|
|
|
cluster_health = {
|
|
|
|
"health": cluster_health_value,
|
|
|
|
"messages": cluster_health_messages,
|
|
|
|
}
|
|
|
|
|
|
|
|
return cluster_health
|
|
|
|
|
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
|
|
|
|
health_delta_map = {
|
2023-02-15 15:48:31 -05:00
|
|
|
"node_stopped": 50,
|
|
|
|
"node_flushed": 10,
|
|
|
|
"vm_stopped": 10,
|
|
|
|
"osd_out": 50,
|
|
|
|
"osd_down": 10,
|
2023-04-28 10:48:28 -04:00
|
|
|
"osd_full": 50,
|
|
|
|
"osd_nearfull": 10,
|
2023-02-15 15:48:31 -05:00
|
|
|
"memory_overprovisioned": 50,
|
|
|
|
"ceph_err": 50,
|
|
|
|
"ceph_warn": 10,
|
2023-02-15 15:45:43 -05:00
|
|
|
}
|
2020-08-14 12:27:13 -04:00
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
# Generate total cluster health numbers
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value = 100
|
|
|
|
cluster_health_messages = list()
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
for index, node in enumerate(node_list):
|
|
|
|
# Apply node health values to total health number
|
2023-02-15 16:28:56 -05:00
|
|
|
try:
|
|
|
|
node_health_int = int(node["health"])
|
|
|
|
except Exception:
|
|
|
|
node_health_int = 100
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= 100 - node_health_int
|
2023-02-15 16:28:56 -05:00
|
|
|
|
2023-02-15 15:48:31 -05:00
|
|
|
for entry in node["health_details"]:
|
|
|
|
if entry["health_delta"] > 0:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
2023-02-15 15:45:43 -05:00
|
|
|
|
|
|
|
# Handle unhealthy node states
|
2023-02-15 15:48:31 -05:00
|
|
|
if node["daemon_state"] not in ["run"]:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["node_stopped"]
|
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
|
|
|
elif node["domain_state"] not in ["ready"]:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["node_flushed"]
|
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
for index, vm in enumerate(vm_list):
|
|
|
|
# Handle unhealthy VM states
|
2023-02-16 20:32:33 -05:00
|
|
|
if vm["state"] in ["stop", "fail"]:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["vm_stopped"]
|
|
|
|
cluster_health_messages.append(
|
|
|
|
f"cluster: VM {vm['name']} in {vm['state'].upper()} state"
|
|
|
|
)
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
for index, ceph_osd in enumerate(ceph_osd_list):
|
|
|
|
in_texts = {1: "in", 0: "out"}
|
|
|
|
up_texts = {1: "up", 0: "down"}
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
# Handle unhealthy OSD states
|
|
|
|
if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["osd_out"]
|
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
|
|
|
elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["osd_down"]
|
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-04-28 10:48:28 -04:00
|
|
|
# Handle full or nearfull OSDs (>85%)
|
|
|
|
if ceph_osd["stats"]["utilization"] >= 90:
|
|
|
|
cluster_health_value -= health_delta_map["osd_full"]
|
|
|
|
cluster_health_messages.append(
|
|
|
|
f"cluster: Ceph OSD {ceph_osd['id']} is FULL ({ceph_osd['stats']['utilization']:.1f}% > 90%)"
|
|
|
|
)
|
|
|
|
elif ceph_osd["stats"]["utilization"] >= 85:
|
|
|
|
cluster_health_value -= health_delta_map["osd_nearfull"]
|
|
|
|
cluster_health_messages.append(
|
|
|
|
f"cluster: Ceph OSD {ceph_osd['id']} is NEARFULL ({ceph_osd['stats']['utilization']:.1f}% > 85%)"
|
|
|
|
)
|
|
|
|
|
2020-10-18 14:46:32 -04:00
|
|
|
# Check for (n-1) overprovisioning
|
|
|
|
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
|
|
|
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
|
|
|
n_minus_1_total = 0
|
|
|
|
alloc_total = 0
|
|
|
|
node_largest_index = None
|
|
|
|
node_largest_count = 0
|
|
|
|
for index, node in enumerate(node_list):
|
2021-11-06 03:02:43 -04:00
|
|
|
node_mem_total = node["memory"]["total"]
|
|
|
|
node_mem_alloc = node["memory"]["allocated"]
|
2020-10-18 14:46:32 -04:00
|
|
|
alloc_total += node_mem_alloc
|
|
|
|
# Determine if this node is the largest seen so far
|
|
|
|
if node_mem_total > node_largest_count:
|
|
|
|
node_largest_index = index
|
|
|
|
node_largest_count = node_mem_total
|
|
|
|
n_minus_1_node_list = list()
|
|
|
|
for index, node in enumerate(node_list):
|
|
|
|
if index == node_largest_index:
|
|
|
|
continue
|
|
|
|
n_minus_1_node_list.append(node)
|
|
|
|
for index, node in enumerate(n_minus_1_node_list):
|
2021-11-06 03:02:43 -04:00
|
|
|
n_minus_1_total += node["memory"]["total"]
|
2020-10-18 14:46:32 -04:00
|
|
|
if alloc_total > n_minus_1_total:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["memory_overprovisioned"]
|
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
2020-10-18 14:46:32 -04:00
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
# Check Ceph cluster health
|
|
|
|
ceph_health = loads(zkhandler.read("base.storage.health"))
|
|
|
|
ceph_health_status = ceph_health["status"]
|
|
|
|
ceph_health_entries = ceph_health["checks"].keys()
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-02-15 16:28:56 -05:00
|
|
|
ceph_health_status_map = {
|
|
|
|
"HEALTH_ERR": "ERROR",
|
|
|
|
"HEALTH_WARN": "WARNING",
|
|
|
|
}
|
|
|
|
for entry in ceph_health_entries:
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_messages.append(
|
2023-02-15 16:28:56 -05:00
|
|
|
f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
|
2023-02-15 15:48:31 -05:00
|
|
|
)
|
2023-02-15 16:28:56 -05:00
|
|
|
|
|
|
|
if ceph_health_status == "HEALTH_ERR":
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["ceph_err"]
|
2023-02-15 15:48:31 -05:00
|
|
|
elif ceph_health_status == "HEALTH_WARN":
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health_value -= health_delta_map["ceph_warn"]
|
|
|
|
|
|
|
|
if cluster_health_value < 0:
|
|
|
|
cluster_health_value = 0
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-02-16 12:33:36 -05:00
|
|
|
cluster_health = {
|
|
|
|
"health": cluster_health_value,
|
|
|
|
"messages": cluster_health_messages,
|
|
|
|
}
|
2023-02-15 16:49:12 -05:00
|
|
|
|
2023-02-16 12:33:36 -05:00
|
|
|
return cluster_health
|
2019-10-22 13:59:28 -04:00
|
|
|
|
|
|
|
|
2023-02-15 16:42:42 -05:00
|
|
|
def getNodeHealth(zkhandler, node_list):
|
2023-12-10 16:52:20 -05:00
|
|
|
# Get the health state of all nodes
|
|
|
|
node_health_reads = list()
|
|
|
|
for node in node_list:
|
|
|
|
node_health_reads += [
|
|
|
|
("node.monitoring.health", node),
|
|
|
|
("node.monitoring.plugins", node),
|
|
|
|
]
|
|
|
|
all_node_health_details = zkhandler.read_many(node_health_reads)
|
|
|
|
# Parse out the Node health details
|
2023-02-15 16:42:42 -05:00
|
|
|
node_health = dict()
|
2023-12-10 16:52:20 -05:00
|
|
|
for nidx, node in enumerate(node_list):
|
|
|
|
# Split the large list of return values by the IDX of this node
|
|
|
|
# Each node result is 2 fields long
|
|
|
|
pos_start = nidx * 2
|
|
|
|
pos_end = nidx * 2 + 2
|
|
|
|
node_health_value, node_health_plugins = tuple(
|
|
|
|
all_node_health_details[pos_start:pos_end]
|
|
|
|
)
|
|
|
|
node_health_details = pvc_node.getNodeHealthDetails(
|
|
|
|
zkhandler, node, node_health_plugins.split()
|
|
|
|
)
|
|
|
|
|
2023-02-15 16:42:42 -05:00
|
|
|
node_health_messages = list()
|
2023-12-10 16:52:20 -05:00
|
|
|
for entry in node_health_details:
|
2023-02-15 16:42:42 -05:00
|
|
|
if entry["health_delta"] > 0:
|
|
|
|
node_health_messages.append(f"'{entry['name']}': {entry['message']}")
|
|
|
|
|
|
|
|
node_health_entry = {
|
2023-12-13 14:34:02 -05:00
|
|
|
"health": int(node_health_value)
|
|
|
|
if isinstance(node_health_value, int)
|
|
|
|
else node_health_value,
|
2023-02-15 16:42:42 -05:00
|
|
|
"messages": node_health_messages,
|
|
|
|
}
|
2023-12-10 16:52:20 -05:00
|
|
|
node_health[node] = node_health_entry
|
2023-02-15 16:42:42 -05:00
|
|
|
|
|
|
|
return node_health
|
|
|
|
|
|
|
|
|
2023-02-15 15:45:43 -05:00
|
|
|
def getClusterInformation(zkhandler):
|
|
|
|
# Get cluster maintenance state
|
|
|
|
maintenance_state = zkhandler.read("base.config.maintenance")
|
2020-08-14 12:27:13 -04:00
|
|
|
|
2023-02-22 16:05:28 -05:00
|
|
|
# Get primary node
|
2023-12-10 16:52:20 -05:00
|
|
|
maintenance_state, primary_node = zkhandler.read_many(
|
|
|
|
[
|
|
|
|
("base.config.maintenance"),
|
|
|
|
("base.config.primary_node"),
|
|
|
|
]
|
|
|
|
)
|
2023-02-22 16:05:28 -05:00
|
|
|
|
|
|
|
# Get PVC version of primary node
|
2023-12-10 16:52:20 -05:00
|
|
|
pvc_version = zkhandler.read(("node.data.pvc_version", primary_node))
|
2020-08-14 12:27:13 -04:00
|
|
|
|
2023-12-10 16:52:20 -05:00
|
|
|
# Get the list of Nodes
|
|
|
|
node_list = zkhandler.children("base.node")
|
2023-02-15 15:45:43 -05:00
|
|
|
node_count = len(node_list)
|
2023-12-10 16:52:20 -05:00
|
|
|
# Get the daemon and domain states of all Nodes
|
|
|
|
node_state_reads = list()
|
|
|
|
for node in node_list:
|
|
|
|
node_state_reads += [
|
|
|
|
("node.state.daemon", node),
|
|
|
|
("node.state.domain", node),
|
|
|
|
]
|
|
|
|
all_node_states = zkhandler.read_many(node_state_reads)
|
|
|
|
# Parse out the Node states
|
2023-12-10 17:24:21 -05:00
|
|
|
node_data = list()
|
2023-12-10 16:52:20 -05:00
|
|
|
formatted_node_states = {"total": node_count}
|
|
|
|
for nidx, node in enumerate(node_list):
|
|
|
|
# Split the large list of return values by the IDX of this node
|
|
|
|
# Each node result is 2 fields long
|
|
|
|
pos_start = nidx * 2
|
|
|
|
pos_end = nidx * 2 + 2
|
2023-12-10 17:24:21 -05:00
|
|
|
node_daemon_state, node_domain_state = tuple(all_node_states[pos_start:pos_end])
|
|
|
|
node_data.append(
|
|
|
|
{
|
|
|
|
"name": node,
|
|
|
|
"daemon_state": node_daemon_state,
|
|
|
|
"domain_state": node_domain_state,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
node_state = f"{node_daemon_state},{node_domain_state}"
|
2023-12-10 16:52:20 -05:00
|
|
|
# Add to the count for this node's state
|
|
|
|
if node_state in common.node_state_combinations:
|
|
|
|
if formatted_node_states.get(node_state) is not None:
|
|
|
|
formatted_node_states[node_state] += 1
|
|
|
|
else:
|
|
|
|
formatted_node_states[node_state] = 1
|
|
|
|
|
|
|
|
# Get the list of VMs
|
|
|
|
vm_list = zkhandler.children("base.domain")
|
2023-02-15 15:45:43 -05:00
|
|
|
vm_count = len(vm_list)
|
2023-12-10 16:52:20 -05:00
|
|
|
# Get the states of all VMs
|
|
|
|
vm_state_reads = list()
|
|
|
|
for vm in vm_list:
|
|
|
|
vm_state_reads += [
|
2023-12-10 17:24:21 -05:00
|
|
|
("domain", vm),
|
2023-12-10 16:52:20 -05:00
|
|
|
("domain.state", vm),
|
|
|
|
]
|
|
|
|
all_vm_states = zkhandler.read_many(vm_state_reads)
|
|
|
|
# Parse out the VM states
|
2023-12-10 17:24:21 -05:00
|
|
|
vm_data = list()
|
2023-12-10 16:52:20 -05:00
|
|
|
formatted_vm_states = {"total": vm_count}
|
|
|
|
for vidx, vm in enumerate(vm_list):
|
|
|
|
# Split the large list of return values by the IDX of this VM
|
2023-12-10 17:24:21 -05:00
|
|
|
# Each VM result is 2 field long
|
2023-12-10 23:44:01 -05:00
|
|
|
pos_start = vidx * 2
|
|
|
|
pos_end = vidx * 2 + 2
|
2023-12-10 17:24:21 -05:00
|
|
|
vm_name, vm_state = tuple(all_vm_states[pos_start:pos_end])
|
|
|
|
vm_data.append(
|
|
|
|
{
|
|
|
|
"uuid": vm,
|
|
|
|
"name": vm_name,
|
|
|
|
"state": vm_state,
|
|
|
|
}
|
|
|
|
)
|
2023-12-10 16:52:20 -05:00
|
|
|
# Add to the count for this VM's state
|
|
|
|
if vm_state in common.vm_state_combinations:
|
|
|
|
if formatted_vm_states.get(vm_state) is not None:
|
|
|
|
formatted_vm_states[vm_state] += 1
|
|
|
|
else:
|
|
|
|
formatted_vm_states[vm_state] = 1
|
|
|
|
|
|
|
|
# Get the list of Ceph OSDs
|
|
|
|
ceph_osd_list = zkhandler.children("base.osd")
|
2023-02-15 15:45:43 -05:00
|
|
|
ceph_osd_count = len(ceph_osd_list)
|
2023-12-10 16:52:20 -05:00
|
|
|
# Get the states of all OSDs ("stat" is not a typo since we're reading stats; states are in
|
|
|
|
# the stats JSON object)
|
|
|
|
osd_stat_reads = list()
|
|
|
|
for osd in ceph_osd_list:
|
|
|
|
osd_stat_reads += [("osd.stats", osd)]
|
|
|
|
all_osd_stats = zkhandler.read_many(osd_stat_reads)
|
|
|
|
# Parse out the OSD states
|
2023-12-10 17:24:21 -05:00
|
|
|
osd_data = list()
|
2023-12-10 16:52:20 -05:00
|
|
|
formatted_osd_states = {"total": ceph_osd_count}
|
|
|
|
up_texts = {1: "up", 0: "down"}
|
|
|
|
in_texts = {1: "in", 0: "out"}
|
|
|
|
for oidx, osd in enumerate(ceph_osd_list):
|
|
|
|
# Split the large list of return values by the IDX of this OSD
|
|
|
|
# Each OSD result is 1 field long, so just use the IDX
|
|
|
|
_osd_stats = all_osd_stats[oidx]
|
|
|
|
# We have to load this JSON object and get our up/in states from it
|
|
|
|
osd_stats = loads(_osd_stats)
|
|
|
|
# Get our states
|
2023-12-10 17:33:59 -05:00
|
|
|
osd_up = up_texts[osd_stats["up"]]
|
|
|
|
osd_in = in_texts[osd_stats["in"]]
|
2023-12-10 17:24:21 -05:00
|
|
|
osd_data.append(
|
|
|
|
{
|
|
|
|
"id": osd,
|
|
|
|
"up": osd_up,
|
|
|
|
"in": osd_in,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
osd_state = f"{osd_up},{osd_in}"
|
2023-12-10 16:52:20 -05:00
|
|
|
# Add to the count for this OSD's state
|
|
|
|
if osd_state in common.ceph_osd_state_combinations:
|
|
|
|
if formatted_osd_states.get(osd_state) is not None:
|
|
|
|
formatted_osd_states[osd_state] += 1
|
|
|
|
else:
|
|
|
|
formatted_osd_states[osd_state] = 1
|
|
|
|
|
|
|
|
# Get the list of Networks
|
|
|
|
network_list = zkhandler.children("base.network")
|
|
|
|
network_count = len(network_list)
|
|
|
|
|
|
|
|
# Get the list of Ceph pools
|
|
|
|
ceph_pool_list = zkhandler.children("base.pool")
|
2023-02-15 15:45:43 -05:00
|
|
|
ceph_pool_count = len(ceph_pool_list)
|
2023-12-10 16:52:20 -05:00
|
|
|
|
|
|
|
# Get the list of Ceph volumes
|
2023-12-11 10:21:46 -05:00
|
|
|
ceph_volume_list = list()
|
|
|
|
for pool in ceph_pool_list:
|
|
|
|
ceph_volume_list_pool = zkhandler.children(("volume", pool))
|
|
|
|
if ceph_volume_list_pool is not None:
|
|
|
|
ceph_volume_list += [f"{pool}/{volume}" for volume in ceph_volume_list_pool]
|
2023-02-15 15:45:43 -05:00
|
|
|
ceph_volume_count = len(ceph_volume_list)
|
2020-08-13 15:06:19 -04:00
|
|
|
|
2023-12-10 16:52:20 -05:00
|
|
|
# Get the list of Ceph snapshots
|
2023-12-11 10:21:46 -05:00
|
|
|
ceph_snapshot_list = list()
|
|
|
|
for volume in ceph_volume_list:
|
|
|
|
ceph_snapshot_list_volume = zkhandler.children(("snapshot", volume))
|
|
|
|
if ceph_snapshot_list_volume is not None:
|
|
|
|
ceph_snapshot_list += [
|
|
|
|
f"{volume}@{snapshot}" for snapshot in ceph_snapshot_list_volume
|
|
|
|
]
|
2023-12-10 16:52:20 -05:00
|
|
|
ceph_snapshot_count = len(ceph_snapshot_list)
|
2019-10-22 11:23:12 -04:00
|
|
|
|
2023-12-10 17:24:21 -05:00
|
|
|
# Get the list of faults
|
|
|
|
faults_data = faults.getAllFaults(zkhandler)
|
|
|
|
|
2019-10-22 11:23:12 -04:00
|
|
|
# Format the status data
|
|
|
|
cluster_information = {
|
2023-12-10 17:24:21 -05:00
|
|
|
"cluster_health": getClusterHealthFromFaults(zkhandler, faults_data),
|
2023-02-15 16:42:42 -05:00
|
|
|
"node_health": getNodeHealth(zkhandler, node_list),
|
2023-02-15 15:45:43 -05:00
|
|
|
"maintenance": maintenance_state,
|
2023-02-22 16:05:28 -05:00
|
|
|
"primary_node": primary_node,
|
|
|
|
"pvc_version": pvc_version,
|
2021-11-06 03:02:43 -04:00
|
|
|
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
|
|
|
|
"nodes": formatted_node_states,
|
|
|
|
"vms": formatted_vm_states,
|
|
|
|
"networks": network_count,
|
|
|
|
"osds": formatted_osd_states,
|
|
|
|
"pools": ceph_pool_count,
|
|
|
|
"volumes": ceph_volume_count,
|
|
|
|
"snapshots": ceph_snapshot_count,
|
2023-12-10 17:24:21 -05:00
|
|
|
"detail": {
|
|
|
|
"node": node_data,
|
|
|
|
"vm": vm_data,
|
|
|
|
"osd": osd_data,
|
|
|
|
"faults": faults_data,
|
2023-12-10 17:33:59 -05:00
|
|
|
},
|
2019-10-22 11:23:12 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
return cluster_information
|
|
|
|
|
2020-11-07 14:45:24 -05:00
|
|
|
|
2021-05-29 20:32:20 -04:00
|
|
|
def get_info(zkhandler):
|
2019-10-22 11:23:12 -04:00
|
|
|
# This is a thin wrapper function for naming purposes
|
2021-05-29 20:32:20 -04:00
|
|
|
cluster_information = getClusterInformation(zkhandler)
|
2019-10-22 11:23:12 -04:00
|
|
|
if cluster_information:
|
|
|
|
return True, cluster_information
|
|
|
|
else:
|
2021-11-06 03:02:43 -04:00
|
|
|
return False, "ERROR: Failed to obtain cluster information!"
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
|
2023-12-11 19:20:30 -05:00
|
|
|
def get_metrics(zkhandler):
|
|
|
|
# Get general cluster information
|
|
|
|
status_retflag, status_data = get_info(zkhandler)
|
|
|
|
if not status_retflag:
|
|
|
|
return False, "Error: Status data threw error"
|
|
|
|
|
|
|
|
faults_data = status_data["detail"]["faults"]
|
|
|
|
node_data = status_data["detail"]["node"]
|
|
|
|
vm_data = status_data["detail"]["vm"]
|
|
|
|
osd_data = status_data["detail"]["osd"]
|
|
|
|
|
|
|
|
output_lines = list()
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_info PVC cluster information")
|
|
|
|
output_lines.append("# TYPE pvc_info gauge")
|
|
|
|
output_lines.append(
|
|
|
|
f"pvc_info{{primary_node=\"{status_data['primary_node']}\", version=\"{status_data['pvc_version']}\", upstream_ip=\"{status_data['upstream_ip']}\"}} 1"
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_cluster_maintenance PVC cluster maintenance state")
|
|
|
|
output_lines.append("# TYPE pvc_cluster_maintenance gauge")
|
|
|
|
output_lines.append(
|
|
|
|
f"pvc_cluster_maintenance {1 if bool(strtobool(status_data['maintenance'])) else 0}"
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_cluster_health PVC cluster health status")
|
|
|
|
output_lines.append("# TYPE pvc_cluster_health gauge")
|
|
|
|
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
|
|
|
|
output_lines.append("# TYPE pvc_cluster_faults gauge")
|
|
|
|
fault_map = dict()
|
|
|
|
for fault_type in common.fault_state_combinations:
|
|
|
|
fault_map[fault_type] = 0
|
|
|
|
for fault in faults_data:
|
|
|
|
fault_map[fault["status"]] += 1
|
|
|
|
for fault_type in fault_map:
|
|
|
|
output_lines.append(
|
|
|
|
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
|
|
|
|
)
|
|
|
|
|
|
|
|
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
|
|
|
|
# output_lines.append("# TYPE pvc_cluster_faults gauge")
|
|
|
|
# for fault_msg in status_data["cluster_health"]["messages"]:
|
|
|
|
# output_lines.append(
|
|
|
|
# f"pvc_cluster_faults{{id=\"{fault_msg['id']}\", message=\"{fault_msg['text']}\"}} {fault_msg['health_delta']}"
|
|
|
|
# )
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_node_health PVC cluster node health status")
|
|
|
|
output_lines.append("# TYPE pvc_node_health gauge")
|
|
|
|
for node in status_data["node_health"]:
|
|
|
|
if isinstance(status_data["node_health"][node]["health"], int):
|
|
|
|
output_lines.append(
|
|
|
|
f"pvc_node_health{{node=\"{node}\"}} {status_data['node_health'][node]['health']}"
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_node_daemon_states PVC Node daemon state counts")
|
|
|
|
output_lines.append("# TYPE pvc_node_daemon_states gauge")
|
|
|
|
node_daemon_state_map = dict()
|
|
|
|
for state in set([s.split(",")[0] for s in common.node_state_combinations]):
|
|
|
|
node_daemon_state_map[state] = 0
|
|
|
|
for node in node_data:
|
|
|
|
node_daemon_state_map[node["daemon_state"]] += 1
|
|
|
|
for state in node_daemon_state_map:
|
|
|
|
output_lines.append(
|
|
|
|
f'pvc_node_daemon_states{{state="{state}"}} {node_daemon_state_map[state]}'
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_node_domain_states PVC Node domain state counts")
|
|
|
|
output_lines.append("# TYPE pvc_node_domain_states gauge")
|
|
|
|
node_domain_state_map = dict()
|
|
|
|
for state in set([s.split(",")[1] for s in common.node_state_combinations]):
|
|
|
|
node_domain_state_map[state] = 0
|
|
|
|
for node in node_data:
|
|
|
|
node_domain_state_map[node["domain_state"]] += 1
|
|
|
|
for state in node_domain_state_map:
|
|
|
|
output_lines.append(
|
|
|
|
f'pvc_node_domain_states{{state="{state}"}} {node_domain_state_map[state]}'
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_vm_states PVC VM state counts")
|
|
|
|
output_lines.append("# TYPE pvc_vm_states gauge")
|
|
|
|
vm_state_map = dict()
|
|
|
|
for state in set(common.vm_state_combinations):
|
|
|
|
vm_state_map[state] = 0
|
|
|
|
for vm in vm_data:
|
|
|
|
vm_state_map[vm["state"]] += 1
|
|
|
|
for state in vm_state_map:
|
|
|
|
output_lines.append(f'pvc_vm_states{{state="{state}"}} {vm_state_map[state]}')
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_osd_up_states PVC OSD up state counts")
|
|
|
|
output_lines.append("# TYPE pvc_osd_up_states gauge")
|
|
|
|
osd_up_state_map = dict()
|
|
|
|
for state in set([s.split(",")[0] for s in common.ceph_osd_state_combinations]):
|
|
|
|
osd_up_state_map[state] = 0
|
|
|
|
for osd in osd_data:
|
|
|
|
if osd["up"] == "up":
|
|
|
|
osd_up_state_map["up"] += 1
|
|
|
|
else:
|
|
|
|
osd_up_state_map["down"] += 1
|
|
|
|
for state in osd_up_state_map:
|
|
|
|
output_lines.append(
|
|
|
|
f'pvc_osd_up_states{{state="{state}"}} {osd_up_state_map[state]}'
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_osd_in_states PVC OSD in state counts")
|
|
|
|
output_lines.append("# TYPE pvc_osd_in_states gauge")
|
|
|
|
osd_in_state_map = dict()
|
|
|
|
for state in set([s.split(",")[1] for s in common.ceph_osd_state_combinations]):
|
|
|
|
osd_in_state_map[state] = 0
|
|
|
|
for osd in osd_data:
|
|
|
|
if osd["in"] == "in":
|
|
|
|
osd_in_state_map["in"] += 1
|
|
|
|
else:
|
|
|
|
osd_in_state_map["out"] += 1
|
|
|
|
for state in osd_in_state_map:
|
|
|
|
output_lines.append(
|
|
|
|
f'pvc_osd_in_states{{state="{state}"}} {osd_in_state_map[state]}'
|
|
|
|
)
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_nodes PVC Node count")
|
|
|
|
output_lines.append("# TYPE pvc_nodes gauge")
|
|
|
|
output_lines.append(f"pvc_nodes {status_data['nodes']['total']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_vms PVC VM count")
|
|
|
|
output_lines.append("# TYPE pvc_vms gauge")
|
|
|
|
output_lines.append(f"pvc_vms {status_data['vms']['total']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_osds PVC OSD count")
|
|
|
|
output_lines.append("# TYPE pvc_osds gauge")
|
|
|
|
output_lines.append(f"pvc_osds {status_data['osds']['total']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_networks PVC Network count")
|
|
|
|
output_lines.append("# TYPE pvc_networks gauge")
|
|
|
|
output_lines.append(f"pvc_networks {status_data['networks']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_pools PVC Storage Pool count")
|
|
|
|
output_lines.append("# TYPE pvc_pools gauge")
|
|
|
|
output_lines.append(f"pvc_pools {status_data['pools']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_volumes PVC Storage Volume count")
|
|
|
|
output_lines.append("# TYPE pvc_volumes gauge")
|
|
|
|
output_lines.append(f"pvc_volumes {status_data['volumes']}")
|
|
|
|
|
|
|
|
output_lines.append("# HELP pvc_snapshots PVC Storage Snapshot count")
|
|
|
|
output_lines.append("# TYPE pvc_snapshots gauge")
|
|
|
|
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
|
|
|
|
|
|
|
|
return True, "\n".join(output_lines) + "\n"
|
|
|
|
|
|
|
|
|
2021-06-13 14:22:26 -04:00
|
|
|
def cluster_initialize(zkhandler, overwrite=False):
|
|
|
|
# Abort if we've initialized the cluster before
|
2021-11-06 03:02:43 -04:00
|
|
|
if zkhandler.exists("base.config.primary_node") and not overwrite:
|
|
|
|
return False, "ERROR: Cluster contains data and overwrite not set."
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
if overwrite:
|
2021-06-29 18:41:02 -04:00
|
|
|
# Delete the existing keys
|
2021-11-06 03:02:43 -04:00
|
|
|
for key in zkhandler.schema.keys("base"):
|
|
|
|
if key == "root":
|
2021-06-29 18:41:02 -04:00
|
|
|
# Don't delete the root key
|
|
|
|
continue
|
|
|
|
|
2021-11-06 03:02:43 -04:00
|
|
|
status = zkhandler.delete("base.{}".format(key), recursive=True)
|
2021-06-29 18:41:02 -04:00
|
|
|
if not status:
|
2021-11-06 03:02:43 -04:00
|
|
|
return (
|
|
|
|
False,
|
|
|
|
"ERROR: Failed to delete data in cluster; running nodes perhaps?",
|
|
|
|
)
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
# Create the root keys
|
|
|
|
zkhandler.schema.apply(zkhandler)
|
|
|
|
|
2021-11-06 03:02:43 -04:00
|
|
|
return True, "Successfully initialized cluster"
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
def cluster_backup(zkhandler):
|
|
|
|
# Dictionary of values to come
|
|
|
|
cluster_data = dict()
|
|
|
|
|
|
|
|
def get_data(path):
|
|
|
|
data = zkhandler.read(path)
|
|
|
|
children = zkhandler.children(path)
|
|
|
|
|
|
|
|
cluster_data[path] = data
|
|
|
|
|
|
|
|
if children:
|
2021-11-06 03:02:43 -04:00
|
|
|
if path == "/":
|
|
|
|
child_prefix = "/"
|
2021-06-13 14:22:26 -04:00
|
|
|
else:
|
2021-11-06 03:02:43 -04:00
|
|
|
child_prefix = path + "/"
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
for child in children:
|
2021-11-06 03:02:43 -04:00
|
|
|
if child_prefix + child == "/zookeeper":
|
2021-06-13 14:22:26 -04:00
|
|
|
# We must skip the built-in /zookeeper tree
|
|
|
|
continue
|
2021-11-06 03:02:43 -04:00
|
|
|
if child_prefix + child == "/patroni":
|
2021-06-13 14:22:26 -04:00
|
|
|
# We must skip the /patroni tree
|
|
|
|
continue
|
|
|
|
|
|
|
|
get_data(child_prefix + child)
|
|
|
|
|
|
|
|
try:
|
2021-11-06 03:02:43 -04:00
|
|
|
get_data("/")
|
2021-06-13 14:22:26 -04:00
|
|
|
except Exception as e:
|
2021-11-06 03:02:43 -04:00
|
|
|
return False, "ERROR: Failed to obtain backup: {}".format(e)
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
return True, cluster_data
|
|
|
|
|
|
|
|
|
|
|
|
def cluster_restore(zkhandler, cluster_data):
|
|
|
|
# Build a key+value list
|
|
|
|
kv = []
|
|
|
|
schema_version = None
|
|
|
|
for key in cluster_data:
|
2021-11-06 03:02:43 -04:00
|
|
|
if key == zkhandler.schema.path("base.schema.version"):
|
2021-06-13 14:22:26 -04:00
|
|
|
schema_version = cluster_data[key]
|
|
|
|
data = cluster_data[key]
|
|
|
|
kv.append((key, data))
|
|
|
|
|
2021-06-13 14:43:37 -04:00
|
|
|
if int(schema_version) != int(zkhandler.schema.version):
|
2021-11-06 03:02:43 -04:00
|
|
|
return (
|
|
|
|
False,
|
|
|
|
"ERROR: Schema version of backup ({}) does not match cluster schema version ({}).".format(
|
|
|
|
schema_version, zkhandler.schema.version
|
|
|
|
),
|
|
|
|
)
|
2021-06-13 14:22:26 -04:00
|
|
|
|
|
|
|
# Close the Zookeeper connection
|
|
|
|
result = zkhandler.write(kv)
|
|
|
|
|
|
|
|
if result:
|
2021-11-06 03:02:43 -04:00
|
|
|
return True, "Restore completed successfully."
|
2021-06-13 14:22:26 -04:00
|
|
|
else:
|
2021-11-06 03:02:43 -04:00
|
|
|
return False, "Restore failed."
|