2023-12-04 01:37:54 -05:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
# faults.py - PVC client function library, faults management
|
|
|
|
# Part of the Parallel Virtual Cluster (PVC) system
|
|
|
|
#
|
|
|
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, version 3.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
###############################################################################
|
|
|
|
|
|
|
|
from datetime import datetime
|
2023-12-06 13:17:10 -05:00
|
|
|
|
|
|
|
|
|
|
|
def generate_fault(
|
2023-12-09 16:13:36 -05:00
|
|
|
zkhandler,
|
|
|
|
logger,
|
|
|
|
fault_name,
|
|
|
|
fault_time,
|
|
|
|
fault_delta,
|
|
|
|
fault_message,
|
|
|
|
fault_details=None,
|
2023-12-06 13:17:10 -05:00
|
|
|
):
|
2023-12-06 17:12:44 -05:00
|
|
|
# Strip the microseconds off of the fault time; we don't care about that precision
|
|
|
|
fault_time = str(fault_time).split(".")[0]
|
|
|
|
|
2023-12-09 16:13:36 -05:00
|
|
|
if fault_details is not None:
|
|
|
|
fault_message = f"{fault_message}: {fault_details}"
|
|
|
|
|
2023-12-06 13:17:10 -05:00
|
|
|
# If a fault already exists with this ID, just update the time
|
|
|
|
if not zkhandler.exists("base.faults"):
|
|
|
|
logger.out(
|
2023-12-09 16:48:14 -05:00
|
|
|
f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
|
2023-12-06 13:17:10 -05:00
|
|
|
state="w",
|
|
|
|
)
|
|
|
|
return
|
|
|
|
|
|
|
|
existing_faults = zkhandler.children("base.faults")
|
2023-12-09 16:48:14 -05:00
|
|
|
if fault_name in existing_faults:
|
2023-12-06 13:17:10 -05:00
|
|
|
logger.out(
|
2023-12-09 16:48:14 -05:00
|
|
|
f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
|
2023-12-06 13:17:10 -05:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.out(
|
2023-12-09 16:48:14 -05:00
|
|
|
f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
|
2023-12-06 13:17:10 -05:00
|
|
|
state="i",
|
|
|
|
)
|
|
|
|
|
|
|
|
if zkhandler.read("base.config.maintenance") == "true":
|
|
|
|
logger.out(
|
2023-12-09 16:48:14 -05:00
|
|
|
f"Skipping fault reporting for {fault_name} due to maintenance mode",
|
2023-12-06 13:17:10 -05:00
|
|
|
state="w",
|
|
|
|
)
|
|
|
|
return
|
|
|
|
|
2023-12-09 16:48:14 -05:00
|
|
|
# Update an existing fault
|
|
|
|
if fault_name in existing_faults:
|
2023-12-06 13:17:10 -05:00
|
|
|
zkhandler.write(
|
|
|
|
[
|
2023-12-09 16:48:14 -05:00
|
|
|
(("faults.last_time", fault_name), fault_time),
|
|
|
|
(("faults.delta", fault_name), fault_delta),
|
|
|
|
(("faults.message", fault_name), fault_message),
|
2023-12-06 13:17:10 -05:00
|
|
|
]
|
|
|
|
)
|
2023-12-09 16:48:14 -05:00
|
|
|
# Generate a new fault
|
2023-12-06 13:17:10 -05:00
|
|
|
else:
|
|
|
|
zkhandler.write(
|
|
|
|
[
|
2023-12-09 16:48:14 -05:00
|
|
|
(("faults.id", fault_name), ""),
|
|
|
|
(("faults.first_time", fault_name), fault_time),
|
|
|
|
(("faults.last_time", fault_name), fault_time),
|
|
|
|
(("faults.ack_time", fault_name), ""),
|
|
|
|
(("faults.status", fault_name), "new"),
|
|
|
|
(("faults.delta", fault_name), fault_delta),
|
|
|
|
(("faults.message", fault_name), fault_message),
|
2023-12-06 13:17:10 -05:00
|
|
|
]
|
|
|
|
)
|
2023-12-04 01:37:54 -05:00
|
|
|
|
|
|
|
|
|
|
|
def getFault(zkhandler, fault_id):
|
|
|
|
"""
|
|
|
|
Get the details of a fault based on the fault ID
|
|
|
|
"""
|
|
|
|
if not zkhandler.exists(("faults.id", fault_id)):
|
|
|
|
return None
|
|
|
|
|
|
|
|
fault_id = fault_id
|
2023-12-10 16:05:16 -05:00
|
|
|
|
|
|
|
(
|
|
|
|
fault_last_time,
|
|
|
|
fault_first_time,
|
|
|
|
fault_ack_time,
|
|
|
|
fault_status,
|
|
|
|
fault_delta,
|
|
|
|
fault_message,
|
|
|
|
) = zkhandler.read_many(
|
|
|
|
[
|
|
|
|
("faults.last_time", fault_id),
|
|
|
|
("faults.first_time", fault_id),
|
|
|
|
("faults.ack_time", fault_id),
|
|
|
|
("faults.status", fault_id),
|
|
|
|
("faults.delta", fault_id),
|
|
|
|
("faults.message", fault_id),
|
|
|
|
]
|
|
|
|
)
|
2023-12-04 01:37:54 -05:00
|
|
|
|
2023-12-04 15:48:49 -05:00
|
|
|
# Acknowledged faults have a delta of 0
|
|
|
|
if fault_ack_time != "":
|
|
|
|
fault_delta = 0
|
|
|
|
|
2023-12-04 01:37:54 -05:00
|
|
|
fault = {
|
|
|
|
"id": fault_id,
|
|
|
|
"last_reported": fault_last_time,
|
|
|
|
"first_reported": fault_first_time,
|
|
|
|
"acknowledged_at": fault_ack_time,
|
|
|
|
"status": fault_status,
|
2023-12-10 16:05:16 -05:00
|
|
|
"health_delta": int(fault_delta),
|
2023-12-04 01:37:54 -05:00
|
|
|
"message": fault_message,
|
|
|
|
}
|
|
|
|
|
|
|
|
return fault
|
|
|
|
|
|
|
|
|
|
|
|
def getAllFaults(zkhandler, sort_key="last_reported"):
|
|
|
|
"""
|
|
|
|
Get the details of all registered faults
|
|
|
|
"""
|
|
|
|
|
|
|
|
all_faults = zkhandler.children(("base.faults"))
|
|
|
|
|
2023-12-10 16:05:16 -05:00
|
|
|
faults_reads = list()
|
2023-12-04 01:37:54 -05:00
|
|
|
for fault_id in all_faults:
|
2023-12-10 16:05:16 -05:00
|
|
|
faults_reads += [
|
|
|
|
("faults.last_time", fault_id),
|
|
|
|
("faults.first_time", fault_id),
|
|
|
|
("faults.ack_time", fault_id),
|
|
|
|
("faults.status", fault_id),
|
|
|
|
("faults.delta", fault_id),
|
|
|
|
("faults.message", fault_id),
|
|
|
|
]
|
|
|
|
all_faults_data = list(zkhandler.read_many(faults_reads))
|
|
|
|
|
|
|
|
faults_detail = list()
|
|
|
|
for fidx, fault_id in enumerate(all_faults):
|
|
|
|
# Split the large list of return values by the IDX of this fault
|
|
|
|
# Each fault result is 6 fields long
|
|
|
|
pos_start = fidx * 6
|
|
|
|
pos_end = fidx * 6 + 6
|
|
|
|
(
|
|
|
|
fault_last_time,
|
|
|
|
fault_first_time,
|
|
|
|
fault_ack_time,
|
|
|
|
fault_status,
|
|
|
|
fault_delta,
|
|
|
|
fault_message,
|
|
|
|
) = tuple(all_faults_data[pos_start:pos_end])
|
|
|
|
fault_output = {
|
|
|
|
"id": fault_id,
|
|
|
|
"last_reported": fault_last_time,
|
|
|
|
"first_reported": fault_first_time,
|
|
|
|
"acknowledged_at": fault_ack_time,
|
|
|
|
"status": fault_status,
|
|
|
|
"health_delta": int(fault_delta),
|
|
|
|
"message": fault_message,
|
|
|
|
}
|
|
|
|
faults_detail.append(fault_output)
|
2023-12-04 01:37:54 -05:00
|
|
|
|
|
|
|
sorted_faults = sorted(faults_detail, key=lambda x: x[sort_key])
|
|
|
|
# Sort newest-first for time-based sorts
|
|
|
|
if sort_key in ["first_reported", "last_reported", "acknowledge_at"]:
|
|
|
|
sorted_faults.reverse()
|
|
|
|
|
|
|
|
return sorted_faults
|
|
|
|
|
|
|
|
|
|
|
|
def get_list(zkhandler, limit=None, sort_key="last_reported"):
|
|
|
|
"""
|
|
|
|
Get a list of all known faults, sorted by {sort_key}
|
|
|
|
"""
|
|
|
|
if sort_key not in [
|
|
|
|
"first_reported",
|
|
|
|
"last_reported",
|
|
|
|
"acknowledged_at",
|
|
|
|
"status",
|
|
|
|
"health_delta",
|
|
|
|
"message",
|
|
|
|
]:
|
|
|
|
return False, f"Invalid sort key {sort_key} provided"
|
|
|
|
|
|
|
|
all_faults = getAllFaults(zkhandler, sort_key=sort_key)
|
|
|
|
|
|
|
|
if limit is not None:
|
|
|
|
all_faults = [fault for fault in all_faults if fault["id"] == limit]
|
|
|
|
|
|
|
|
return True, all_faults
|
|
|
|
|
|
|
|
|
2023-12-06 13:33:27 -05:00
|
|
|
def acknowledge(zkhandler, fault_id=None):
|
2023-12-04 01:37:54 -05:00
|
|
|
"""
|
2023-12-06 13:33:27 -05:00
|
|
|
Acknowledge a fault or all faults
|
2023-12-04 01:37:54 -05:00
|
|
|
"""
|
2023-12-06 13:33:27 -05:00
|
|
|
if fault_id is None:
|
|
|
|
faults = getAllFaults(zkhandler)
|
|
|
|
else:
|
|
|
|
fault = getFault(zkhandler, fault_id)
|
|
|
|
|
|
|
|
if fault is None:
|
|
|
|
return False, f"No fault with ID {fault_id} found"
|
|
|
|
|
|
|
|
faults = [fault]
|
|
|
|
|
|
|
|
for fault in faults:
|
|
|
|
# Don't reacknowledge already-acknowledged faults
|
|
|
|
if fault["status"] != "ack":
|
|
|
|
zkhandler.write(
|
|
|
|
[
|
2023-12-06 14:18:31 -05:00
|
|
|
(
|
|
|
|
("faults.ack_time", fault["id"]),
|
|
|
|
str(datetime.now()).split(".")[0],
|
|
|
|
),
|
|
|
|
(("faults.status", fault["id"]), "ack"),
|
2023-12-06 13:33:27 -05:00
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
return (
|
|
|
|
True,
|
|
|
|
f"Successfully acknowledged fault(s) {', '.join([fault['id'] for fault in faults])}",
|
2023-12-04 01:37:54 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-12-06 13:33:27 -05:00
|
|
|
def delete(zkhandler, fault_id=None):
|
2023-12-04 01:37:54 -05:00
|
|
|
"""
|
2023-12-06 13:33:27 -05:00
|
|
|
Delete a fault or all faults
|
2023-12-04 01:37:54 -05:00
|
|
|
"""
|
2023-12-06 13:33:27 -05:00
|
|
|
if fault_id is None:
|
|
|
|
faults = getAllFaults(zkhandler)
|
|
|
|
else:
|
|
|
|
fault = getFault(zkhandler, fault_id)
|
|
|
|
|
|
|
|
if fault is None:
|
|
|
|
return False, f"No fault with ID {fault_id} found"
|
2023-12-04 01:37:54 -05:00
|
|
|
|
2023-12-06 13:33:27 -05:00
|
|
|
faults = [fault]
|
2023-12-04 01:37:54 -05:00
|
|
|
|
2023-12-06 13:33:27 -05:00
|
|
|
for fault in faults:
|
2023-12-06 14:18:31 -05:00
|
|
|
zkhandler.delete(("faults.id", fault["id"]), recursive=True)
|
2023-12-04 01:37:54 -05:00
|
|
|
|
2023-12-06 13:33:27 -05:00
|
|
|
return (
|
|
|
|
True,
|
|
|
|
f"Successfully deleted fault(s) {', '.join([fault['id'] for fault in faults])}",
|
|
|
|
)
|