pvc/daemon-common/faults.py

209 lines
6.2 KiB
Python
Raw Normal View History

2023-12-04 01:37:54 -05:00
#!/usr/bin/env python3
# faults.py - PVC client function library, faults management
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
from datetime import datetime
from hashlib import md5
def generate_fault(
zkhandler, logger, fault_name, fault_time, fault_delta, fault_message
):
# Generate a fault ID from the fault_message and fault_delta
fault_str = f"{fault_name} {fault_delta} {fault_message}"
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
# If a fault already exists with this ID, just update the time
if not zkhandler.exists("base.faults"):
logger.out(
f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
state="w",
)
return
existing_faults = zkhandler.children("base.faults")
if fault_id in existing_faults:
logger.out(
f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
)
else:
logger.out(
f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
state="i",
)
if zkhandler.read("base.config.maintenance") == "true":
logger.out(
f"Skipping fault reporting for {fault_id} due to maintenance mode",
state="w",
)
return
if fault_id in existing_faults:
zkhandler.write(
[
(("faults.last_time", fault_id), str(fault_time)),
]
)
# Otherwise, generate a new fault event
else:
zkhandler.write(
[
(("faults.id", fault_id), ""),
(("faults.first_time", fault_id), str(fault_time).split(".")[0]),
(("faults.last_time", fault_id), str(fault_time).split(".")[0]),
(("faults.ack_time", fault_id), ""),
(("faults.status", fault_id), "new"),
(("faults.delta", fault_id), fault_delta),
(("faults.message", fault_id), fault_message),
]
)
2023-12-04 01:37:54 -05:00
def getFault(zkhandler, fault_id):
"""
Get the details of a fault based on the fault ID
"""
if not zkhandler.exists(("faults.id", fault_id)):
return None
fault_id = fault_id
fault_last_time = zkhandler.read(("faults.last_time", fault_id))
fault_first_time = zkhandler.read(("faults.first_time", fault_id))
fault_ack_time = zkhandler.read(("faults.ack_time", fault_id))
fault_status = zkhandler.read(("faults.status", fault_id))
fault_delta = int(zkhandler.read(("faults.delta", fault_id)))
fault_message = zkhandler.read(("faults.message", fault_id))
# Acknowledged faults have a delta of 0
if fault_ack_time != "":
fault_delta = 0
2023-12-04 01:37:54 -05:00
fault = {
"id": fault_id,
"last_reported": fault_last_time,
"first_reported": fault_first_time,
"acknowledged_at": fault_ack_time,
"status": fault_status,
"health_delta": fault_delta,
"message": fault_message,
}
return fault
def getAllFaults(zkhandler, sort_key="last_reported"):
"""
Get the details of all registered faults
"""
all_faults = zkhandler.children(("base.faults"))
faults_detail = list()
for fault_id in all_faults:
fault_detail = getFault(zkhandler, fault_id)
faults_detail.append(fault_detail)
sorted_faults = sorted(faults_detail, key=lambda x: x[sort_key])
# Sort newest-first for time-based sorts
if sort_key in ["first_reported", "last_reported", "acknowledge_at"]:
sorted_faults.reverse()
return sorted_faults
def get_list(zkhandler, limit=None, sort_key="last_reported"):
"""
Get a list of all known faults, sorted by {sort_key}
"""
if sort_key not in [
"first_reported",
"last_reported",
"acknowledged_at",
"status",
"health_delta",
"message",
]:
return False, f"Invalid sort key {sort_key} provided"
all_faults = getAllFaults(zkhandler, sort_key=sort_key)
if limit is not None:
all_faults = [fault for fault in all_faults if fault["id"] == limit]
return True, all_faults
2023-12-06 13:33:27 -05:00
def acknowledge(zkhandler, fault_id=None):
2023-12-04 01:37:54 -05:00
"""
2023-12-06 13:33:27 -05:00
Acknowledge a fault or all faults
2023-12-04 01:37:54 -05:00
"""
2023-12-06 13:33:27 -05:00
if fault_id is None:
faults = getAllFaults(zkhandler)
else:
fault = getFault(zkhandler, fault_id)
if fault is None:
return False, f"No fault with ID {fault_id} found"
faults = [fault]
for fault in faults:
# Don't reacknowledge already-acknowledged faults
if fault["status"] != "ack":
zkhandler.write(
[
2023-12-06 14:18:31 -05:00
(
("faults.ack_time", fault["id"]),
str(datetime.now()).split(".")[0],
),
(("faults.status", fault["id"]), "ack"),
2023-12-06 13:33:27 -05:00
]
)
return (
True,
f"Successfully acknowledged fault(s) {', '.join([fault['id'] for fault in faults])}",
2023-12-04 01:37:54 -05:00
)
2023-12-06 13:33:27 -05:00
def delete(zkhandler, fault_id=None):
2023-12-04 01:37:54 -05:00
"""
2023-12-06 13:33:27 -05:00
Delete a fault or all faults
2023-12-04 01:37:54 -05:00
"""
2023-12-06 13:33:27 -05:00
if fault_id is None:
faults = getAllFaults(zkhandler)
else:
fault = getFault(zkhandler, fault_id)
if fault is None:
return False, f"No fault with ID {fault_id} found"
2023-12-04 01:37:54 -05:00
2023-12-06 13:33:27 -05:00
faults = [fault]
2023-12-04 01:37:54 -05:00
2023-12-06 13:33:27 -05:00
for fault in faults:
2023-12-06 14:18:31 -05:00
zkhandler.delete(("faults.id", fault["id"]), recursive=True)
2023-12-04 01:37:54 -05:00
2023-12-06 13:33:27 -05:00
return (
True,
f"Successfully deleted fault(s) {', '.join([fault['id'] for fault in faults])}",
)