Implement interfaces to faults

This commit is contained in:
Joshua Boniface 2023-12-04 01:37:54 -05:00
parent b59f743690
commit 672e58133f
6 changed files with 651 additions and 35 deletions

View File

@ -622,6 +622,152 @@ class API_Status(Resource):
api.add_resource(API_Status, "/status")
# /faults
class API_Faults(Resource):
@RequestParser(
[
{
"name": "sort_key",
"choices": (
"first_reported",
"last_reported",
"acknowledged_at",
"status",
"health_delta",
"message",
),
"helptext": "A valid sort key must be specified",
"required": False,
},
]
)
@Authenticator
def get(self, reqargs):
"""
Return a list of cluster faults
---
tags:
- faults
parameters:
- in: query
name: sort_key
type: string
required: false
description: The fault object key to sort results by
enum:
- first_reported
- last_reported
- acknowledged_at
- status
- health_delta
- message
responses:
200:
description: OK
schema:
type: array
items:
type: object
id: fault
properties:
id:
type: string
description: The ID of the fault
example: "10ae144b78b4cc5fdf09e2ebbac51235"
first_reported:
type: date
description: The first time the fault was reported
example: "2023-12-01 16:47:59.849742"
last_reported:
type: date
description: The last time the fault was reported
example: "2023-12-01 17:39:45.188398"
acknowledged_at:
type: date
description: The time the fault was acknowledged, or empty if not acknowledged
example: "2023-12-01 17:50:00.000000"
status:
type: string
description: The current state of the fault, either "new" or "ack" (acknowledged)
example: "new"
health_delta:
type: integer
description: The health delta (amount it reduces cluster health from 100%) of the fault
example: 25
message:
type: string
description: The textual description of the fault
example: "Node hv1 was at 40% (psur@-10%, psql@-50%) <= 50% health"
"""
return api_helper.fault_list(sort_key=reqargs.get("sort_key", "last_reported"))
api.add_resource(API_Faults, "/faults")
# /faults/<fault_id>
class API_Faults_Element(Resource):
@Authenticator
def get(self, fault_id):
"""
Return a single cluster fault
---
tags:
- faults
responses:
200:
description: OK
schema:
type: array
items:
type: object
id: fault
$ref: '#/definitions/fault'
"""
return api_helper.fault_list(limit=fault_id)
@Authenticator
def put(self, fault_id):
"""
Acknowledge a cluster fault
---
tags:
- faults
responses:
200:
description: OK
schema:
type: object
properties:
message:
type: string
description: A text message
"""
return api_helper.fault_acknowledge(fault_id)
@Authenticator
def delete(self, fault_id):
"""
Delete a cluster fault
---
tags:
- faults
responses:
200:
description: OK
schema:
type: object
properties:
message:
type: string
description: A text message
"""
return api_helper.fault_delete(fault_id)
api.add_resource(API_Faults_Element, "/faults/<fault_id>")
# /tasks
class API_Tasks(Resource):
@Authenticator

View File

@ -31,6 +31,7 @@ from daemon_lib.zkhandler import ZKConnection
import daemon_lib.common as pvc_common
import daemon_lib.cluster as pvc_cluster
import daemon_lib.faults as pvc_faults
import daemon_lib.node as pvc_node
import daemon_lib.vm as pvc_vm
import daemon_lib.network as pvc_network
@ -118,6 +119,65 @@ def cluster_maintenance(zkhandler, maint_state="false"):
return retdata, retcode
#
# Fault functions
#
@pvc_common.Profiler(config)
@ZKConnection(config)
def fault_list(zkhandler, limit=None, sort_key="last_reported"):
"""
Return a list of all faults sorted by SORT_KEY.
"""
retflag, retdata = pvc_faults.get_list(zkhandler, limit=limit, sort_key=sort_key)
if retflag:
retcode = 200
elif retflag and limit is not None and len(retdata) < 1:
retcode = 404
retdata = {"message": f"No fault with ID {limit} found"}
else:
retcode = 400
retdata = {"message": retdata}
return retdata, retcode
@pvc_common.Profiler(config)
@ZKConnection(config)
def fault_acknowledge(zkhandler, fault_id):
"""
Acknowledge a fault of FAULT_ID.
"""
retflag, retdata = pvc_faults.acknowledge(zkhandler, fault_id)
if retflag:
retcode = 200
else:
retcode = 404
retdata = {"message": retdata}
return retdata, retcode
@pvc_common.Profiler(config)
@ZKConnection(config)
def fault_delete(zkhandler, fault_id):
"""
Delete a fault of FAULT_ID.
"""
retflag, retdata = pvc_faults.delete(zkhandler, fault_id)
if retflag:
retcode = 200
else:
retcode = 404
retdata = {"message": retdata}
return retdata, retcode
#
# Node functions
#

View File

@ -37,6 +37,7 @@ from pvc.cli.parsers import *
from pvc.cli.formatters import *
import pvc.lib.cluster
import pvc.lib.faults
import pvc.lib.node
import pvc.lib.vm
import pvc.lib.network
@ -347,40 +348,6 @@ def cli_cluster():
pass
###############################################################################
# > pvc cluster status
###############################################################################
@click.command(
name="status",
short_help="Show cluster status.",
)
@connection_req
@format_opt(
{
"pretty": cli_cluster_status_format_pretty,
"short": cli_cluster_status_format_short,
"json": lambda d: jdumps(d),
"json-pretty": lambda d: jdumps(d, indent=2),
}
)
def cli_cluster_status(
format_function,
):
"""
Show information and health about a PVC cluster.
\b
Format options:
"pretty": Output all details in a nice colourful format.
"short" Output only details about cluster health in a nice colourful format.
"json": Output in unformatted JSON.
"json-pretty": Output in formatted JSON.
"""
retcode, retdata = pvc.lib.cluster.get_info(CLI_CONFIG)
finish(retcode, retdata, format_function)
###############################################################################
# > pvc cluster init
###############################################################################
@ -485,6 +452,120 @@ def cli_cluster_restore(
"""
###############################################################################
# > pvc cluster status
###############################################################################
@click.command(
name="status",
short_help="Show cluster status.",
)
@connection_req
@format_opt(
{
"pretty": cli_cluster_status_format_pretty,
"short": cli_cluster_status_format_short,
"json": lambda d: jdumps(d),
"json-pretty": lambda d: jdumps(d, indent=2),
}
)
def cli_cluster_status(
format_function,
):
"""
Show information and health about a PVC cluster.
\b
Format options:
"pretty": Output all details in a nice colourful format.
"short" Output only details about cluster health in a nice colourful format.
"json": Output in unformatted JSON.
"json-pretty": Output in formatted JSON.
"""
retcode, retdata = pvc.lib.cluster.get_info(CLI_CONFIG)
finish(retcode, retdata, format_function)
###############################################################################
# > pvc cluster fault
###############################################################################
@click.group(
name="fault",
short_help="Manage PVC cluster faults.",
context_settings=CONTEXT_SETTINGS,
)
def cli_cluster_fault():
"""
Manage faults in the PVC cluster.
"""
pass
###############################################################################
# > pvc cluster fault list
###############################################################################
@click.command(
name="list",
short_help="List all cluster faults.",
)
@click.argument("limit", default=None, required=False)
@format_opt(
{
"pretty": cli_cluster_fault_list_format_pretty,
# "short": cli_cluster_status_format_short,
"json": lambda d: jdumps(d),
"json-pretty": lambda d: jdumps(d, indent=2),
}
)
@connection_req
def cli_cluster_fault_list(limit, format_function):
"""
List all faults in the PVC cluster, optionally limited to fault ID LIMIT.
"""
retcode, retdata = pvc.lib.faults.get_list(
CLI_CONFIG,
limit=limit,
)
finish(retcode, retdata, format_function)
###############################################################################
# > pvc cluster fault ack
###############################################################################
@click.command(
name="ack",
short_help="Acknowledge a cluster fault.",
)
@click.argument("fault_id")
@connection_req
def cli_cluster_fault_acknowledge(fault_id):
"""
Acknowledge the cluster fault FAULT_ID.
"""
retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, fault_id)
finish(retcode, retdata)
###############################################################################
# > pvc cluster fault delete
###############################################################################
@click.command(
name="delete",
short_help="Delete a cluster fault.",
)
@click.argument("fault_id")
@connection_req
def cli_cluster_fault_delete(fault_id):
"""
Delete the cluster fault FAULT_ID.
"""
retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, fault_id)
finish(retcode, retdata)
###############################################################################
# > pvc cluster maintenance
###############################################################################
@ -6170,10 +6251,14 @@ cli_provisioner_profile.add_command(cli_provisioner_profile_list)
cli_provisioner.add_command(cli_provisioner_profile)
cli_provisioner.add_command(cli_provisioner_create)
cli.add_command(cli_provisioner)
cli_cluster.add_command(cli_cluster_status)
cli_cluster.add_command(cli_cluster_init)
cli_cluster.add_command(cli_cluster_backup)
cli_cluster.add_command(cli_cluster_restore)
cli_cluster.add_command(cli_cluster_status)
cli_cluster_fault.add_command(cli_cluster_fault_list)
cli_cluster_fault.add_command(cli_cluster_fault_acknowledge)
cli_cluster_fault.add_command(cli_cluster_fault_delete)
cli_cluster.add_command(cli_cluster_fault)
cli_cluster_maintenance.add_command(cli_cluster_maintenance_on)
cli_cluster_maintenance.add_command(cli_cluster_maintenance_off)
cli_cluster.add_command(cli_cluster_maintenance)

View File

@ -261,6 +261,127 @@ def cli_cluster_status_format_short(CLI_CONFIG, data):
return "\n".join(output)
def cli_cluster_fault_list_format_pretty(CLI_CONFIG, fault_data):
"""
Pretty format the output of cli_cluster_fault_list
"""
fault_list_output = []
# Determine optimal column widths
fault_id_length = 3 # "ID"
fault_status_length = 7 # "Status"
fault_health_delta_length = 7 # "Health"
fault_acknowledged_at_length = 6 # "Ack'd"
fault_last_reported_length = 5 # "Last"
fault_first_reported_length = 6 # "First"
# Message goes on its own line
for fault in fault_data:
# fault_id column
_fault_id_length = len(str(fault["id"])) + 1
if _fault_id_length > fault_id_length:
fault_id_length = _fault_id_length
# status column
_fault_status_length = len(str(fault["status"])) + 1
if _fault_status_length > fault_status_length:
fault_status_length = _fault_status_length
# health_delta column
_fault_health_delta_length = len(str(fault["health_delta"])) + 1
if _fault_health_delta_length > fault_health_delta_length:
fault_health_delta_length = _fault_health_delta_length
# acknowledged_at column
_fault_acknowledged_at_length = len(str(fault["acknowledged_at"])) + 1
if _fault_acknowledged_at_length > fault_acknowledged_at_length:
fault_acknowledged_at_length = _fault_acknowledged_at_length
# last_reported column
_fault_last_reported_length = len(str(fault["last_reported"])) + 1
if _fault_last_reported_length > fault_last_reported_length:
fault_last_reported_length = _fault_last_reported_length
# first_reported column
_fault_first_reported_length = len(str(fault["first_reported"])) + 1
if _fault_first_reported_length > fault_first_reported_length:
fault_first_reported_length = _fault_first_reported_length
# Format the string (header)
fault_list_output.append(
"{bold}{fault_id: <{fault_id_length}} {fault_status: <{fault_status_length}} {fault_health_delta: <{fault_health_delta_length}} {fault_acknowledged_at: <{fault_acknowledged_at_length}} {fault_last_reported: <{fault_last_reported_length}} {fault_first_reported: <{fault_first_reported_length}}{end_bold}".format(
bold=ansii["bold"],
end_bold=ansii["end"],
fault_id_length=fault_id_length,
fault_status_length=fault_status_length,
fault_health_delta_length=fault_health_delta_length,
fault_acknowledged_at_length=fault_acknowledged_at_length,
fault_last_reported_length=fault_last_reported_length,
fault_first_reported_length=fault_first_reported_length,
fault_id="ID",
fault_status="Status",
fault_health_delta="Health",
fault_acknowledged_at="Ack'd",
fault_last_reported="Last",
fault_first_reported="First",
)
)
fault_list_output.append(
"{bold}> {fault_message}{end_bold}".format(
bold=ansii["bold"],
end_bold=ansii["end"],
fault_message="Message",
)
)
for fault in sorted(
fault_data,
key=lambda x: (x["status"], x["health_delta"], x["last_reported"]),
reverse=True,
):
health_delta = fault["health_delta"]
if fault["acknowledged_at"] != "":
health_colour = ansii["blue"]
elif health_delta >= 50:
health_colour = ansii["red"]
elif health_delta >= 10:
health_colour = ansii["yellow"]
else:
health_colour = ansii["green"]
fault_list_output.append("")
fault_list_output.append(
"{bold}{fault_id: <{fault_id_length}} {health_colour}{fault_status: <{fault_status_length}} {fault_health_delta: <{fault_health_delta_length}}{end_colour} {fault_acknowledged_at: <{fault_acknowledged_at_length}} {fault_last_reported: <{fault_last_reported_length}} {fault_first_reported: <{fault_first_reported_length}}{end_bold}".format(
bold="",
end_bold="",
health_colour=health_colour,
end_colour=ansii["end"],
fault_id_length=fault_id_length,
fault_status_length=fault_status_length,
fault_health_delta_length=fault_health_delta_length,
fault_acknowledged_at_length=fault_acknowledged_at_length,
fault_last_reported_length=fault_last_reported_length,
fault_first_reported_length=fault_first_reported_length,
fault_id=fault["id"],
fault_status=fault["status"].title(),
fault_health_delta=f"-{fault['health_delta']}%",
fault_acknowledged_at=fault["acknowledged_at"]
if fault["acknowledged_at"] != ""
else "N/A",
fault_last_reported=fault["last_reported"],
fault_first_reported=fault["first_reported"],
)
)
fault_list_output.append(
"> {fault_message}".format(
fault_message=fault["message"],
)
)
return "\n".join(fault_list_output)
def cli_cluster_task_format_pretty(CLI_CONFIG, task_data):
"""
Pretty format the output of cli_cluster_task

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
# faults.py - PVC CLI client function library, faults management
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
from pvc.lib.common import call_api
def get_list(config, limit=None, sort_key="last_reported"):
"""
Get list of PVC faults
API endpoint: GET /api/v1/faults
API arguments: sort_key={sort_key}
API schema: {json_data_object}
"""
if limit is not None:
params = {}
endpoint = f"/faults/{limit}"
else:
params = {"sort_key": sort_key}
endpoint = "/faults"
response = call_api(config, "get", endpoint, params=params)
if response.status_code == 200:
return True, response.json()
else:
return False, response.json().get("message", "")
def acknowledge(config, fault_id):
"""
Acknowledge a PVC fault
API endpoint: PUT /api/v1/faults/<fault_id>
API arguments:
API schema: {json_message}
"""
response = call_api(config, "put", f"/faults/{fault_id}")
print(response.json())
if response.status_code == 200:
return True, response.json().get("message", "")
else:
return False, response.json().get("message", "")
def delete(config, fault_id):
"""
Delete a PVC fault
API endpoint: DELETE /api/v1/faults/<fault_id>
API arguments:
API schema: {json_message}
"""
response = call_api(config, "delete", f"/faults/{fault_id}")
if response.status_code == 200:
return True, response.json().get("message", "")
else:
return False, response.json().get("message", "")

126
daemon-common/faults.py Normal file
View File

@ -0,0 +1,126 @@
#!/usr/bin/env python3
# faults.py - PVC client function library, faults management
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
from datetime import datetime
def getFault(zkhandler, fault_id):
"""
Get the details of a fault based on the fault ID
"""
if not zkhandler.exists(("faults.id", fault_id)):
return None
fault_id = fault_id
fault_last_time = zkhandler.read(("faults.last_time", fault_id))
fault_first_time = zkhandler.read(("faults.first_time", fault_id))
fault_ack_time = zkhandler.read(("faults.ack_time", fault_id))
fault_status = zkhandler.read(("faults.status", fault_id))
fault_delta = int(zkhandler.read(("faults.delta", fault_id)))
fault_message = zkhandler.read(("faults.message", fault_id))
fault = {
"id": fault_id,
"last_reported": fault_last_time,
"first_reported": fault_first_time,
"acknowledged_at": fault_ack_time,
"status": fault_status,
"health_delta": fault_delta,
"message": fault_message,
}
return fault
def getAllFaults(zkhandler, sort_key="last_reported"):
"""
Get the details of all registered faults
"""
all_faults = zkhandler.children(("base.faults"))
faults_detail = list()
for fault_id in all_faults:
fault_detail = getFault(zkhandler, fault_id)
faults_detail.append(fault_detail)
sorted_faults = sorted(faults_detail, key=lambda x: x[sort_key])
# Sort newest-first for time-based sorts
if sort_key in ["first_reported", "last_reported", "acknowledge_at"]:
sorted_faults.reverse()
return sorted_faults
def get_list(zkhandler, limit=None, sort_key="last_reported"):
"""
Get a list of all known faults, sorted by {sort_key}
"""
if sort_key not in [
"first_reported",
"last_reported",
"acknowledged_at",
"status",
"health_delta",
"message",
]:
return False, f"Invalid sort key {sort_key} provided"
all_faults = getAllFaults(zkhandler, sort_key=sort_key)
if limit is not None:
all_faults = [fault for fault in all_faults if fault["id"] == limit]
return True, all_faults
def acknowledge(zkhandler, fault_id):
"""
Acknowledge a fault
"""
fault = getFault(zkhandler, fault_id)
if fault is None:
return False, f"No fault with ID {fault_id} found"
zkhandler.write(
[
(("faults.ack_time", fault_id), datetime.now()),
(("faults.status", fault_id), "ack"),
]
)
return True, f"Successfully acknowledged fault {fault_id}"
def delete(zkhandler, fault_id):
"""
Delete a fault
"""
fault = getFault(zkhandler, fault_id)
if fault is None:
return False, f"No fault with ID {fault_id} found"
zkhandler.delete(("faults.id", fault_id), recursive=True)
return True, f"Successfully deleted fault {fault_id}"