Compare commits
3 Commits
7e6d922877
...
47bd7bf2f5
Author | SHA1 | Date | |
---|---|---|---|
47bd7bf2f5 | |||
b9fbfe2ed5 | |||
764e3e3722 |
@ -388,22 +388,22 @@ def cli_cluster_fault_list_format_short(CLI_CONFIG, fault_data):
|
||||
fault_id_length + fault_status_length + fault_health_delta_length + 2
|
||||
)
|
||||
detail_header_length = (
|
||||
fault_health_delta_length
|
||||
fault_id_length
|
||||
+ fault_health_delta_length
|
||||
+ fault_status_length
|
||||
+ fault_last_reported_length
|
||||
+ fault_message_length
|
||||
+ 3
|
||||
- meta_header_length
|
||||
+ 8
|
||||
)
|
||||
|
||||
# Format the string (header)
|
||||
fault_list_output.append(
|
||||
"{bold}Meta {meta_dashes} Fault {detail_dashes}{end_bold}".format(
|
||||
"{bold}Meta {meta_dashes} Fault {detail_dashes}{end_bold}".format(
|
||||
bold=ansii["bold"],
|
||||
end_bold=ansii["end"],
|
||||
meta_dashes="-" * (meta_header_length - len("Meta ")),
|
||||
detail_dashes="-" * (detail_header_length - len("Fault ")),
|
||||
meta_dashes="-" * (meta_header_length - len("Meta ")),
|
||||
detail_dashes="-" * (detail_header_length - len("Fault ")),
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -20,7 +20,6 @@
|
||||
###############################################################################
|
||||
|
||||
from datetime import datetime
|
||||
from hashlib import md5
|
||||
|
||||
|
||||
def generate_fault(
|
||||
@ -32,10 +31,6 @@ def generate_fault(
|
||||
fault_message,
|
||||
fault_details=None,
|
||||
):
|
||||
# Generate a fault ID from the fault_name, fault_delta, and fault_message
|
||||
fault_str = f"{fault_name} {fault_delta} {fault_message}"
|
||||
fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
|
||||
|
||||
# Strip the microseconds off of the fault time; we don't care about that precision
|
||||
fault_time = str(fault_time).split(".")[0]
|
||||
|
||||
@ -45,47 +40,49 @@ def generate_fault(
|
||||
# If a fault already exists with this ID, just update the time
|
||||
if not zkhandler.exists("base.faults"):
|
||||
logger.out(
|
||||
f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
|
||||
f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
|
||||
state="w",
|
||||
)
|
||||
return
|
||||
|
||||
existing_faults = zkhandler.children("base.faults")
|
||||
if fault_id in existing_faults:
|
||||
if fault_name in existing_faults:
|
||||
logger.out(
|
||||
f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
|
||||
f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
|
||||
)
|
||||
else:
|
||||
logger.out(
|
||||
f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
|
||||
f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
|
||||
state="i",
|
||||
)
|
||||
|
||||
if zkhandler.read("base.config.maintenance") == "true":
|
||||
logger.out(
|
||||
f"Skipping fault reporting for {fault_id} due to maintenance mode",
|
||||
f"Skipping fault reporting for {fault_name} due to maintenance mode",
|
||||
state="w",
|
||||
)
|
||||
return
|
||||
|
||||
if fault_id in existing_faults:
|
||||
# Update an existing fault
|
||||
if fault_name in existing_faults:
|
||||
zkhandler.write(
|
||||
[
|
||||
(("faults.last_time", fault_id), fault_time),
|
||||
(("faults.message", fault_id), fault_message),
|
||||
(("faults.last_time", fault_name), fault_time),
|
||||
(("faults.delta", fault_name), fault_delta),
|
||||
(("faults.message", fault_name), fault_message),
|
||||
]
|
||||
)
|
||||
# Otherwise, generate a new fault event
|
||||
# Generate a new fault
|
||||
else:
|
||||
zkhandler.write(
|
||||
[
|
||||
(("faults.id", fault_id), ""),
|
||||
(("faults.first_time", fault_id), fault_time),
|
||||
(("faults.last_time", fault_id), fault_time),
|
||||
(("faults.ack_time", fault_id), ""),
|
||||
(("faults.status", fault_id), "new"),
|
||||
(("faults.delta", fault_id), fault_delta),
|
||||
(("faults.message", fault_id), fault_message),
|
||||
(("faults.id", fault_name), ""),
|
||||
(("faults.first_time", fault_name), fault_time),
|
||||
(("faults.last_time", fault_name), fault_time),
|
||||
(("faults.ack_time", fault_name), ""),
|
||||
(("faults.status", fault_name), "new"),
|
||||
(("faults.delta", fault_name), fault_delta),
|
||||
(("faults.message", fault_name), fault_message),
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -228,7 +228,7 @@ class MonitoringInstance(object):
|
||||
def get_ceph_health_entries():
|
||||
ceph_health_entries = [
|
||||
{
|
||||
"entry": f"{value['severity']} {key}",
|
||||
"entry": key,
|
||||
"check": value["severity"],
|
||||
"details": value["summary"]["message"],
|
||||
}
|
||||
@ -281,36 +281,42 @@ class MonitoringInstance(object):
|
||||
# This is a list of all possible faults (cluster error messages) and their corresponding details
|
||||
self.cluster_faults_map = {
|
||||
"dead_or_fenced_node": {
|
||||
"name": "DEAD_NODE_{entry}",
|
||||
"entries": get_node_daemon_states,
|
||||
"conditions": ["dead", "fenced"],
|
||||
"delta": 50,
|
||||
"message": "Node {entry} was dead and/or fenced",
|
||||
},
|
||||
"ceph_osd_out": {
|
||||
"name": "CEPH_OSD_OUT_{entry}",
|
||||
"entries": get_osd_in_states,
|
||||
"conditions": ["0"],
|
||||
"delta": 50,
|
||||
"message": "OSD {entry} was marked out",
|
||||
},
|
||||
"ceph_warn": {
|
||||
"name": "CEPH_WARN_{entry}",
|
||||
"entries": get_ceph_health_entries,
|
||||
"conditions": ["HEALTH_WARN"],
|
||||
"delta": 10,
|
||||
"message": "{entry} reported by Ceph cluster",
|
||||
},
|
||||
"ceph_err": {
|
||||
"name": "CEPH_ERR_{entry}",
|
||||
"entries": get_ceph_health_entries,
|
||||
"conditions": ["HEALTH_ERR"],
|
||||
"delta": 50,
|
||||
"message": "{entry} reported by Ceph cluster",
|
||||
},
|
||||
"vm_failed": {
|
||||
"name": "VM_FAILED_{entry}",
|
||||
"entries": get_vm_states,
|
||||
"conditions": ["fail"],
|
||||
"delta": 10,
|
||||
"message": "VM {entry} was failed",
|
||||
},
|
||||
"memory_overprovisioned": {
|
||||
"name": "MEMORY_OVERPROVISIONED",
|
||||
"entries": get_overprovisioned_memory,
|
||||
"conditions": ["overprovisioned"],
|
||||
"delta": 50,
|
||||
@ -531,11 +537,12 @@ class MonitoringInstance(object):
|
||||
if str(condition) == str(check):
|
||||
fault_time = datetime.now()
|
||||
fault_delta = fault_data["delta"]
|
||||
fault_name = fault_data["name"].format(entry=entry)
|
||||
fault_message = fault_data["message"].format(entry=entry)
|
||||
generate_fault(
|
||||
self.zkhandler,
|
||||
self.logger,
|
||||
fault_type,
|
||||
fault_name,
|
||||
fault_time,
|
||||
fault_delta,
|
||||
fault_message,
|
||||
@ -587,7 +594,7 @@ class MonitoringInstance(object):
|
||||
|
||||
# Generate a cluster fault if the plugin is in a suboptimal state
|
||||
if result.health_delta > 0:
|
||||
fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
|
||||
fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}"
|
||||
fault_time = datetime.now()
|
||||
|
||||
# Map our check results to fault results
|
||||
@ -602,11 +609,11 @@ class MonitoringInstance(object):
|
||||
generate_fault(
|
||||
self.zkhandler,
|
||||
self.logger,
|
||||
fault_type,
|
||||
fault_name,
|
||||
fault_time,
|
||||
fault_delta,
|
||||
fault_message,
|
||||
fault_detail=None,
|
||||
fault_details=None,
|
||||
)
|
||||
self.faults += 1
|
||||
|
||||
@ -661,7 +668,7 @@ class MonitoringInstance(object):
|
||||
|
||||
self.run_plugins(coordinator_state=coordinator_state)
|
||||
|
||||
if coordinator_state in ["primary", "secondary", "takeover", "relinquish"]:
|
||||
if coordinator_state in ["primary", "takeover"]:
|
||||
self.run_faults(coordinator_state=coordinator_state)
|
||||
|
||||
runtime_end = datetime.now()
|
||||
|
Reference in New Issue
Block a user