Only run cluster-wide health checks on primary

Avoids multiple coordinators trying to write updated cluster-wide fault events. Instead, they are now only written by the primary (or the incoming primary if still in a transition).
Improve fault ID format
2023-12-09 16:50:51 -05:00 · 2023-12-09 16:48:14 -05:00 · 2023-12-09 16:47:56 -05:00
3 changed files with 36 additions and 32 deletions
--- a/client-cli/pvc/cli/formatters.py
+++ b/client-cli/pvc/cli/formatters.py
@ -388,22 +388,22 @@ def cli_cluster_fault_list_format_short(CLI_CONFIG, fault_data):
        fault_id_length + fault_status_length + fault_health_delta_length + 2
    )
    detail_header_length = (
-        fault_health_delta_length
+        fault_id_length
+        + fault_health_delta_length
        + fault_status_length
        + fault_last_reported_length
        + fault_message_length
        + 3
        - meta_header_length
-        + 8
    )

    # Format the string (header)
    fault_list_output.append(
-        "{bold}Meta {meta_dashes} Fault {detail_dashes}{end_bold}".format(
+        "{bold}Meta {meta_dashes}  Fault {detail_dashes}{end_bold}".format(
            bold=ansii["bold"],
            end_bold=ansii["end"],
-            meta_dashes="-" * (meta_header_length - len("Meta ")),
-            detail_dashes="-" * (detail_header_length - len("Fault ")),
+            meta_dashes="-" * (meta_header_length - len("Meta  ")),
+            detail_dashes="-" * (detail_header_length - len("Fault  ")),
        )
    )

--- a/daemon-common/faults.py
+++ b/daemon-common/faults.py
@ -20,7 +20,6 @@
 ###############################################################################

 from datetime import datetime
-from hashlib import md5


 def generate_fault(
@ -32,10 +31,6 @@ def generate_fault(
    fault_message,
    fault_details=None,
 ):
-    # Generate a fault ID from the fault_name, fault_delta, and fault_message
-    fault_str = f"{fault_name} {fault_delta} {fault_message}"
-    fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
-
    # Strip the microseconds off of the fault time; we don't care about that precision
    fault_time = str(fault_time).split(".")[0]

@ -45,47 +40,49 @@ def generate_fault(
    # If a fault already exists with this ID, just update the time
    if not zkhandler.exists("base.faults"):
        logger.out(
-            f"Skipping fault reporting for {fault_id} due to missing Zookeeper schemas",
+            f"Skipping fault reporting for {fault_name} due to missing Zookeeper schemas",
            state="w",
        )
        return

    existing_faults = zkhandler.children("base.faults")
-    if fault_id in existing_faults:
+    if fault_name in existing_faults:
        logger.out(
-            f"Updating fault {fault_id}: {fault_message} @ {fault_time}", state="i"
+            f"Updating fault {fault_name}: {fault_message} @ {fault_time}", state="i"
        )
    else:
        logger.out(
-            f"Generating fault {fault_id}: {fault_message} @ {fault_time}",
+            f"Generating fault {fault_name}: {fault_message} @ {fault_time}",
            state="i",
        )

    if zkhandler.read("base.config.maintenance") == "true":
        logger.out(
-            f"Skipping fault reporting for {fault_id} due to maintenance mode",
+            f"Skipping fault reporting for {fault_name} due to maintenance mode",
            state="w",
        )
        return

-    if fault_id in existing_faults:
+    # Update an existing fault
+    if fault_name in existing_faults:
        zkhandler.write(
            [
-                (("faults.last_time", fault_id), fault_time),
-                (("faults.message", fault_id), fault_message),
+                (("faults.last_time", fault_name), fault_time),
+                (("faults.delta", fault_name), fault_delta),
+                (("faults.message", fault_name), fault_message),
            ]
        )
-    # Otherwise, generate a new fault event
+    # Generate a new fault
    else:
        zkhandler.write(
            [
-                (("faults.id", fault_id), ""),
-                (("faults.first_time", fault_id), fault_time),
-                (("faults.last_time", fault_id), fault_time),
-                (("faults.ack_time", fault_id), ""),
-                (("faults.status", fault_id), "new"),
-                (("faults.delta", fault_id), fault_delta),
-                (("faults.message", fault_id), fault_message),
+                (("faults.id", fault_name), ""),
+                (("faults.first_time", fault_name), fault_time),
+                (("faults.last_time", fault_name), fault_time),
+                (("faults.ack_time", fault_name), ""),
+                (("faults.status", fault_name), "new"),
+                (("faults.delta", fault_name), fault_delta),
+                (("faults.message", fault_name), fault_message),
            ]
        )

--- a/health-daemon/pvchealthd/objects/MonitoringInstance.py
+++ b/health-daemon/pvchealthd/objects/MonitoringInstance.py
@ -228,7 +228,7 @@ class MonitoringInstance(object):
        def get_ceph_health_entries():
            ceph_health_entries = [
                {
-                    "entry": f"{value['severity']} {key}",
+                    "entry": key,
                    "check": value["severity"],
                    "details": value["summary"]["message"],
                }
@ -281,36 +281,42 @@ class MonitoringInstance(object):
        # This is a list of all possible faults (cluster error messages) and their corresponding details
        self.cluster_faults_map = {
            "dead_or_fenced_node": {
+                "name": "DEAD_NODE_{entry}",
                "entries": get_node_daemon_states,
                "conditions": ["dead", "fenced"],
                "delta": 50,
                "message": "Node {entry} was dead and/or fenced",
            },
            "ceph_osd_out": {
+                "name": "CEPH_OSD_OUT_{entry}",
                "entries": get_osd_in_states,
                "conditions": ["0"],
                "delta": 50,
                "message": "OSD {entry} was marked out",
            },
            "ceph_warn": {
+                "name": "CEPH_WARN_{entry}",
                "entries": get_ceph_health_entries,
                "conditions": ["HEALTH_WARN"],
                "delta": 10,
                "message": "{entry} reported by Ceph cluster",
            },
            "ceph_err": {
+                "name": "CEPH_ERR_{entry}",
                "entries": get_ceph_health_entries,
                "conditions": ["HEALTH_ERR"],
                "delta": 50,
                "message": "{entry} reported by Ceph cluster",
            },
            "vm_failed": {
+                "name": "VM_FAILED_{entry}",
                "entries": get_vm_states,
                "conditions": ["fail"],
                "delta": 10,
                "message": "VM {entry} was failed",
            },
            "memory_overprovisioned": {
+                "name": "MEMORY_OVERPROVISIONED",
                "entries": get_overprovisioned_memory,
                "conditions": ["overprovisioned"],
                "delta": 50,
@ -531,11 +537,12 @@ class MonitoringInstance(object):
                    if str(condition) == str(check):
                        fault_time = datetime.now()
                        fault_delta = fault_data["delta"]
+                        fault_name = fault_data["name"].format(entry=entry)
                        fault_message = fault_data["message"].format(entry=entry)
                        generate_fault(
                            self.zkhandler,
                            self.logger,
-                            fault_type,
+                            fault_name,
                            fault_time,
                            fault_delta,
                            fault_message,
@ -587,7 +594,7 @@ class MonitoringInstance(object):

            # Generate a cluster fault if the plugin is in a suboptimal state
            if result.health_delta > 0:
-                fault_type = f"plugin.{self.this_node.name}.{result.plugin_name}"
+                fault_name = f"NODE_PLUGIN_{result.plugin_name.upper()}_{self.this_node.name.upper()}"
                fault_time = datetime.now()

                # Map our check results to fault results
@ -602,11 +609,11 @@ class MonitoringInstance(object):
                generate_fault(
                    self.zkhandler,
                    self.logger,
-                    fault_type,
+                    fault_name,
                    fault_time,
                    fault_delta,
                    fault_message,
-                    fault_detail=None,
+                    fault_details=None,
                )
                self.faults += 1

@ -661,7 +668,7 @@ class MonitoringInstance(object):

        self.run_plugins(coordinator_state=coordinator_state)

-        if coordinator_state in ["primary", "secondary", "takeover", "relinquish"]:
+        if coordinator_state in ["primary", "takeover"]:
            self.run_faults(coordinator_state=coordinator_state)

        runtime_end = datetime.now()