Improve fault detail handling further
Since we already had a "details" field, simply move where it gets added to the message later, in generate_fault, after the main message value was used to generate the ID.
This commit is contained in:
		@@ -21,21 +21,27 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
from hashlib import md5
 | 
					from hashlib import md5
 | 
				
			||||||
from re import sub
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def generate_fault(
 | 
					def generate_fault(
 | 
				
			||||||
    zkhandler, logger, fault_name, fault_time, fault_delta, fault_message
 | 
					    zkhandler,
 | 
				
			||||||
 | 
					    logger,
 | 
				
			||||||
 | 
					    fault_name,
 | 
				
			||||||
 | 
					    fault_time,
 | 
				
			||||||
 | 
					    fault_delta,
 | 
				
			||||||
 | 
					    fault_message,
 | 
				
			||||||
 | 
					    fault_details=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    # Strip off any "extra" data from the message (things in brackets)
 | 
					    # Generate a fault ID from the fault_name, fault_delta, and fault_message
 | 
				
			||||||
    fault_core_message = sub(r"[\(\[].*?[\)\]]", "", fault_message).strip()
 | 
					    fault_str = f"{fault_name} {fault_delta} {fault_message}"
 | 
				
			||||||
    # Generate a fault ID from the fault_name, fault_delta, and fault_core_message
 | 
					 | 
				
			||||||
    fault_str = f"{fault_name} {fault_delta} {fault_core_message}"
 | 
					 | 
				
			||||||
    fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
 | 
					    fault_id = str(md5(fault_str.encode("utf-8")).hexdigest())[:8]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Strip the microseconds off of the fault time; we don't care about that precision
 | 
					    # Strip the microseconds off of the fault time; we don't care about that precision
 | 
				
			||||||
    fault_time = str(fault_time).split(".")[0]
 | 
					    fault_time = str(fault_time).split(".")[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if fault_details is not None:
 | 
				
			||||||
 | 
					        fault_message = f"{fault_message}: {fault_details}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # If a fault already exists with this ID, just update the time
 | 
					    # If a fault already exists with this ID, just update the time
 | 
				
			||||||
    if not zkhandler.exists("base.faults"):
 | 
					    if not zkhandler.exists("base.faults"):
 | 
				
			||||||
        logger.out(
 | 
					        logger.out(
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -206,7 +206,7 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                {
 | 
					                {
 | 
				
			||||||
                    "entry": node,
 | 
					                    "entry": node,
 | 
				
			||||||
                    "check": self.zkhandler.read(("node.state.daemon", node)),
 | 
					                    "check": self.zkhandler.read(("node.state.daemon", node)),
 | 
				
			||||||
                    "details": "",
 | 
					                    "details": None,
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                for node in self.zkhandler.children("base.node")
 | 
					                for node in self.zkhandler.children("base.node")
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
@@ -219,7 +219,7 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                    "check": loads(self.zkhandler.read(("osd.stats", osd))).get(
 | 
					                    "check": loads(self.zkhandler.read(("osd.stats", osd))).get(
 | 
				
			||||||
                        "in", 0
 | 
					                        "in", 0
 | 
				
			||||||
                    ),
 | 
					                    ),
 | 
				
			||||||
                    "details": "",
 | 
					                    "details": None,
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                for osd in self.zkhandler.children("base.osd")
 | 
					                for osd in self.zkhandler.children("base.osd")
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
@@ -271,9 +271,9 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                op_str = "ok"
 | 
					                op_str = "ok"
 | 
				
			||||||
            overprovisioned_memory = [
 | 
					            overprovisioned_memory = [
 | 
				
			||||||
                {
 | 
					                {
 | 
				
			||||||
                    "entry": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
 | 
					                    "entry": "Cluster memory was overprovisioned",
 | 
				
			||||||
                    "check": op_str,
 | 
					                    "check": op_str,
 | 
				
			||||||
                    "details": "",
 | 
					                    "details": f"{current_memory_provisioned}MB > {available_node_memory}MB (N-1)",
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
            return overprovisioned_memory
 | 
					            return overprovisioned_memory
 | 
				
			||||||
@@ -296,25 +296,25 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                "entries": get_ceph_health_entries,
 | 
					                "entries": get_ceph_health_entries,
 | 
				
			||||||
                "conditions": ["HEALTH_WARN"],
 | 
					                "conditions": ["HEALTH_WARN"],
 | 
				
			||||||
                "delta": 10,
 | 
					                "delta": 10,
 | 
				
			||||||
                "message": "{entry} reported by Ceph ({details})",
 | 
					                "message": "{entry} reported by Ceph cluster",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "ceph_err": {
 | 
					            "ceph_err": {
 | 
				
			||||||
                "entries": get_ceph_health_entries,
 | 
					                "entries": get_ceph_health_entries,
 | 
				
			||||||
                "conditions": ["HEALTH_ERR"],
 | 
					                "conditions": ["HEALTH_ERR"],
 | 
				
			||||||
                "delta": 50,
 | 
					                "delta": 50,
 | 
				
			||||||
                "message": "{entry} reported by Ceph ({details})",
 | 
					                "message": "{entry} reported by Ceph cluster",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "vm_failed": {
 | 
					            "vm_failed": {
 | 
				
			||||||
                "entries": get_vm_states,
 | 
					                "entries": get_vm_states,
 | 
				
			||||||
                "conditions": ["fail"],
 | 
					                "conditions": ["fail"],
 | 
				
			||||||
                "delta": 10,
 | 
					                "delta": 10,
 | 
				
			||||||
                "message": "VM {entry} was failed ({details})",
 | 
					                "message": "VM {entry} was failed",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "memory_overprovisioned": {
 | 
					            "memory_overprovisioned": {
 | 
				
			||||||
                "entries": get_overprovisioned_memory,
 | 
					                "entries": get_overprovisioned_memory,
 | 
				
			||||||
                "conditions": ["overprovisioned"],
 | 
					                "conditions": ["overprovisioned"],
 | 
				
			||||||
                "delta": 50,
 | 
					                "delta": 50,
 | 
				
			||||||
                "message": "Cluster memory was overprovisioned {entry}",
 | 
					                "message": "{entry}",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -507,7 +507,7 @@ class MonitoringInstance(object):
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for fault_type in self.cluster_faults_map.keys():
 | 
					        for fault_type in self.cluster_faults_map.keys():
 | 
				
			||||||
            fault_details = self.cluster_faults_map[fault_type]
 | 
					            fault_data = self.cluster_faults_map[fault_type]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if self.config["log_monitoring_details"] or self.config["debug"]:
 | 
					            if self.config["log_monitoring_details"] or self.config["debug"]:
 | 
				
			||||||
                self.logger.out(
 | 
					                self.logger.out(
 | 
				
			||||||
@@ -515,7 +515,7 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                    state="t",
 | 
					                    state="t",
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            entries = fault_details["entries"]()
 | 
					            entries = fault_data["entries"]()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if self.config["debug"]:
 | 
					            if self.config["debug"]:
 | 
				
			||||||
                self.logger.out(
 | 
					                self.logger.out(
 | 
				
			||||||
@@ -527,13 +527,11 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                entry = _entry["entry"]
 | 
					                entry = _entry["entry"]
 | 
				
			||||||
                check = _entry["check"]
 | 
					                check = _entry["check"]
 | 
				
			||||||
                details = _entry["details"]
 | 
					                details = _entry["details"]
 | 
				
			||||||
                for condition in fault_details["conditions"]:
 | 
					                for condition in fault_data["conditions"]:
 | 
				
			||||||
                    if str(condition) == str(check):
 | 
					                    if str(condition) == str(check):
 | 
				
			||||||
                        fault_time = datetime.now()
 | 
					                        fault_time = datetime.now()
 | 
				
			||||||
                        fault_delta = fault_details["delta"]
 | 
					                        fault_delta = fault_data["delta"]
 | 
				
			||||||
                        fault_message = fault_details["message"].format(
 | 
					                        fault_message = fault_data["message"].format(entry=entry)
 | 
				
			||||||
                            entry=entry, details=details
 | 
					 | 
				
			||||||
                        )
 | 
					 | 
				
			||||||
                        generate_fault(
 | 
					                        generate_fault(
 | 
				
			||||||
                            self.zkhandler,
 | 
					                            self.zkhandler,
 | 
				
			||||||
                            self.logger,
 | 
					                            self.logger,
 | 
				
			||||||
@@ -541,6 +539,7 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                            fault_time,
 | 
					                            fault_time,
 | 
				
			||||||
                            fault_delta,
 | 
					                            fault_delta,
 | 
				
			||||||
                            fault_message,
 | 
					                            fault_message,
 | 
				
			||||||
 | 
					                            fault_details=details,
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                        self.faults += 1
 | 
					                        self.faults += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -607,6 +606,7 @@ class MonitoringInstance(object):
 | 
				
			|||||||
                    fault_time,
 | 
					                    fault_time,
 | 
				
			||||||
                    fault_delta,
 | 
					                    fault_delta,
 | 
				
			||||||
                    fault_message,
 | 
					                    fault_message,
 | 
				
			||||||
 | 
					                    fault_detail=None,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                self.faults += 1
 | 
					                self.faults += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user