Compare commits
	
		
			6 Commits
		
	
	
		
			f6bea50a0a
			...
			fa900f6212
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| fa900f6212 | |||
| b236127dba | |||
| 0ae77d7e77 | |||
| 8b5011c266 | |||
| 6ac5b0d02f | |||
| 3a1b8f0e7a | 
@@ -125,81 +125,56 @@ def format_info(cluster_information, oformat):
 | 
			
		||||
        return json.dumps(cluster_information, indent=4)
 | 
			
		||||
 | 
			
		||||
    # Plain formatting, i.e. human-readable
 | 
			
		||||
    if cluster_information["health"] == "Optimal":
 | 
			
		||||
        health_colour = ansiprint.green()
 | 
			
		||||
    elif cluster_information["health"] == "Maintenance":
 | 
			
		||||
    if cluster_information["maintenance"] == "True":
 | 
			
		||||
        health_colour = ansiprint.blue()
 | 
			
		||||
    else:
 | 
			
		||||
    elif cluster_information["health"] > 90:
 | 
			
		||||
        health_colour = ansiprint.green()
 | 
			
		||||
    elif cluster_information["health"] > 50:
 | 
			
		||||
        health_colour = ansiprint.yellow()
 | 
			
		||||
 | 
			
		||||
    if cluster_information["storage_health"] == "Optimal":
 | 
			
		||||
        storage_health_colour = ansiprint.green()
 | 
			
		||||
    elif cluster_information["storage_health"] == "Maintenance":
 | 
			
		||||
        storage_health_colour = ansiprint.blue()
 | 
			
		||||
    else:
 | 
			
		||||
        storage_health_colour = ansiprint.yellow()
 | 
			
		||||
        health_colour = ansiprint.red()
 | 
			
		||||
 | 
			
		||||
    ainformation = []
 | 
			
		||||
 | 
			
		||||
    if oformat == "short":
 | 
			
		||||
        ainformation.append(
 | 
			
		||||
            "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
 | 
			
		||||
        )
 | 
			
		||||
        ainformation.append(
 | 
			
		||||
            "{}Cluster health:{}      {}{}{}".format(
 | 
			
		||||
                ansiprint.purple(),
 | 
			
		||||
                ansiprint.end(),
 | 
			
		||||
                health_colour,
 | 
			
		||||
                cluster_information["health"],
 | 
			
		||||
                ansiprint.end(),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        if cluster_information["health_msg"]:
 | 
			
		||||
            for line in cluster_information["health_msg"]:
 | 
			
		||||
                ainformation.append("                     > {}".format(line))
 | 
			
		||||
        ainformation.append(
 | 
			
		||||
            "{}Storage health:{}      {}{}{}".format(
 | 
			
		||||
                ansiprint.purple(),
 | 
			
		||||
                ansiprint.end(),
 | 
			
		||||
                storage_health_colour,
 | 
			
		||||
                cluster_information["storage_health"],
 | 
			
		||||
                ansiprint.end(),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        if cluster_information["storage_health_msg"]:
 | 
			
		||||
            for line in cluster_information["storage_health_msg"]:
 | 
			
		||||
                ainformation.append("                     > {}".format(line))
 | 
			
		||||
 | 
			
		||||
        return "\n".join(ainformation)
 | 
			
		||||
 | 
			
		||||
    ainformation.append(
 | 
			
		||||
        "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
 | 
			
		||||
    )
 | 
			
		||||
    ainformation.append("")
 | 
			
		||||
 | 
			
		||||
    health_text = f"{cluster_information['health']}%"
 | 
			
		||||
    if cluster_information["maintenance"] == "True":
 | 
			
		||||
        health_text += " (maintenance on)"
 | 
			
		||||
 | 
			
		||||
    ainformation.append(
 | 
			
		||||
        "{}Cluster health:{}      {}{}{}".format(
 | 
			
		||||
        "{}Cluster health:{}  {}{}{}".format(
 | 
			
		||||
            ansiprint.purple(),
 | 
			
		||||
            ansiprint.end(),
 | 
			
		||||
            health_colour,
 | 
			
		||||
            cluster_information["health"],
 | 
			
		||||
            health_text,
 | 
			
		||||
            ansiprint.end(),
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
    if cluster_information["health_msg"]:
 | 
			
		||||
        for line in cluster_information["health_msg"]:
 | 
			
		||||
            ainformation.append("                     > {}".format(line))
 | 
			
		||||
    ainformation.append(
 | 
			
		||||
        "{}Storage health:{}      {}{}{}".format(
 | 
			
		||||
            ansiprint.purple(),
 | 
			
		||||
            ansiprint.end(),
 | 
			
		||||
            storage_health_colour,
 | 
			
		||||
            cluster_information["storage_health"],
 | 
			
		||||
            ansiprint.end(),
 | 
			
		||||
    if cluster_information["health_messages"]:
 | 
			
		||||
        health_messages = "\n                 > ".join(
 | 
			
		||||
            sorted(cluster_information["health_messages"])
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
    if cluster_information["storage_health_msg"]:
 | 
			
		||||
        for line in cluster_information["storage_health_msg"]:
 | 
			
		||||
            ainformation.append("                     > {}".format(line))
 | 
			
		||||
        ainformation.append(
 | 
			
		||||
            "{}Health messages:{} > {}".format(
 | 
			
		||||
                ansiprint.purple(),
 | 
			
		||||
                ansiprint.end(),
 | 
			
		||||
                health_messages,
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        ainformation.append(
 | 
			
		||||
            "{}Health messages:{} N/A".format(
 | 
			
		||||
                ansiprint.purple(),
 | 
			
		||||
                ansiprint.end(),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    if oformat == "short":
 | 
			
		||||
        return "\n".join(ainformation)
 | 
			
		||||
 | 
			
		||||
    ainformation.append("")
 | 
			
		||||
    ainformation.append(
 | 
			
		||||
 
 | 
			
		||||
@@ -158,6 +158,19 @@ def get_status(zkhandler):
 | 
			
		||||
    return True, status_data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_health(zkhandler):
 | 
			
		||||
    primary_node = zkhandler.read("base.config.primary_node")
 | 
			
		||||
    ceph_health = zkhandler.read("base.storage.health").rstrip()
 | 
			
		||||
 | 
			
		||||
    # Create a data structure for the information
 | 
			
		||||
    status_data = {
 | 
			
		||||
        "type": "health",
 | 
			
		||||
        "primary_node": primary_node,
 | 
			
		||||
        "ceph_data": ceph_health,
 | 
			
		||||
    }
 | 
			
		||||
    return True, status_data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_util(zkhandler):
 | 
			
		||||
    primary_node = zkhandler.read("base.config.primary_node")
 | 
			
		||||
    ceph_df = zkhandler.read("base.storage.util").rstrip()
 | 
			
		||||
 
 | 
			
		||||
@@ -19,7 +19,7 @@
 | 
			
		||||
#
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
from json import loads
 | 
			
		||||
 | 
			
		||||
import daemon_lib.common as common
 | 
			
		||||
import daemon_lib.vm as pvc_vm
 | 
			
		||||
@@ -44,13 +44,123 @@ def set_maintenance(zkhandler, maint_state):
 | 
			
		||||
        return True, "Successfully set cluster in normal mode"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
 | 
			
		||||
    health_delta_map = {
 | 
			
		||||
        "node_stopped": 50,
 | 
			
		||||
        "node_flushed": 10,
 | 
			
		||||
        "vm_stopped": 10,
 | 
			
		||||
        "osd_out": 50,
 | 
			
		||||
        "osd_down": 10,
 | 
			
		||||
        "memory_overprovisioned": 50,
 | 
			
		||||
        "ceph_err": 50,
 | 
			
		||||
        "ceph_warn": 10,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Generate total cluster health numbers
 | 
			
		||||
    cluster_health = 100
 | 
			
		||||
    messages = list()
 | 
			
		||||
 | 
			
		||||
    for index, node in enumerate(node_list):
 | 
			
		||||
        # Apply node health values to total health number
 | 
			
		||||
        try:
 | 
			
		||||
            node_health_int = int(node["health"])
 | 
			
		||||
        except Exception:
 | 
			
		||||
            node_health_int = 100
 | 
			
		||||
        cluster_health -= 100 - node_health_int
 | 
			
		||||
 | 
			
		||||
        for entry in node["health_details"]:
 | 
			
		||||
            if entry["health_delta"] > 0:
 | 
			
		||||
                messages.append(
 | 
			
		||||
                    f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        # Handle unhealthy node states
 | 
			
		||||
        if node["daemon_state"] not in ["run"]:
 | 
			
		||||
            cluster_health -= health_delta_map["node_stopped"]
 | 
			
		||||
            messages.append(
 | 
			
		||||
                f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
 | 
			
		||||
            )
 | 
			
		||||
        elif node["domain_state"] not in ["ready"]:
 | 
			
		||||
            cluster_health -= health_delta_map["node_flushed"]
 | 
			
		||||
            messages.append(
 | 
			
		||||
                f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    for index, vm in enumerate(vm_list):
 | 
			
		||||
        # Handle unhealthy VM states
 | 
			
		||||
        if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]:
 | 
			
		||||
            cluster_health -= health_delta_map["vm_stopped"]
 | 
			
		||||
            messages.append(f"cluster: VM {vm['name']} in {vm['state'].upper()} state")
 | 
			
		||||
 | 
			
		||||
    for index, ceph_osd in enumerate(ceph_osd_list):
 | 
			
		||||
        in_texts = {1: "in", 0: "out"}
 | 
			
		||||
        up_texts = {1: "up", 0: "down"}
 | 
			
		||||
 | 
			
		||||
        # Handle unhealthy OSD states
 | 
			
		||||
        if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
 | 
			
		||||
            cluster_health -= health_delta_map["osd_out"]
 | 
			
		||||
            messages.append(
 | 
			
		||||
                f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
 | 
			
		||||
            )
 | 
			
		||||
        elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
 | 
			
		||||
            cluster_health -= health_delta_map["osd_down"]
 | 
			
		||||
            messages.append(
 | 
			
		||||
                f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    # Check for (n-1) overprovisioning
 | 
			
		||||
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
 | 
			
		||||
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
 | 
			
		||||
    n_minus_1_total = 0
 | 
			
		||||
    alloc_total = 0
 | 
			
		||||
    node_largest_index = None
 | 
			
		||||
    node_largest_count = 0
 | 
			
		||||
    for index, node in enumerate(node_list):
 | 
			
		||||
        node_mem_total = node["memory"]["total"]
 | 
			
		||||
        node_mem_alloc = node["memory"]["allocated"]
 | 
			
		||||
        alloc_total += node_mem_alloc
 | 
			
		||||
        # Determine if this node is the largest seen so far
 | 
			
		||||
        if node_mem_total > node_largest_count:
 | 
			
		||||
            node_largest_index = index
 | 
			
		||||
            node_largest_count = node_mem_total
 | 
			
		||||
    n_minus_1_node_list = list()
 | 
			
		||||
    for index, node in enumerate(node_list):
 | 
			
		||||
        if index == node_largest_index:
 | 
			
		||||
            continue
 | 
			
		||||
        n_minus_1_node_list.append(node)
 | 
			
		||||
    for index, node in enumerate(n_minus_1_node_list):
 | 
			
		||||
        n_minus_1_total += node["memory"]["total"]
 | 
			
		||||
    if alloc_total > n_minus_1_total:
 | 
			
		||||
        cluster_health -= health_delta_map["memory_overprovisioned"]
 | 
			
		||||
        messages.append(
 | 
			
		||||
            f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # Check Ceph cluster health
 | 
			
		||||
    ceph_health = loads(zkhandler.read("base.storage.health"))
 | 
			
		||||
    ceph_health_status = ceph_health["status"]
 | 
			
		||||
    ceph_health_entries = ceph_health["checks"].keys()
 | 
			
		||||
 | 
			
		||||
    ceph_health_status_map = {
 | 
			
		||||
        "HEALTH_ERR": "ERROR",
 | 
			
		||||
        "HEALTH_WARN": "WARNING",
 | 
			
		||||
    }
 | 
			
		||||
    for entry in ceph_health_entries:
 | 
			
		||||
        messages.append(
 | 
			
		||||
            f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    if ceph_health_status == "HEALTH_ERR":
 | 
			
		||||
        cluster_health -= health_delta_map["ceph_err"]
 | 
			
		||||
    elif ceph_health_status == "HEALTH_WARN":
 | 
			
		||||
        cluster_health -= health_delta_map["ceph_warn"]
 | 
			
		||||
 | 
			
		||||
    return cluster_health, messages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def getClusterInformation(zkhandler):
 | 
			
		||||
    # Get cluster maintenance state
 | 
			
		||||
    maint_state = zkhandler.read("base.config.maintenance")
 | 
			
		||||
 | 
			
		||||
    # List of messages to display to the clients
 | 
			
		||||
    cluster_health_msg = []
 | 
			
		||||
    storage_health_msg = []
 | 
			
		||||
    maintenance_state = zkhandler.read("base.config.maintenance")
 | 
			
		||||
 | 
			
		||||
    # Get node information object list
 | 
			
		||||
    retcode, node_list = pvc_node.get_list(zkhandler, None)
 | 
			
		||||
@@ -78,135 +188,6 @@ def getClusterInformation(zkhandler):
 | 
			
		||||
    ceph_volume_count = len(ceph_volume_list)
 | 
			
		||||
    ceph_snapshot_count = len(ceph_snapshot_list)
 | 
			
		||||
 | 
			
		||||
    # Determinations for general cluster health
 | 
			
		||||
    cluster_healthy_status = True
 | 
			
		||||
    # Check for (n-1) overprovisioning
 | 
			
		||||
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
 | 
			
		||||
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
 | 
			
		||||
    n_minus_1_total = 0
 | 
			
		||||
    alloc_total = 0
 | 
			
		||||
 | 
			
		||||
    node_largest_index = None
 | 
			
		||||
    node_largest_count = 0
 | 
			
		||||
    for index, node in enumerate(node_list):
 | 
			
		||||
        node_mem_total = node["memory"]["total"]
 | 
			
		||||
        node_mem_alloc = node["memory"]["allocated"]
 | 
			
		||||
        alloc_total += node_mem_alloc
 | 
			
		||||
 | 
			
		||||
        # Determine if this node is the largest seen so far
 | 
			
		||||
        if node_mem_total > node_largest_count:
 | 
			
		||||
            node_largest_index = index
 | 
			
		||||
            node_largest_count = node_mem_total
 | 
			
		||||
    n_minus_1_node_list = list()
 | 
			
		||||
    for index, node in enumerate(node_list):
 | 
			
		||||
        if index == node_largest_index:
 | 
			
		||||
            continue
 | 
			
		||||
        n_minus_1_node_list.append(node)
 | 
			
		||||
    for index, node in enumerate(n_minus_1_node_list):
 | 
			
		||||
        n_minus_1_total += node["memory"]["total"]
 | 
			
		||||
    if alloc_total > n_minus_1_total:
 | 
			
		||||
        cluster_healthy_status = False
 | 
			
		||||
        cluster_health_msg.append(
 | 
			
		||||
            "Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
 | 
			
		||||
                alloc_total, n_minus_1_total
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # Determinations for node health
 | 
			
		||||
    node_healthy_status = list(range(0, node_count))
 | 
			
		||||
    node_report_status = list(range(0, node_count))
 | 
			
		||||
    for index, node in enumerate(node_list):
 | 
			
		||||
        daemon_state = node["daemon_state"]
 | 
			
		||||
        domain_state = node["domain_state"]
 | 
			
		||||
        if daemon_state != "run" and domain_state != "ready":
 | 
			
		||||
            node_healthy_status[index] = False
 | 
			
		||||
            cluster_health_msg.append(
 | 
			
		||||
                "Node '{}' in {},{} state".format(
 | 
			
		||||
                    node["name"], daemon_state, domain_state
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            node_healthy_status[index] = True
 | 
			
		||||
        node_report_status[index] = daemon_state + "," + domain_state
 | 
			
		||||
 | 
			
		||||
    # Determinations for VM health
 | 
			
		||||
    vm_healthy_status = list(range(0, vm_count))
 | 
			
		||||
    vm_report_status = list(range(0, vm_count))
 | 
			
		||||
    for index, vm in enumerate(vm_list):
 | 
			
		||||
        vm_state = vm["state"]
 | 
			
		||||
        if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
 | 
			
		||||
            vm_healthy_status[index] = False
 | 
			
		||||
            cluster_health_msg.append(
 | 
			
		||||
                "VM '{}' in {} state".format(vm["name"], vm_state)
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            vm_healthy_status[index] = True
 | 
			
		||||
        vm_report_status[index] = vm_state
 | 
			
		||||
 | 
			
		||||
    # Determinations for OSD health
 | 
			
		||||
    ceph_osd_healthy_status = list(range(0, ceph_osd_count))
 | 
			
		||||
    ceph_osd_report_status = list(range(0, ceph_osd_count))
 | 
			
		||||
    for index, ceph_osd in enumerate(ceph_osd_list):
 | 
			
		||||
        try:
 | 
			
		||||
            ceph_osd_up = ceph_osd["stats"]["up"]
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            ceph_osd_up = 0
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            ceph_osd_in = ceph_osd["stats"]["in"]
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            ceph_osd_in = 0
 | 
			
		||||
 | 
			
		||||
        up_texts = {1: "up", 0: "down"}
 | 
			
		||||
        in_texts = {1: "in", 0: "out"}
 | 
			
		||||
 | 
			
		||||
        if not ceph_osd_up or not ceph_osd_in:
 | 
			
		||||
            ceph_osd_healthy_status[index] = False
 | 
			
		||||
            cluster_health_msg.append(
 | 
			
		||||
                "OSD {} in {},{} state".format(
 | 
			
		||||
                    ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            ceph_osd_healthy_status[index] = True
 | 
			
		||||
        ceph_osd_report_status[index] = (
 | 
			
		||||
            up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
 | 
			
		||||
    if maint_state == "true":
 | 
			
		||||
        cluster_health = "Maintenance"
 | 
			
		||||
    elif (
 | 
			
		||||
        cluster_healthy_status is False
 | 
			
		||||
        or False in node_healthy_status
 | 
			
		||||
        or False in vm_healthy_status
 | 
			
		||||
        or False in ceph_osd_healthy_status
 | 
			
		||||
    ):
 | 
			
		||||
        cluster_health = "Degraded"
 | 
			
		||||
    else:
 | 
			
		||||
        cluster_health = "Optimal"
 | 
			
		||||
 | 
			
		||||
    # Find out our storage health from Ceph
 | 
			
		||||
    ceph_status = zkhandler.read("base.storage").split("\n")
 | 
			
		||||
    ceph_health = ceph_status[2].split()[-1]
 | 
			
		||||
 | 
			
		||||
    # Parse the status output to get the health indicators
 | 
			
		||||
    line_record = False
 | 
			
		||||
    for index, line in enumerate(ceph_status):
 | 
			
		||||
        if re.search("services:", line):
 | 
			
		||||
            line_record = False
 | 
			
		||||
        if line_record and len(line.strip()) > 0:
 | 
			
		||||
            storage_health_msg.append(line.strip())
 | 
			
		||||
        if re.search("health:", line):
 | 
			
		||||
            line_record = True
 | 
			
		||||
 | 
			
		||||
    if maint_state == "true":
 | 
			
		||||
        storage_health = "Maintenance"
 | 
			
		||||
    elif ceph_health != "HEALTH_OK":
 | 
			
		||||
        storage_health = "Degraded"
 | 
			
		||||
    else:
 | 
			
		||||
        storage_health = "Optimal"
 | 
			
		||||
 | 
			
		||||
    # State lists
 | 
			
		||||
    node_state_combinations = [
 | 
			
		||||
        "run,ready",
 | 
			
		||||
@@ -237,13 +218,19 @@ def getClusterInformation(zkhandler):
 | 
			
		||||
        "unmigrate",
 | 
			
		||||
        "provision",
 | 
			
		||||
    ]
 | 
			
		||||
    ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
 | 
			
		||||
    ceph_osd_state_combinations = [
 | 
			
		||||
        "up,in",
 | 
			
		||||
        "up,out",
 | 
			
		||||
        "down,in",
 | 
			
		||||
        "down,out",
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    # Format the Node states
 | 
			
		||||
    formatted_node_states = {"total": node_count}
 | 
			
		||||
    for state in node_state_combinations:
 | 
			
		||||
        state_count = 0
 | 
			
		||||
        for node_state in node_report_status:
 | 
			
		||||
        for node in node_list:
 | 
			
		||||
            node_state = f"{node['daemon_state']},{node['domain_state']}"
 | 
			
		||||
            if node_state == state:
 | 
			
		||||
                state_count += 1
 | 
			
		||||
        if state_count > 0:
 | 
			
		||||
@@ -253,28 +240,35 @@ def getClusterInformation(zkhandler):
 | 
			
		||||
    formatted_vm_states = {"total": vm_count}
 | 
			
		||||
    for state in vm_state_combinations:
 | 
			
		||||
        state_count = 0
 | 
			
		||||
        for vm_state in vm_report_status:
 | 
			
		||||
            if vm_state == state:
 | 
			
		||||
        for vm in vm_list:
 | 
			
		||||
            if vm["state"] == state:
 | 
			
		||||
                state_count += 1
 | 
			
		||||
        if state_count > 0:
 | 
			
		||||
            formatted_vm_states[state] = state_count
 | 
			
		||||
 | 
			
		||||
    # Format the OSD states
 | 
			
		||||
    up_texts = {1: "up", 0: "down"}
 | 
			
		||||
    in_texts = {1: "in", 0: "out"}
 | 
			
		||||
    formatted_osd_states = {"total": ceph_osd_count}
 | 
			
		||||
    for state in ceph_osd_state_combinations:
 | 
			
		||||
        state_count = 0
 | 
			
		||||
        for ceph_osd_state in ceph_osd_report_status:
 | 
			
		||||
        for ceph_osd in ceph_osd_list:
 | 
			
		||||
            ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
 | 
			
		||||
            if ceph_osd_state == state:
 | 
			
		||||
                state_count += 1
 | 
			
		||||
        if state_count > 0:
 | 
			
		||||
            formatted_osd_states[state] = state_count
 | 
			
		||||
 | 
			
		||||
    # Get cluster health data
 | 
			
		||||
    cluster_health, cluster_health_messages = getClusterHealth(
 | 
			
		||||
        zkhandler, node_list, vm_list, ceph_osd_list
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Format the status data
 | 
			
		||||
    cluster_information = {
 | 
			
		||||
        "health": cluster_health,
 | 
			
		||||
        "health_msg": cluster_health_msg,
 | 
			
		||||
        "storage_health": storage_health,
 | 
			
		||||
        "storage_health_msg": storage_health_msg,
 | 
			
		||||
        "health_messages": cluster_health_messages,
 | 
			
		||||
        "maintenance": maintenance_state,
 | 
			
		||||
        "primary_node": common.getPrimaryNode(zkhandler),
 | 
			
		||||
        "upstream_ip": zkhandler.read("base.config.upstream_ip"),
 | 
			
		||||
        "nodes": formatted_node_states,
 | 
			
		||||
 
 | 
			
		||||
@@ -1 +1 @@
 | 
			
		||||
{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
 | 
			
		||||
{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
 | 
			
		||||
@@ -569,6 +569,7 @@ class ZKSchema(object):
 | 
			
		||||
            "domain": f"{_schema_root}/domains",
 | 
			
		||||
            "network": f"{_schema_root}/networks",
 | 
			
		||||
            "storage": f"{_schema_root}/ceph",
 | 
			
		||||
            "storage.health": f"{_schema_root}/ceph/health",
 | 
			
		||||
            "storage.util": f"{_schema_root}/ceph/util",
 | 
			
		||||
            "osd": f"{_schema_root}/ceph/osds",
 | 
			
		||||
            "pool": f"{_schema_root}/ceph/pools",
 | 
			
		||||
 
 | 
			
		||||
@@ -1,129 +0,0 @@
 | 
			
		||||
#!/usr/bin/env python3
 | 
			
		||||
 | 
			
		||||
# ceph.py - PVC Monitoring example plugin for Ceph status
 | 
			
		||||
# Part of the Parallel Virtual Cluster (PVC) system
 | 
			
		||||
#
 | 
			
		||||
#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 | 
			
		||||
#
 | 
			
		||||
#    This program is free software: you can redistribute it and/or modify
 | 
			
		||||
#    it under the terms of the GNU General Public License as published by
 | 
			
		||||
#    the Free Software Foundation, version 3.
 | 
			
		||||
#
 | 
			
		||||
#    This program is distributed in the hope that it will be useful,
 | 
			
		||||
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
#    GNU General Public License for more details.
 | 
			
		||||
#
 | 
			
		||||
#    You should have received a copy of the GNU General Public License
 | 
			
		||||
#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 | 
			
		||||
#
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
# This script provides an example of a PVC monitoring plugin script. It will create
 | 
			
		||||
# a simple plugin to check the Ceph cluster health for anomalies, and return a health
 | 
			
		||||
# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50).
 | 
			
		||||
 | 
			
		||||
# This script can thus be used as an example or reference implementation of a
 | 
			
		||||
# PVC monitoring pluginscript and expanded upon as required.
 | 
			
		||||
 | 
			
		||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
 | 
			
		||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 | 
			
		||||
# of the role of each function is provided in context of the example; see the other
 | 
			
		||||
# examples for more potential uses.
 | 
			
		||||
 | 
			
		||||
# WARNING:
 | 
			
		||||
#
 | 
			
		||||
# This script will run in the context of the node daemon keepalives as root.
 | 
			
		||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This import is always required here, as MonitoringPlugin is used by the
 | 
			
		||||
# MonitoringPluginScript class
 | 
			
		||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A monitoring plugin script must always expose its nice name, which must be identical to
 | 
			
		||||
# the file name
 | 
			
		||||
PLUGIN_NAME = "ceph"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 | 
			
		||||
class MonitoringPluginScript(MonitoringPlugin):
 | 
			
		||||
    def setup(self):
 | 
			
		||||
        """
 | 
			
		||||
        setup(): Perform special setup steps during node daemon startup
 | 
			
		||||
 | 
			
		||||
        This step is optional and should be used sparingly.
 | 
			
		||||
 | 
			
		||||
        If you wish for the plugin to not load in certain conditions, do any checks here
 | 
			
		||||
        and return a non-None failure message to indicate the error.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def run(self):
 | 
			
		||||
        """
 | 
			
		||||
        run(): Perform the check actions and return a PluginResult object
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # Run any imports first
 | 
			
		||||
        from rados import Rados
 | 
			
		||||
        from json import loads, dumps
 | 
			
		||||
 | 
			
		||||
        # Connect to the Ceph cluster
 | 
			
		||||
        try:
 | 
			
		||||
            ceph_conn = Rados(
 | 
			
		||||
                conffile=self.config["ceph_config_file"],
 | 
			
		||||
                conf=dict(keyring=self.config["ceph_admin_keyring"]),
 | 
			
		||||
            )
 | 
			
		||||
            ceph_conn.connect(timeout=1)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.log(f"Failed to connect to Ceph cluster: {e}", state="e")
 | 
			
		||||
            return self.plugin_result
 | 
			
		||||
            
 | 
			
		||||
        # Get the Ceph cluster health
 | 
			
		||||
        try:
 | 
			
		||||
            health_status = loads(
 | 
			
		||||
                ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1]
 | 
			
		||||
            )
 | 
			
		||||
            ceph_health = health_status["status"]
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.log(f"Failed to get health data from Ceph cluster: {e}", state="e")
 | 
			
		||||
            return self.plugin_result
 | 
			
		||||
        finally:
 | 
			
		||||
            ceph_conn.shutdown()
 | 
			
		||||
 | 
			
		||||
        # Get a list of error entries in the health status output
 | 
			
		||||
        error_entries = health_status["checks"].keys()
 | 
			
		||||
 | 
			
		||||
        # Set the health delta based on the errors presented
 | 
			
		||||
        if ceph_health == "HEALTH_ERR":
 | 
			
		||||
            health_delta = 50
 | 
			
		||||
            message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}"
 | 
			
		||||
        elif ceph_health == "HEALTH_WARN":
 | 
			
		||||
            health_delta = 10
 | 
			
		||||
            message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}"
 | 
			
		||||
        else:
 | 
			
		||||
            health_delta = 0
 | 
			
		||||
            message = "Ceph cluster in OK state"
 | 
			
		||||
 | 
			
		||||
        # Set the health delta in our local PluginResult object
 | 
			
		||||
        self.plugin_result.set_health_delta(health_delta)
 | 
			
		||||
 | 
			
		||||
        # Set the message in our local PluginResult object
 | 
			
		||||
        self.plugin_result.set_message(message)
 | 
			
		||||
 | 
			
		||||
        # Set the detailed data in our local PluginResult object
 | 
			
		||||
        self.plugin_result.set_data(health_status)
 | 
			
		||||
 | 
			
		||||
        # Return our local PluginResult object
 | 
			
		||||
        return self.plugin_result
 | 
			
		||||
 | 
			
		||||
    def cleanup(self):
 | 
			
		||||
        """
 | 
			
		||||
        cleanup(): Perform special cleanup steps during node daemon termination
 | 
			
		||||
 | 
			
		||||
        This step is optional and should be used sparingly.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        pass
 | 
			
		||||
@@ -133,7 +133,7 @@ class MonitoringPluginScript(MonitoringPlugin):
 | 
			
		||||
        self.plugin_result.set_health_delta(health_delta)
 | 
			
		||||
 | 
			
		||||
        # Craft the message
 | 
			
		||||
        message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages valid: {count_ok}, inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
 | 
			
		||||
        message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
 | 
			
		||||
 | 
			
		||||
        # Set the message in our local PluginResult object
 | 
			
		||||
        self.plugin_result.set_message(message)
 | 
			
		||||
 
 | 
			
		||||
@@ -99,9 +99,10 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
 | 
			
		||||
 | 
			
		||||
    # Primary-only functions
 | 
			
		||||
    if this_node.router_state == "primary":
 | 
			
		||||
        # Get Ceph status information (pretty)
 | 
			
		||||
        if debug:
 | 
			
		||||
            logger.out(
 | 
			
		||||
                "Set ceph health information in zookeeper (primary only)",
 | 
			
		||||
                "Set Ceph status information in zookeeper (primary only)",
 | 
			
		||||
                state="d",
 | 
			
		||||
                prefix="ceph-thread",
 | 
			
		||||
            )
 | 
			
		||||
@@ -115,9 +116,27 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.out("Failed to set Ceph status data: {}".format(e), state="e")
 | 
			
		||||
 | 
			
		||||
        # Get Ceph health information (JSON)
 | 
			
		||||
        if debug:
 | 
			
		||||
            logger.out(
 | 
			
		||||
                "Set ceph rados df information in zookeeper (primary only)",
 | 
			
		||||
                "Set Ceph health information in zookeeper (primary only)",
 | 
			
		||||
                state="d",
 | 
			
		||||
                prefix="ceph-thread",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        command = {"prefix": "health", "format": "json"}
 | 
			
		||||
        ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
 | 
			
		||||
            1
 | 
			
		||||
        ].decode("ascii")
 | 
			
		||||
        try:
 | 
			
		||||
            zkhandler.write([("base.storage.health", str(ceph_health))])
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.out("Failed to set Ceph health data: {}".format(e), state="e")
 | 
			
		||||
 | 
			
		||||
        # Get Ceph df information (pretty)
 | 
			
		||||
        if debug:
 | 
			
		||||
            logger.out(
 | 
			
		||||
                "Set Ceph rados df information in zookeeper (primary only)",
 | 
			
		||||
                state="d",
 | 
			
		||||
                prefix="ceph-thread",
 | 
			
		||||
            )
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user