Fix bugs and formatting of health messages

Remove extra text from packages plugin
Fix linting of cluster.py file
2023-02-15 16:28:56 -05:00 · 2023-02-15 16:28:41 -05:00 · 2023-02-15 15:48:31 -05:00 · 2023-02-15 15:46:13 -05:00 · 2023-02-15 15:45:43 -05:00 · 2023-02-15 15:26:57 -05:00
8 changed files with 206 additions and 333 deletions
--- a/client-cli/pvc/cli_lib/cluster.py
+++ b/client-cli/pvc/cli_lib/cluster.py
@@ -125,81 +125,56 @@ def format_info(cluster_information, oformat):
        return json.dumps(cluster_information, indent=4)
    # Plain formatting, i.e. human-readable
-    if cluster_information["health"] == "Optimal":
+    if cluster_information["maintenance"] == "True":
        health_colour = ansiprint.green()
    elif cluster_information["health"] == "Maintenance":
        health_colour = ansiprint.blue()
-    else:
+    elif cluster_information["health"] > 90:
        health_colour = ansiprint.green()
    elif cluster_information["health"] > 50:
        health_colour = ansiprint.yellow()
    if cluster_information["storage_health"] == "Optimal":
        storage_health_colour = ansiprint.green()
    elif cluster_information["storage_health"] == "Maintenance":
        storage_health_colour = ansiprint.blue()
    else:
-        storage_health_colour = ansiprint.yellow()
+        health_colour = ansiprint.red()
    ainformation = []
    if oformat == "short":
        ainformation.append(
            "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
        )
        ainformation.append(
            "{}Cluster health:{}      {}{}{}".format(
                ansiprint.purple(),
                ansiprint.end(),
                health_colour,
                cluster_information["health"],
                ansiprint.end(),
            )
        )
        if cluster_information["health_msg"]:
            for line in cluster_information["health_msg"]:
                ainformation.append("                     > {}".format(line))
        ainformation.append(
            "{}Storage health:{}      {}{}{}".format(
                ansiprint.purple(),
                ansiprint.end(),
                storage_health_colour,
                cluster_information["storage_health"],
                ansiprint.end(),
            )
        )
        if cluster_information["storage_health_msg"]:
            for line in cluster_information["storage_health_msg"]:
                ainformation.append("                     > {}".format(line))
        return "\n".join(ainformation)
    ainformation.append(
        "{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
    )
    ainformation.append("")
    health_text = f"{cluster_information['health']}%"
    if cluster_information["maintenance"] == "True":
        health_text += " (maintenance on)"
    ainformation.append(
        "{}Cluster health:{}  {}{}{}".format(
            ansiprint.purple(),
            ansiprint.end(),
            health_colour,
-            cluster_information["health"],
+            health_text,
            ansiprint.end(),
        )
    )
-    if cluster_information["health_msg"]:
+    if cluster_information["health_messages"]:
-        for line in cluster_information["health_msg"]:
+        health_messages = "\n                 > ".join(
-            ainformation.append("                     > {}".format(line))
+            sorted(cluster_information["health_messages"])
        )
        ainformation.append(
-        "{}Storage health:{}      {}{}{}".format(
+            "{}Health messages:{} > {}".format(
                ansiprint.purple(),
                ansiprint.end(),
-            storage_health_colour,
+                health_messages,
-            cluster_information["storage_health"],
+            )
        )
    else:
        ainformation.append(
            "{}Health messages:{} N/A".format(
                ansiprint.purple(),
                ansiprint.end(),
            )
        )
-    if cluster_information["storage_health_msg"]:
+
-        for line in cluster_information["storage_health_msg"]:
+    if oformat == "short":
-            ainformation.append("                     > {}".format(line))
+        return "\n".join(ainformation)
    ainformation.append("")
    ainformation.append(
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@@ -158,6 +158,19 @@ def get_status(zkhandler):
    return True, status_data
 def get_health(zkhandler):
    primary_node = zkhandler.read("base.config.primary_node")
    ceph_health = zkhandler.read("base.storage.health").rstrip()
    # Create a data structure for the information
    status_data = {
        "type": "health",
        "primary_node": primary_node,
        "ceph_data": ceph_health,
    }
    return True, status_data
 def get_util(zkhandler):
    primary_node = zkhandler.read("base.config.primary_node")
    ceph_df = zkhandler.read("base.storage.util").rstrip()
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -19,7 +19,7 @@
 #
 ###############################################################################
-import re
+from json import loads
 import daemon_lib.common as common
 import daemon_lib.vm as pvc_vm
@@ -44,13 +44,123 @@ def set_maintenance(zkhandler, maint_state):
        return True, "Successfully set cluster in normal mode"
 def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
    health_delta_map = {
        "node_stopped": 50,
        "node_flushed": 10,
        "vm_stopped": 10,
        "osd_out": 50,
        "osd_down": 10,
        "memory_overprovisioned": 50,
        "ceph_err": 50,
        "ceph_warn": 10,
    }
    # Generate total cluster health numbers
    cluster_health = 100
    messages = list()
    for index, node in enumerate(node_list):
        # Apply node health values to total health number
        try:
            node_health_int = int(node["health"])
        except Exception:
            node_health_int = 100
        cluster_health -= 100 - node_health_int
        for entry in node["health_details"]:
            if entry["health_delta"] > 0:
                messages.append(
                    f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
                )
        # Handle unhealthy node states
        if node["daemon_state"] not in ["run"]:
            cluster_health -= health_delta_map["node_stopped"]
            messages.append(
                f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
            )
        elif node["domain_state"] not in ["ready"]:
            cluster_health -= health_delta_map["node_flushed"]
            messages.append(
                f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
            )
    for index, vm in enumerate(vm_list):
        # Handle unhealthy VM states
        if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]:
            cluster_health -= health_delta_map["vm_stopped"]
            messages.append(f"cluster: VM {vm['name']} in {vm['state'].upper()} state")
    for index, ceph_osd in enumerate(ceph_osd_list):
        in_texts = {1: "in", 0: "out"}
        up_texts = {1: "up", 0: "down"}
        # Handle unhealthy OSD states
        if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
            cluster_health -= health_delta_map["osd_out"]
            messages.append(
                f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
            )
        elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
            cluster_health -= health_delta_map["osd_down"]
            messages.append(
                f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
            )
    # Check for (n-1) overprovisioning
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
    n_minus_1_total = 0
    alloc_total = 0
    node_largest_index = None
    node_largest_count = 0
    for index, node in enumerate(node_list):
        node_mem_total = node["memory"]["total"]
        node_mem_alloc = node["memory"]["allocated"]
        alloc_total += node_mem_alloc
        # Determine if this node is the largest seen so far
        if node_mem_total > node_largest_count:
            node_largest_index = index
            node_largest_count = node_mem_total
    n_minus_1_node_list = list()
    for index, node in enumerate(node_list):
        if index == node_largest_index:
            continue
        n_minus_1_node_list.append(node)
    for index, node in enumerate(n_minus_1_node_list):
        n_minus_1_total += node["memory"]["total"]
    if alloc_total > n_minus_1_total:
        cluster_health -= health_delta_map["memory_overprovisioned"]
        messages.append(
            f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
        )
    # Check Ceph cluster health
    ceph_health = loads(zkhandler.read("base.storage.health"))
    ceph_health_status = ceph_health["status"]
    ceph_health_entries = ceph_health["checks"].keys()
    ceph_health_status_map = {
        "HEALTH_ERR": "ERROR",
        "HEALTH_WARN": "WARNING",
    }
    for entry in ceph_health_entries:
        messages.append(
            f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
        )
    if ceph_health_status == "HEALTH_ERR":
        cluster_health -= health_delta_map["ceph_err"]
    elif ceph_health_status == "HEALTH_WARN":
        cluster_health -= health_delta_map["ceph_warn"]
    return cluster_health, messages
 def getClusterInformation(zkhandler):
    # Get cluster maintenance state
-    maint_state = zkhandler.read("base.config.maintenance")
+    maintenance_state = zkhandler.read("base.config.maintenance")
    # List of messages to display to the clients
    cluster_health_msg = []
    storage_health_msg = []
    # Get node information object list
    retcode, node_list = pvc_node.get_list(zkhandler, None)
@@ -78,135 +188,6 @@ def getClusterInformation(zkhandler):
    ceph_volume_count = len(ceph_volume_list)
    ceph_snapshot_count = len(ceph_snapshot_list)
    # Determinations for general cluster health
    cluster_healthy_status = True
    # Check for (n-1) overprovisioning
    #   Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
    #   the total memory of the (n-1) smallest nodes, trigger this warning.
    n_minus_1_total = 0
    alloc_total = 0
    node_largest_index = None
    node_largest_count = 0
    for index, node in enumerate(node_list):
        node_mem_total = node["memory"]["total"]
        node_mem_alloc = node["memory"]["allocated"]
        alloc_total += node_mem_alloc
        # Determine if this node is the largest seen so far
        if node_mem_total > node_largest_count:
            node_largest_index = index
            node_largest_count = node_mem_total
    n_minus_1_node_list = list()
    for index, node in enumerate(node_list):
        if index == node_largest_index:
            continue
        n_minus_1_node_list.append(node)
    for index, node in enumerate(n_minus_1_node_list):
        n_minus_1_total += node["memory"]["total"]
    if alloc_total > n_minus_1_total:
        cluster_healthy_status = False
        cluster_health_msg.append(
            "Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
                alloc_total, n_minus_1_total
            )
        )
    # Determinations for node health
    node_healthy_status = list(range(0, node_count))
    node_report_status = list(range(0, node_count))
    for index, node in enumerate(node_list):
        daemon_state = node["daemon_state"]
        domain_state = node["domain_state"]
        if daemon_state != "run" and domain_state != "ready":
            node_healthy_status[index] = False
            cluster_health_msg.append(
                "Node '{}' in {},{} state".format(
                    node["name"], daemon_state, domain_state
                )
            )
        else:
            node_healthy_status[index] = True
        node_report_status[index] = daemon_state + "," + domain_state
    # Determinations for VM health
    vm_healthy_status = list(range(0, vm_count))
    vm_report_status = list(range(0, vm_count))
    for index, vm in enumerate(vm_list):
        vm_state = vm["state"]
        if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
            vm_healthy_status[index] = False
            cluster_health_msg.append(
                "VM '{}' in {} state".format(vm["name"], vm_state)
            )
        else:
            vm_healthy_status[index] = True
        vm_report_status[index] = vm_state
    # Determinations for OSD health
    ceph_osd_healthy_status = list(range(0, ceph_osd_count))
    ceph_osd_report_status = list(range(0, ceph_osd_count))
    for index, ceph_osd in enumerate(ceph_osd_list):
        try:
            ceph_osd_up = ceph_osd["stats"]["up"]
        except KeyError:
            ceph_osd_up = 0
        try:
            ceph_osd_in = ceph_osd["stats"]["in"]
        except KeyError:
            ceph_osd_in = 0
        up_texts = {1: "up", 0: "down"}
        in_texts = {1: "in", 0: "out"}
        if not ceph_osd_up or not ceph_osd_in:
            ceph_osd_healthy_status[index] = False
            cluster_health_msg.append(
                "OSD {} in {},{} state".format(
                    ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
                )
            )
        else:
            ceph_osd_healthy_status[index] = True
        ceph_osd_report_status[index] = (
            up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
        )
    # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
    if maint_state == "true":
        cluster_health = "Maintenance"
    elif (
        cluster_healthy_status is False
        or False in node_healthy_status
        or False in vm_healthy_status
        or False in ceph_osd_healthy_status
    ):
        cluster_health = "Degraded"
    else:
        cluster_health = "Optimal"
    # Find out our storage health from Ceph
    ceph_status = zkhandler.read("base.storage").split("\n")
    ceph_health = ceph_status[2].split()[-1]
    # Parse the status output to get the health indicators
    line_record = False
    for index, line in enumerate(ceph_status):
        if re.search("services:", line):
            line_record = False
        if line_record and len(line.strip()) > 0:
            storage_health_msg.append(line.strip())
        if re.search("health:", line):
            line_record = True
    if maint_state == "true":
        storage_health = "Maintenance"
    elif ceph_health != "HEALTH_OK":
        storage_health = "Degraded"
    else:
        storage_health = "Optimal"
    # State lists
    node_state_combinations = [
        "run,ready",
@@ -237,13 +218,19 @@ def getClusterInformation(zkhandler):
        "unmigrate",
        "provision",
    ]
-    ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
+    ceph_osd_state_combinations = [
        "up,in",
        "up,out",
        "down,in",
        "down,out",
    ]
    # Format the Node states
    formatted_node_states = {"total": node_count}
    for state in node_state_combinations:
        state_count = 0
-        for node_state in node_report_status:
+        for node in node_list:
            node_state = f"{node['daemon_state']},{node['domain_state']}"
            if node_state == state:
                state_count += 1
        if state_count > 0:
@@ -253,28 +240,35 @@ def getClusterInformation(zkhandler):
    formatted_vm_states = {"total": vm_count}
    for state in vm_state_combinations:
        state_count = 0
-        for vm_state in vm_report_status:
+        for vm in vm_list:
-            if vm_state == state:
+            if vm["state"] == state:
                state_count += 1
        if state_count > 0:
            formatted_vm_states[state] = state_count
    # Format the OSD states
    up_texts = {1: "up", 0: "down"}
    in_texts = {1: "in", 0: "out"}
    formatted_osd_states = {"total": ceph_osd_count}
    for state in ceph_osd_state_combinations:
        state_count = 0
-        for ceph_osd_state in ceph_osd_report_status:
+        for ceph_osd in ceph_osd_list:
            ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
            if ceph_osd_state == state:
                state_count += 1
        if state_count > 0:
            formatted_osd_states[state] = state_count
    # Get cluster health data
    cluster_health, cluster_health_messages = getClusterHealth(
        zkhandler, node_list, vm_list, ceph_osd_list
    )
    # Format the status data
    cluster_information = {
        "health": cluster_health,
-        "health_msg": cluster_health_msg,
+        "health_messages": cluster_health_messages,
-        "storage_health": storage_health,
+        "maintenance": maintenance_state,
        "storage_health_msg": storage_health_msg,
        "primary_node": common.getPrimaryNode(zkhandler),
        "upstream_ip": zkhandler.read("base.config.upstream_ip"),
        "nodes": formatted_node_states,
--- a/daemon-common/migrations/versions/9.json
+++ b/daemon-common/migrations/versions/9.json
@@ -1 +1 @@
-{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
+{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@@ -569,6 +569,7 @@ class ZKSchema(object):
            "domain": f"{_schema_root}/domains",
            "network": f"{_schema_root}/networks",
            "storage": f"{_schema_root}/ceph",
            "storage.health": f"{_schema_root}/ceph/health",
            "storage.util": f"{_schema_root}/ceph/util",
            "osd": f"{_schema_root}/ceph/osds",
            "pool": f"{_schema_root}/ceph/pools",
--- a/node-daemon/plugins/ceph
+++ b/node-daemon/plugins/ceph
@@ -1,129 +0,0 @@
 #!/usr/bin/env python3
 # ceph.py - PVC Monitoring example plugin for Ceph status
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, version 3.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # This script provides an example of a PVC monitoring plugin script. It will create
 # a simple plugin to check the Ceph cluster health for anomalies, and return a health
 # delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50).
 # This script can thus be used as an example or reference implementation of a
 # PVC monitoring pluginscript and expanded upon as required.
 # A monitoring plugin script must implement the class "MonitoringPluginScript" which
 # extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
 # of the role of each function is provided in context of the example; see the other
 # examples for more potential uses.
 # WARNING:
 #
 # This script will run in the context of the node daemon keepalives as root.
 # DO NOT install untrusted, unvetted plugins under any circumstances.
 # This import is always required here, as MonitoringPlugin is used by the
 # MonitoringPluginScript class
 from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
 # A monitoring plugin script must always expose its nice name, which must be identical to
 # the file name
 PLUGIN_NAME = "ceph"
 # The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
 class MonitoringPluginScript(MonitoringPlugin):
    def setup(self):
        """
        setup(): Perform special setup steps during node daemon startup
        This step is optional and should be used sparingly.
        If you wish for the plugin to not load in certain conditions, do any checks here
        and return a non-None failure message to indicate the error.
        """
        pass
    def run(self):
        """
        run(): Perform the check actions and return a PluginResult object
        """
        # Run any imports first
        from rados import Rados
        from json import loads, dumps
        # Connect to the Ceph cluster
        try:
            ceph_conn = Rados(
                conffile=self.config["ceph_config_file"],
                conf=dict(keyring=self.config["ceph_admin_keyring"]),
            )
            ceph_conn.connect(timeout=1)
        except Exception as e:
            self.log(f"Failed to connect to Ceph cluster: {e}", state="e")
            return self.plugin_result
        # Get the Ceph cluster health
        try:
            health_status = loads(
                ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1]
            )
            ceph_health = health_status["status"]
        except Exception as e:
            self.log(f"Failed to get health data from Ceph cluster: {e}", state="e")
            return self.plugin_result
        finally:
            ceph_conn.shutdown()
        # Get a list of error entries in the health status output
        error_entries = health_status["checks"].keys()
        # Set the health delta based on the errors presented
        if ceph_health == "HEALTH_ERR":
            health_delta = 50
            message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}"
        elif ceph_health == "HEALTH_WARN":
            health_delta = 10
            message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}"
        else:
            health_delta = 0
            message = "Ceph cluster in OK state"
        # Set the health delta in our local PluginResult object
        self.plugin_result.set_health_delta(health_delta)
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
        # Set the detailed data in our local PluginResult object
        self.plugin_result.set_data(health_status)
        # Return our local PluginResult object
        return self.plugin_result
    def cleanup(self):
        """
        cleanup(): Perform special cleanup steps during node daemon termination
        This step is optional and should be used sparingly.
        """
        pass
--- a/node-daemon/plugins/dpkg
+++ b/node-daemon/plugins/dpkg
@@ -133,7 +133,7 @@ class MonitoringPluginScript(MonitoringPlugin):
        self.plugin_result.set_health_delta(health_delta)
        # Craft the message
-        message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages valid: {count_ok}, inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
+        message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
        # Set the message in our local PluginResult object
        self.plugin_result.set_message(message)
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@@ -99,9 +99,10 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
    # Primary-only functions
    if this_node.router_state == "primary":
        # Get Ceph status information (pretty)
        if debug:
            logger.out(
-                "Set ceph health information in zookeeper (primary only)",
+                "Set Ceph status information in zookeeper (primary only)",
                state="d",
                prefix="ceph-thread",
            )
@@ -115,9 +116,27 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
        except Exception as e:
            logger.out("Failed to set Ceph status data: {}".format(e), state="e")
        # Get Ceph health information (JSON)
        if debug:
            logger.out(
-                "Set ceph rados df information in zookeeper (primary only)",
+                "Set Ceph health information in zookeeper (primary only)",
                state="d",
                prefix="ceph-thread",
            )
        command = {"prefix": "health", "format": "json"}
        ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
            1
        ].decode("ascii")
        try:
            zkhandler.write([("base.storage.health", str(ceph_health))])
        except Exception as e:
            logger.out("Failed to set Ceph health data: {}".format(e), state="e")
        # Get Ceph df information (pretty)
        if debug:
            logger.out(
                "Set Ceph rados df information in zookeeper (primary only)",
                state="d",
                prefix="ceph-thread",
            )
Author	SHA1	Message	Date
Joshua M. Boniface	fa900f6212	Fix bugs and formatting of health messages	2023-02-15 16:28:56 -05:00
Joshua M. Boniface	b236127dba	Remove extra text from packages plugin	2023-02-15 16:28:41 -05:00
Joshua M. Boniface	0ae77d7e77	Fix linting of cluster.py file	2023-02-15 15:48:31 -05:00
Joshua M. Boniface	8b5011c266	Move Ceph health to global cluster health	2023-02-15 15:46:13 -05:00
Joshua M. Boniface	6ac5b0d02f	Modify cluster health to use new values	2023-02-15 15:45:43 -05:00
Joshua M. Boniface	3a1b8f0e7a	Add JSON health to cluster data	2023-02-15 15:26:57 -05:00
		`@@ -1 +1 @@`
			{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}				{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}