Add full/nearfull OSD health detection

This commit is contained in:
Joshua Boniface 2023-04-28 10:48:28 -04:00
parent 9114255af5
commit c1782c5004
1 changed files with 14 additions and 0 deletions

View File

@ -51,6 +51,8 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
"vm_stopped": 10,
"osd_out": 50,
"osd_down": 10,
"osd_full": 50,
"osd_nearfull": 10,
"memory_overprovisioned": 50,
"ceph_err": 50,
"ceph_warn": 10,
@ -110,6 +112,18 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
)
# Handle full or nearfull OSDs (>85%)
if ceph_osd["stats"]["utilization"] >= 90:
cluster_health_value -= health_delta_map["osd_full"]
cluster_health_messages.append(
f"cluster: Ceph OSD {ceph_osd['id']} is FULL ({ceph_osd['stats']['utilization']:.1f}% > 90%)"
)
elif ceph_osd["stats"]["utilization"] >= 85:
cluster_health_value -= health_delta_map["osd_nearfull"]
cluster_health_messages.append(
f"cluster: Ceph OSD {ceph_osd['id']} is NEARFULL ({ceph_osd['stats']['utilization']:.1f}% > 85%)"
)
# Check for (n-1) overprovisioning
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
# the total memory of the (n-1) smallest nodes, trigger this warning.