Add Ceph metrics proxy and health fault counts

This commit is contained in:
Joshua Boniface 2023-12-09 02:52:08 -05:00
parent 7373bfed3f
commit 9b3c9f1be5
2 changed files with 99 additions and 8 deletions

View File

@ -639,13 +639,35 @@ class API_Metrics(Resource):
400: 400:
description: Bad request description: Bad request
""" """
status_data, status_retcode = api_helper.cluster_status() return api_helper.cluster_metrics()
return api_helper.cluster_format_metrics(status_data, status_retcode)
api.add_resource(API_Metrics, "/metrics") api.add_resource(API_Metrics, "/metrics")
# /metrics/ceph
class API_Metrics_Ceph(Resource):
def get(self):
"""
Return the current PVC Ceph Prometheus metrics
Proxies a metrics request to the current active MGR, since this is dynamic
and can't be controlled by PVC easily.
---
tags:
- root
responses:
200:
description: OK
400:
description: Bad request
"""
return api_helper.cluster_ceph_metrics_proxy()
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
# /faults # /faults
class API_Faults(Resource): class API_Faults(Resource):
@RequestParser( @RequestParser(

View File

@ -23,6 +23,8 @@ import flask
import json import json
import lxml.etree as etree import lxml.etree as etree
from re import match
from requests import get
from werkzeug.formparser import parse_form_data from werkzeug.formparser import parse_form_data
from pvcapid.Daemon import config, strtobool from pvcapid.Daemon import config, strtobool
@ -124,18 +126,24 @@ def cluster_maintenance(zkhandler, maint_state="false"):
# #
@pvc_common.Profiler(config) @pvc_common.Profiler(config)
@ZKConnection(config) @ZKConnection(config)
def cluster_format_metrics(zkhandler, status_data, status_retcode): def cluster_metrics(zkhandler):
""" """
Format status data from cluster_status into Prometheus-compatible metrics Format status data from cluster_status into Prometheus-compatible metrics
""" """
from flask import make_response
if status_retcode != 200: # Get general cluster information
return "Error: Status data threw error", status_retcode status_retflag, status_data = pvc_cluster.get_info(zkhandler)
if not status_retflag:
return "Error: Status data threw error", 400
print(status_data)
faults_retflag, faults_data = pvc_faults.get_list(zkhandler)
if not faults_retflag:
return "Error: Faults data threw error", 400
print(faults_data)
retcode = 200 retcode = 200
output_lines = list() output_lines = list()
print(status_data)
output_lines.append("# HELP pvc_info PVC cluster information") output_lines.append("# HELP pvc_info PVC cluster information")
output_lines.append("# TYPE pvc_info gauge") output_lines.append("# TYPE pvc_info gauge")
@ -153,6 +161,19 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode):
output_lines.append("# TYPE pvc_cluster_health gauge") output_lines.append("# TYPE pvc_cluster_health gauge")
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}") output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
output_lines.append("# TYPE pvc_cluster_faults gauge")
fault_map = dict()
for fault in faults_data:
if not fault_map.get(fault["status"]):
fault_map[fault["status"]] = 1
else:
fault_map[fault["status"]] += 1
for fault_type in fault_map:
output_lines.append(
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
)
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults") # output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
# output_lines.append("# TYPE pvc_cluster_faults gauge") # output_lines.append("# TYPE pvc_cluster_faults gauge")
# for fault_msg in status_data["cluster_health"]["messages"]: # for fault_msg in status_data["cluster_health"]["messages"]:
@ -204,7 +225,55 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode):
output_lines.append(f"pvc_snapshots {status_data['snapshots']}") output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
# We manually make the Flask response here so the output format is correct. # We manually make the Flask response here so the output format is correct.
response = make_response("\n".join(output_lines) + "\n", retcode) response = flask.make_response("\n".join(output_lines) + "\n", retcode)
response.mimetype = "text/plain"
return response
@pvc_common.Profiler(config)
@ZKConnection(config)
def cluster_ceph_metrics_proxy(zkhandler):
"""
Obtain current Ceph Prometheus metrics from the active MGR
"""
# We have to parse out the *name* of the currently active MGR
# While the JSON version of the "ceph status" output provides a
# URL, this URL is in the backend (i.e. storage) network, which
# the API might not have access to. This way, we can connect to
# the node name which can be handled however.
retcode, retdata = pvc_ceph.get_status(zkhandler)
if not retcode:
ceph_mgr_node = None
else:
ceph_data = retdata["ceph_data"]
try:
ceph_mgr_line = [
n for n in ceph_data.split("\n") if match(r"^mgr:", n.strip())
][0]
ceph_mgr_node = ceph_mgr_line.split()[1].split("(")[0]
except Exception:
ceph_mgr_node = None
if ceph_mgr_node is not None:
# Get the data from the endpoint
# We use the default port of 9283
ceph_prometheus_uri = f"http://{ceph_mgr_node}:9283/metrics"
response = get(ceph_prometheus_uri)
if response.status_code == 200:
status_code = 200
output = response.text
else:
status_code = 400
output = (
f"Error: Failed to obtain metric data from {ceph_mgr_node} MGR daemon\n"
)
else:
status_code = 400
output = "Error: Failed to find an active MGR node\n"
# We manually make the Flask response here so the output format is correct.
response = flask.make_response(output, status_code)
response.mimetype = "text/plain" response.mimetype = "text/plain"
return response return response