From 9b3c9f1be5ff32c9cfa2ebd6b816e3396eee2270 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sat, 9 Dec 2023 02:52:08 -0500 Subject: [PATCH] Add Ceph metrics proxy and health fault counts --- api-daemon/pvcapid/flaskapi.py | 26 ++++++++++- api-daemon/pvcapid/helper.py | 81 +++++++++++++++++++++++++++++++--- 2 files changed, 99 insertions(+), 8 deletions(-) diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 7f72b9d2..dba6317f 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -639,13 +639,35 @@ class API_Metrics(Resource): 400: description: Bad request """ - status_data, status_retcode = api_helper.cluster_status() - return api_helper.cluster_format_metrics(status_data, status_retcode) + return api_helper.cluster_metrics() api.add_resource(API_Metrics, "/metrics") +# /metrics/ceph +class API_Metrics_Ceph(Resource): + def get(self): + """ + Return the current PVC Ceph Prometheus metrics + + Proxies a metrics request to the current active MGR, since this is dynamic + and can't be controlled by PVC easily. + --- + tags: + - root + responses: + 200: + description: OK + 400: + description: Bad request + """ + return api_helper.cluster_ceph_metrics_proxy() + + +api.add_resource(API_Metrics_Ceph, "/metrics/ceph") + + # /faults class API_Faults(Resource): @RequestParser( diff --git a/api-daemon/pvcapid/helper.py b/api-daemon/pvcapid/helper.py index 899dc752..174cf502 100755 --- a/api-daemon/pvcapid/helper.py +++ b/api-daemon/pvcapid/helper.py @@ -23,6 +23,8 @@ import flask import json import lxml.etree as etree +from re import match +from requests import get from werkzeug.formparser import parse_form_data from pvcapid.Daemon import config, strtobool @@ -124,18 +126,24 @@ def cluster_maintenance(zkhandler, maint_state="false"): # @pvc_common.Profiler(config) @ZKConnection(config) -def cluster_format_metrics(zkhandler, status_data, status_retcode): +def cluster_metrics(zkhandler): """ Format status data from cluster_status into Prometheus-compatible metrics """ - from flask import make_response - if status_retcode != 200: - return "Error: Status data threw error", status_retcode + # Get general cluster information + status_retflag, status_data = pvc_cluster.get_info(zkhandler) + if not status_retflag: + return "Error: Status data threw error", 400 + print(status_data) + + faults_retflag, faults_data = pvc_faults.get_list(zkhandler) + if not faults_retflag: + return "Error: Faults data threw error", 400 + print(faults_data) retcode = 200 output_lines = list() - print(status_data) output_lines.append("# HELP pvc_info PVC cluster information") output_lines.append("# TYPE pvc_info gauge") @@ -153,6 +161,19 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode): output_lines.append("# TYPE pvc_cluster_health gauge") output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}") + output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults") + output_lines.append("# TYPE pvc_cluster_faults gauge") + fault_map = dict() + for fault in faults_data: + if not fault_map.get(fault["status"]): + fault_map[fault["status"]] = 1 + else: + fault_map[fault["status"]] += 1 + for fault_type in fault_map: + output_lines.append( + f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}' + ) + # output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults") # output_lines.append("# TYPE pvc_cluster_faults gauge") # for fault_msg in status_data["cluster_health"]["messages"]: @@ -204,7 +225,55 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode): output_lines.append(f"pvc_snapshots {status_data['snapshots']}") # We manually make the Flask response here so the output format is correct. - response = make_response("\n".join(output_lines) + "\n", retcode) + response = flask.make_response("\n".join(output_lines) + "\n", retcode) + response.mimetype = "text/plain" + return response + + +@pvc_common.Profiler(config) +@ZKConnection(config) +def cluster_ceph_metrics_proxy(zkhandler): + """ + Obtain current Ceph Prometheus metrics from the active MGR + """ + # We have to parse out the *name* of the currently active MGR + # While the JSON version of the "ceph status" output provides a + # URL, this URL is in the backend (i.e. storage) network, which + # the API might not have access to. This way, we can connect to + # the node name which can be handled however. + retcode, retdata = pvc_ceph.get_status(zkhandler) + if not retcode: + ceph_mgr_node = None + else: + ceph_data = retdata["ceph_data"] + try: + ceph_mgr_line = [ + n for n in ceph_data.split("\n") if match(r"^mgr:", n.strip()) + ][0] + ceph_mgr_node = ceph_mgr_line.split()[1].split("(")[0] + except Exception: + ceph_mgr_node = None + + if ceph_mgr_node is not None: + # Get the data from the endpoint + # We use the default port of 9283 + ceph_prometheus_uri = f"http://{ceph_mgr_node}:9283/metrics" + response = get(ceph_prometheus_uri) + + if response.status_code == 200: + status_code = 200 + output = response.text + else: + status_code = 400 + output = ( + f"Error: Failed to obtain metric data from {ceph_mgr_node} MGR daemon\n" + ) + else: + status_code = 400 + output = "Error: Failed to find an active MGR node\n" + + # We manually make the Flask response here so the output format is correct. + response = flask.make_response(output, status_code) response.mimetype = "text/plain" return response