Add Ceph metrics proxy and health fault counts

2023-12-09 02:52:08 -05:00
parent 7373bfed3f
commit 9b3c9f1be5
2 changed files with 99 additions and 8 deletions
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@@ -639,13 +639,35 @@ class API_Metrics(Resource):
          400:
            description: Bad request
        """
-        status_data, status_retcode = api_helper.cluster_status()
+        return api_helper.cluster_metrics()
        return api_helper.cluster_format_metrics(status_data, status_retcode)
 api.add_resource(API_Metrics, "/metrics")
 # /metrics/ceph
 class API_Metrics_Ceph(Resource):
    def get(self):
        """
        Return the current PVC Ceph Prometheus metrics
        Proxies a metrics request to the current active MGR, since this is dynamic
        and can't be controlled by PVC easily.
        ---
        tags:
          - root
        responses:
          200:
            description: OK
          400:
            description: Bad request
        """
        return api_helper.cluster_ceph_metrics_proxy()
 api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
 # /faults
 class API_Faults(Resource):
    @RequestParser(
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@@ -23,6 +23,8 @@ import flask
 import json
 import lxml.etree as etree
 from re import match
 from requests import get
 from werkzeug.formparser import parse_form_data
 from pvcapid.Daemon import config, strtobool
@@ -124,18 +126,24 @@ def cluster_maintenance(zkhandler, maint_state="false"):
 #
@pvc_common.Profiler(config)
@ZKConnection(config)
-def cluster_format_metrics(zkhandler, status_data, status_retcode):
+def cluster_metrics(zkhandler):
    """
    Format status data from cluster_status into Prometheus-compatible metrics
    """
    from flask import make_response
-    if status_retcode != 200:
+    # Get general cluster information
-        return "Error: Status data threw error", status_retcode
+    status_retflag, status_data = pvc_cluster.get_info(zkhandler)
    if not status_retflag:
        return "Error: Status data threw error", 400
    print(status_data)
    faults_retflag, faults_data = pvc_faults.get_list(zkhandler)
    if not faults_retflag:
        return "Error: Faults data threw error", 400
    print(faults_data)
    retcode = 200
    output_lines = list()
    print(status_data)
    output_lines.append("# HELP pvc_info PVC cluster information")
    output_lines.append("# TYPE pvc_info gauge")
@@ -153,6 +161,19 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode):
    output_lines.append("# TYPE pvc_cluster_health gauge")
    output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
    output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
    output_lines.append("# TYPE pvc_cluster_faults gauge")
    fault_map = dict()
    for fault in faults_data:
        if not fault_map.get(fault["status"]):
            fault_map[fault["status"]] = 1
        else:
            fault_map[fault["status"]] += 1
    for fault_type in fault_map:
        output_lines.append(
            f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
        )
    # output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
    # output_lines.append("# TYPE pvc_cluster_faults gauge")
    # for fault_msg in status_data["cluster_health"]["messages"]:
@@ -204,7 +225,55 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode):
    output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
    # We manually make the Flask response here so the output format is correct.
-    response = make_response("\n".join(output_lines) + "\n", retcode)
+    response = flask.make_response("\n".join(output_lines) + "\n", retcode)
    response.mimetype = "text/plain"
    return response
@pvc_common.Profiler(config)
@ZKConnection(config)
 def cluster_ceph_metrics_proxy(zkhandler):
    """
    Obtain current Ceph Prometheus metrics from the active MGR
    """
    # We have to parse out the *name* of the currently active MGR
    # While the JSON version of the "ceph status" output provides a
    # URL, this URL is in the backend (i.e. storage) network, which
    # the API might not have access to. This way, we can connect to
    # the node name which can be handled however.
    retcode, retdata = pvc_ceph.get_status(zkhandler)
    if not retcode:
        ceph_mgr_node = None
    else:
        ceph_data = retdata["ceph_data"]
        try:
            ceph_mgr_line = [
                n for n in ceph_data.split("\n") if match(r"^mgr:", n.strip())
            ][0]
            ceph_mgr_node = ceph_mgr_line.split()[1].split("(")[0]
        except Exception:
            ceph_mgr_node = None
    if ceph_mgr_node is not None:
        # Get the data from the endpoint
        # We use the default port of 9283
        ceph_prometheus_uri = f"http://{ceph_mgr_node}:9283/metrics"
        response = get(ceph_prometheus_uri)
        if response.status_code == 200:
            status_code = 200
            output = response.text
        else:
            status_code = 400
            output = (
                f"Error: Failed to obtain metric data from {ceph_mgr_node} MGR daemon\n"
            )
    else:
        status_code = 400
        output = "Error: Failed to find an active MGR node\n"
    # We manually make the Flask response here so the output format is correct.
    response = flask.make_response(output, status_code)
    response.mimetype = "text/plain"
    return response