From 7373bfed3f8a5ee4b992c280a3d1ebdf773b26ed Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 8 Dec 2023 11:45:52 -0500 Subject: [PATCH] Add Prometheus metric exporter Adds a "fake" Prometheus metrics endpoint which returns cluster status information in Prometheus format. --- api-daemon/pvcapid/flaskapi.py | 24 +++++++++ api-daemon/pvcapid/helper.py | 90 ++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 04a5fee1..7f72b9d2 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -622,6 +622,30 @@ class API_Status(Resource): api.add_resource(API_Status, "/status") +# /metrics +class API_Metrics(Resource): + def get(self): + """ + Return the current PVC cluster status in Prometheus-compatible metrics format + + Endpoint is unauthenticated to allow metrics exfiltration without having to deal + with the Prometheus compatibility later. + --- + tags: + - root + responses: + 200: + description: OK + 400: + description: Bad request + """ + status_data, status_retcode = api_helper.cluster_status() + return api_helper.cluster_format_metrics(status_data, status_retcode) + + +api.add_resource(API_Metrics, "/metrics") + + # /faults class API_Faults(Resource): @RequestParser( diff --git a/api-daemon/pvcapid/helper.py b/api-daemon/pvcapid/helper.py index 7440597b..899dc752 100755 --- a/api-daemon/pvcapid/helper.py +++ b/api-daemon/pvcapid/helper.py @@ -119,6 +119,96 @@ def cluster_maintenance(zkhandler, maint_state="false"): return retdata, retcode +# +# Metrics functions +# +@pvc_common.Profiler(config) +@ZKConnection(config) +def cluster_format_metrics(zkhandler, status_data, status_retcode): + """ + Format status data from cluster_status into Prometheus-compatible metrics + """ + from flask import make_response + + if status_retcode != 200: + return "Error: Status data threw error", status_retcode + + retcode = 200 + output_lines = list() + print(status_data) + + output_lines.append("# HELP pvc_info PVC cluster information") + output_lines.append("# TYPE pvc_info gauge") + output_lines.append( + f"pvc_info{{primary_node=\"{status_data['primary_node']}\", version=\"{status_data['pvc_version']}\", upstream_ip=\"{status_data['upstream_ip']}\"}} 1" + ) + + output_lines.append("# HELP pvc_cluster_maintenance PVC cluster maintenance state") + output_lines.append("# TYPE pvc_cluster_maintenance gauge") + output_lines.append( + f"pvc_cluster_maintenance {1 if bool(strtobool(status_data['maintenance'])) else 0}" + ) + + output_lines.append("# HELP pvc_cluster_health PVC cluster health status") + output_lines.append("# TYPE pvc_cluster_health gauge") + output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}") + + # output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults") + # output_lines.append("# TYPE pvc_cluster_faults gauge") + # for fault_msg in status_data["cluster_health"]["messages"]: + # output_lines.append( + # f"pvc_cluster_faults{{id=\"{fault_msg['id']}\", message=\"{fault_msg['text']}\"}} {fault_msg['health_delta']}" + # ) + + output_lines.append("# HELP pvc_node_health PVC cluster node health status") + output_lines.append("# TYPE pvc_node_health gauge") + for node in status_data["node_health"]: + if isinstance(status_data["node_health"][node]["health"], int): + output_lines.append( + f"pvc_node_health{{node=\"{node}\"}} {status_data['node_health'][node]['health']}" + ) + + output_lines.append("# HELP pvc_nodes PVC node state counts") + output_lines.append("# TYPE pvc_nodes gauge") + for state in status_data["nodes"]: + output_lines.append( + f"pvc_nodes{{state=\"{state}\"}} {status_data['nodes'][state]}" + ) + + output_lines.append("# HELP pvc_vms PVC VM state counts") + output_lines.append("# TYPE pvc_vms gauge") + for state in status_data["vms"]: + output_lines.append(f"pvc_vms{{state=\"{state}\"}} {status_data['vms'][state]}") + + output_lines.append("# HELP pvc_osds PVC OSD state counts") + output_lines.append("# TYPE pvc_osds gauge") + for state in status_data["osds"]: + output_lines.append( + f"pvc_osds{{state=\"{state}\"}} {status_data['osds'][state]}" + ) + + output_lines.append("# HELP pvc_networks PVC network count") + output_lines.append("# TYPE pvc_networks gauge") + output_lines.append(f"pvc_networks {status_data['networks']}") + + output_lines.append("# HELP pvc_pools PVC storage pool count") + output_lines.append("# TYPE pvc_pools gauge") + output_lines.append(f"pvc_pools {status_data['pools']}") + + output_lines.append("# HELP pvc_volumes PVC storage volume count") + output_lines.append("# TYPE pvc_volumes gauge") + output_lines.append(f"pvc_volumes {status_data['volumes']}") + + output_lines.append("# HELP pvc_snapshots PVC storage snapshot count") + output_lines.append("# TYPE pvc_snapshots gauge") + output_lines.append(f"pvc_snapshots {status_data['snapshots']}") + + # We manually make the Flask response here so the output format is correct. + response = make_response("\n".join(output_lines) + "\n", retcode) + response.mimetype = "text/plain" + return response + + # # Fault functions #