Add Ceph metrics proxy and health fault counts
This commit is contained in:
parent
7373bfed3f
commit
9b3c9f1be5
|
@ -639,13 +639,35 @@ class API_Metrics(Resource):
|
||||||
400:
|
400:
|
||||||
description: Bad request
|
description: Bad request
|
||||||
"""
|
"""
|
||||||
status_data, status_retcode = api_helper.cluster_status()
|
return api_helper.cluster_metrics()
|
||||||
return api_helper.cluster_format_metrics(status_data, status_retcode)
|
|
||||||
|
|
||||||
|
|
||||||
api.add_resource(API_Metrics, "/metrics")
|
api.add_resource(API_Metrics, "/metrics")
|
||||||
|
|
||||||
|
|
||||||
|
# /metrics/ceph
|
||||||
|
class API_Metrics_Ceph(Resource):
|
||||||
|
def get(self):
|
||||||
|
"""
|
||||||
|
Return the current PVC Ceph Prometheus metrics
|
||||||
|
|
||||||
|
Proxies a metrics request to the current active MGR, since this is dynamic
|
||||||
|
and can't be controlled by PVC easily.
|
||||||
|
---
|
||||||
|
tags:
|
||||||
|
- root
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: OK
|
||||||
|
400:
|
||||||
|
description: Bad request
|
||||||
|
"""
|
||||||
|
return api_helper.cluster_ceph_metrics_proxy()
|
||||||
|
|
||||||
|
|
||||||
|
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
|
||||||
|
|
||||||
|
|
||||||
# /faults
|
# /faults
|
||||||
class API_Faults(Resource):
|
class API_Faults(Resource):
|
||||||
@RequestParser(
|
@RequestParser(
|
||||||
|
|
|
@ -23,6 +23,8 @@ import flask
|
||||||
import json
|
import json
|
||||||
import lxml.etree as etree
|
import lxml.etree as etree
|
||||||
|
|
||||||
|
from re import match
|
||||||
|
from requests import get
|
||||||
from werkzeug.formparser import parse_form_data
|
from werkzeug.formparser import parse_form_data
|
||||||
|
|
||||||
from pvcapid.Daemon import config, strtobool
|
from pvcapid.Daemon import config, strtobool
|
||||||
|
@ -124,18 +126,24 @@ def cluster_maintenance(zkhandler, maint_state="false"):
|
||||||
#
|
#
|
||||||
@pvc_common.Profiler(config)
|
@pvc_common.Profiler(config)
|
||||||
@ZKConnection(config)
|
@ZKConnection(config)
|
||||||
def cluster_format_metrics(zkhandler, status_data, status_retcode):
|
def cluster_metrics(zkhandler):
|
||||||
"""
|
"""
|
||||||
Format status data from cluster_status into Prometheus-compatible metrics
|
Format status data from cluster_status into Prometheus-compatible metrics
|
||||||
"""
|
"""
|
||||||
from flask import make_response
|
|
||||||
|
|
||||||
if status_retcode != 200:
|
# Get general cluster information
|
||||||
return "Error: Status data threw error", status_retcode
|
status_retflag, status_data = pvc_cluster.get_info(zkhandler)
|
||||||
|
if not status_retflag:
|
||||||
|
return "Error: Status data threw error", 400
|
||||||
|
print(status_data)
|
||||||
|
|
||||||
|
faults_retflag, faults_data = pvc_faults.get_list(zkhandler)
|
||||||
|
if not faults_retflag:
|
||||||
|
return "Error: Faults data threw error", 400
|
||||||
|
print(faults_data)
|
||||||
|
|
||||||
retcode = 200
|
retcode = 200
|
||||||
output_lines = list()
|
output_lines = list()
|
||||||
print(status_data)
|
|
||||||
|
|
||||||
output_lines.append("# HELP pvc_info PVC cluster information")
|
output_lines.append("# HELP pvc_info PVC cluster information")
|
||||||
output_lines.append("# TYPE pvc_info gauge")
|
output_lines.append("# TYPE pvc_info gauge")
|
||||||
|
@ -153,6 +161,19 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode):
|
||||||
output_lines.append("# TYPE pvc_cluster_health gauge")
|
output_lines.append("# TYPE pvc_cluster_health gauge")
|
||||||
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
|
output_lines.append(f"pvc_cluster_health {status_data['cluster_health']['health']}")
|
||||||
|
|
||||||
|
output_lines.append("# HELP pvc_cluster_faults PVC cluster new faults")
|
||||||
|
output_lines.append("# TYPE pvc_cluster_faults gauge")
|
||||||
|
fault_map = dict()
|
||||||
|
for fault in faults_data:
|
||||||
|
if not fault_map.get(fault["status"]):
|
||||||
|
fault_map[fault["status"]] = 1
|
||||||
|
else:
|
||||||
|
fault_map[fault["status"]] += 1
|
||||||
|
for fault_type in fault_map:
|
||||||
|
output_lines.append(
|
||||||
|
f'pvc_cluster_faults{{status="{fault_type}"}} {fault_map[fault_type]}'
|
||||||
|
)
|
||||||
|
|
||||||
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
|
# output_lines.append("# HELP pvc_cluster_faults PVC cluster health faults")
|
||||||
# output_lines.append("# TYPE pvc_cluster_faults gauge")
|
# output_lines.append("# TYPE pvc_cluster_faults gauge")
|
||||||
# for fault_msg in status_data["cluster_health"]["messages"]:
|
# for fault_msg in status_data["cluster_health"]["messages"]:
|
||||||
|
@ -204,7 +225,55 @@ def cluster_format_metrics(zkhandler, status_data, status_retcode):
|
||||||
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
|
output_lines.append(f"pvc_snapshots {status_data['snapshots']}")
|
||||||
|
|
||||||
# We manually make the Flask response here so the output format is correct.
|
# We manually make the Flask response here so the output format is correct.
|
||||||
response = make_response("\n".join(output_lines) + "\n", retcode)
|
response = flask.make_response("\n".join(output_lines) + "\n", retcode)
|
||||||
|
response.mimetype = "text/plain"
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@pvc_common.Profiler(config)
|
||||||
|
@ZKConnection(config)
|
||||||
|
def cluster_ceph_metrics_proxy(zkhandler):
|
||||||
|
"""
|
||||||
|
Obtain current Ceph Prometheus metrics from the active MGR
|
||||||
|
"""
|
||||||
|
# We have to parse out the *name* of the currently active MGR
|
||||||
|
# While the JSON version of the "ceph status" output provides a
|
||||||
|
# URL, this URL is in the backend (i.e. storage) network, which
|
||||||
|
# the API might not have access to. This way, we can connect to
|
||||||
|
# the node name which can be handled however.
|
||||||
|
retcode, retdata = pvc_ceph.get_status(zkhandler)
|
||||||
|
if not retcode:
|
||||||
|
ceph_mgr_node = None
|
||||||
|
else:
|
||||||
|
ceph_data = retdata["ceph_data"]
|
||||||
|
try:
|
||||||
|
ceph_mgr_line = [
|
||||||
|
n for n in ceph_data.split("\n") if match(r"^mgr:", n.strip())
|
||||||
|
][0]
|
||||||
|
ceph_mgr_node = ceph_mgr_line.split()[1].split("(")[0]
|
||||||
|
except Exception:
|
||||||
|
ceph_mgr_node = None
|
||||||
|
|
||||||
|
if ceph_mgr_node is not None:
|
||||||
|
# Get the data from the endpoint
|
||||||
|
# We use the default port of 9283
|
||||||
|
ceph_prometheus_uri = f"http://{ceph_mgr_node}:9283/metrics"
|
||||||
|
response = get(ceph_prometheus_uri)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
status_code = 200
|
||||||
|
output = response.text
|
||||||
|
else:
|
||||||
|
status_code = 400
|
||||||
|
output = (
|
||||||
|
f"Error: Failed to obtain metric data from {ceph_mgr_node} MGR daemon\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
status_code = 400
|
||||||
|
output = "Error: Failed to find an active MGR node\n"
|
||||||
|
|
||||||
|
# We manually make the Flask response here so the output format is correct.
|
||||||
|
response = flask.make_response(output, status_code)
|
||||||
response.mimetype = "text/plain"
|
response.mimetype = "text/plain"
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue