Compare commits

..

5 Commits

Author SHA1 Message Date
e4ca74c201 Add Zookeeper performance to Grafana dashboard 2023-12-29 09:44:40 -05:00
4969e90f8a Allow enable/disable of Prometheus endpoints
Since these are unauthenticated, it might be the case that an
administrator wishes to completely disable these metrics endpoints.
Provide that option via pvc.conf through pvc-ansible's existing
enable_prometheus_exporters option and the new enable_prometheus
configuration flag.

Defaults to "yes" to provide all functionality unless explicitly
disabled, as the author assumes that the PVC API is secured in other
ways as well and that metric information is not completely sensitive.
2023-12-29 09:25:10 -05:00
52f68909f6 Update Grafana dashboard 2023-12-28 14:55:43 -05:00
0bcf8cfe19 Add Zookeeper metrics proxy 2023-12-28 13:53:15 -05:00
2bb24d3b57 Update Prometheus dashboard and add README 2023-12-27 15:57:12 -05:00
7 changed files with 7092 additions and 5733 deletions

View File

@ -629,8 +629,9 @@ class API_Metrics(Resource):
Return the current PVC cluster status in Prometheus-compatible metrics format and
the Ceph cluster metrics as one document.
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
with the Prometheus compatibility later.
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
is only opened to trusted networks that cannot abuse the data provided!
---
tags:
- root
@ -643,12 +644,13 @@ class API_Metrics(Resource):
health_output, health_retcode = api_helper.cluster_health_metrics()
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
ceph_output, ceph_retcode = api_helper.ceph_metrics()
zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
output = "Error: Failed to obtain data"
retcode = 400
else:
output = health_output + resource_output + ceph_output
output = health_output + resource_output + ceph_output + zookeeper_output
retcode = 200
response = flask.make_response(output, retcode)
@ -656,7 +658,8 @@ class API_Metrics(Resource):
return response
api.add_resource(API_Metrics, "/metrics")
if config["enable_prometheus"]:
api.add_resource(API_Metrics, "/metrics")
# /metrics/health
@ -665,8 +668,9 @@ class API_Metrics_Health(Resource):
"""
Return the current PVC cluster health status in Prometheus-compatible metrics format
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
with the Prometheus compatibility later.
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
is only opened to trusted networks that cannot abuse the data provided!
---
tags:
- root
@ -690,7 +694,8 @@ class API_Metrics_Health(Resource):
return response
api.add_resource(API_Metrics_Health, "/metrics/health")
if config["enable_prometheus"]:
api.add_resource(API_Metrics_Health, "/metrics/health")
# /metrics/resource
@ -699,8 +704,9 @@ class API_Metrics_Resource(Resource):
"""
Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
with the Prometheus compatibility later.
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
is only opened to trusted networks that cannot abuse the data provided!
---
tags:
- root
@ -724,7 +730,8 @@ class API_Metrics_Resource(Resource):
return response
api.add_resource(API_Metrics_Resource, "/metrics/resource")
if config["enable_prometheus"]:
api.add_resource(API_Metrics_Resource, "/metrics/resource")
# /metrics/ceph
@ -735,6 +742,10 @@ class API_Metrics_Ceph(Resource):
Proxies a metrics request to the current active MGR, since this is dynamic
and can't be controlled by PVC easily.
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
is only opened to trusted networks that cannot abuse the data provided!
---
tags:
- root
@ -758,7 +769,47 @@ class API_Metrics_Ceph(Resource):
return response
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
if config["enable_prometheus"]:
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
# /metrics/zookeeper
class API_Metrics_Zookeeper(Resource):
def get(self):
"""
Return the current PVC Zookeeper Prometheus metrics
Proxies a metrics request to the current primary node, since all coordinators
run an active Zookeeper instance and we want one central location.
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
is only opened to trusted networks that cannot abuse the data provided!
---
tags:
- root
responses:
200:
description: OK
400:
description: Bad request
"""
zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
if zookeeper_retcode != 200:
output = "Error: Failed to obtain data"
retcode = 400
else:
output = zookeeper_output
retcode = 200
response = flask.make_response(output, retcode)
response.mimetype = "text/plain"
return response
if config["enable_prometheus"]:
api.add_resource(API_Metrics_Zookeeper, "/metrics/zookeeper")
# /faults

View File

@ -199,6 +199,38 @@ def ceph_metrics(zkhandler):
return output, status_code
@pvc_common.Profiler(config)
@ZKConnection(config)
def zookeeper_metrics(zkhandler):
"""
Obtain current Zookeeper Prometheus metrics from the active coordinator node
"""
primary_node = zkhandler.read("base.config.primary_node")
if primary_node is not None:
# Get the data from the endpoint
# We use the default port of 9141
zookeeper_prometheus_uri = f"http://{primary_node}:9141/metrics"
response = get(zookeeper_prometheus_uri)
if response.status_code == 200:
output = response.text
# Parse the text to remove annoying ports (":2181")
output = output.replace(":2181", "")
# Sort the output text
output_lines = output.split("\n")
output_lines.sort()
output = "\n".join(output_lines) + "\n"
status_code = 200
else:
output = f"Error: Failed to obtain metric data from {primary_node} primary node daemon\n"
status_code = 400
else:
output = "Error: Failed to find an active primary node\n"
status_code = 400
return output, status_code
#
# Fault functions
#

View File

@ -176,6 +176,7 @@ def get_parsed_configuration(config_file):
"enable_storage": o_subsystem.get("enable_storage", True),
"enable_worker": o_subsystem.get("enable_worker", True),
"enable_api": o_subsystem.get("enable_api", True),
"enable_prometheus": o_subsystem.get("enable_prometheus", True),
}
config = {**config, **config_subsystem}

View File

@ -0,0 +1,36 @@
# Prometheus Monitoring for PVC
This example contains a Prometheus config snippit, an example `file_sd_configs` file, and a Grafana dashboard for monitoring a PVC cluster using the inbuilt metrics (`/api/v1/metrics`).
## `prometheus.yml`
This snippit shows how to set up a scrape config leveraging the `file_sd_configs` file.
This example uses `http` transport; if you use HTTPS for PVC API traffic (e.g. if it traverses the Internet), use `https` here. You can optionally disable certificate checking like so:
```
[...]
scheme: "https"
tls_config:
insecure_skip_verify: true
file_sd_configs:
[...]
```
## `targets-pvc_cluster.json`
This JSON-based config shows two example clusters as two discrete entries. This is required for proper labeling.
Each entry must contain:
* A single `targets` entry, pointing at the API address and port of the PVC cluster.
* Two `labels` which are leveraged by the Grafana dashboard:
* `pvc_cluster_id`: An identifier for the cluster. Likely, the `Name` in your `pvc connection list` entry for the cluster.
* `pvc_cluster_name`: A nicer, more human-readable description of the cluster. Likely, the `Description` in your `pvc connection list` entry for the cluster.
## `grafana-pvc-cluster-dashboard.json`
This JSON-based Grafana dashboard allows for a nice presentation of the metrics collected by the above Prometheus pollers. The cluster can be selected (based on the `pvc_cluster_name` value) and useful information about the cluster is then displayed.

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,20 @@
[
{
"targets": [
"pvc.upstream.floating.address.tld:7370"
"pvc.upstream.floating.address.1.tld:7370"
],
"labels": {
"cluster": "cluster1"
"pvc_cluster_id": "cluster1",
"pvc_cluster_name": "cluster1: My First Cluster"
}
},
{
"targets": [
"pvc.upstream.floating.address.2.tld:7370"
],
"labels": {
"pvc_cluster_id": "cluster2",
"pvc_cluster_name": "cluster2: My Second Cluster"
}
}
]

View File

@ -44,6 +44,9 @@ subsystem:
# Enable or disable the API client, if installed, when node is Primary
enable_api: yes
# Enable or disable the Prometheus metrics endpoints in the API; if disabled, these return 404
enable_prometheus: yes
# Cluster configuration
cluster: