Compare commits
7 Commits
v0.9.87
...
2309b9dcf0
Author | SHA1 | Date | |
---|---|---|---|
2309b9dcf0 | |||
51b9f062b7 | |||
e4ca74c201 | |||
4969e90f8a | |||
52f68909f6 | |||
0bcf8cfe19 | |||
2bb24d3b57 |
@ -629,8 +629,9 @@ class API_Metrics(Resource):
|
||||
Return the current PVC cluster status in Prometheus-compatible metrics format and
|
||||
the Ceph cluster metrics as one document.
|
||||
|
||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
||||
with the Prometheus compatibility later.
|
||||
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||
is only opened to trusted networks that cannot abuse the data provided!
|
||||
---
|
||||
tags:
|
||||
- root
|
||||
@ -643,12 +644,13 @@ class API_Metrics(Resource):
|
||||
health_output, health_retcode = api_helper.cluster_health_metrics()
|
||||
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
|
||||
ceph_output, ceph_retcode = api_helper.ceph_metrics()
|
||||
zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
|
||||
|
||||
if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
|
||||
output = "Error: Failed to obtain data"
|
||||
retcode = 400
|
||||
else:
|
||||
output = health_output + resource_output + ceph_output
|
||||
output = health_output + resource_output + ceph_output + zookeeper_output
|
||||
retcode = 200
|
||||
|
||||
response = flask.make_response(output, retcode)
|
||||
@ -656,7 +658,8 @@ class API_Metrics(Resource):
|
||||
return response
|
||||
|
||||
|
||||
api.add_resource(API_Metrics, "/metrics")
|
||||
if config["enable_prometheus"]:
|
||||
api.add_resource(API_Metrics, "/metrics")
|
||||
|
||||
|
||||
# /metrics/health
|
||||
@ -665,8 +668,9 @@ class API_Metrics_Health(Resource):
|
||||
"""
|
||||
Return the current PVC cluster health status in Prometheus-compatible metrics format
|
||||
|
||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
||||
with the Prometheus compatibility later.
|
||||
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||
is only opened to trusted networks that cannot abuse the data provided!
|
||||
---
|
||||
tags:
|
||||
- root
|
||||
@ -690,7 +694,8 @@ class API_Metrics_Health(Resource):
|
||||
return response
|
||||
|
||||
|
||||
api.add_resource(API_Metrics_Health, "/metrics/health")
|
||||
if config["enable_prometheus"]:
|
||||
api.add_resource(API_Metrics_Health, "/metrics/health")
|
||||
|
||||
|
||||
# /metrics/resource
|
||||
@ -699,8 +704,9 @@ class API_Metrics_Resource(Resource):
|
||||
"""
|
||||
Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format
|
||||
|
||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
||||
with the Prometheus compatibility later.
|
||||
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||
is only opened to trusted networks that cannot abuse the data provided!
|
||||
---
|
||||
tags:
|
||||
- root
|
||||
@ -724,7 +730,8 @@ class API_Metrics_Resource(Resource):
|
||||
return response
|
||||
|
||||
|
||||
api.add_resource(API_Metrics_Resource, "/metrics/resource")
|
||||
if config["enable_prometheus"]:
|
||||
api.add_resource(API_Metrics_Resource, "/metrics/resource")
|
||||
|
||||
|
||||
# /metrics/ceph
|
||||
@ -735,6 +742,10 @@ class API_Metrics_Ceph(Resource):
|
||||
|
||||
Proxies a metrics request to the current active MGR, since this is dynamic
|
||||
and can't be controlled by PVC easily.
|
||||
|
||||
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||
is only opened to trusted networks that cannot abuse the data provided!
|
||||
---
|
||||
tags:
|
||||
- root
|
||||
@ -758,7 +769,47 @@ class API_Metrics_Ceph(Resource):
|
||||
return response
|
||||
|
||||
|
||||
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
|
||||
if config["enable_prometheus"]:
|
||||
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
|
||||
|
||||
|
||||
# /metrics/zookeeper
|
||||
class API_Metrics_Zookeeper(Resource):
|
||||
def get(self):
|
||||
"""
|
||||
Return the current PVC Zookeeper Prometheus metrics
|
||||
|
||||
Proxies a metrics request to the current primary node, since all coordinators
|
||||
run an active Zookeeper instance and we want one central location.
|
||||
|
||||
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||
is only opened to trusted networks that cannot abuse the data provided!
|
||||
---
|
||||
tags:
|
||||
- root
|
||||
responses:
|
||||
200:
|
||||
description: OK
|
||||
400:
|
||||
description: Bad request
|
||||
"""
|
||||
zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
|
||||
|
||||
if zookeeper_retcode != 200:
|
||||
output = "Error: Failed to obtain data"
|
||||
retcode = 400
|
||||
else:
|
||||
output = zookeeper_output
|
||||
retcode = 200
|
||||
|
||||
response = flask.make_response(output, retcode)
|
||||
response.mimetype = "text/plain"
|
||||
return response
|
||||
|
||||
|
||||
if config["enable_prometheus"]:
|
||||
api.add_resource(API_Metrics_Zookeeper, "/metrics/zookeeper")
|
||||
|
||||
|
||||
# /faults
|
||||
|
@ -199,6 +199,38 @@ def ceph_metrics(zkhandler):
|
||||
return output, status_code
|
||||
|
||||
|
||||
@pvc_common.Profiler(config)
|
||||
@ZKConnection(config)
|
||||
def zookeeper_metrics(zkhandler):
|
||||
"""
|
||||
Obtain current Zookeeper Prometheus metrics from the active coordinator node
|
||||
"""
|
||||
primary_node = zkhandler.read("base.config.primary_node")
|
||||
if primary_node is not None:
|
||||
# Get the data from the endpoint
|
||||
# We use the default port of 9141
|
||||
zookeeper_prometheus_uri = f"http://{primary_node}:9141/metrics"
|
||||
response = get(zookeeper_prometheus_uri)
|
||||
|
||||
if response.status_code == 200:
|
||||
output = response.text
|
||||
# Parse the text to remove annoying ports (":2181")
|
||||
output = output.replace(":2181", "")
|
||||
# Sort the output text
|
||||
output_lines = output.split("\n")
|
||||
output_lines.sort()
|
||||
output = "\n".join(output_lines) + "\n"
|
||||
status_code = 200
|
||||
else:
|
||||
output = f"Error: Failed to obtain metric data from {primary_node} primary node daemon\n"
|
||||
status_code = 400
|
||||
else:
|
||||
output = "Error: Failed to find an active primary node\n"
|
||||
status_code = 400
|
||||
|
||||
return output, status_code
|
||||
|
||||
|
||||
#
|
||||
# Fault functions
|
||||
#
|
||||
|
@ -176,6 +176,7 @@ def get_parsed_configuration(config_file):
|
||||
"enable_storage": o_subsystem.get("enable_storage", True),
|
||||
"enable_worker": o_subsystem.get("enable_worker", True),
|
||||
"enable_api": o_subsystem.get("enable_api", True),
|
||||
"enable_prometheus": o_subsystem.get("enable_prometheus", True),
|
||||
}
|
||||
config = {**config, **config_subsystem}
|
||||
|
||||
|
36
monitoring/prometheus/README.md
Normal file
36
monitoring/prometheus/README.md
Normal file
@ -0,0 +1,36 @@
|
||||
# Prometheus Monitoring for PVC
|
||||
|
||||
This example contains a Prometheus config snippit, an example `file_sd_configs` file, and a Grafana dashboard for monitoring a PVC cluster using the inbuilt metrics (`/api/v1/metrics`).
|
||||
|
||||
## `prometheus.yml`
|
||||
|
||||
This snippit shows how to set up a scrape config leveraging the `file_sd_configs` file.
|
||||
|
||||
This example uses `http` transport; if you use HTTPS for PVC API traffic (e.g. if it traverses the Internet), use `https` here. You can optionally disable certificate checking like so:
|
||||
|
||||
```
|
||||
[...]
|
||||
scheme: "https"
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
file_sd_configs:
|
||||
[...]
|
||||
```
|
||||
|
||||
## `targets-pvc_cluster.json`
|
||||
|
||||
This JSON-based config shows two example clusters as two discrete entries. This is required for proper labeling.
|
||||
|
||||
Each entry must contain:
|
||||
|
||||
* A single `targets` entry, pointing at the API address and port of the PVC cluster.
|
||||
|
||||
* Two `labels` which are leveraged by the Grafana dashboard:
|
||||
|
||||
* `pvc_cluster_id`: An identifier for the cluster. Likely, the `Name` in your `pvc connection list` entry for the cluster.
|
||||
|
||||
* `pvc_cluster_name`: A nicer, more human-readable description of the cluster. Likely, the `Description` in your `pvc connection list` entry for the cluster.
|
||||
|
||||
## `grafana-pvc-cluster-dashboard.json`
|
||||
|
||||
This JSON-based Grafana dashboard allows for a nice presentation of the metrics collected by the above Prometheus pollers. The cluster can be selected (based on the `pvc_cluster_name` value) and useful information about the cluster is then displayed.
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,20 @@
|
||||
[
|
||||
{
|
||||
"targets": [
|
||||
"pvc.upstream.floating.address.tld:7370"
|
||||
"pvc.upstream.floating.address.1.tld:7370"
|
||||
],
|
||||
"labels": {
|
||||
"cluster": "cluster1"
|
||||
"pvc_cluster_id": "cluster1",
|
||||
"pvc_cluster_name": "cluster1: My First Cluster"
|
||||
}
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
"pvc.upstream.floating.address.2.tld:7370"
|
||||
],
|
||||
"labels": {
|
||||
"pvc_cluster_id": "cluster2",
|
||||
"pvc_cluster_name": "cluster2: My Second Cluster"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -44,6 +44,9 @@ subsystem:
|
||||
# Enable or disable the API client, if installed, when node is Primary
|
||||
enable_api: yes
|
||||
|
||||
# Enable or disable the Prometheus metrics endpoints in the API; if disabled, these return 404
|
||||
enable_prometheus: yes
|
||||
|
||||
# Cluster configuration
|
||||
cluster:
|
||||
|
||||
|
Reference in New Issue
Block a user