Compare commits
5 Commits
v0.9.87
...
e4ca74c201
Author | SHA1 | Date | |
---|---|---|---|
e4ca74c201 | |||
4969e90f8a | |||
52f68909f6 | |||
0bcf8cfe19 | |||
2bb24d3b57 |
@ -629,8 +629,9 @@ class API_Metrics(Resource):
|
|||||||
Return the current PVC cluster status in Prometheus-compatible metrics format and
|
Return the current PVC cluster status in Prometheus-compatible metrics format and
|
||||||
the Ceph cluster metrics as one document.
|
the Ceph cluster metrics as one document.
|
||||||
|
|
||||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||||
with the Prometheus compatibility later.
|
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||||
|
is only opened to trusted networks that cannot abuse the data provided!
|
||||||
---
|
---
|
||||||
tags:
|
tags:
|
||||||
- root
|
- root
|
||||||
@ -643,12 +644,13 @@ class API_Metrics(Resource):
|
|||||||
health_output, health_retcode = api_helper.cluster_health_metrics()
|
health_output, health_retcode = api_helper.cluster_health_metrics()
|
||||||
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
|
resource_output, resource_retcode = api_helper.cluster_resource_metrics()
|
||||||
ceph_output, ceph_retcode = api_helper.ceph_metrics()
|
ceph_output, ceph_retcode = api_helper.ceph_metrics()
|
||||||
|
zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
|
||||||
|
|
||||||
if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
|
if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
|
||||||
output = "Error: Failed to obtain data"
|
output = "Error: Failed to obtain data"
|
||||||
retcode = 400
|
retcode = 400
|
||||||
else:
|
else:
|
||||||
output = health_output + resource_output + ceph_output
|
output = health_output + resource_output + ceph_output + zookeeper_output
|
||||||
retcode = 200
|
retcode = 200
|
||||||
|
|
||||||
response = flask.make_response(output, retcode)
|
response = flask.make_response(output, retcode)
|
||||||
@ -656,6 +658,7 @@ class API_Metrics(Resource):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if config["enable_prometheus"]:
|
||||||
api.add_resource(API_Metrics, "/metrics")
|
api.add_resource(API_Metrics, "/metrics")
|
||||||
|
|
||||||
|
|
||||||
@ -665,8 +668,9 @@ class API_Metrics_Health(Resource):
|
|||||||
"""
|
"""
|
||||||
Return the current PVC cluster health status in Prometheus-compatible metrics format
|
Return the current PVC cluster health status in Prometheus-compatible metrics format
|
||||||
|
|
||||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||||
with the Prometheus compatibility later.
|
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||||
|
is only opened to trusted networks that cannot abuse the data provided!
|
||||||
---
|
---
|
||||||
tags:
|
tags:
|
||||||
- root
|
- root
|
||||||
@ -690,6 +694,7 @@ class API_Metrics_Health(Resource):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if config["enable_prometheus"]:
|
||||||
api.add_resource(API_Metrics_Health, "/metrics/health")
|
api.add_resource(API_Metrics_Health, "/metrics/health")
|
||||||
|
|
||||||
|
|
||||||
@ -699,8 +704,9 @@ class API_Metrics_Resource(Resource):
|
|||||||
"""
|
"""
|
||||||
Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format
|
Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format
|
||||||
|
|
||||||
Endpoint is unauthenticated to allow metrics exfiltration without having to deal
|
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||||
with the Prometheus compatibility later.
|
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||||
|
is only opened to trusted networks that cannot abuse the data provided!
|
||||||
---
|
---
|
||||||
tags:
|
tags:
|
||||||
- root
|
- root
|
||||||
@ -724,6 +730,7 @@ class API_Metrics_Resource(Resource):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if config["enable_prometheus"]:
|
||||||
api.add_resource(API_Metrics_Resource, "/metrics/resource")
|
api.add_resource(API_Metrics_Resource, "/metrics/resource")
|
||||||
|
|
||||||
|
|
||||||
@ -735,6 +742,10 @@ class API_Metrics_Ceph(Resource):
|
|||||||
|
|
||||||
Proxies a metrics request to the current active MGR, since this is dynamic
|
Proxies a metrics request to the current active MGR, since this is dynamic
|
||||||
and can't be controlled by PVC easily.
|
and can't be controlled by PVC easily.
|
||||||
|
|
||||||
|
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||||
|
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||||
|
is only opened to trusted networks that cannot abuse the data provided!
|
||||||
---
|
---
|
||||||
tags:
|
tags:
|
||||||
- root
|
- root
|
||||||
@ -758,9 +769,49 @@ class API_Metrics_Ceph(Resource):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if config["enable_prometheus"]:
|
||||||
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
|
api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
|
||||||
|
|
||||||
|
|
||||||
|
# /metrics/zookeeper
|
||||||
|
class API_Metrics_Zookeeper(Resource):
|
||||||
|
def get(self):
|
||||||
|
"""
|
||||||
|
Return the current PVC Zookeeper Prometheus metrics
|
||||||
|
|
||||||
|
Proxies a metrics request to the current primary node, since all coordinators
|
||||||
|
run an active Zookeeper instance and we want one central location.
|
||||||
|
|
||||||
|
Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
|
||||||
|
with Prometheus compatibility (only basic auth support). Ensure this API endpoint
|
||||||
|
is only opened to trusted networks that cannot abuse the data provided!
|
||||||
|
---
|
||||||
|
tags:
|
||||||
|
- root
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: OK
|
||||||
|
400:
|
||||||
|
description: Bad request
|
||||||
|
"""
|
||||||
|
zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
|
||||||
|
|
||||||
|
if zookeeper_retcode != 200:
|
||||||
|
output = "Error: Failed to obtain data"
|
||||||
|
retcode = 400
|
||||||
|
else:
|
||||||
|
output = zookeeper_output
|
||||||
|
retcode = 200
|
||||||
|
|
||||||
|
response = flask.make_response(output, retcode)
|
||||||
|
response.mimetype = "text/plain"
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if config["enable_prometheus"]:
|
||||||
|
api.add_resource(API_Metrics_Zookeeper, "/metrics/zookeeper")
|
||||||
|
|
||||||
|
|
||||||
# /faults
|
# /faults
|
||||||
class API_Faults(Resource):
|
class API_Faults(Resource):
|
||||||
@RequestParser(
|
@RequestParser(
|
||||||
|
@ -199,6 +199,38 @@ def ceph_metrics(zkhandler):
|
|||||||
return output, status_code
|
return output, status_code
|
||||||
|
|
||||||
|
|
||||||
|
@pvc_common.Profiler(config)
|
||||||
|
@ZKConnection(config)
|
||||||
|
def zookeeper_metrics(zkhandler):
|
||||||
|
"""
|
||||||
|
Obtain current Zookeeper Prometheus metrics from the active coordinator node
|
||||||
|
"""
|
||||||
|
primary_node = zkhandler.read("base.config.primary_node")
|
||||||
|
if primary_node is not None:
|
||||||
|
# Get the data from the endpoint
|
||||||
|
# We use the default port of 9141
|
||||||
|
zookeeper_prometheus_uri = f"http://{primary_node}:9141/metrics"
|
||||||
|
response = get(zookeeper_prometheus_uri)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
output = response.text
|
||||||
|
# Parse the text to remove annoying ports (":2181")
|
||||||
|
output = output.replace(":2181", "")
|
||||||
|
# Sort the output text
|
||||||
|
output_lines = output.split("\n")
|
||||||
|
output_lines.sort()
|
||||||
|
output = "\n".join(output_lines) + "\n"
|
||||||
|
status_code = 200
|
||||||
|
else:
|
||||||
|
output = f"Error: Failed to obtain metric data from {primary_node} primary node daemon\n"
|
||||||
|
status_code = 400
|
||||||
|
else:
|
||||||
|
output = "Error: Failed to find an active primary node\n"
|
||||||
|
status_code = 400
|
||||||
|
|
||||||
|
return output, status_code
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Fault functions
|
# Fault functions
|
||||||
#
|
#
|
||||||
|
@ -176,6 +176,7 @@ def get_parsed_configuration(config_file):
|
|||||||
"enable_storage": o_subsystem.get("enable_storage", True),
|
"enable_storage": o_subsystem.get("enable_storage", True),
|
||||||
"enable_worker": o_subsystem.get("enable_worker", True),
|
"enable_worker": o_subsystem.get("enable_worker", True),
|
||||||
"enable_api": o_subsystem.get("enable_api", True),
|
"enable_api": o_subsystem.get("enable_api", True),
|
||||||
|
"enable_prometheus": o_subsystem.get("enable_prometheus", True),
|
||||||
}
|
}
|
||||||
config = {**config, **config_subsystem}
|
config = {**config, **config_subsystem}
|
||||||
|
|
||||||
|
36
monitoring/prometheus/README.md
Normal file
36
monitoring/prometheus/README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Prometheus Monitoring for PVC
|
||||||
|
|
||||||
|
This example contains a Prometheus config snippit, an example `file_sd_configs` file, and a Grafana dashboard for monitoring a PVC cluster using the inbuilt metrics (`/api/v1/metrics`).
|
||||||
|
|
||||||
|
## `prometheus.yml`
|
||||||
|
|
||||||
|
This snippit shows how to set up a scrape config leveraging the `file_sd_configs` file.
|
||||||
|
|
||||||
|
This example uses `http` transport; if you use HTTPS for PVC API traffic (e.g. if it traverses the Internet), use `https` here. You can optionally disable certificate checking like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
[...]
|
||||||
|
scheme: "https"
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
file_sd_configs:
|
||||||
|
[...]
|
||||||
|
```
|
||||||
|
|
||||||
|
## `targets-pvc_cluster.json`
|
||||||
|
|
||||||
|
This JSON-based config shows two example clusters as two discrete entries. This is required for proper labeling.
|
||||||
|
|
||||||
|
Each entry must contain:
|
||||||
|
|
||||||
|
* A single `targets` entry, pointing at the API address and port of the PVC cluster.
|
||||||
|
|
||||||
|
* Two `labels` which are leveraged by the Grafana dashboard:
|
||||||
|
|
||||||
|
* `pvc_cluster_id`: An identifier for the cluster. Likely, the `Name` in your `pvc connection list` entry for the cluster.
|
||||||
|
|
||||||
|
* `pvc_cluster_name`: A nicer, more human-readable description of the cluster. Likely, the `Description` in your `pvc connection list` entry for the cluster.
|
||||||
|
|
||||||
|
## `grafana-pvc-cluster-dashboard.json`
|
||||||
|
|
||||||
|
This JSON-based Grafana dashboard allows for a nice presentation of the metrics collected by the above Prometheus pollers. The cluster can be selected (based on the `pvc_cluster_name` value) and useful information about the cluster is then displayed.
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,20 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"targets": [
|
"targets": [
|
||||||
"pvc.upstream.floating.address.tld:7370"
|
"pvc.upstream.floating.address.1.tld:7370"
|
||||||
],
|
],
|
||||||
"labels": {
|
"labels": {
|
||||||
"cluster": "cluster1"
|
"pvc_cluster_id": "cluster1",
|
||||||
|
"pvc_cluster_name": "cluster1: My First Cluster"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"targets": [
|
||||||
|
"pvc.upstream.floating.address.2.tld:7370"
|
||||||
|
],
|
||||||
|
"labels": {
|
||||||
|
"pvc_cluster_id": "cluster2",
|
||||||
|
"pvc_cluster_name": "cluster2: My Second Cluster"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -44,6 +44,9 @@ subsystem:
|
|||||||
# Enable or disable the API client, if installed, when node is Primary
|
# Enable or disable the API client, if installed, when node is Primary
|
||||||
enable_api: yes
|
enable_api: yes
|
||||||
|
|
||||||
|
# Enable or disable the Prometheus metrics endpoints in the API; if disabled, these return 404
|
||||||
|
enable_prometheus: yes
|
||||||
|
|
||||||
# Cluster configuration
|
# Cluster configuration
|
||||||
cluster:
|
cluster:
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user