Add Zookeeper performance to Grafana dashboard

Allow enable/disable of Prometheus endpoints
Since these are unauthenticated, it might be the case that an administrator wishes to completely disable these metrics endpoints. Provide that option via pvc.conf through pvc-ansible's existing enable_prometheus_exporters option and the new enable_prometheus configuration flag. Defaults to "yes" to provide all functionality unless explicitly disabled, as the author assumes that the PVC API is secured in other ways as well and that metric information is not completely sensitive.
2023-12-29 09:44:40 -05:00 · 2023-12-29 09:25:10 -05:00 · 2023-12-28 14:55:43 -05:00 · 2023-12-28 13:53:15 -05:00 · 2023-12-27 15:57:12 -05:00
7 changed files with 7092 additions and 5733 deletions
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@ -629,8 +629,9 @@ class API_Metrics(Resource):
        Return the current PVC cluster status in Prometheus-compatible metrics format and
        the Ceph cluster metrics as one document.

-        Endpoint is unauthenticated to allow metrics exfiltration without having to deal
-        with the Prometheus compatibility later.
+        Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
+        with Prometheus compatibility (only basic auth support). Ensure this API endpoint
+        is only opened to trusted networks that cannot abuse the data provided!
        ---
        tags:
          - root
@ -643,12 +644,13 @@ class API_Metrics(Resource):
        health_output, health_retcode = api_helper.cluster_health_metrics()
        resource_output, resource_retcode = api_helper.cluster_resource_metrics()
        ceph_output, ceph_retcode = api_helper.ceph_metrics()
+        zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()

        if health_retcode != 200 or resource_retcode != 200 or ceph_retcode != 200:
            output = "Error: Failed to obtain data"
            retcode = 400
        else:
-            output = health_output + resource_output + ceph_output
+            output = health_output + resource_output + ceph_output + zookeeper_output
            retcode = 200

        response = flask.make_response(output, retcode)
@ -656,7 +658,8 @@ class API_Metrics(Resource):
        return response


-api.add_resource(API_Metrics, "/metrics")
+if config["enable_prometheus"]:
+    api.add_resource(API_Metrics, "/metrics")


 # /metrics/health
@ -665,8 +668,9 @@ class API_Metrics_Health(Resource):
        """
        Return the current PVC cluster health status in Prometheus-compatible metrics format

-        Endpoint is unauthenticated to allow metrics exfiltration without having to deal
-        with the Prometheus compatibility later.
+        Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
+        with Prometheus compatibility (only basic auth support). Ensure this API endpoint
+        is only opened to trusted networks that cannot abuse the data provided!
        ---
        tags:
          - root
@ -690,7 +694,8 @@ class API_Metrics_Health(Resource):
        return response


-api.add_resource(API_Metrics_Health, "/metrics/health")
+if config["enable_prometheus"]:
+    api.add_resource(API_Metrics_Health, "/metrics/health")


 # /metrics/resource
@ -699,8 +704,9 @@ class API_Metrics_Resource(Resource):
        """
        Return the current PVC cluster resource utilizations in Prometheus-compatible metrics format

-        Endpoint is unauthenticated to allow metrics exfiltration without having to deal
-        with the Prometheus compatibility later.
+        Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
+        with Prometheus compatibility (only basic auth support). Ensure this API endpoint
+        is only opened to trusted networks that cannot abuse the data provided!
        ---
        tags:
          - root
@ -724,7 +730,8 @@ class API_Metrics_Resource(Resource):
        return response


-api.add_resource(API_Metrics_Resource, "/metrics/resource")
+if config["enable_prometheus"]:
+    api.add_resource(API_Metrics_Resource, "/metrics/resource")


 # /metrics/ceph
@ -735,6 +742,10 @@ class API_Metrics_Ceph(Resource):

        Proxies a metrics request to the current active MGR, since this is dynamic
        and can't be controlled by PVC easily.
+
+        Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
+        with Prometheus compatibility (only basic auth support). Ensure this API endpoint
+        is only opened to trusted networks that cannot abuse the data provided!
        ---
        tags:
          - root
@ -758,7 +769,47 @@ class API_Metrics_Ceph(Resource):
        return response


-api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
+if config["enable_prometheus"]:
+    api.add_resource(API_Metrics_Ceph, "/metrics/ceph")
+
+
+# /metrics/zookeeper
+class API_Metrics_Zookeeper(Resource):
+    def get(self):
+        """
+        Return the current PVC Zookeeper Prometheus metrics
+
+        Proxies a metrics request to the current primary node, since all coordinators
+        run an active Zookeeper instance and we want one central location.
+
+        Endpoint is UNAUTHENTICATED to allow metrics exfiltration without having to deal
+        with Prometheus compatibility (only basic auth support). Ensure this API endpoint
+        is only opened to trusted networks that cannot abuse the data provided!
+        ---
+        tags:
+          - root
+        responses:
+          200:
+            description: OK
+          400:
+            description: Bad request
+        """
+        zookeeper_output, zookeeper_retcode = api_helper.zookeeper_metrics()
+
+        if zookeeper_retcode != 200:
+            output = "Error: Failed to obtain data"
+            retcode = 400
+        else:
+            output = zookeeper_output
+            retcode = 200
+
+        response = flask.make_response(output, retcode)
+        response.mimetype = "text/plain"
+        return response
+
+
+if config["enable_prometheus"]:
+    api.add_resource(API_Metrics_Zookeeper, "/metrics/zookeeper")


 # /faults
--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@ -199,6 +199,38 @@ def ceph_metrics(zkhandler):
    return output, status_code


+@pvc_common.Profiler(config)
+@ZKConnection(config)
+def zookeeper_metrics(zkhandler):
+    """
+    Obtain current Zookeeper Prometheus metrics from the active coordinator node
+    """
+    primary_node = zkhandler.read("base.config.primary_node")
+    if primary_node is not None:
+        # Get the data from the endpoint
+        # We use the default port of 9141
+        zookeeper_prometheus_uri = f"http://{primary_node}:9141/metrics"
+        response = get(zookeeper_prometheus_uri)
+
+        if response.status_code == 200:
+            output = response.text
+            # Parse the text to remove annoying ports (":2181")
+            output = output.replace(":2181", "")
+            # Sort the output text
+            output_lines = output.split("\n")
+            output_lines.sort()
+            output = "\n".join(output_lines) + "\n"
+            status_code = 200
+        else:
+            output = f"Error: Failed to obtain metric data from {primary_node} primary node daemon\n"
+            status_code = 400
+    else:
+        output = "Error: Failed to find an active primary node\n"
+        status_code = 400
+
+    return output, status_code
+
+
 #
 # Fault functions
 #
--- a/daemon-common/config.py
+++ b/daemon-common/config.py
@ -176,6 +176,7 @@ def get_parsed_configuration(config_file):
            "enable_storage": o_subsystem.get("enable_storage", True),
            "enable_worker": o_subsystem.get("enable_worker", True),
            "enable_api": o_subsystem.get("enable_api", True),
+            "enable_prometheus": o_subsystem.get("enable_prometheus", True),
        }
        config = {**config, **config_subsystem}

--- a/monitoring/prometheus/README.md
+++ b/monitoring/prometheus/README.md
@ -0,0 +1,36 @@
+# Prometheus Monitoring for PVC
+
+This example contains a Prometheus config snippit, an example `file_sd_configs` file, and a Grafana dashboard for monitoring a PVC cluster using the inbuilt metrics (`/api/v1/metrics`).
+
+## `prometheus.yml`
+
+This snippit shows how to set up a scrape config leveraging the `file_sd_configs` file.
+
+This example uses `http` transport; if you use HTTPS for PVC API traffic (e.g. if it traverses the Internet), use `https` here. You can optionally disable certificate checking like so:
+
+```
+[...]
+scheme: "https"
+tls_config:
+  insecure_skip_verify: true
+file_sd_configs:
+[...]
+```
+
+## `targets-pvc_cluster.json`
+
+This JSON-based config shows two example clusters as two discrete entries. This is required for proper labeling.
+
+Each entry must contain:
+
+* A single `targets` entry, pointing at the API address and port of the PVC cluster.
+
+* Two `labels` which are leveraged by the Grafana dashboard:
+
+   * `pvc_cluster_id`: An identifier for the cluster. Likely, the `Name` in your `pvc connection list` entry for the cluster.
+
+   * `pvc_cluster_name`: A nicer, more human-readable description of the cluster. Likely, the `Description` in your `pvc connection list` entry for the cluster.
+
+## `grafana-pvc-cluster-dashboard.json`
+
+This JSON-based Grafana dashboard allows for a nice presentation of the metrics collected by the above Prometheus pollers. The cluster can be selected (based on the `pvc_cluster_name` value) and useful information about the cluster is then displayed.
--- a/monitoring/prometheus/grafana-pvc-cluster-dashboard.json
+++ b/monitoring/prometheus/grafana-pvc-cluster-dashboard.json
--- a/monitoring/prometheus/targets-pvc_cluster.json
+++ b/monitoring/prometheus/targets-pvc_cluster.json
@ -1,10 +1,20 @@
 [
  {
    "targets": [
-      "pvc.upstream.floating.address.tld:7370"
+      "pvc.upstream.floating.address.1.tld:7370"
    ],
    "labels": {
-      "cluster": "cluster1"
+      "pvc_cluster_id": "cluster1",
+      "pvc_cluster_name": "cluster1: My First Cluster"
+    }
+  },
+  {
+    "targets": [
+      "pvc.upstream.floating.address.2.tld:7370"
+    ],
+    "labels": {
+      "pvc_cluster_id": "cluster2",
+      "pvc_cluster_name": "cluster2: My Second Cluster"
    }
  }
 ]
--- a/pvc.sample.conf
+++ b/pvc.sample.conf
@ -44,6 +44,9 @@ subsystem:
    # Enable or disable the API client, if installed, when node is Primary
    enable_api: yes

+    # Enable or disable the Prometheus metrics endpoints in the API; if disabled, these return 404
+    enable_prometheus: yes
+
 # Cluster configuration
 cluster:
Author	SHA1	Message	Date
Joshua M. Boniface	e4ca74c201	Add Zookeeper performance to Grafana dashboard	2023-12-29 09:44:40 -05:00
Joshua M. Boniface	4969e90f8a	Allow enable/disable of Prometheus endpoints Since these are unauthenticated, it might be the case that an administrator wishes to completely disable these metrics endpoints. Provide that option via pvc.conf through pvc-ansible's existing enable_prometheus_exporters option and the new enable_prometheus configuration flag. Defaults to "yes" to provide all functionality unless explicitly disabled, as the author assumes that the PVC API is secured in other ways as well and that metric information is not completely sensitive.	2023-12-29 09:25:10 -05:00
Joshua M. Boniface	52f68909f6	Update Grafana dashboard	2023-12-28 14:55:43 -05:00
Joshua M. Boniface	0bcf8cfe19	Add Zookeeper metrics proxy	2023-12-28 13:53:15 -05:00
Joshua M. Boniface	2bb24d3b57	Update Prometheus dashboard and add README	2023-12-27 15:57:12 -05:00