ops/metrics: basic cluster setup with prometheus We handwavingly plan on implementing monitoring as a two-tier system: - a 'global' component that is reponsible for global aggregation, long-term storage and alerting. - multiple 'per-cluster' components, that collect metrics from Kubernetes clusters and export them to the global component. In addition, several lower tiers (collected by per-cluster components) might also be implemented in the future - for instance, specific to some subprojects. Here we start sketching out some basic jsonnet structure (currently all in a single file, with little parametrization) and a cluster-level prometheus server that scrapes Kubernetes Node and cAdvisor metrics. This review is mostly to get this commited as early as possible, and to make sure that the little existing Prometheus scrape configuration is sane. Change-Id: If37ac3b1243b8b6f464d65fee6d53080c36f992c

commit: ce81c39081f112ca09d2ceb300db3492d1d805b5 [log] [tgz]
author: Sergiusz Bazanski <q3k@hackerspace.pl> Sat Jun 06 12:35:06 2020 +0200
committer: Sergiusz Bazanski <q3k@hackerspace.pl> Sat Jun 06 15:56:10 2020 +0200
tree: de459e5d0ee5ec7ee247b9b1252da644bcab374b
parent: 30f9d03106e65ede9c54f84463999fc09700860e [diff]
diff --git a/cluster/kube/lib/metrics.libsonnet b/cluster/kube/lib/metrics.libsonnet
index e11f5ef..fda3a59 100644
--- a/cluster/kube/lib/metrics.libsonnet
+++ b/cluster/kube/lib/metrics.libsonnet

@@ -1,4 +1,5 @@
 # Deploy a per-cluster Metrics Server setup.
+# These are Kubernetes metrics, not Prometheus/whatever.
 
 local kube = import "../../../kube/kube.libsonnet";
 

diff --git a/ops/monitoring/kube.jsonnet b/ops/monitoring/kube.jsonnet
new file mode 100644
index 0000000..919ca7d
--- /dev/null
+++ b/ops/monitoring/kube.jsonnet

@@ -0,0 +1,228 @@
+local kube = import "../../kube/kube.libsonnet";
+
+{
+    local top = self,
+    // Per-cluster components
+    Cluster(name):: {
+        local cluster = self,
+        local cfg = cluster.cfg,
+        cfg:: {
+            name: name,
+            namespace: "monitoring-cluster",
+
+            images: {
+                prometheus: "prom/prometheus:v2.18.1",
+            },
+
+            storageClasses: {
+                prometheus: "waw-hdd-redundant-3",
+            },
+        },
+
+        namespace: kube.Namespace(cfg.namespace),
+
+        prometheus: {
+            local prometheus = self,
+
+            // Configuration that's going to be emitted as prometheus.yml and passed to the
+            // prometheus server for this cluster.
+            configuration:: {
+                global: {
+                    external_labels: {
+                        cluster: cluster.cfg.name,
+                    },
+                },
+
+                // Constructor for a Kubernetes scrape job that uses the pod's service account and
+                // TLS configuration, selecting the given k8s scrape 'role'.
+                local kubeScrapeConfig = function(name, role) {
+                    job_name: name,
+                    scheme: "https",
+                    scrape_interval: "30s",
+                    kubernetes_sd_configs: [ { role: role }, ],
+                    tls_config: {
+                        ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
+                    },
+                    bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
+                },
+
+                scrape_configs: [
+                    // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
+                    // the metrics endpoints on the kubelet via the API server. This is done by
+                    // relabeling _address__ and __metrics_path__ to point at the k8s API server,
+                    // and at the API server proxy path to reach a node's metrics endpoint.
+                    //
+                    // This approach was lifted from the prometheus examples for Kubernetes, and
+                    // while the benefits outlined there do not matter that much to us (our
+                    // kubelets listen on public addresses, anyway), we still enjoy this approach
+                    // for the fact that we don't have to hardcode the kubelet TLS port.
+                    //
+                    // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
+                    //
+                    // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
+                    // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
+                    // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
+
+                    // Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.
+                    kubeScrapeConfig("cluster_node_metrics", "node") {
+                        relabel_configs: [
+                            {
+                                action: "labelmap",
+                                regex: "__meta_kubernetes_node_label_(.+)",
+                            },
+                            {
+                                action: "replace",
+                                target_label: "__address__",
+                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                            },
+                            {
+                                target_label: "__metrics_path__",
+                                source_labels: ["__meta_kubernetes_node_name"],
+                                regex: "(.+)",
+                                replacement: "/api/v1/nodes/${1}/proxy/metrics",
+                            },
+                        ],
+                    },
+                    // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
+                    kubeScrapeConfig("cluster_cadvisor_metrics", "node") {
+                        relabel_configs: [
+                            {
+                                action: "labelmap",
+                                regex: "__meta_kubernetes_node_label_(.+)",
+                            },
+                            {
+                                action: "replace",
+                                target_label: "__address__",
+                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                            },
+                            {
+                                target_label: "__metrics_path__",
+                                source_labels: ["__meta_kubernetes_node_name"],
+                                regex: "(.+)",
+                                replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",
+                            },
+                        ],
+                    },
+                ],
+            },
+
+            configmap: kube.ConfigMap("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                data: {
+                    "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
+                },
+            },
+
+            sa: kube.ServiceAccount("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+            },
+
+            cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
+                rules: [
+                    // Allow access to all metrics.
+                    { nonResourceURLs: ["/metrics"], verbs: ["get"], },
+                    // Allow to access node details for discovery.
+                    { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
+                    // Allow to proxy to bare node HTTP to access per-node metrics endpoints. 
+                    { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
+                ],
+            },
+
+            crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
+                subjects_: [prometheus.sa],
+                roleRef_: prometheus.cr,
+            },
+
+            deploy: kube.Deployment("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: {
+                    template+: {
+                        spec+: {
+                            containers_: {
+                                default: kube.Container("default") {
+                                    image: cfg.images.prometheus,
+                                    command: [
+                                        "/bin/prometheus",
+                                        "--config.file=/etc/prometheus/prometheus.yml",
+                                        "--storage.tsdb.path=/prometheus",
+                                        # TODO(q3k): reduce this once we have a long-term storage
+                                        # solution.
+                                        "--storage.tsdb.retention.time=120d",
+                                        "--web.console.libraries=/usr/share/prometheus/console_libraries",
+                                        "--web.console.templates=/usr/share/prometheus/consoles",
+                                        "--web.enable-lifecycle",
+                                    ],
+                                    resources: {
+                                        requests: {
+                                            memory: "256Mi",
+                                            cpu: "100m",
+                                        },
+                                        limits: {
+                                            memory: "1Gi",
+                                            cpu: "1",
+                                        },
+                                    },
+                                    volumeMounts_: {
+                                        data: { mountPath: "/prometheus", },
+                                        configmap: { mountPath: "/etc/prometheus", },
+                                    },
+                                },
+                            },
+                            serviceAccountName: prometheus.sa.metadata.name,
+                            tolerations: [
+                                { key: "CriticalAddonsOnly", operator: "Exists" },
+                            ],
+                            volumes_: {
+                                data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
+                                configmap: kube.ConfigMapVolume(prometheus.configmap),
+                            },
+                        },
+                    },
+                },
+            },
+
+            // Kubernetes metric storage volume.
+            pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: {
+                    storageClassName: cfg.storageClasses.prometheus,
+                    accessModes: ["ReadWriteOnce"],
+                    resources: {
+                        requests: {
+                            storage: "32Gi",
+                        },
+                    },
+                },
+            },
+
+            // Network Policy governing access to the prometheus server.
+            np: kube.NetworkPolicy("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: kube.podLabelsSelector(prometheus.deploy) {
+                    ingress_: {
+                        // Deny all inbound traffic to pod.
+                        // This will be augmented to allow access from some other pod/namespace
+                        // in the future.
+                    },
+                    egress_: {
+                        // Allow all outbound traffic from pod.
+                        outboundAll: {},
+                    },
+                    policyTypes: ["Ingress", "Egress"],
+                },
+            },
+        },
+    },
+
+    k0: top.Cluster("k0"),
+}
commit	ce81c39081f112ca09d2ceb300db3492d1d805b5	[log] [tgz]
author	Sergiusz Bazanski <q3k@hackerspace.pl>	Sat Jun 06 12:35:06 2020 +0200
committer	Sergiusz Bazanski <q3k@hackerspace.pl>	Sat Jun 06 15:56:10 2020 +0200
tree	de459e5d0ee5ec7ee247b9b1252da644bcab374b
parent	30f9d03106e65ede9c54f84463999fc09700860e [diff]