ops/metrics: basic cluster setup with prometheus

We handwavingly plan on implementing monitoring as a two-tier system:

 - a 'global' component that is reponsible for global aggregation,
   long-term storage and alerting.
 - multiple 'per-cluster' components, that collect metrics from
   Kubernetes clusters and export them to the global component.

In addition, several lower tiers (collected by per-cluster components)
might also be implemented in the future - for instance, specific to some
subprojects.

Here we start sketching out some basic jsonnet structure (currently all
in a single file, with little parametrization) and a cluster-level
prometheus server that scrapes Kubernetes Node and cAdvisor metrics.

This review is mostly to get this commited as early as possible, and to
make sure that the little existing Prometheus scrape configuration is
sane.

Change-Id: If37ac3b1243b8b6f464d65fee6d53080c36f992c
diff --git a/cluster/kube/lib/metrics.libsonnet b/cluster/kube/lib/metrics.libsonnet
index e11f5ef..fda3a59 100644
--- a/cluster/kube/lib/metrics.libsonnet
+++ b/cluster/kube/lib/metrics.libsonnet
@@ -1,4 +1,5 @@
 # Deploy a per-cluster Metrics Server setup.
+# These are Kubernetes metrics, not Prometheus/whatever.
 
 local kube = import "../../../kube/kube.libsonnet";
 
diff --git a/ops/monitoring/kube.jsonnet b/ops/monitoring/kube.jsonnet
new file mode 100644
index 0000000..919ca7d
--- /dev/null
+++ b/ops/monitoring/kube.jsonnet
@@ -0,0 +1,228 @@
+local kube = import "../../kube/kube.libsonnet";
+
+{
+    local top = self,
+    // Per-cluster components
+    Cluster(name):: {
+        local cluster = self,
+        local cfg = cluster.cfg,
+        cfg:: {
+            name: name,
+            namespace: "monitoring-cluster",
+
+            images: {
+                prometheus: "prom/prometheus:v2.18.1",
+            },
+
+            storageClasses: {
+                prometheus: "waw-hdd-redundant-3",
+            },
+        },
+
+        namespace: kube.Namespace(cfg.namespace),
+
+        prometheus: {
+            local prometheus = self,
+
+            // Configuration that's going to be emitted as prometheus.yml and passed to the
+            // prometheus server for this cluster.
+            configuration:: {
+                global: {
+                    external_labels: {
+                        cluster: cluster.cfg.name,
+                    },
+                },
+
+                // Constructor for a Kubernetes scrape job that uses the pod's service account and
+                // TLS configuration, selecting the given k8s scrape 'role'.
+                local kubeScrapeConfig = function(name, role) {
+                    job_name: name,
+                    scheme: "https",
+                    scrape_interval: "30s",
+                    kubernetes_sd_configs: [ { role: role }, ],
+                    tls_config: {
+                        ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
+                    },
+                    bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
+                },
+
+                scrape_configs: [
+                    // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
+                    // the metrics endpoints on the kubelet via the API server. This is done by
+                    // relabeling _address__ and __metrics_path__ to point at the k8s API server,
+                    // and at the API server proxy path to reach a node's metrics endpoint.
+                    //
+                    // This approach was lifted from the prometheus examples for Kubernetes, and
+                    // while the benefits outlined there do not matter that much to us (our
+                    // kubelets listen on public addresses, anyway), we still enjoy this approach
+                    // for the fact that we don't have to hardcode the kubelet TLS port.
+                    //
+                    // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
+                    //
+                    // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
+                    // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
+                    // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
+
+                    // Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.
+                    kubeScrapeConfig("cluster_node_metrics", "node") {
+                        relabel_configs: [
+                            {
+                                action: "labelmap",
+                                regex: "__meta_kubernetes_node_label_(.+)",
+                            },
+                            {
+                                action: "replace",
+                                target_label: "__address__",
+                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                            },
+                            {
+                                target_label: "__metrics_path__",
+                                source_labels: ["__meta_kubernetes_node_name"],
+                                regex: "(.+)",
+                                replacement: "/api/v1/nodes/${1}/proxy/metrics",
+                            },
+                        ],
+                    },
+                    // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
+                    kubeScrapeConfig("cluster_cadvisor_metrics", "node") {
+                        relabel_configs: [
+                            {
+                                action: "labelmap",
+                                regex: "__meta_kubernetes_node_label_(.+)",
+                            },
+                            {
+                                action: "replace",
+                                target_label: "__address__",
+                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                            },
+                            {
+                                target_label: "__metrics_path__",
+                                source_labels: ["__meta_kubernetes_node_name"],
+                                regex: "(.+)",
+                                replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",
+                            },
+                        ],
+                    },
+                ],
+            },
+
+            configmap: kube.ConfigMap("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                data: {
+                    "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
+                },
+            },
+
+            sa: kube.ServiceAccount("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+            },
+
+            cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
+                rules: [
+                    // Allow access to all metrics.
+                    { nonResourceURLs: ["/metrics"], verbs: ["get"], },
+                    // Allow to access node details for discovery.
+                    { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
+                    // Allow to proxy to bare node HTTP to access per-node metrics endpoints. 
+                    { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
+                ],
+            },
+
+            crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
+                subjects_: [prometheus.sa],
+                roleRef_: prometheus.cr,
+            },
+
+            deploy: kube.Deployment("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: {
+                    template+: {
+                        spec+: {
+                            containers_: {
+                                default: kube.Container("default") {
+                                    image: cfg.images.prometheus,
+                                    command: [
+                                        "/bin/prometheus",
+                                        "--config.file=/etc/prometheus/prometheus.yml",
+                                        "--storage.tsdb.path=/prometheus",
+                                        # TODO(q3k): reduce this once we have a long-term storage
+                                        # solution.
+                                        "--storage.tsdb.retention.time=120d",
+                                        "--web.console.libraries=/usr/share/prometheus/console_libraries",
+                                        "--web.console.templates=/usr/share/prometheus/consoles",
+                                        "--web.enable-lifecycle",
+                                    ],
+                                    resources: {
+                                        requests: {
+                                            memory: "256Mi",
+                                            cpu: "100m",
+                                        },
+                                        limits: {
+                                            memory: "1Gi",
+                                            cpu: "1",
+                                        },
+                                    },
+                                    volumeMounts_: {
+                                        data: { mountPath: "/prometheus", },
+                                        configmap: { mountPath: "/etc/prometheus", },
+                                    },
+                                },
+                            },
+                            serviceAccountName: prometheus.sa.metadata.name,
+                            tolerations: [
+                                { key: "CriticalAddonsOnly", operator: "Exists" },
+                            ],
+                            volumes_: {
+                                data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
+                                configmap: kube.ConfigMapVolume(prometheus.configmap),
+                            },
+                        },
+                    },
+                },
+            },
+
+            // Kubernetes metric storage volume.
+            pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: {
+                    storageClassName: cfg.storageClasses.prometheus,
+                    accessModes: ["ReadWriteOnce"],
+                    resources: {
+                        requests: {
+                            storage: "32Gi",
+                        },
+                    },
+                },
+            },
+
+            // Network Policy governing access to the prometheus server.
+            np: kube.NetworkPolicy("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: kube.podLabelsSelector(prometheus.deploy) {
+                    ingress_: {
+                        // Deny all inbound traffic to pod.
+                        // This will be augmented to allow access from some other pod/namespace
+                        // in the future.
+                    },
+                    egress_: {
+                        // Allow all outbound traffic from pod.
+                        outboundAll: {},
+                    },
+                    policyTypes: ["Ingress", "Egress"],
+                },
+            },
+        },
+    },
+
+    k0: top.Cluster("k0"),
+}