ops/monitoring: scrape apiserver, scheduler, and controller-manager

These get scraped by public IP address, which get retrieved via service
discovery in Prometheus (by using the endpoints role on the
default/kubernetes service).

Also drive-by fix cluster prometheus resources - the default
configuration wants at least 3GB of physical memory.

Change-Id: I1eedb19051f62b40613f69e5f0f736d5958acf42
diff --git a/ops/monitoring/lib/cluster.libsonnet b/ops/monitoring/lib/cluster.libsonnet
index 511d426..00aa792 100644
--- a/ops/monitoring/lib/cluster.libsonnet
+++ b/ops/monitoring/lib/cluster.libsonnet
@@ -60,63 +60,106 @@
                     bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
                 },
 
-                scrape_configs: [
-                    // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
-                    // the metrics endpoints on the kubelet via the API server. This is done by
-                    // relabeling _address__ and __metrics_path__ to point at the k8s API server,
-                    // and at the API server proxy path to reach a node's metrics endpoint.
-                    //
-                    // This approach was lifted from the prometheus examples for Kubernetes, and
-                    // while the benefits outlined there do not matter that much to us (our
-                    // kubelets listen on public addresses, anyway), we still enjoy this approach
-                    // for the fact that we don't have to hardcode the kubelet TLS port.
-                    //
-                    // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
-                    //
-                    // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
-                    // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
-                    // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
 
-                    // Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.
-                    kubeScrapeConfig("cluster_node_metrics", "node") {
-                        relabel_configs: [
-                            {
-                                action: "labelmap",
-                                regex: "__meta_kubernetes_node_label_(.+)",
-                            },
-                            {
-                                action: "replace",
-                                target_label: "__address__",
-                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
-                            },
-                            {
-                                target_label: "__metrics_path__",
-                                source_labels: ["__meta_kubernetes_node_name"],
-                                regex: "(.+)",
-                                replacement: "/api/v1/nodes/${1}/proxy/metrics",
-                            },
-                        ],
+                // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
+                // the metrics endpoints on the kubelet via the API server. This is done by
+                // relabeling _address__ and __metrics_path__ to point at the k8s API server,
+                // and at the API server proxy path to reach a node's metrics endpoint.
+                //
+                // This approach was lifted from the prometheus examples for Kubernetes, and
+                // while the benefits outlined there do not matter that much to us (our
+                // kubelets listen on public addresses, anyway), we still enjoy this approach
+                // for the fact that we don't have to hardcode the kubelet TLS port.
+                //
+                // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
+                //
+                // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
+                // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
+                // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
+                local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
+                    relabel_configs: [
+                        {
+                            action: "labelmap",
+                            regex: "__meta_kubernetes_node_label_(.+)",
+                        },
+                        {
+                            action: "replace",
+                            target_label: "__address__",
+                            replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                        },
+                        {
+                            target_label: "__metrics_path__",
+                            source_labels: ["__meta_kubernetes_node_name"],
+                            regex: "(.+)",
+                            replacement: "/api/v1/nodes/${1}/proxy" + path,
+                        },
+                    ],
+                },
+
+                // When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
+                // APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
+                // and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
+                // discovery which selects all endpoints for the default/kubernetes service. This service is
+                // backed by apiserver instances on public IP addresses. We can then rewrite the received port
+                // by the port of the service we're interested in to get to that service.
+                local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
+                    relabel_configs: [
+                        // Select only endpoints that back the default/kubernetes service. These are all
+                        // public IP addresses of nodes that run the API server.
+                        {
+                            action: "keep",
+                            regex: "default;kubernetes;https",
+                            source_labels: [
+                                "__meta_kubernetes_namespace",
+                                "__meta_kubernetes_service_name",
+                                "__meta_kubernetes_endpoint_port_name",
+                            ],
+                        },
+                    ] + (if port == 4001 then [] else [
+                        // Replace endpoint port with requested port, if the requested port is not the apiserver's
+                        // port 4001, which is the one returned by default for the these endpoints.
+                        {
+                            action: "replace",
+                            regex: "([^:]+):.+",
+                            replacement: "$1:%d" % [port],
+                            source_labels: [
+                                "__address__",
+                            ],
+                            target_label: "__address__",
+                        },
+                    ]),
+                    // We disable server-side TLS certificate verification.
+                    // Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
+                    // the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
+                    // target [1], so the only two choiced we are left with are:
+                    //   1) re-emit relevant certificates with IP address SANs that allow for access by IP.
+                    //   2) disable TLS verification.
+                    // We choose 2), knowing that if someone manages to hijack a target IP address they can end up
+                    // stealing our bearer token and impersonating the service account with which Prometheus is
+                    // running. In the long term, we hope for [1] to be resolved.
+                    //
+                    // TODO(q3k): revisit this once [1] gets fixed.
+                    // [1] - https://github.com/prometheus/prometheus/issues/4827
+                    tls_config: {
+                        insecure_skip_verify: true,
                     },
+                },
+
+                scrape_configs: [
+                    /// Scrape per-node metrics, proxied via the APIServer..
+                    // Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
+                    kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
                     // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
-                    kubeScrapeConfig("cluster_cadvisor_metrics", "node") {
-                        relabel_configs: [
-                            {
-                                action: "labelmap",
-                                regex: "__meta_kubernetes_node_label_(.+)",
-                            },
-                            {
-                                action: "replace",
-                                target_label: "__address__",
-                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
-                            },
-                            {
-                                target_label: "__metrics_path__",
-                                source_labels: ["__meta_kubernetes_node_name"],
-                                regex: "(.+)",
-                                replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",
-                            },
-                        ],
-                    },
+                    kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),
+
+                    /// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
+                    /// (currently all nodes are 'master' nodes)
+                    // Scrape Kubernetes apiserver metrics.
+                    kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
+                    // Scrape Kubernetes controller-manager metrics.
+                    kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
+                    // Scrape Kubernetes scheduler metrics.
+                    kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
                 ],
 
                 remote_write: [
@@ -152,6 +195,7 @@
                     { nonResourceURLs: ["/metrics"], verbs: ["get"], },
                     // Allow to access node details for discovery.
                     { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
+                    { apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
                     // Allow to proxy to bare node HTTP to access per-node metrics endpoints. 
                     { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
                 ],
@@ -183,11 +227,11 @@
                                     ],
                                     resources: {
                                         requests: {
-                                            memory: "256Mi",
+                                            memory: "3Gi",
                                             cpu: "100m",
                                         },
                                         limits: {
-                                            memory: "1Gi",
+                                            memory: "3Gi",
                                             cpu: "1",
                                         },
                                     },