Merge "cluster/kube: split up cluster.jsonnet"
diff --git a/WORKSPACE b/WORKSPACE
index 138b93a..268259b 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -105,29 +105,6 @@
 
 container_repositories()
 
-# Nix rules
-http_archive(
-    name = "io_tweag_rules_nixpkgs",
-    strip_prefix = "rules_nixpkgs-33c50ba64c11dddb95823d12f6b1324083cc5c43",
-    urls = ["https://github.com/tweag/rules_nixpkgs/archive/33c50ba64c11dddb95823d12f6b1324083cc5c43.tar.gz"],
-    sha256 = "91fedd5151bbd9ef89efc39e2172921bd7036c68cff54712a5df8ddf62bd6922",
-)
-
-# Nix packages
-
-load("@io_tweag_rules_nixpkgs//nixpkgs:nixpkgs.bzl", "nixpkgs_git_repository", "nixpkgs_package")
-
-nixpkgs_git_repository(
-    name = "nixpkgs",
-    revision = "2f1f9a9fe8a3c22f0677733523eaf6bd33995d50",
-)
-
-nixpkgs_package(
-    name = "nixops",
-    attribute_path = "nixops",
-    repositories = {"nixpkgs": "@nixpkgs"},
-)
-
 # Python rules
 
 git_repository(
diff --git a/cluster/kube/lib/metrics.libsonnet b/cluster/kube/lib/metrics.libsonnet
index e11f5ef..fda3a59 100644
--- a/cluster/kube/lib/metrics.libsonnet
+++ b/cluster/kube/lib/metrics.libsonnet
@@ -1,4 +1,5 @@
 # Deploy a per-cluster Metrics Server setup.
+# These are Kubernetes metrics, not Prometheus/whatever.
 
 local kube = import "../../../kube/kube.libsonnet";
 
diff --git a/cluster/tools/install.sh b/cluster/tools/install.sh
index 08e3476..6f32fbb 100755
--- a/cluster/tools/install.sh
+++ b/cluster/tools/install.sh
@@ -15,9 +15,3 @@
         //cluster/tools:calicoctl \
         //cluster/tools:cfssl
 
-if [ ! -e /nix ] ; then
-    echo "WARNING: No Nix installation detected. nix-dependent tools (nixops) will not be built or available." 
-else
-    bazel build \
-            //cluster/tools:nixops
-fi
diff --git a/ops/monitoring/OWNERS b/ops/monitoring/OWNERS
new file mode 100644
index 0000000..318c819
--- /dev/null
+++ b/ops/monitoring/OWNERS
@@ -0,0 +1,3 @@
+owners:
+- q3k
+- implr
diff --git a/ops/monitoring/doc/index.md b/ops/monitoring/doc/index.md
new file mode 100644
index 0000000..b17287c
--- /dev/null
+++ b/ops/monitoring/doc/index.md
@@ -0,0 +1,38 @@
+Monitoring
+==========
+
+Setting up monitoring in hscloud is a work in progress.
+
+Components
+----------
+
+Currently we have a per-cluster setup with prometheus scraping Kubernetes nodes
+(kubeletes) for kubelet metrics and cAdvisor metrics.
+
+    .-----------------------------------------------------------.
+    |                        k0.hswaw.net                       |
+    |-----------------------------------------------------------|
+    |  .---------------------.                                  |
+    |  | ns: metrics-cluster |    .--------------------------.  |
+    |  |---------------------|    |  kubernetes.svc.cluster  |  |
+    |  | prometheus          |--> | apiserver proxy to nodes |  |
+    |  '---------------------'    '--------------------------'  |
+    |                                         |                 |
+    '---------------------------------------- v ----------------'
+                                  .---------------------.
+                                  |   bc0n01.hswaw.net  |-.
+                      Kubernetes  |---------------------| |-.
+                           Nodes  |  /metrics           |-| |
+                                  |  /metrics/cadvisor  | |-|
+                                  '---------------------' | |
+                                    '---------------------' |
+                                      '---------------------'
+
+Everything else (dashboard, aggregation, user metrics) is a work in progress.
+
+Legacy
+------
+
+There is a legacy prometheus/grafana VM on https://metrics.hackerspace.pl/. The
+certificate is expired, but it Generally Works, and will be kept going until
+its functionality is migrated to hscloud.
diff --git a/ops/monitoring/k0.jsonnet b/ops/monitoring/k0.jsonnet
new file mode 100644
index 0000000..028a463
--- /dev/null
+++ b/ops/monitoring/k0.jsonnet
@@ -0,0 +1,11 @@
+local lib = import "lib.libsonnet";
+
+{
+    cluster: lib.Cluster("k0") {
+        cfg+: {
+            storageClasses+: {
+                prometheus: "waw-hdd-redundant-3",
+            },
+        },
+    },
+}
diff --git a/ops/monitoring/lib.libsonnet b/ops/monitoring/lib.libsonnet
new file mode 100644
index 0000000..61f49b4
--- /dev/null
+++ b/ops/monitoring/lib.libsonnet
@@ -0,0 +1,5 @@
+local cluster = import "lib/cluster.libsonnet";
+
+{
+    Cluster: cluster.Cluster,
+}
diff --git a/ops/monitoring/lib/cluster.libsonnet b/ops/monitoring/lib/cluster.libsonnet
new file mode 100644
index 0000000..9b64f05
--- /dev/null
+++ b/ops/monitoring/lib/cluster.libsonnet
@@ -0,0 +1,227 @@
+local kube = import "../../../kube/kube.libsonnet";
+
+{
+    // Cluster sets up all cluster-specific monitoring resources in their own namespace.
+    // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
+    // and cAdvisor metrics.
+    Cluster(name):: {
+        local cluster = self,
+        local cfg = cluster.cfg,
+        cfg:: {
+            name: name,
+            namespace: "monitoring-cluster",
+
+            images: {
+                prometheus: "prom/prometheus:v2.18.1",
+            },
+
+            storageClasses: {
+                prometheus: error "storageClasses.prometheus must be set",
+            },
+        },
+
+        namespace: kube.Namespace(cfg.namespace),
+
+        prometheus: {
+            local prometheus = self,
+
+            // Configuration that's going to be emitted as prometheus.yml and passed to the
+            // prometheus server for this cluster.
+            configuration:: {
+                global: {
+                    external_labels: {
+                        cluster: cluster.cfg.name,
+                    },
+                },
+
+                // Constructor for a Kubernetes scrape job that uses the pod's service account and
+                // TLS configuration, selecting the given k8s scrape 'role'.
+                local kubeScrapeConfig = function(name, role) {
+                    job_name: name,
+                    scheme: "https",
+                    scrape_interval: "30s",
+                    kubernetes_sd_configs: [ { role: role }, ],
+                    tls_config: {
+                        ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
+                    },
+                    bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
+                },
+
+                scrape_configs: [
+                    // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
+                    // the metrics endpoints on the kubelet via the API server. This is done by
+                    // relabeling _address__ and __metrics_path__ to point at the k8s API server,
+                    // and at the API server proxy path to reach a node's metrics endpoint.
+                    //
+                    // This approach was lifted from the prometheus examples for Kubernetes, and
+                    // while the benefits outlined there do not matter that much to us (our
+                    // kubelets listen on public addresses, anyway), we still enjoy this approach
+                    // for the fact that we don't have to hardcode the kubelet TLS port.
+                    //
+                    // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
+                    //
+                    // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
+                    // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
+                    // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
+
+                    // Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.
+                    kubeScrapeConfig("cluster_node_metrics", "node") {
+                        relabel_configs: [
+                            {
+                                action: "labelmap",
+                                regex: "__meta_kubernetes_node_label_(.+)",
+                            },
+                            {
+                                action: "replace",
+                                target_label: "__address__",
+                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                            },
+                            {
+                                target_label: "__metrics_path__",
+                                source_labels: ["__meta_kubernetes_node_name"],
+                                regex: "(.+)",
+                                replacement: "/api/v1/nodes/${1}/proxy/metrics",
+                            },
+                        ],
+                    },
+                    // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
+                    kubeScrapeConfig("cluster_cadvisor_metrics", "node") {
+                        relabel_configs: [
+                            {
+                                action: "labelmap",
+                                regex: "__meta_kubernetes_node_label_(.+)",
+                            },
+                            {
+                                action: "replace",
+                                target_label: "__address__",
+                                replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
+                            },
+                            {
+                                target_label: "__metrics_path__",
+                                source_labels: ["__meta_kubernetes_node_name"],
+                                regex: "(.+)",
+                                replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",
+                            },
+                        ],
+                    },
+                ],
+            },
+
+            configmap: kube.ConfigMap("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                data: {
+                    "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
+                },
+            },
+
+            sa: kube.ServiceAccount("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+            },
+
+            cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
+                rules: [
+                    // Allow access to all metrics.
+                    { nonResourceURLs: ["/metrics"], verbs: ["get"], },
+                    // Allow to access node details for discovery.
+                    { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
+                    // Allow to proxy to bare node HTTP to access per-node metrics endpoints. 
+                    { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
+                ],
+            },
+
+            crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
+                subjects_: [prometheus.sa],
+                roleRef_: prometheus.cr,
+            },
+
+            deploy: kube.Deployment("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: {
+                    template+: {
+                        spec+: {
+                            containers_: {
+                                default: kube.Container("default") {
+                                    image: cfg.images.prometheus,
+                                    command: [
+                                        "/bin/prometheus",
+                                        "--config.file=/etc/prometheus/prometheus.yml",
+                                        "--storage.tsdb.path=/prometheus",
+                                        # TODO(q3k): reduce this once we have a long-term storage
+                                        # solution.
+                                        "--storage.tsdb.retention.time=120d",
+                                        "--web.console.libraries=/usr/share/prometheus/console_libraries",
+                                        "--web.console.templates=/usr/share/prometheus/consoles",
+                                        "--web.enable-lifecycle",
+                                    ],
+                                    resources: {
+                                        requests: {
+                                            memory: "256Mi",
+                                            cpu: "100m",
+                                        },
+                                        limits: {
+                                            memory: "1Gi",
+                                            cpu: "1",
+                                        },
+                                    },
+                                    volumeMounts_: {
+                                        data: { mountPath: "/prometheus", },
+                                        configmap: { mountPath: "/etc/prometheus", },
+                                    },
+                                },
+                            },
+                            serviceAccountName: prometheus.sa.metadata.name,
+                            tolerations: [
+                                { key: "CriticalAddonsOnly", operator: "Exists" },
+                            ],
+                            volumes_: {
+                                data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
+                                configmap: kube.ConfigMapVolume(prometheus.configmap),
+                            },
+                        },
+                    },
+                },
+            },
+
+            // Kubernetes metric storage volume.
+            pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: {
+                    storageClassName: cfg.storageClasses.prometheus,
+                    accessModes: ["ReadWriteOnce"],
+                    resources: {
+                        requests: {
+                            storage: "32Gi",
+                        },
+                    },
+                },
+            },
+
+            // Network Policy governing access to the prometheus server.
+            np: kube.NetworkPolicy("prometheus-cluster") {
+                metadata+: {
+                    namespace: cfg.namespace,
+                },
+                spec+: kube.podLabelsSelector(prometheus.deploy) {
+                    ingress_: {
+                        // Deny all inbound traffic to pod.
+                        // This will be augmented to allow access from some other pod/namespace
+                        // in the future.
+                    },
+                    egress_: {
+                        // Allow all outbound traffic from pod.
+                        outboundAll: {},
+                    },
+                    policyTypes: ["Ingress", "Egress"],
+                },
+            },
+        },
+    },
+}