monitoring: global: implement

This creates a basic Global instance, running Victoria Metrics on k0.

Change-Id: Ib03003213d79b41cc54efe40cd2c4837f652c0f4
diff --git a/ops/monitoring/README.md b/ops/monitoring/README.md
new file mode 100644
index 0000000..0594678
--- /dev/null
+++ b/ops/monitoring/README.md
@@ -0,0 +1,41 @@
+hscloud monitoring
+==================
+
+Quick links
+-----------
+
+ - *Old Global Dashboard*: [monitoring.hackerspace.pl](https://monitoring.hackerspace.pl) - old monitoring system, unrelated to this one, configured using Chef at management.hackerspace.pl (long since dead). This setup is supposed to replace it.
+
+Architecture
+------------
+
+The hscloud monitoring solution is two-tiered:
+
+ - at the *global* tier we run metrics aggregation, long-term storage, dashboard and alerting.
+ - at the *agent* tier we collect metrics from various sources (possibly even lower tiered agents).
+
+All agent-tier agents send metrics to all global instances.
+
+
+          .--------.     .--------.              '.
+          | global |     | global |               > - global tier
+          '--------'     '--------'              .'   (contains 'global instances')
+            |    '---. .---'    |
+            |         X         |
+            |    .---' '---.    |
+            |    |         |    |
+    .--------------.     .--------------------. '.
+    |   cluster    |     |    hswaw-proxy     |  |
+    | k0.hswaw.net |     | waw.hackerspace.pl |   > - agent tier
+    '--------------'     '--------------------' .'    (contains 'agents')
+
+
+Agent - cluster
+---------------
+
+Cluster agents are responsible from collecting Kubernetes cluster metrics. They run a prometheus server that scrapes kubelet/cadvisor/... metrics and send them off to global instances.
+
+Global Instances
+----------------
+
+Global agents run Victoria Metrics, ingest metrics from all agents, and perform long-term storage. In the future they will also run Grafana and AlertManager.
diff --git a/ops/monitoring/k0.jsonnet b/ops/monitoring/k0.jsonnet
index 028a463..62810c5 100644
--- a/ops/monitoring/k0.jsonnet
+++ b/ops/monitoring/k0.jsonnet
@@ -1,11 +1,39 @@
-local lib = import "lib.libsonnet";
+local cluster = import "lib/cluster.libsonnet";
+local global = import "lib/global.libsonnet";
+
+// Monitoring tiers set up on k0. See README for architectural background.
 
 {
-    cluster: lib.Cluster("k0") {
-        cfg+: {
-            storageClasses+: {
-                prometheus: "waw-hdd-redundant-3",
-            },
+    local k0 = self,
+    local cfg = {
+        storageClasses+: {
+            prometheus: "waw-hdd-redundant-3",
+            victoria: "waw-hdd-redundant-3",
         },
     },
+
+    // Cluster tier - prometheus.
+    cluster: cluster.Cluster("k0") {
+        cfg+: cfg {
+            username: "cluster-k0",
+            upstreams: [
+                { password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], remote: k0.global.internalIngestURL  },
+            ],
+        },
+    },
+
+    // Global tier - victoria metrics.
+    global: global.Global("k0") {
+        cfg+: cfg {
+            hosts: {
+                globalAPI: "monitoring-global-api.k0.hswaw.net",
+            },
+            agents: [
+                // Ingestion from k0 cluster tier.
+                { username: k0.cluster.cfg.username, password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], },
+                // Access from q3k's test Grafana.
+                { username: "grafana", password: std.split(importstr "secrets/plain/global-agent-grafana", "\n")[0], },
+            ],
+        }, 
+    },
 }
diff --git a/ops/monitoring/lib.libsonnet b/ops/monitoring/lib.libsonnet
deleted file mode 100644
index 61f49b4..0000000
--- a/ops/monitoring/lib.libsonnet
+++ /dev/null
@@ -1,5 +0,0 @@
-local cluster = import "lib/cluster.libsonnet";
-
-{
-    Cluster: cluster.Cluster,
-}
diff --git a/ops/monitoring/lib/cluster.libsonnet b/ops/monitoring/lib/cluster.libsonnet
index 9b64f05..511d426 100644
--- a/ops/monitoring/lib/cluster.libsonnet
+++ b/ops/monitoring/lib/cluster.libsonnet
@@ -2,8 +2,10 @@
 
 {
     // Cluster sets up all cluster-specific monitoring resources in their own namespace.
+    //
     // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
-    // and cAdvisor metrics.
+    // and cAdvisor metrics, and possibly ships over metrics to the global tier via set
+    // upstreams.
     Cluster(name):: {
         local cluster = self,
         local cfg = cluster.cfg,
@@ -18,6 +20,17 @@
             storageClasses: {
                 prometheus: error "storageClasses.prometheus must be set",
             },
+
+            // Username used to authenticate to upstreams.
+            username: error "username must be set",
+
+            // Global tier upstreams that this cluster should ship metrics off to.
+            // List of
+            //  {
+            //     remote: URL of upstream
+            //     password: password used to authenticate, in conjunction with cfg.username.
+            //  
+            upstreams: [],
         },
 
         namespace: kube.Namespace(cfg.namespace),
@@ -105,6 +118,17 @@
                         ],
                     },
                 ],
+
+                remote_write: [
+                    {
+                        url: u.remote,
+                        basic_auth: {
+                            username: cluster.cfg.username,
+                            password: u.password,
+                        },
+                    }
+                    for u in cluster.cfg.upstreams
+                ],
             },
 
             configmap: kube.ConfigMap("prometheus-cluster") {
@@ -152,9 +176,7 @@
                                         "/bin/prometheus",
                                         "--config.file=/etc/prometheus/prometheus.yml",
                                         "--storage.tsdb.path=/prometheus",
-                                        # TODO(q3k): reduce this once we have a long-term storage
-                                        # solution.
-                                        "--storage.tsdb.retention.time=120d",
+                                        "--storage.tsdb.retention.size=10GB",
                                         "--web.console.libraries=/usr/share/prometheus/console_libraries",
                                         "--web.console.templates=/usr/share/prometheus/consoles",
                                         "--web.enable-lifecycle",
@@ -198,7 +220,7 @@
                     accessModes: ["ReadWriteOnce"],
                     resources: {
                         requests: {
-                            storage: "32Gi",
+                            storage: "16Gi",
                         },
                     },
                 },
diff --git a/ops/monitoring/lib/global.libsonnet b/ops/monitoring/lib/global.libsonnet
new file mode 100644
index 0000000..dbdbebb
--- /dev/null
+++ b/ops/monitoring/lib/global.libsonnet
@@ -0,0 +1,149 @@
+local kube = import "../../../kube/kube.libsonnet";
+
+{
+    // Global sets up a global tier instance of the hscloud monitoring infrastructure.
+    //
+    // This currently consists of Victoria Metrics, to which the agent tier sends metrics data via
+    // the prometheus remote_write protocol.
+    // Victoria Metrics is here used as a long-term storage solution. However, right now, it
+    // just keeps data locally on disk. In the future, S3 snapshots/backups should be introduced.
+    Global(name):: {
+        local global = self,
+        local cfg = global.cfg,
+
+        cfg:: {
+            name: name,
+            namespace: "monitoring-global-%s" % [cfg.name],
+
+            images: {
+                victoria: "victoriametrics/victoria-metrics:v1.40.0",
+                vmauth: "victoriametrics/vmauth:v1.40.0",
+            },
+
+            hosts: {
+                // DNS hostname that this global tier will use. Ingress will run under it.
+                globalAPI: error "hosts.globalAPI must be set",
+            },
+
+            storageClasses: {
+                // Storage class used for main data retention.
+                victoria: error "storageClasses.victoria must be set",
+            },
+
+            // A list of agents that will push metrics to this instance.
+            // List of:
+            // {
+            //   username: the username that the agent will authenticate with
+            //   password: the password that the agent will authenticate with
+            // }
+            agents: [],
+        },
+
+        // Generated URLs that agents should use to ship metrics over. Both require HTTP basic
+        // auth, configured via cfg.agents.
+        // The internal URL should be used for agents colocated in the same Kubernetes cluster.
+        internalIngestURL:: "http://%s/api/v1/write" % [global.victoria.serviceAPI.host_colon_port],
+        // The glboal URL should be used for agents sending data over the internet.
+        globalIngestURL:: "https://%s/api/v1/write" % [cfg.hosts.globalAPI],
+
+        namespace: kube.Namespace(cfg.namespace),
+        local ns = global.namespace,
+
+        victoria: {
+            local victoria = self,
+
+            pvc: ns.Contain(kube.PersistentVolumeClaim("victoria-data")) {
+                spec+: {
+                    storageClassName: cfg.storageClasses.victoria,
+                    accessModes: ["ReadWriteOnce"],
+                    resources: {
+                        requests: {
+                            storage: "64Gi",
+                        },
+                    },
+                },
+            },
+
+            authSecret: ns.Contain(kube.Secret("vmauth")) {
+                data+: {
+                    "config.yaml": std.base64(std.manifestJson({
+                        users: [
+                            {
+                                username: a.username,
+                                password: a.password,
+                                url_prefix: "http://localhost:8428",
+                            }
+                            for a in cfg.agents
+                        ],
+                    }) + "\n")
+                },
+            },
+
+            deploy: ns.Contain(kube.Deployment("victoria")) {
+                spec+: {
+                    template+: {
+                        spec+: {
+                            containers_: {
+                                default: kube.Container("default") {
+                                    image: cfg.images.victoria,
+                                    volumeMounts_: {
+                                        data: { mountPath: "/victoria-metrics-data", },
+                                    },
+                                },
+                                vmauth: kube.Container("vmauth") {
+                                    image: cfg.images.vmauth,
+                                    command: [
+                                        "/vmauth-prod",
+                                        "-auth.config", "/mnt/secret/config.yaml",
+                                    ],
+                                    volumeMounts_: {
+                                        secret: { mountPath: "/mnt/secret", },
+                                    },
+                                    ports_: {
+                                        api: { containerPort: 8427 }
+                                    },
+                                }
+                            },
+                            volumes_: {
+                                data: kube.PersistentVolumeClaimVolume(victoria.pvc),
+                                secret: kube.SecretVolume(victoria.authSecret),
+                            },
+                        },
+                    },
+                },
+            },
+
+            serviceAPI: ns.Contain(kube.Service("victoria-api")) {
+                target_pod: victoria.deploy.spec.template,
+                spec+: {
+                    ports: [
+                        { name: "api", port: 8427, targetPort: 8427, protocol: "TCP" },
+                    ],
+                    type: "ClusterIP",
+                },
+            },
+
+            ingressAPI: ns.Contain(kube.Ingress("victoria-api")) {
+                metadata+: {
+                    annotations+: {
+                        "kubernetes.io/tls-acme": "true",
+                        "certmanager.k8s.io/cluster-issuer": "letsencrypt-prod",
+                    },
+                },
+                spec+: {
+                    tls: [
+                        { hosts: [cfg.hosts.globalAPI], secretName: "ingress-tls" },
+                    ],
+                    rules: [
+                        {
+                            host: cfg.hosts.globalAPI,
+                            http: {
+                                paths: [ { path: "/", backend: { serviceName: victoria.serviceAPI.metadata.name, servicePort: 8427 } }, ],
+                            },
+                        }
+                    ],
+                },
+            },
+        },
+    }
+}
diff --git a/ops/monitoring/secrets/cipher/global-agent-cluster-k0 b/ops/monitoring/secrets/cipher/global-agent-cluster-k0
new file mode 100644
index 0000000..29e715b
--- /dev/null
+++ b/ops/monitoring/secrets/cipher/global-agent-cluster-k0
@@ -0,0 +1,40 @@
+-----BEGIN PGP MESSAGE-----
+
+hQEMAzhuiT4RC8VbAQgAiZLuysTzxY8VM1wOAC7Hb0/3dHh0/5cFG1nOC6svnVt4
+NLZG0K+9uSuku76N/TZak1lk0pieeW9PE+FBDAAjUhGKS1/0qvZmG2Y5T3qs7pYf
+0Zv68hKix88bEfK7yfF/t68cYB1F2ms/4Y5tCBuW3av8MI7XQifWdnwgokxbE6xY
+yhGpII6zZemfA+kuMo4BRsyy2Z1xsKo7Ah64hQQUQFXwzr+i4hzpp2AeVlWAcFNj
+IlHPxA02ZcBCtjz2DLShFN2s8WBenboM88eUfeKpRAbMMfGcycmpIt7uf6pZ1UJa
+viTnfV1juqyXaMLECOBNYBhlMagjRIZ0CbM/5mn3Q4UBDANcG2tp6fXqvgEIAK2M
+pbeD3JpNE6pRvQsAHuKObQ+Bm82CxZg2uS9OPwNm6l7ESROpCnTRU8ahHIJO2f1d
+IMXzLO4M6QMb5FpAl2ixsT/SeZ9Z8NSxcl1ndByTRPQ3wSNfCV8wW7tXIWHzv1El
+pjBRowEbitwuwFgfgk86lYdYLKRPefAPr4fFNQV6aGLSWdVMo6vdR/C78xDivduy
+A79Fu64+nsKgOrKHkcxn4YyhFDTOt7avpCX3xAFDWoN7w3W5iQ/EQk+6SVnfsqjo
+IqTcxcS1o1TxpEjyoBPgpAERFEJEjIE2Dpo8E5UjJDLHMtSJMMqrAW7nLZJimI+c
+DSY83h5VtCzAnvjIXYeFAgwDodoT8VqRl4UBD/4ujIoPDkIZ/dLGbiwtlwZz4giY
+2LbfxHq2nkwy3+V6fbAxp+GyK4lB4XiZia2lMeWk5UECK8z9fpAhGvrFaSwjXkvn
+ig8LY4WFW8uxjtilJXxPBIqkl9EMyEZJaBFQ3d9icE2ZgPV7peXtZBYgZVD4fY7E
+Pxi2CTRrILr628Vqpbo0GdB88NdDd24wzkec6rVVV8WktWbyNXvzzJE6BbkcdBj6
+DyfG55SKSQAjYfC3b8LYtcCZDXiidFMflDXraVuaoOWHKuAb8Mwhrz0TwdTVSH0G
+xcnANQjBXf+TNfm8nrfLLSnmyili4qOgEuRQKeSJR+aiR3kDQhV0exd/jAJlcTcF
++QuLWEgpj95W+TD9s/EaVjRBIWs2TFvsUYlU+HZap0GzFiBQQubpK/EgJsh6Xz1/
+3mjDxG5vv/Wdti0ko3oul9koS24eNK07lwM6g/GwtLhT48h/Db/M/9/Vadx+T6KW
+fEKTdYyn438hOVKqrKwIRLp98e++VbLIg7pQsH0YOFK0QMFk6N7IbYRSEBnuDD+F
+W3VJ4wiP0dwM2LttHwFTapReaDkORYv70rvb3mplp4kCLF7AvUJJQaU1cncuLdwd
+Sj3iDu8s0lf3lM0Y+SlB1xDkzHrOGmcC0I4JVH+padBDKCpAn/8IbFmlaiV1Xjtd
+KTwB6+NcMtmqX+ObNYUCDAPiA8lOXOuz7wEP/A9JFmiFZ36mPnfjbA7OHz6U3zOT
+71iHwJDJXXG4g83WKtOLTYaNAizjXVz7wInWbwigTK2uD38lzOfArbU7UaAP38yS
+89xNff5YOQASO71AJutoulUbA43TFF0gVqtcqsYJO7Cx+DSrBwhHXtFsppOVeMsH
+9CmxVskPnwyZGG1yAJJ+EnD+y+SGmUh97EyWH+UKNZ4fiXVnwNt1ffY6vFYz61d8
+AferfbvKy/9UN2gxn+QDVjDdyf5Qk14t6ljdBnO/RVbZNpU7pIRepttUFlCr87a+
+SZR+WGPOaedzq/8nu3rABEqOxyLd0W0c9eOFwAtLxszjQ3yakhDseH2xlpeqa5g1
+YzPFL79ywFDLjdfnPju8OBROEtyZD2mNrCqR846xIjPERjMhDYSS7DUI73iC7hrW
+5hzfNj7ky1l0mYg3lfIdbtDrQO/HjnsYL3JA8WTcNHVEx3EpgEpQCz7g2TZAfpNs
+E08pkn8hLuNg+PH1RvTFLTVflclfZsnTPu7np7TTE8O1OaA8qUG/P1nAXX+wX93a
+a2uHQ39I3VZqPA7eRK+Gp6lDSBP1pbZbKz5tV/9glVPXKXY+bDamlWE7kgXbGOsY
+zILK+jN9GFad4/gg9b7Upw5EdphnyAoob2SbrKbFN5ALyfFbgG+wFhFdb2oS7tCk
+eLx5zEc+aQReFEQ30nMBLEWI+DfbN4nby1Ccp3bcOQvnSr9a0XJYSgFtcVve9aWw
+/tYpNn0+fo1M1J+93UPjfjL9/ApDH3dDaS6L6WR8jn2EHALHUbwNuShBnDcCAlQe
+/ZVaR2LGJprPHURChG2pognfRZhp+YK06diTwUHtyHir
+=e/F1
+-----END PGP MESSAGE-----
diff --git a/ops/monitoring/secrets/cipher/global-agent-grafana b/ops/monitoring/secrets/cipher/global-agent-grafana
new file mode 100644
index 0000000..fd501e6
--- /dev/null
+++ b/ops/monitoring/secrets/cipher/global-agent-grafana
@@ -0,0 +1,40 @@
+-----BEGIN PGP MESSAGE-----
+
+hQEMAzhuiT4RC8VbAQf+NMwFZv9tVcUOo13hi7r2Z5V294dseTFk+q3nE//ZmWx6
+6LL70Ggdc3etozf9w6uriQG0wbrfy7XwOYkpFJYaJb2gut0xxG/Fw221ZGR8elpe
+79FzveD0FUZK+UdixXMiqYQOiwUK5+RbjhKN+R3WjG5mrClHDeCG5WrXFPvT9wOX
+dA3ED/ZczrNxvSbKeE1imFoCeudrC9/zo/CRmb0BHrIoOEe+vCe/MzN0s/fkiq1k
+RyZZxJ0M6/oudlcexyaYJcTdBTW1ZMNmmZ9lWsBjmf5kTKGu1tDUuMU9RBJmtyYn
+8euaTwwCOfZjz6vdKoGer07ftEyfbjDGuU6zOtN9Z4UBDANcG2tp6fXqvgEH/i++
+MbBnFbCOajtIN2xN8P6WiP6RjPmUKaKLJCltZHqPPYuULFWTa8uBIbfqVjtgFfoE
+43eBgP+D1EQooq6O7NqWcoCY7LphwKx///oWsmeuiRy+wwQOGMV45tF31n944P1U
+qZGhik8n7pxLkNaC5ohaucQJeaDSi7GuMATzBWdGY6lZkaNLUfrPmKXu9tyIaA4u
+b80gPvFWc/9PgnS9rAcVPA7/8Il53EVJJsYK2/S7nDFKRTJfThId4cvvAFUXkuwW
+hM6FEFcJdSW90qbhBGCwr3yfvSb3Je0k7gYrg9iQTLxRpIbG3Mbz9irEKno3alGe
+8FvuAdmVn1AVyxNomT+FAgwDodoT8VqRl4UBD/0bGb+8AF3mTmEMTHAPPLUIxhu+
+ihb4Po6OxQ7u/UCMSHXhFQCqb4ytK2JsG2UhcIiYrQrZMVGQh3rGcfZESHpX1/Ol
+rTwkjUZSSnek8M1hbEkS7PU2rePsXt/O07+zenqMO3pMeVsX+VLEGXRS6KZ9WXsc
+X09iLyqBErgntaM7otMSZVSPV7VFEaIoVdPe4NZxudDMedeA0hr1BneaVUNVjMtQ
+UE6ZVxzFSoqMnfsJY9/dn/uhHdv7qOhzw0ABINmDybI6IWNEaGzzSJC2HcHZjocY
+h2Se4mzxjOz9X4CG28h8b9jFHRtSe4OiSAQYxwtZNxGR6kCt1PfP20oemPBg+LXF
+T6+ledT5nkkaoztl5EOxKoh20BfbNOR2AWbPYuRLJ7OKF/dFDJ3PndPwSjEvPZDY
+xpvHszqVlcMpleqM/iQILD33Nzz9RhtSZHQiGYuZsak4aeUWsz/3a6JhtcliK4Do
+CyG3J1wVf0a+jsXlDr0M50qf+aY7k76zTqtfXcPSKypMeP0yZaYMPCnzHvFpj39u
+u3NGOEiwv1WXMrUn59SuL9X6aP5s4D455E5JDuOFKrndN78CKqsCTPkNGOCU5F+f
+B25lXY0yF22RLiHbWiAGcM+u4roi7qA8HWYly6lqOl3imk3D3NJP+ENGyoCxgfcQ
+GIrl8z5fyE5GtJUgAoUCDAPiA8lOXOuz7wEP/3ZmA1njB/F1nu/vafx90O0A0Mmr
+J7EvveK89W2P5JsZjEX/sVSurx1kY4U1Lofe00jdbsQNtfQ33/K2zr0Vb5G3VQxL
+QnsksGO9MjywwVzpspuS+gQE2P6VU5YjpHXA15SJXkJV/SDvxoPSyPt5x3m0nU+9
+aj43mTgKdvTwSeDoEHzt54KRY01HOugpmmY/TZ5Wkeam2vNsCaSEYAdDTaSEG6j2
+OfxQf8X/A+7RDwyoVpjDg4LVAyHmcomWBeudEH4GkR+oGC7YQ39QEb4TA9h0ufUO
+2N1XWIf+FraCFX5x9IeKoe9ZyInz8I24lRMRHHu1RGrluGHCjflVzuIfOnasy2yU
+CX9EhvoL2IxfCLdY2XoBcrpCcTr+FKu4/5n4P3WN6TSWnQlvCypgfJ7zkjGWytQW
+cChiouYvYLcW2q1opZgIu3J1i7QnS6qHzzhK3eejBF0DZinxs+q7cylVVAzrq5FF
+p6+v6OXlZRbsIoBJg0kKKfUwqzoTwEFPvPMvoIxCmwSfQmlnemeXrXdahqbZXgIO
+a4jMrlene6Asr7xVZ+9siv5plPogmvWco/950KmKlXOUEg439nADHzhTYPmai5YA
+i9inb9B1sAcpbYQejUwnIx+W11qsyE2PAXHdj/mvVm1fzO/VJ+yGHtnKwfGtHS8x
+v7vq6yCM2p4HPQLC0m4BFIvHf29iKhYDxjj3F0d//VEez/43+79bDDakmS5StD6P
+DykfoaYRBijgUfhG4a4UbMBbpIuukwBI0EQoy+3Sca6Rx1Da5lIPXuMbb/N/c4rp
+xL1OhySubTvzg2yTCfUoEuMWZFL1rwgT9lrKVg==
+=Do8R
+-----END PGP MESSAGE-----
diff --git a/ops/monitoring/secrets/plain/.gitignore b/ops/monitoring/secrets/plain/.gitignore
new file mode 100644
index 0000000..72e8ffc
--- /dev/null
+++ b/ops/monitoring/secrets/plain/.gitignore
@@ -0,0 +1 @@
+*