monitoring: global: implement
This creates a basic Global instance, running Victoria Metrics on k0.
Change-Id: Ib03003213d79b41cc54efe40cd2c4837f652c0f4
diff --git a/ops/monitoring/README.md b/ops/monitoring/README.md
new file mode 100644
index 0000000..0594678
--- /dev/null
+++ b/ops/monitoring/README.md
@@ -0,0 +1,41 @@
+hscloud monitoring
+==================
+
+Quick links
+-----------
+
+ - *Old Global Dashboard*: [monitoring.hackerspace.pl](https://monitoring.hackerspace.pl) - old monitoring system, unrelated to this one, configured using Chef at management.hackerspace.pl (long since dead). This setup is supposed to replace it.
+
+Architecture
+------------
+
+The hscloud monitoring solution is two-tiered:
+
+ - at the *global* tier we run metrics aggregation, long-term storage, dashboard and alerting.
+ - at the *agent* tier we collect metrics from various sources (possibly even lower tiered agents).
+
+All agent-tier agents send metrics to all global instances.
+
+
+ .--------. .--------. '.
+ | global | | global | > - global tier
+ '--------' '--------' .' (contains 'global instances')
+ | '---. .---' |
+ | X |
+ | .---' '---. |
+ | | | |
+ .--------------. .--------------------. '.
+ | cluster | | hswaw-proxy | |
+ | k0.hswaw.net | | waw.hackerspace.pl | > - agent tier
+ '--------------' '--------------------' .' (contains 'agents')
+
+
+Agent - cluster
+---------------
+
+Cluster agents are responsible from collecting Kubernetes cluster metrics. They run a prometheus server that scrapes kubelet/cadvisor/... metrics and send them off to global instances.
+
+Global Instances
+----------------
+
+Global agents run Victoria Metrics, ingest metrics from all agents, and perform long-term storage. In the future they will also run Grafana and AlertManager.
diff --git a/ops/monitoring/k0.jsonnet b/ops/monitoring/k0.jsonnet
index 028a463..62810c5 100644
--- a/ops/monitoring/k0.jsonnet
+++ b/ops/monitoring/k0.jsonnet
@@ -1,11 +1,39 @@
-local lib = import "lib.libsonnet";
+local cluster = import "lib/cluster.libsonnet";
+local global = import "lib/global.libsonnet";
+
+// Monitoring tiers set up on k0. See README for architectural background.
{
- cluster: lib.Cluster("k0") {
- cfg+: {
- storageClasses+: {
- prometheus: "waw-hdd-redundant-3",
- },
+ local k0 = self,
+ local cfg = {
+ storageClasses+: {
+ prometheus: "waw-hdd-redundant-3",
+ victoria: "waw-hdd-redundant-3",
},
},
+
+ // Cluster tier - prometheus.
+ cluster: cluster.Cluster("k0") {
+ cfg+: cfg {
+ username: "cluster-k0",
+ upstreams: [
+ { password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], remote: k0.global.internalIngestURL },
+ ],
+ },
+ },
+
+ // Global tier - victoria metrics.
+ global: global.Global("k0") {
+ cfg+: cfg {
+ hosts: {
+ globalAPI: "monitoring-global-api.k0.hswaw.net",
+ },
+ agents: [
+ // Ingestion from k0 cluster tier.
+ { username: k0.cluster.cfg.username, password: std.split(importstr "secrets/plain/global-agent-cluster-k0", "\n")[0], },
+ // Access from q3k's test Grafana.
+ { username: "grafana", password: std.split(importstr "secrets/plain/global-agent-grafana", "\n")[0], },
+ ],
+ },
+ },
}
diff --git a/ops/monitoring/lib.libsonnet b/ops/monitoring/lib.libsonnet
deleted file mode 100644
index 61f49b4..0000000
--- a/ops/monitoring/lib.libsonnet
+++ /dev/null
@@ -1,5 +0,0 @@
-local cluster = import "lib/cluster.libsonnet";
-
-{
- Cluster: cluster.Cluster,
-}
diff --git a/ops/monitoring/lib/cluster.libsonnet b/ops/monitoring/lib/cluster.libsonnet
index 9b64f05..511d426 100644
--- a/ops/monitoring/lib/cluster.libsonnet
+++ b/ops/monitoring/lib/cluster.libsonnet
@@ -2,8 +2,10 @@
{
// Cluster sets up all cluster-specific monitoring resources in their own namespace.
+ //
// Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
- // and cAdvisor metrics.
+ // and cAdvisor metrics, and possibly ships over metrics to the global tier via set
+ // upstreams.
Cluster(name):: {
local cluster = self,
local cfg = cluster.cfg,
@@ -18,6 +20,17 @@
storageClasses: {
prometheus: error "storageClasses.prometheus must be set",
},
+
+ // Username used to authenticate to upstreams.
+ username: error "username must be set",
+
+ // Global tier upstreams that this cluster should ship metrics off to.
+ // List of
+ // {
+ // remote: URL of upstream
+ // password: password used to authenticate, in conjunction with cfg.username.
+ //
+ upstreams: [],
},
namespace: kube.Namespace(cfg.namespace),
@@ -105,6 +118,17 @@
],
},
],
+
+ remote_write: [
+ {
+ url: u.remote,
+ basic_auth: {
+ username: cluster.cfg.username,
+ password: u.password,
+ },
+ }
+ for u in cluster.cfg.upstreams
+ ],
},
configmap: kube.ConfigMap("prometheus-cluster") {
@@ -152,9 +176,7 @@
"/bin/prometheus",
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus",
- # TODO(q3k): reduce this once we have a long-term storage
- # solution.
- "--storage.tsdb.retention.time=120d",
+ "--storage.tsdb.retention.size=10GB",
"--web.console.libraries=/usr/share/prometheus/console_libraries",
"--web.console.templates=/usr/share/prometheus/consoles",
"--web.enable-lifecycle",
@@ -198,7 +220,7 @@
accessModes: ["ReadWriteOnce"],
resources: {
requests: {
- storage: "32Gi",
+ storage: "16Gi",
},
},
},
diff --git a/ops/monitoring/lib/global.libsonnet b/ops/monitoring/lib/global.libsonnet
new file mode 100644
index 0000000..dbdbebb
--- /dev/null
+++ b/ops/monitoring/lib/global.libsonnet
@@ -0,0 +1,149 @@
+local kube = import "../../../kube/kube.libsonnet";
+
+{
+ // Global sets up a global tier instance of the hscloud monitoring infrastructure.
+ //
+ // This currently consists of Victoria Metrics, to which the agent tier sends metrics data via
+ // the prometheus remote_write protocol.
+ // Victoria Metrics is here used as a long-term storage solution. However, right now, it
+ // just keeps data locally on disk. In the future, S3 snapshots/backups should be introduced.
+ Global(name):: {
+ local global = self,
+ local cfg = global.cfg,
+
+ cfg:: {
+ name: name,
+ namespace: "monitoring-global-%s" % [cfg.name],
+
+ images: {
+ victoria: "victoriametrics/victoria-metrics:v1.40.0",
+ vmauth: "victoriametrics/vmauth:v1.40.0",
+ },
+
+ hosts: {
+ // DNS hostname that this global tier will use. Ingress will run under it.
+ globalAPI: error "hosts.globalAPI must be set",
+ },
+
+ storageClasses: {
+ // Storage class used for main data retention.
+ victoria: error "storageClasses.victoria must be set",
+ },
+
+ // A list of agents that will push metrics to this instance.
+ // List of:
+ // {
+ // username: the username that the agent will authenticate with
+ // password: the password that the agent will authenticate with
+ // }
+ agents: [],
+ },
+
+ // Generated URLs that agents should use to ship metrics over. Both require HTTP basic
+ // auth, configured via cfg.agents.
+ // The internal URL should be used for agents colocated in the same Kubernetes cluster.
+ internalIngestURL:: "http://%s/api/v1/write" % [global.victoria.serviceAPI.host_colon_port],
+ // The glboal URL should be used for agents sending data over the internet.
+ globalIngestURL:: "https://%s/api/v1/write" % [cfg.hosts.globalAPI],
+
+ namespace: kube.Namespace(cfg.namespace),
+ local ns = global.namespace,
+
+ victoria: {
+ local victoria = self,
+
+ pvc: ns.Contain(kube.PersistentVolumeClaim("victoria-data")) {
+ spec+: {
+ storageClassName: cfg.storageClasses.victoria,
+ accessModes: ["ReadWriteOnce"],
+ resources: {
+ requests: {
+ storage: "64Gi",
+ },
+ },
+ },
+ },
+
+ authSecret: ns.Contain(kube.Secret("vmauth")) {
+ data+: {
+ "config.yaml": std.base64(std.manifestJson({
+ users: [
+ {
+ username: a.username,
+ password: a.password,
+ url_prefix: "http://localhost:8428",
+ }
+ for a in cfg.agents
+ ],
+ }) + "\n")
+ },
+ },
+
+ deploy: ns.Contain(kube.Deployment("victoria")) {
+ spec+: {
+ template+: {
+ spec+: {
+ containers_: {
+ default: kube.Container("default") {
+ image: cfg.images.victoria,
+ volumeMounts_: {
+ data: { mountPath: "/victoria-metrics-data", },
+ },
+ },
+ vmauth: kube.Container("vmauth") {
+ image: cfg.images.vmauth,
+ command: [
+ "/vmauth-prod",
+ "-auth.config", "/mnt/secret/config.yaml",
+ ],
+ volumeMounts_: {
+ secret: { mountPath: "/mnt/secret", },
+ },
+ ports_: {
+ api: { containerPort: 8427 }
+ },
+ }
+ },
+ volumes_: {
+ data: kube.PersistentVolumeClaimVolume(victoria.pvc),
+ secret: kube.SecretVolume(victoria.authSecret),
+ },
+ },
+ },
+ },
+ },
+
+ serviceAPI: ns.Contain(kube.Service("victoria-api")) {
+ target_pod: victoria.deploy.spec.template,
+ spec+: {
+ ports: [
+ { name: "api", port: 8427, targetPort: 8427, protocol: "TCP" },
+ ],
+ type: "ClusterIP",
+ },
+ },
+
+ ingressAPI: ns.Contain(kube.Ingress("victoria-api")) {
+ metadata+: {
+ annotations+: {
+ "kubernetes.io/tls-acme": "true",
+ "certmanager.k8s.io/cluster-issuer": "letsencrypt-prod",
+ },
+ },
+ spec+: {
+ tls: [
+ { hosts: [cfg.hosts.globalAPI], secretName: "ingress-tls" },
+ ],
+ rules: [
+ {
+ host: cfg.hosts.globalAPI,
+ http: {
+ paths: [ { path: "/", backend: { serviceName: victoria.serviceAPI.metadata.name, servicePort: 8427 } }, ],
+ },
+ }
+ ],
+ },
+ },
+ },
+ }
+}
diff --git a/ops/monitoring/secrets/cipher/global-agent-cluster-k0 b/ops/monitoring/secrets/cipher/global-agent-cluster-k0
new file mode 100644
index 0000000..29e715b
--- /dev/null
+++ b/ops/monitoring/secrets/cipher/global-agent-cluster-k0
@@ -0,0 +1,40 @@
+-----BEGIN PGP MESSAGE-----
+
+hQEMAzhuiT4RC8VbAQgAiZLuysTzxY8VM1wOAC7Hb0/3dHh0/5cFG1nOC6svnVt4
+NLZG0K+9uSuku76N/TZak1lk0pieeW9PE+FBDAAjUhGKS1/0qvZmG2Y5T3qs7pYf
+0Zv68hKix88bEfK7yfF/t68cYB1F2ms/4Y5tCBuW3av8MI7XQifWdnwgokxbE6xY
+yhGpII6zZemfA+kuMo4BRsyy2Z1xsKo7Ah64hQQUQFXwzr+i4hzpp2AeVlWAcFNj
+IlHPxA02ZcBCtjz2DLShFN2s8WBenboM88eUfeKpRAbMMfGcycmpIt7uf6pZ1UJa
+viTnfV1juqyXaMLECOBNYBhlMagjRIZ0CbM/5mn3Q4UBDANcG2tp6fXqvgEIAK2M
+pbeD3JpNE6pRvQsAHuKObQ+Bm82CxZg2uS9OPwNm6l7ESROpCnTRU8ahHIJO2f1d
+IMXzLO4M6QMb5FpAl2ixsT/SeZ9Z8NSxcl1ndByTRPQ3wSNfCV8wW7tXIWHzv1El
+pjBRowEbitwuwFgfgk86lYdYLKRPefAPr4fFNQV6aGLSWdVMo6vdR/C78xDivduy
+A79Fu64+nsKgOrKHkcxn4YyhFDTOt7avpCX3xAFDWoN7w3W5iQ/EQk+6SVnfsqjo
+IqTcxcS1o1TxpEjyoBPgpAERFEJEjIE2Dpo8E5UjJDLHMtSJMMqrAW7nLZJimI+c
+DSY83h5VtCzAnvjIXYeFAgwDodoT8VqRl4UBD/4ujIoPDkIZ/dLGbiwtlwZz4giY
+2LbfxHq2nkwy3+V6fbAxp+GyK4lB4XiZia2lMeWk5UECK8z9fpAhGvrFaSwjXkvn
+ig8LY4WFW8uxjtilJXxPBIqkl9EMyEZJaBFQ3d9icE2ZgPV7peXtZBYgZVD4fY7E
+Pxi2CTRrILr628Vqpbo0GdB88NdDd24wzkec6rVVV8WktWbyNXvzzJE6BbkcdBj6
+DyfG55SKSQAjYfC3b8LYtcCZDXiidFMflDXraVuaoOWHKuAb8Mwhrz0TwdTVSH0G
+xcnANQjBXf+TNfm8nrfLLSnmyili4qOgEuRQKeSJR+aiR3kDQhV0exd/jAJlcTcF
++QuLWEgpj95W+TD9s/EaVjRBIWs2TFvsUYlU+HZap0GzFiBQQubpK/EgJsh6Xz1/
+3mjDxG5vv/Wdti0ko3oul9koS24eNK07lwM6g/GwtLhT48h/Db/M/9/Vadx+T6KW
+fEKTdYyn438hOVKqrKwIRLp98e++VbLIg7pQsH0YOFK0QMFk6N7IbYRSEBnuDD+F
+W3VJ4wiP0dwM2LttHwFTapReaDkORYv70rvb3mplp4kCLF7AvUJJQaU1cncuLdwd
+Sj3iDu8s0lf3lM0Y+SlB1xDkzHrOGmcC0I4JVH+padBDKCpAn/8IbFmlaiV1Xjtd
+KTwB6+NcMtmqX+ObNYUCDAPiA8lOXOuz7wEP/A9JFmiFZ36mPnfjbA7OHz6U3zOT
+71iHwJDJXXG4g83WKtOLTYaNAizjXVz7wInWbwigTK2uD38lzOfArbU7UaAP38yS
+89xNff5YOQASO71AJutoulUbA43TFF0gVqtcqsYJO7Cx+DSrBwhHXtFsppOVeMsH
+9CmxVskPnwyZGG1yAJJ+EnD+y+SGmUh97EyWH+UKNZ4fiXVnwNt1ffY6vFYz61d8
+AferfbvKy/9UN2gxn+QDVjDdyf5Qk14t6ljdBnO/RVbZNpU7pIRepttUFlCr87a+
+SZR+WGPOaedzq/8nu3rABEqOxyLd0W0c9eOFwAtLxszjQ3yakhDseH2xlpeqa5g1
+YzPFL79ywFDLjdfnPju8OBROEtyZD2mNrCqR846xIjPERjMhDYSS7DUI73iC7hrW
+5hzfNj7ky1l0mYg3lfIdbtDrQO/HjnsYL3JA8WTcNHVEx3EpgEpQCz7g2TZAfpNs
+E08pkn8hLuNg+PH1RvTFLTVflclfZsnTPu7np7TTE8O1OaA8qUG/P1nAXX+wX93a
+a2uHQ39I3VZqPA7eRK+Gp6lDSBP1pbZbKz5tV/9glVPXKXY+bDamlWE7kgXbGOsY
+zILK+jN9GFad4/gg9b7Upw5EdphnyAoob2SbrKbFN5ALyfFbgG+wFhFdb2oS7tCk
+eLx5zEc+aQReFEQ30nMBLEWI+DfbN4nby1Ccp3bcOQvnSr9a0XJYSgFtcVve9aWw
+/tYpNn0+fo1M1J+93UPjfjL9/ApDH3dDaS6L6WR8jn2EHALHUbwNuShBnDcCAlQe
+/ZVaR2LGJprPHURChG2pognfRZhp+YK06diTwUHtyHir
+=e/F1
+-----END PGP MESSAGE-----
diff --git a/ops/monitoring/secrets/cipher/global-agent-grafana b/ops/monitoring/secrets/cipher/global-agent-grafana
new file mode 100644
index 0000000..fd501e6
--- /dev/null
+++ b/ops/monitoring/secrets/cipher/global-agent-grafana
@@ -0,0 +1,40 @@
+-----BEGIN PGP MESSAGE-----
+
+hQEMAzhuiT4RC8VbAQf+NMwFZv9tVcUOo13hi7r2Z5V294dseTFk+q3nE//ZmWx6
+6LL70Ggdc3etozf9w6uriQG0wbrfy7XwOYkpFJYaJb2gut0xxG/Fw221ZGR8elpe
+79FzveD0FUZK+UdixXMiqYQOiwUK5+RbjhKN+R3WjG5mrClHDeCG5WrXFPvT9wOX
+dA3ED/ZczrNxvSbKeE1imFoCeudrC9/zo/CRmb0BHrIoOEe+vCe/MzN0s/fkiq1k
+RyZZxJ0M6/oudlcexyaYJcTdBTW1ZMNmmZ9lWsBjmf5kTKGu1tDUuMU9RBJmtyYn
+8euaTwwCOfZjz6vdKoGer07ftEyfbjDGuU6zOtN9Z4UBDANcG2tp6fXqvgEH/i++
+MbBnFbCOajtIN2xN8P6WiP6RjPmUKaKLJCltZHqPPYuULFWTa8uBIbfqVjtgFfoE
+43eBgP+D1EQooq6O7NqWcoCY7LphwKx///oWsmeuiRy+wwQOGMV45tF31n944P1U
+qZGhik8n7pxLkNaC5ohaucQJeaDSi7GuMATzBWdGY6lZkaNLUfrPmKXu9tyIaA4u
+b80gPvFWc/9PgnS9rAcVPA7/8Il53EVJJsYK2/S7nDFKRTJfThId4cvvAFUXkuwW
+hM6FEFcJdSW90qbhBGCwr3yfvSb3Je0k7gYrg9iQTLxRpIbG3Mbz9irEKno3alGe
+8FvuAdmVn1AVyxNomT+FAgwDodoT8VqRl4UBD/0bGb+8AF3mTmEMTHAPPLUIxhu+
+ihb4Po6OxQ7u/UCMSHXhFQCqb4ytK2JsG2UhcIiYrQrZMVGQh3rGcfZESHpX1/Ol
+rTwkjUZSSnek8M1hbEkS7PU2rePsXt/O07+zenqMO3pMeVsX+VLEGXRS6KZ9WXsc
+X09iLyqBErgntaM7otMSZVSPV7VFEaIoVdPe4NZxudDMedeA0hr1BneaVUNVjMtQ
+UE6ZVxzFSoqMnfsJY9/dn/uhHdv7qOhzw0ABINmDybI6IWNEaGzzSJC2HcHZjocY
+h2Se4mzxjOz9X4CG28h8b9jFHRtSe4OiSAQYxwtZNxGR6kCt1PfP20oemPBg+LXF
+T6+ledT5nkkaoztl5EOxKoh20BfbNOR2AWbPYuRLJ7OKF/dFDJ3PndPwSjEvPZDY
+xpvHszqVlcMpleqM/iQILD33Nzz9RhtSZHQiGYuZsak4aeUWsz/3a6JhtcliK4Do
+CyG3J1wVf0a+jsXlDr0M50qf+aY7k76zTqtfXcPSKypMeP0yZaYMPCnzHvFpj39u
+u3NGOEiwv1WXMrUn59SuL9X6aP5s4D455E5JDuOFKrndN78CKqsCTPkNGOCU5F+f
+B25lXY0yF22RLiHbWiAGcM+u4roi7qA8HWYly6lqOl3imk3D3NJP+ENGyoCxgfcQ
+GIrl8z5fyE5GtJUgAoUCDAPiA8lOXOuz7wEP/3ZmA1njB/F1nu/vafx90O0A0Mmr
+J7EvveK89W2P5JsZjEX/sVSurx1kY4U1Lofe00jdbsQNtfQ33/K2zr0Vb5G3VQxL
+QnsksGO9MjywwVzpspuS+gQE2P6VU5YjpHXA15SJXkJV/SDvxoPSyPt5x3m0nU+9
+aj43mTgKdvTwSeDoEHzt54KRY01HOugpmmY/TZ5Wkeam2vNsCaSEYAdDTaSEG6j2
+OfxQf8X/A+7RDwyoVpjDg4LVAyHmcomWBeudEH4GkR+oGC7YQ39QEb4TA9h0ufUO
+2N1XWIf+FraCFX5x9IeKoe9ZyInz8I24lRMRHHu1RGrluGHCjflVzuIfOnasy2yU
+CX9EhvoL2IxfCLdY2XoBcrpCcTr+FKu4/5n4P3WN6TSWnQlvCypgfJ7zkjGWytQW
+cChiouYvYLcW2q1opZgIu3J1i7QnS6qHzzhK3eejBF0DZinxs+q7cylVVAzrq5FF
+p6+v6OXlZRbsIoBJg0kKKfUwqzoTwEFPvPMvoIxCmwSfQmlnemeXrXdahqbZXgIO
+a4jMrlene6Asr7xVZ+9siv5plPogmvWco/950KmKlXOUEg439nADHzhTYPmai5YA
+i9inb9B1sAcpbYQejUwnIx+W11qsyE2PAXHdj/mvVm1fzO/VJ+yGHtnKwfGtHS8x
+v7vq6yCM2p4HPQLC0m4BFIvHf29iKhYDxjj3F0d//VEez/43+79bDDakmS5StD6P
+DykfoaYRBijgUfhG4a4UbMBbpIuukwBI0EQoy+3Sca6Rx1Da5lIPXuMbb/N/c4rp
+xL1OhySubTvzg2yTCfUoEuMWZFL1rwgT9lrKVg==
+=Do8R
+-----END PGP MESSAGE-----
diff --git a/ops/monitoring/secrets/plain/.gitignore b/ops/monitoring/secrets/plain/.gitignore
new file mode 100644
index 0000000..72e8ffc
--- /dev/null
+++ b/ops/monitoring/secrets/plain/.gitignore
@@ -0,0 +1 @@
+*