blob: 00aa792296d666bb46ca1de0490067e4323b03dd [file] [log] [blame]
Sergiusz Bazanski2022ac22020-06-06 17:04:07 +02001local kube = import "../../../kube/kube.libsonnet";
Sergiusz Bazanskice81c392020-06-06 12:35:06 +02002
3{
Sergiusz Bazanski2022ac22020-06-06 17:04:07 +02004 // Cluster sets up all cluster-specific monitoring resources in their own namespace.
Serge Bazanski363bf4f2020-08-24 21:00:56 +02005 //
Sergiusz Bazanski2022ac22020-06-06 17:04:07 +02006 // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
Serge Bazanski363bf4f2020-08-24 21:00:56 +02007 // and cAdvisor metrics, and possibly ships over metrics to the global tier via set
8 // upstreams.
Sergiusz Bazanskice81c392020-06-06 12:35:06 +02009 Cluster(name):: {
10 local cluster = self,
11 local cfg = cluster.cfg,
12 cfg:: {
13 name: name,
14 namespace: "monitoring-cluster",
15
16 images: {
17 prometheus: "prom/prometheus:v2.18.1",
18 },
19
20 storageClasses: {
Sergiusz Bazanski2022ac22020-06-06 17:04:07 +020021 prometheus: error "storageClasses.prometheus must be set",
Sergiusz Bazanskice81c392020-06-06 12:35:06 +020022 },
Serge Bazanski363bf4f2020-08-24 21:00:56 +020023
24 // Username used to authenticate to upstreams.
25 username: error "username must be set",
26
27 // Global tier upstreams that this cluster should ship metrics off to.
28 // List of
29 // {
30 // remote: URL of upstream
31 // password: password used to authenticate, in conjunction with cfg.username.
32 //
33 upstreams: [],
Sergiusz Bazanskice81c392020-06-06 12:35:06 +020034 },
35
36 namespace: kube.Namespace(cfg.namespace),
37
38 prometheus: {
39 local prometheus = self,
40
41 // Configuration that's going to be emitted as prometheus.yml and passed to the
42 // prometheus server for this cluster.
43 configuration:: {
44 global: {
45 external_labels: {
46 cluster: cluster.cfg.name,
47 },
48 },
49
50 // Constructor for a Kubernetes scrape job that uses the pod's service account and
51 // TLS configuration, selecting the given k8s scrape 'role'.
52 local kubeScrapeConfig = function(name, role) {
53 job_name: name,
54 scheme: "https",
55 scrape_interval: "30s",
56 kubernetes_sd_configs: [ { role: role }, ],
57 tls_config: {
58 ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
59 },
60 bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
61 },
62
Sergiusz Bazanskice81c392020-06-06 12:35:06 +020063
Serge Bazanskicfc04962020-10-10 15:57:12 +020064 // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
65 // the metrics endpoints on the kubelet via the API server. This is done by
66 // relabeling _address__ and __metrics_path__ to point at the k8s API server,
67 // and at the API server proxy path to reach a node's metrics endpoint.
68 //
69 // This approach was lifted from the prometheus examples for Kubernetes, and
70 // while the benefits outlined there do not matter that much to us (our
71 // kubelets listen on public addresses, anyway), we still enjoy this approach
72 // for the fact that we don't have to hardcode the kubelet TLS port.
73 //
74 // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
75 //
76 // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
77 // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
78 // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
79 local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
80 relabel_configs: [
81 {
82 action: "labelmap",
83 regex: "__meta_kubernetes_node_label_(.+)",
84 },
85 {
86 action: "replace",
87 target_label: "__address__",
88 replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
89 },
90 {
91 target_label: "__metrics_path__",
92 source_labels: ["__meta_kubernetes_node_name"],
93 regex: "(.+)",
94 replacement: "/api/v1/nodes/${1}/proxy" + path,
95 },
96 ],
97 },
98
99 // When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
100 // APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
101 // and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
102 // discovery which selects all endpoints for the default/kubernetes service. This service is
103 // backed by apiserver instances on public IP addresses. We can then rewrite the received port
104 // by the port of the service we're interested in to get to that service.
105 local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
106 relabel_configs: [
107 // Select only endpoints that back the default/kubernetes service. These are all
108 // public IP addresses of nodes that run the API server.
109 {
110 action: "keep",
111 regex: "default;kubernetes;https",
112 source_labels: [
113 "__meta_kubernetes_namespace",
114 "__meta_kubernetes_service_name",
115 "__meta_kubernetes_endpoint_port_name",
116 ],
117 },
118 ] + (if port == 4001 then [] else [
119 // Replace endpoint port with requested port, if the requested port is not the apiserver's
120 // port 4001, which is the one returned by default for the these endpoints.
121 {
122 action: "replace",
123 regex: "([^:]+):.+",
124 replacement: "$1:%d" % [port],
125 source_labels: [
126 "__address__",
127 ],
128 target_label: "__address__",
129 },
130 ]),
131 // We disable server-side TLS certificate verification.
132 // Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
133 // the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
134 // target [1], so the only two choiced we are left with are:
135 // 1) re-emit relevant certificates with IP address SANs that allow for access by IP.
136 // 2) disable TLS verification.
137 // We choose 2), knowing that if someone manages to hijack a target IP address they can end up
138 // stealing our bearer token and impersonating the service account with which Prometheus is
139 // running. In the long term, we hope for [1] to be resolved.
140 //
141 // TODO(q3k): revisit this once [1] gets fixed.
142 // [1] - https://github.com/prometheus/prometheus/issues/4827
143 tls_config: {
144 insecure_skip_verify: true,
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200145 },
Serge Bazanskicfc04962020-10-10 15:57:12 +0200146 },
147
148 scrape_configs: [
149 /// Scrape per-node metrics, proxied via the APIServer..
150 // Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
151 kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200152 // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
Serge Bazanskicfc04962020-10-10 15:57:12 +0200153 kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),
154
155 /// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
156 /// (currently all nodes are 'master' nodes)
157 // Scrape Kubernetes apiserver metrics.
158 kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
159 // Scrape Kubernetes controller-manager metrics.
160 kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
161 // Scrape Kubernetes scheduler metrics.
162 kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200163 ],
Serge Bazanski363bf4f2020-08-24 21:00:56 +0200164
165 remote_write: [
166 {
167 url: u.remote,
168 basic_auth: {
169 username: cluster.cfg.username,
170 password: u.password,
171 },
172 }
173 for u in cluster.cfg.upstreams
174 ],
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200175 },
176
177 configmap: kube.ConfigMap("prometheus-cluster") {
178 metadata+: {
179 namespace: cfg.namespace,
180 },
181 data: {
182 "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
183 },
184 },
185
186 sa: kube.ServiceAccount("prometheus-cluster") {
187 metadata+: {
188 namespace: cfg.namespace,
189 },
190 },
191
192 cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
193 rules: [
194 // Allow access to all metrics.
195 { nonResourceURLs: ["/metrics"], verbs: ["get"], },
196 // Allow to access node details for discovery.
197 { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
Serge Bazanskicfc04962020-10-10 15:57:12 +0200198 { apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200199 // Allow to proxy to bare node HTTP to access per-node metrics endpoints.
200 { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
201 ],
202 },
203
204 crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
205 subjects_: [prometheus.sa],
206 roleRef_: prometheus.cr,
207 },
208
209 deploy: kube.Deployment("prometheus-cluster") {
210 metadata+: {
211 namespace: cfg.namespace,
212 },
213 spec+: {
214 template+: {
215 spec+: {
216 containers_: {
217 default: kube.Container("default") {
218 image: cfg.images.prometheus,
219 command: [
220 "/bin/prometheus",
221 "--config.file=/etc/prometheus/prometheus.yml",
222 "--storage.tsdb.path=/prometheus",
Serge Bazanski363bf4f2020-08-24 21:00:56 +0200223 "--storage.tsdb.retention.size=10GB",
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200224 "--web.console.libraries=/usr/share/prometheus/console_libraries",
225 "--web.console.templates=/usr/share/prometheus/consoles",
226 "--web.enable-lifecycle",
227 ],
228 resources: {
229 requests: {
Serge Bazanskicfc04962020-10-10 15:57:12 +0200230 memory: "3Gi",
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200231 cpu: "100m",
232 },
233 limits: {
Serge Bazanskicfc04962020-10-10 15:57:12 +0200234 memory: "3Gi",
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200235 cpu: "1",
236 },
237 },
238 volumeMounts_: {
239 data: { mountPath: "/prometheus", },
240 configmap: { mountPath: "/etc/prometheus", },
241 },
242 },
243 },
244 serviceAccountName: prometheus.sa.metadata.name,
245 tolerations: [
246 { key: "CriticalAddonsOnly", operator: "Exists" },
247 ],
248 volumes_: {
249 data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
250 configmap: kube.ConfigMapVolume(prometheus.configmap),
251 },
252 },
253 },
254 },
255 },
256
257 // Kubernetes metric storage volume.
258 pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
259 metadata+: {
260 namespace: cfg.namespace,
261 },
262 spec+: {
263 storageClassName: cfg.storageClasses.prometheus,
264 accessModes: ["ReadWriteOnce"],
265 resources: {
266 requests: {
Serge Bazanski363bf4f2020-08-24 21:00:56 +0200267 storage: "16Gi",
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200268 },
269 },
270 },
271 },
272
273 // Network Policy governing access to the prometheus server.
274 np: kube.NetworkPolicy("prometheus-cluster") {
275 metadata+: {
276 namespace: cfg.namespace,
277 },
278 spec+: kube.podLabelsSelector(prometheus.deploy) {
279 ingress_: {
280 // Deny all inbound traffic to pod.
281 // This will be augmented to allow access from some other pod/namespace
282 // in the future.
283 },
284 egress_: {
285 // Allow all outbound traffic from pod.
286 outboundAll: {},
287 },
288 policyTypes: ["Ingress", "Egress"],
289 },
290 },
291 },
292 },
Sergiusz Bazanskice81c392020-06-06 12:35:06 +0200293}