| local kube = import "../../../kube/kube.libsonnet"; |
| |
| { |
| // Cluster sets up all cluster-specific monitoring resources in their own namespace. |
| // |
| // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet |
| // and cAdvisor metrics, and possibly ships over metrics to the global tier via set |
| // upstreams. |
| Cluster(name):: { |
| local cluster = self, |
| local cfg = cluster.cfg, |
| cfg:: { |
| name: name, |
| namespace: "monitoring-cluster", |
| |
| images: { |
| prometheus: "prom/prometheus:v2.18.1", |
| }, |
| |
| storageClasses: { |
| prometheus: error "storageClasses.prometheus must be set", |
| }, |
| |
| // Username used to authenticate to upstreams. |
| username: error "username must be set", |
| |
| // Global tier upstreams that this cluster should ship metrics off to. |
| // List of |
| // { |
| // remote: URL of upstream |
| // password: password used to authenticate, in conjunction with cfg.username. |
| // |
| upstreams: [], |
| }, |
| |
| namespace: kube.Namespace(cfg.namespace), |
| |
| prometheus: { |
| local prometheus = self, |
| |
| // Configuration that's going to be emitted as prometheus.yml and passed to the |
| // prometheus server for this cluster. |
| configuration:: { |
| global: { |
| external_labels: { |
| cluster: cluster.cfg.name, |
| }, |
| }, |
| |
| // Constructor for a Kubernetes scrape job that uses the pod's service account and |
| // TLS configuration, selecting the given k8s scrape 'role'. |
| local kubeScrapeConfig = function(name, role) { |
| job_name: name, |
| scheme: "https", |
| scrape_interval: "30s", |
| kubernetes_sd_configs: [ { role: role }, ], |
| tls_config: { |
| ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", |
| }, |
| bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token", |
| }, |
| |
| |
| // When scraping node-based metrics (ie. node and cadvisor metrics) we contact |
| // the metrics endpoints on the kubelet via the API server. This is done by |
| // relabeling _address__ and __metrics_path__ to point at the k8s API server, |
| // and at the API server proxy path to reach a node's metrics endpoint. |
| // |
| // This approach was lifted from the prometheus examples for Kubernetes, and |
| // while the benefits outlined there do not matter that much to us (our |
| // kubelets listen on public addresses, anyway), we still enjoy this approach |
| // for the fact that we don't have to hardcode the kubelet TLS port. |
| // |
| // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml |
| // |
| // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as |
| // our API server's TLS certificate only has a CN/SAN for its full FQDN, not |
| // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py). |
| local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") { |
| relabel_configs: [ |
| { |
| action: "labelmap", |
| regex: "__meta_kubernetes_node_label_(.+)", |
| }, |
| { |
| action: "replace", |
| target_label: "__address__", |
| replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name], |
| }, |
| { |
| target_label: "__metrics_path__", |
| source_labels: ["__meta_kubernetes_node_name"], |
| regex: "(.+)", |
| replacement: "/api/v1/nodes/${1}/proxy" + path, |
| }, |
| ], |
| }, |
| |
| // When scraping API server-colocated metrics (ie. metrics from nixos services running alongside |
| // APIserver instances), we contact the metrics endpoints directly over the node's IP addresses |
| // and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint |
| // discovery which selects all endpoints for the default/kubernetes service. This service is |
| // backed by apiserver instances on public IP addresses. We can then rewrite the received port |
| // by the port of the service we're interested in to get to that service. |
| local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") { |
| relabel_configs: [ |
| // Select only endpoints that back the default/kubernetes service. These are all |
| // public IP addresses of nodes that run the API server. |
| { |
| action: "keep", |
| regex: "default;kubernetes;https", |
| source_labels: [ |
| "__meta_kubernetes_namespace", |
| "__meta_kubernetes_service_name", |
| "__meta_kubernetes_endpoint_port_name", |
| ], |
| }, |
| ] + (if port == 4001 then [] else [ |
| // Replace endpoint port with requested port, if the requested port is not the apiserver's |
| // port 4001, which is the one returned by default for the these endpoints. |
| { |
| action: "replace", |
| regex: "([^:]+):.+", |
| replacement: "$1:%d" % [port], |
| source_labels: [ |
| "__address__", |
| ], |
| target_label: "__address__", |
| }, |
| ]), |
| // We disable server-side TLS certificate verification. |
| // Unfortunately, all apiserver-colocated services run with TLS certificates that do not have |
| // the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape |
| // target [1], so the only two choiced we are left with are: |
| // 1) re-emit relevant certificates with IP address SANs that allow for access by IP. |
| // 2) disable TLS verification. |
| // We choose 2), knowing that if someone manages to hijack a target IP address they can end up |
| // stealing our bearer token and impersonating the service account with which Prometheus is |
| // running. In the long term, we hope for [1] to be resolved. |
| // |
| // TODO(q3k): revisit this once [1] gets fixed. |
| // [1] - https://github.com/prometheus/prometheus/issues/4827 |
| tls_config: { |
| insecure_skip_verify: true, |
| }, |
| }, |
| |
| scrape_configs: [ |
| /// Scrape per-node metrics, proxied via the APIServer.. |
| // Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics. |
| kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"), |
| // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics. |
| kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"), |
| |
| /// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses. |
| /// (currently all nodes are 'master' nodes) |
| // Scrape Kubernetes apiserver metrics. |
| kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001), |
| // Scrape Kubernetes controller-manager metrics. |
| kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003), |
| // Scrape Kubernetes scheduler metrics. |
| kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005), |
| ], |
| |
| remote_write: [ |
| { |
| url: u.remote, |
| basic_auth: { |
| username: cluster.cfg.username, |
| password: u.password, |
| }, |
| } |
| for u in cluster.cfg.upstreams |
| ], |
| }, |
| |
| configmap: kube.ConfigMap("prometheus-cluster") { |
| metadata+: { |
| namespace: cfg.namespace, |
| }, |
| data: { |
| "prometheus.yml": std.manifestYamlDoc(prometheus.configuration), |
| }, |
| }, |
| |
| sa: kube.ServiceAccount("prometheus-cluster") { |
| metadata+: { |
| namespace: cfg.namespace, |
| }, |
| }, |
| |
| cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) { |
| rules: [ |
| // Allow access to all metrics. |
| { nonResourceURLs: ["/metrics"], verbs: ["get"], }, |
| // Allow to access node details for discovery. |
| { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], }, |
| { apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], }, |
| // Allow to proxy to bare node HTTP to access per-node metrics endpoints. |
| { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], }, |
| ], |
| }, |
| |
| crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) { |
| subjects_: [prometheus.sa], |
| roleRef_: prometheus.cr, |
| }, |
| |
| deploy: kube.Deployment("prometheus-cluster") { |
| metadata+: { |
| namespace: cfg.namespace, |
| }, |
| spec+: { |
| template+: { |
| spec+: { |
| containers_: { |
| default: kube.Container("default") { |
| image: cfg.images.prometheus, |
| command: [ |
| "/bin/prometheus", |
| "--config.file=/etc/prometheus/prometheus.yml", |
| "--storage.tsdb.path=/prometheus", |
| "--storage.tsdb.retention.size=10GB", |
| "--web.console.libraries=/usr/share/prometheus/console_libraries", |
| "--web.console.templates=/usr/share/prometheus/consoles", |
| "--web.enable-lifecycle", |
| ], |
| resources: { |
| requests: { |
| memory: "3Gi", |
| cpu: "100m", |
| }, |
| limits: { |
| memory: "3Gi", |
| cpu: "1", |
| }, |
| }, |
| volumeMounts_: { |
| data: { mountPath: "/prometheus", }, |
| configmap: { mountPath: "/etc/prometheus", }, |
| }, |
| }, |
| }, |
| serviceAccountName: prometheus.sa.metadata.name, |
| tolerations: [ |
| { key: "CriticalAddonsOnly", operator: "Exists" }, |
| ], |
| volumes_: { |
| data: kube.PersistentVolumeClaimVolume(prometheus.pvc), |
| configmap: kube.ConfigMapVolume(prometheus.configmap), |
| }, |
| }, |
| }, |
| }, |
| }, |
| |
| // Kubernetes metric storage volume. |
| pvc: kube.PersistentVolumeClaim("prometheus-cluster") { |
| metadata+: { |
| namespace: cfg.namespace, |
| }, |
| spec+: { |
| storageClassName: cfg.storageClasses.prometheus, |
| accessModes: ["ReadWriteOnce"], |
| resources: { |
| requests: { |
| storage: "16Gi", |
| }, |
| }, |
| }, |
| }, |
| |
| // Network Policy governing access to the prometheus server. |
| np: kube.NetworkPolicy("prometheus-cluster") { |
| metadata+: { |
| namespace: cfg.namespace, |
| }, |
| spec+: kube.podLabelsSelector(prometheus.deploy) { |
| ingress_: { |
| // Deny all inbound traffic to pod. |
| // This will be augmented to allow access from some other pod/namespace |
| // in the future. |
| }, |
| egress_: { |
| // Allow all outbound traffic from pod. |
| outboundAll: {}, |
| }, |
| policyTypes: ["Ingress", "Egress"], |
| }, |
| }, |
| }, |
| }, |
| } |