ops/monitoring/lib/cluster.libsonnet - hscloud - Gitiles

 local kube = import "../../../kube/kube.libsonnet";

 {
     // Cluster sets up all cluster-specific monitoring resources in their own namespace.
     //
     // Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
     // and cAdvisor metrics, and possibly ships over metrics to the global tier via set
     // upstreams.
     Cluster(name):: {
         local cluster = self,
         local cfg = cluster.cfg,
         cfg:: {
             name: name,
             namespace: "monitoring-cluster",

             images: {
                 prometheus: "prom/prometheus:v2.18.1",
             },

             storageClasses: {
                 prometheus: error "storageClasses.prometheus must be set",
             },

             // Username used to authenticate to upstreams.
             username: error "username must be set",

             // Global tier upstreams that this cluster should ship metrics off to.
             // List of
             //  {
             //     remote: URL of upstream
             //     password: password used to authenticate, in conjunction with cfg.username.
             //
             upstreams: [],
         },

         namespace: kube.Namespace(cfg.namespace),

         prometheus: {
             local prometheus = self,

             // Configuration that's going to be emitted as prometheus.yml and passed to the
             // prometheus server for this cluster.
             configuration:: {
                 global: {
                     external_labels: {
                         cluster: cluster.cfg.name,
                     },
                 },

                 // Constructor for a Kubernetes scrape job that uses the pod's service account and
                 // TLS configuration, selecting the given k8s scrape 'role'.
                 local kubeScrapeConfig = function(name, role) {
                     job_name: name,
                     scheme: "https",
                     scrape_interval: "30s",
                     kubernetes_sd_configs: [ { role: role }, ],
                     tls_config: {
                         ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
                     },
                     bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
                 },


                 // When scraping node-based metrics (ie. node and cadvisor metrics) we contact
                 // the metrics endpoints on the kubelet via the API server. This is done by
                 // relabeling _address__ and __metrics_path__ to point at the k8s API server,
                 // and at the API server proxy path to reach a node's metrics endpoint.
                 //
                 // This approach was lifted from the prometheus examples for Kubernetes, and
                 // while the benefits outlined there do not matter that much to us (our
                 // kubelets listen on public addresses, anyway), we still enjoy this approach
                 // for the fact that we don't have to hardcode the kubelet TLS port.
                 //
                 // https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
                 //
                 // When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
                 // our API server's TLS certificate only has a CN/SAN for its full FQDN, not
                 // the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
                 local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
                     relabel_configs: [
                         {
                             action: "labelmap",
                             regex: "__meta_kubernetes_node_label_(.+)",
                         },
                         {
                             action: "replace",
                             target_label: "__address__",
                             replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
                         },
                         {
                             target_label: "__metrics_path__",
                             source_labels: ["__meta_kubernetes_node_name"],
                             regex: "(.+)",
                             replacement: "/api/v1/nodes/${1}/proxy" + path,
                         },
                     ],
                 },

                 // When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
                 // APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
                 // and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
                 // discovery which selects all endpoints for the default/kubernetes service. This service is
                 // backed by apiserver instances on public IP addresses. We can then rewrite the received port
                 // by the port of the service we're interested in to get to that service.
                 local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
                     relabel_configs: [
                         // Select only endpoints that back the default/kubernetes service. These are all
                         // public IP addresses of nodes that run the API server.
                         {
                             action: "keep",
                             regex: "default;kubernetes;https",
                             source_labels: [
                                 "__meta_kubernetes_namespace",
                                 "__meta_kubernetes_service_name",
                                 "__meta_kubernetes_endpoint_port_name",
                             ],
                         },
                     ] + (if port == 4001 then [] else [
                         // Replace endpoint port with requested port, if the requested port is not the apiserver's
                         // port 4001, which is the one returned by default for the these endpoints.
                         {
                             action: "replace",
                             regex: "([^:]+):.+",
                             replacement: "$1:%d" % [port],
                             source_labels: [
                                 "__address__",
                             ],
                             target_label: "__address__",
                         },
                     ]),
                     // We disable server-side TLS certificate verification.
                     // Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
                     // the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
                     // target [1], so the only two choiced we are left with are:
                     //   1) re-emit relevant certificates with IP address SANs that allow for access by IP.
                     //   2) disable TLS verification.
                     // We choose 2), knowing that if someone manages to hijack a target IP address they can end up
                     // stealing our bearer token and impersonating the service account with which Prometheus is
                     // running. In the long term, we hope for [1] to be resolved.
                     //
                     // TODO(q3k): revisit this once [1] gets fixed.
                     // [1] - https://github.com/prometheus/prometheus/issues/4827
                     tls_config: {
                         insecure_skip_verify: true,
                     },
                 },

                 scrape_configs: [
                     /// Scrape per-node metrics, proxied via the APIServer..
                     // Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
                     kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
                     // Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
                     kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),

                     /// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
                     /// (currently all nodes are 'master' nodes)
                     // Scrape Kubernetes apiserver metrics.
                     kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
                     // Scrape Kubernetes controller-manager metrics.
                     kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
                     // Scrape Kubernetes scheduler metrics.
                     kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
                 ],

                 remote_write: [
                     {
                         url: u.remote,
                         basic_auth: {
                             username: cluster.cfg.username,
                             password: u.password,
                         },
                     }
                     for u in cluster.cfg.upstreams
                 ],
             },

             configmap: kube.ConfigMap("prometheus-cluster") {
                 metadata+: {
                     namespace: cfg.namespace,
                 },
                 data: {
                     "prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
                 },
             },

             sa: kube.ServiceAccount("prometheus-cluster") {
                 metadata+: {
                     namespace: cfg.namespace,
                 },
             },

             cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
                 rules: [
                     // Allow access to all metrics.
                     { nonResourceURLs: ["/metrics"], verbs: ["get"], },
                     // Allow to access node details for discovery.
                     { apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
                     { apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
                     // Allow to proxy to bare node HTTP to access per-node metrics endpoints.
                     { apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
                 ],
             },

             crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
                 subjects_: [prometheus.sa],
                 roleRef_: prometheus.cr,
             },

             deploy: kube.Deployment("prometheus-cluster") {
                 metadata+: {
                     namespace: cfg.namespace,
                 },
                 spec+: {
                     template+: {
                         spec+: {
                             containers_: {
                                 default: kube.Container("default") {
                                     image: cfg.images.prometheus,
                                     command: [
                                         "/bin/prometheus",
                                         "--config.file=/etc/prometheus/prometheus.yml",
                                         "--storage.tsdb.path=/prometheus",
                                         "--storage.tsdb.retention.size=10GB",
                                         "--web.console.libraries=/usr/share/prometheus/console_libraries",
                                         "--web.console.templates=/usr/share/prometheus/consoles",
                                         "--web.enable-lifecycle",
                                     ],
                                     resources: {
                                         requests: {
                                             memory: "3Gi",
                                             cpu: "100m",
                                         },
                                         limits: {
                                             memory: "3Gi",
                                             cpu: "1",
                                         },
                                     },
                                     volumeMounts_: {
                                         data: { mountPath: "/prometheus", },
                                         configmap: { mountPath: "/etc/prometheus", },
                                     },
                                 },
                             },
                             serviceAccountName: prometheus.sa.metadata.name,
                             tolerations: [
                                 { key: "CriticalAddonsOnly", operator: "Exists" },
                             ],
                             volumes_: {
                                 data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
                                 configmap: kube.ConfigMapVolume(prometheus.configmap),
                             },
                         },
                     },
                 },
             },

             // Kubernetes metric storage volume.
             pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
                 metadata+: {
                     namespace: cfg.namespace,
                 },
                 spec+: {
                     storageClassName: cfg.storageClasses.prometheus,
                     accessModes: ["ReadWriteOnce"],
                     resources: {
                         requests: {
                             storage: "16Gi",
                         },
                     },
                 },
             },

             // Network Policy governing access to the prometheus server.
             np: kube.NetworkPolicy("prometheus-cluster") {
                 metadata+: {
                     namespace: cfg.namespace,
                 },
                 spec+: kube.podLabelsSelector(prometheus.deploy) {
                     ingress_: {
                         // Deny all inbound traffic to pod.
                         // This will be augmented to allow access from some other pod/namespace
                         // in the future.
                     },
                     egress_: {
                         // Allow all outbound traffic from pod.
                         outboundAll: {},
                     },
                     policyTypes: ["Ingress", "Egress"],
                 },
             },
         },
     },
 }
	local kube = import "../../../kube/kube.libsonnet";

	{
	// Cluster sets up all cluster-specific monitoring resources in their own namespace.
	//
	// Currently this consists of a prometheus server that scrapes k8s nodes for kubelet
	// and cAdvisor metrics, and possibly ships over metrics to the global tier via set
	// upstreams.
	Cluster(name):: {
	local cluster = self,
	local cfg = cluster.cfg,
	cfg:: {
	name: name,
	namespace: "monitoring-cluster",

	images: {
	prometheus: "prom/prometheus:v2.18.1",
	},

	storageClasses: {
	prometheus: error "storageClasses.prometheus must be set",
	},

	// Username used to authenticate to upstreams.
	username: error "username must be set",

	// Global tier upstreams that this cluster should ship metrics off to.
	// List of
	// {
	// remote: URL of upstream
	// password: password used to authenticate, in conjunction with cfg.username.
	//
	upstreams: [],
	},

	namespace: kube.Namespace(cfg.namespace),

	prometheus: {
	local prometheus = self,

	// Configuration that's going to be emitted as prometheus.yml and passed to the
	// prometheus server for this cluster.
	configuration:: {
	global: {
	external_labels: {
	cluster: cluster.cfg.name,
	},
	},

	// Constructor for a Kubernetes scrape job that uses the pod's service account and
	// TLS configuration, selecting the given k8s scrape 'role'.
	local kubeScrapeConfig = function(name, role) {
	job_name: name,
	scheme: "https",
	scrape_interval: "30s",
	kubernetes_sd_configs: [ { role: role }, ],
	tls_config: {
	ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
	},
	bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",
	},


	// When scraping node-based metrics (ie. node and cadvisor metrics) we contact
	// the metrics endpoints on the kubelet via the API server. This is done by
	// relabeling _address__ and __metrics_path__ to point at the k8s API server,
	// and at the API server proxy path to reach a node's metrics endpoint.
	//
	// This approach was lifted from the prometheus examples for Kubernetes, and
	// while the benefits outlined there do not matter that much to us (our
	// kubelets listen on public addresses, anyway), we still enjoy this approach
	// for the fact that we don't have to hardcode the kubelet TLS port.
	//
	// https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml
	//
	// When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as
	// our API server's TLS certificate only has a CN/SAN for its full FQDN, not
	// the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).
	local kubeScrapeNodeMetrics = function(name, path) kubeScrapeConfig(name, "node") {
	relabel_configs: [
	{
	action: "labelmap",
	regex: "__meta_kubernetes_node_label_(.+)",
	},
	{
	action: "replace",
	target_label: "__address__",
	replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],
	},
	{
	target_label: "__metrics_path__",
	source_labels: ["__meta_kubernetes_node_name"],
	regex: "(.+)",
	replacement: "/api/v1/nodes/${1}/proxy" + path,
	},
	],
	},

	// When scraping API server-colocated metrics (ie. metrics from nixos services running alongside
	// APIserver instances), we contact the metrics endpoints directly over the node's IP addresses
	// and an external port. The node IP addresses are discovered via Prometheus kubernetes endpoint
	// discovery which selects all endpoints for the default/kubernetes service. This service is
	// backed by apiserver instances on public IP addresses. We can then rewrite the received port
	// by the port of the service we're interested in to get to that service.
	local kubeScrapeAPIServerColocated = function(name, port) kubeScrapeConfig(name, "endpoints") {
	relabel_configs: [
	// Select only endpoints that back the default/kubernetes service. These are all
	// public IP addresses of nodes that run the API server.
	{
	action: "keep",
	regex: "default;kubernetes;https",
	source_labels: [
	"__meta_kubernetes_namespace",
	"__meta_kubernetes_service_name",
	"__meta_kubernetes_endpoint_port_name",
	],
	},
	] + (if port == 4001 then [] else [
	// Replace endpoint port with requested port, if the requested port is not the apiserver's
	// port 4001, which is the one returned by default for the these endpoints.
	{
	action: "replace",
	regex: "([^:]+):.+",
	replacement: "$1:%d" % [port],
	source_labels: [
	"__address__",
	],
	target_label: "__address__",
	},
	]),
	// We disable server-side TLS certificate verification.
	// Unfortunately, all apiserver-colocated services run with TLS certificates that do not have
	// the right IP address SAN. Unfortunately, we can't override the TLS ServerName for a scrape
	// target [1], so the only two choiced we are left with are:
	// 1) re-emit relevant certificates with IP address SANs that allow for access by IP.
	// 2) disable TLS verification.
	// We choose 2), knowing that if someone manages to hijack a target IP address they can end up
	// stealing our bearer token and impersonating the service account with which Prometheus is
	// running. In the long term, we hope for [1] to be resolved.
	//
	// TODO(q3k): revisit this once [1] gets fixed.
	// [1] - https://github.com/prometheus/prometheus/issues/4827
	tls_config: {
	insecure_skip_verify: true,
	},
	},

	scrape_configs: [
	/// Scrape per-node metrics, proxied via the APIServer..
	// Scrape Kubernetes node metrics via apiserver. This emits kube_node_* metrics.
	kubeScrapeNodeMetrics("cluster_node_metrics", "/metrics"),
	// Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.
	kubeScrapeNodeMetrics("cluster_cadvisor_metrics", "/metrics/cadvisor"),

	/// Scape apiserver-colocated ('master node') metrics, over nodes' public IP addresses.
	/// (currently all nodes are 'master' nodes)
	// Scrape Kubernetes apiserver metrics.
	kubeScrapeAPIServerColocated("cluster_apiserver_metrics", 4001),
	// Scrape Kubernetes controller-manager metrics.
	kubeScrapeAPIServerColocated("cluster_controllermanager_metrics", 4003),
	// Scrape Kubernetes scheduler metrics.
	kubeScrapeAPIServerColocated("cluster_scheduler_metrics", 4005),
	],

	remote_write: [
	{
	url: u.remote,
	basic_auth: {
	username: cluster.cfg.username,
	password: u.password,
	},
	}
	for u in cluster.cfg.upstreams
	],
	},

	configmap: kube.ConfigMap("prometheus-cluster") {
	metadata+: {
	namespace: cfg.namespace,
	},
	data: {
	"prometheus.yml": std.manifestYamlDoc(prometheus.configuration),
	},
	},

	sa: kube.ServiceAccount("prometheus-cluster") {
	metadata+: {
	namespace: cfg.namespace,
	},
	},

	cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
	rules: [
	// Allow access to all metrics.
	{ nonResourceURLs: ["/metrics"], verbs: ["get"], },
	// Allow to access node details for discovery.
	{ apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },
	{ apiGroups: [""], resources: ["endpoints", "services", "pods"], verbs: ["list", "watch", "get"], },
	// Allow to proxy to bare node HTTP to access per-node metrics endpoints.
	{ apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },
	],
	},

	crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {
	subjects_: [prometheus.sa],
	roleRef_: prometheus.cr,
	},

	deploy: kube.Deployment("prometheus-cluster") {
	metadata+: {
	namespace: cfg.namespace,
	},
	spec+: {
	template+: {
	spec+: {
	containers_: {
	default: kube.Container("default") {
	image: cfg.images.prometheus,
	command: [
	"/bin/prometheus",
	"--config.file=/etc/prometheus/prometheus.yml",
	"--storage.tsdb.path=/prometheus",
	"--storage.tsdb.retention.size=10GB",
	"--web.console.libraries=/usr/share/prometheus/console_libraries",
	"--web.console.templates=/usr/share/prometheus/consoles",
	"--web.enable-lifecycle",
	],
	resources: {
	requests: {
	memory: "3Gi",
	cpu: "100m",
	},
	limits: {
	memory: "3Gi",
	cpu: "1",
	},
	},
	volumeMounts_: {
	data: { mountPath: "/prometheus", },
	configmap: { mountPath: "/etc/prometheus", },
	},
	},
	},
	serviceAccountName: prometheus.sa.metadata.name,
	tolerations: [
	{ key: "CriticalAddonsOnly", operator: "Exists" },
	],
	volumes_: {
	data: kube.PersistentVolumeClaimVolume(prometheus.pvc),
	configmap: kube.ConfigMapVolume(prometheus.configmap),
	},
	},
	},
	},
	},

	// Kubernetes metric storage volume.
	pvc: kube.PersistentVolumeClaim("prometheus-cluster") {
	metadata+: {
	namespace: cfg.namespace,
	},
	spec+: {
	storageClassName: cfg.storageClasses.prometheus,
	accessModes: ["ReadWriteOnce"],
	resources: {
	requests: {
	storage: "16Gi",
	},
	},
	},
	},

	// Network Policy governing access to the prometheus server.
	np: kube.NetworkPolicy("prometheus-cluster") {
	metadata+: {
	namespace: cfg.namespace,
	},
	spec+: kube.podLabelsSelector(prometheus.deploy) {
	ingress_: {
	// Deny all inbound traffic to pod.
	// This will be augmented to allow access from some other pod/namespace
	// in the future.
	},
	egress_: {
	// Allow all outbound traffic from pod.
	outboundAll: {},
	},
	policyTypes: ["Ingress", "Egress"],
	},
	},
	},
	},
	}