Blame - ops/monitoring/lib/cluster.libsonnet - hscloud

2020-06-06 17:04:07 +0200

[diff] [blame]

1

local kube = import "../../../kube/kube.libsonnet";

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

2

3

{

Sergiusz Bazanski

2020-06-06 17:04:07 +0200

[diff] [blame]

4

// Cluster sets up all cluster-specific monitoring resources in their own namespace.

Serge Bazanski

2020-08-24 21:00:56 +0200

[diff] [blame]

5

//

Sergiusz Bazanski

2020-06-06 17:04:07 +0200

[diff] [blame]

6

// Currently this consists of a prometheus server that scrapes k8s nodes for kubelet

Serge Bazanski

2020-08-24 21:00:56 +0200

[diff] [blame]

7

// and cAdvisor metrics, and possibly ships over metrics to the global tier via set

8

// upstreams.

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

9

Cluster(name):: {

10

local cluster = self,

11

local cfg = cluster.cfg,

12

cfg:: {

13

name: name,

14

namespace: "monitoring-cluster",

15

16

images: {

17

prometheus: "prom/prometheus:v2.18.1",

18

},

19

20

storageClasses: {

Sergiusz Bazanski

2020-06-06 17:04:07 +0200

[diff] [blame]

21

prometheus: error "storageClasses.prometheus must be set",

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

22

},

Serge Bazanski

2020-08-24 21:00:56 +0200

[diff] [blame]

23

24

// Username used to authenticate to upstreams.

25

username: error "username must be set",

26

27

// Global tier upstreams that this cluster should ship metrics off to.

28

// List of

29

// {

30

// remote: URL of upstream

31

// password: password used to authenticate, in conjunction with cfg.username.

32

//

33

upstreams: [],

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

34

},

35

36

namespace: kube.Namespace(cfg.namespace),

37

38

prometheus: {

39

local prometheus = self,

40

41

// Configuration that's going to be emitted as prometheus.yml and passed to the

42

// prometheus server for this cluster.

configuration:: {

global: {

external_labels: {

cluster: cluster.cfg.name,

},

},

// Constructor for a Kubernetes scrape job that uses the pod's service account and

51

// TLS configuration, selecting the given k8s scrape 'role'.

52

local kubeScrapeConfig = function(name, role) {

53

job_name: name,

54

scheme: "https",

55

scrape_interval: "30s",

56

kubernetes_sd_configs: [ { role: role }, ],

57

tls_config: {

58

ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",

59

},

60

bearer_token_file: "/var/run/secrets/kubernetes.io/serviceaccount/token",

},

scrape_configs: [

// When scraping node-based metrics (ie. node and cadvisor metrics) we contact

65

// the metrics endpoints on the kubelet via the API server. This is done by

66

// relabeling _address__ and __metrics_path__ to point at the k8s API server,

67

// and at the API server proxy path to reach a node's metrics endpoint.

68

//

69

// This approach was lifted from the prometheus examples for Kubernetes, and

70

// while the benefits outlined there do not matter that much to us (our

71

// kubelets listen on public addresses, anyway), we still enjoy this approach

72

// for the fact that we don't have to hardcode the kubelet TLS port.

73

//

74

// https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml

75

//

76

// When contacting the API server, we hardcode the 'hswaw.net' DNS suffix as

77

// our API server's TLS certificate only has a CN/SAN for its full FQDN, not

78

// the .svc.cluster.local shorthand (see //cluster/clustercfg:clustercfg.py).

79

80

// Scrape Kubernetes node metrics via apiserver. This emites kube_node_* metrics.

81

kubeScrapeConfig("cluster_node_metrics", "node") {

relabel_configs: [

{

action: "labelmap",

regex: "__meta_kubernetes_node_label_(.+)",

},

{

action: "replace",

target_label: "__address__",

90

replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],

91

},

92

{

93

target_label: "__metrics_path__",

94

source_labels: ["__meta_kubernetes_node_name"],

95

regex: "(.+)",

96

replacement: "/api/v1/nodes/${1}/proxy/metrics",

},

],

},

// Scrape Kubernetes node cadvisor metrics via apiserver. This emits container_* metrics.

101

kubeScrapeConfig("cluster_cadvisor_metrics", "node") {

relabel_configs: [

{

action: "labelmap",

regex: "__meta_kubernetes_node_label_(.+)",

},

{

action: "replace",

target_label: "__address__",

110

replacement: "kubernetes.default.svc.%s.hswaw.net:443" % [cluster.cfg.name],

111

},

112

{

113

target_label: "__metrics_path__",

114

source_labels: ["__meta_kubernetes_node_name"],

115

regex: "(.+)",

116

replacement: "/api/v1/nodes/${1}/proxy/metrics/cadvisor",

},

],

},

],

Serge Bazanski

2020-08-24 21:00:56 +0200

[diff] [blame]

remote_write: [

{

url: u.remote,

basic_auth: {

username: cluster.cfg.username,

127

password: u.password,

128

},

129

}

130

for u in cluster.cfg.upstreams

131

],

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

132

},

133

134

configmap: kube.ConfigMap("prometheus-cluster") {

135

metadata+: {

136

namespace: cfg.namespace,

137

},

138

data: {

139

"prometheus.yml": std.manifestYamlDoc(prometheus.configuration),

},

},

sa: kube.ServiceAccount("prometheus-cluster") {

144

metadata+: {

145

namespace: cfg.namespace,

},

},

cr: kube.ClusterRole("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {

150

rules: [

151

// Allow access to all metrics.

152

{ nonResourceURLs: ["/metrics"], verbs: ["get"], },

153

// Allow to access node details for discovery.

154

{ apiGroups: [""], resources: ["nodes"], verbs: ["list", "watch", "get"], },

155

// Allow to proxy to bare node HTTP to access per-node metrics endpoints.

156

{ apiGroups: [""], resources: ["nodes/proxy"], verbs: ["get"], },

],

},

crb: kube.ClusterRoleBinding("monitoring-cluster-prometheus-server-%s" % [cfg.name]) {

161

subjects_: [prometheus.sa],

162

roleRef_: prometheus.cr,

163

},

164

165

deploy: kube.Deployment("prometheus-cluster") {

166

metadata+: {

167

namespace: cfg.namespace,

},

spec+: {

template+: {

spec+: {

containers_: {

default: kube.Container("default") {

174

image: cfg.images.prometheus,

175

command: [

176

"/bin/prometheus",

177

"--config.file=/etc/prometheus/prometheus.yml",

178

"--storage.tsdb.path=/prometheus",

Serge Bazanski

2020-08-24 21:00:56 +0200

[diff] [blame]

179

"--storage.tsdb.retention.size=10GB",

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

180

"--web.console.libraries=/usr/share/prometheus/console_libraries",

181

"--web.console.templates=/usr/share/prometheus/consoles",

182

"--web.enable-lifecycle",

],

resources: {

requests: {

memory: "256Mi",

cpu: "100m",

},

limits: {

memory: "1Gi",

cpu: "1",

},

},

volumeMounts_: {

data: { mountPath: "/prometheus", },

196

configmap: { mountPath: "/etc/prometheus", },

},

},

},

serviceAccountName: prometheus.sa.metadata.name,

201

tolerations: [

202

{ key: "CriticalAddonsOnly", operator: "Exists" },

203

],

204

volumes_: {

205

data: kube.PersistentVolumeClaimVolume(prometheus.pvc),

206

configmap: kube.ConfigMapVolume(prometheus.configmap),

},

},

},

},

},

// Kubernetes metric storage volume.

214

pvc: kube.PersistentVolumeClaim("prometheus-cluster") {

215

metadata+: {

216

namespace: cfg.namespace,

217

},

218

spec+: {

219

storageClassName: cfg.storageClasses.prometheus,

220

accessModes: ["ReadWriteOnce"],

221

resources: {

222

requests: {

Serge Bazanski

2020-08-24 21:00:56 +0200

[diff] [blame]

223

storage: "16Gi",

Sergiusz Bazanski

2020-06-06 12:35:06 +0200

[diff] [blame]

},

},

},

},

// Network Policy governing access to the prometheus server.

230

np: kube.NetworkPolicy("prometheus-cluster") {

231

metadata+: {

232

namespace: cfg.namespace,

233

},

234

spec+: kube.podLabelsSelector(prometheus.deploy) {

235

ingress_: {

236

// Deny all inbound traffic to pod.

237

// This will be augmented to allow access from some other pod/namespace

// in the future.

},

egress_: {

// Allow all outbound traffic from pod.

242

outboundAll: {},

243

},

244

policyTypes: ["Ingress", "Egress"],

},

},

},

},

Sergiusz Bazanski