cluster: replace docker with containerd
This removes Docker and docker-shim from our production kubernetes, and
moves over to containerd/CRI. Docker support within Kubernetes was
always slightly shitty, and with 1.20 the integration was dropped
entirely. CRI/Containerd/runc is pretty much the new standard.
Change-Id: I98c89d5433f221b5fe766fcbef261fd72db530fe
diff --git a/cluster/nix/modules/containerd.toml b/cluster/nix/modules/containerd.toml
new file mode 100644
index 0000000..b079637
--- /dev/null
+++ b/cluster/nix/modules/containerd.toml
@@ -0,0 +1,134 @@
+version = 2
+root = "/var/lib/containerd"
+state = "/run/containerd"
+plugin_dir = ""
+disabled_plugins = []
+required_plugins = []
+oom_score = 0
+
+[grpc]
+ address = "/run/containerd/containerd.sock"
+ tcp_address = ""
+ tcp_tls_cert = ""
+ tcp_tls_key = ""
+ uid = 0
+ gid = 0
+ max_recv_message_size = 16777216
+ max_send_message_size = 16777216
+
+[ttrpc]
+ address = ""
+ uid = 0
+ gid = 0
+
+[debug]
+ address = ""
+ uid = 0
+ gid = 0
+ level = ""
+
+[metrics]
+ address = ""
+ grpc_histogram = false
+
+[cgroup]
+ path = ""
+
+[timeouts]
+ "io.containerd.timeout.shim.cleanup" = "5s"
+ "io.containerd.timeout.shim.load" = "5s"
+ "io.containerd.timeout.shim.shutdown" = "3s"
+ "io.containerd.timeout.task.state" = "2s"
+
+[plugins]
+ [plugins."io.containerd.gc.v1.scheduler"]
+ pause_threshold = 0.02
+ deletion_threshold = 0
+ mutation_threshold = 100
+ schedule_delay = "0s"
+ startup_delay = "100ms"
+ [plugins."io.containerd.grpc.v1.cri"]
+ disable_tcp_service = true
+ stream_server_address = "127.0.0.1"
+ stream_server_port = "0"
+ stream_idle_timeout = "4h0m0s"
+ enable_selinux = false
+ selinux_category_range = 1024
+ sandbox_image = "k8s.gcr.io/pause:3.2"
+ stats_collect_period = 10
+ systemd_cgroup = false
+ enable_tls_streaming = false
+ max_container_log_line_size = 16384
+ disable_cgroup = false
+ disable_apparmor = false
+ restrict_oom_score_adj = false
+ max_concurrent_downloads = 3
+ disable_proc_mount = false
+ unset_seccomp_profile = ""
+ tolerate_missing_hugetlb_controller = true
+ disable_hugetlb_controller = true
+ ignore_image_defined_volumes = false
+ [plugins."io.containerd.grpc.v1.cri".containerd]
+ snapshotter = "overlayfs"
+ default_runtime_name = "runc"
+ no_pivot = false
+ disable_snapshot_annotations = true
+ discard_unpacked_layers = false
+ [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
+ runtime_type = ""
+ runtime_engine = ""
+ runtime_root = ""
+ privileged_without_host_devices = false
+ base_runtime_spec = ""
+ [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
+ runtime_type = ""
+ runtime_engine = ""
+ runtime_root = ""
+ privileged_without_host_devices = false
+ base_runtime_spec = ""
+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+ runtime_type = "io.containerd.runc.v2"
+ runtime_engine = ""
+ runtime_root = ""
+ privileged_without_host_devices = false
+ base_runtime_spec = ""
+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+ SystemdCgroup = true
+ [plugins."io.containerd.grpc.v1.cri".cni]
+ bin_dir = "/opt/cni/bin"
+ conf_dir = "/opt/cni/conf"
+ max_conf_num = 1
+ conf_template = ""
+ [plugins."io.containerd.grpc.v1.cri".registry]
+ [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
+ [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
+ endpoint = ["https://registry-1.docker.io"]
+ [plugins."io.containerd.grpc.v1.cri".image_decryption]
+ key_model = ""
+ [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
+ tls_cert_file = ""
+ tls_key_file = ""
+ [plugins."io.containerd.internal.v1.opt"]
+ path = "/opt/containerd"
+ [plugins."io.containerd.internal.v1.restart"]
+ interval = "10s"
+ [plugins."io.containerd.metadata.v1.bolt"]
+ content_sharing_policy = "shared"
+ [plugins."io.containerd.monitor.v1.cgroups"]
+ no_prometheus = false
+ [plugins."io.containerd.runtime.v1.linux"]
+ shim = "containerd-shim"
+ runtime = "runc"
+ runtime_root = ""
+ no_shim = false
+ shim_debug = false
+ [plugins."io.containerd.runtime.v2.task"]
+ platforms = ["linux/amd64"]
+ [plugins."io.containerd.service.v1.diff-service"]
+ default = ["walking"]
+ [plugins."io.containerd.snapshotter.v1.devmapper"]
+ root_path = ""
+ pool_name = ""
+ base_image_size = ""
+ async_remove = false
diff --git a/cluster/nix/modules/kubelet.nix b/cluster/nix/modules/kubelet.nix
index f475b5b..1a71b48 100644
--- a/cluster/nix/modules/kubelet.nix
+++ b/cluster/nix/modules/kubelet.nix
@@ -16,7 +16,7 @@
name = "pause";
tag = "latest";
contents = top.package.pause;
- config.Cmd = "/bin/pause";
+ config.Cmd = ["/bin/pause"];
};
kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
@@ -45,12 +45,6 @@
taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
in
{
- imports = [
- #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "applyManifests" ] "")
- #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "cadvisorPort" ] "")
- #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "allowPrivileged" ] "")
- ];
-
# services/cluster/kubernetes/default.nix still wants to poke flannel,
# but since we nuke that module we have to add a fake tunable for it.
options.services.kubernetes.flannel = {
@@ -203,15 +197,57 @@
(mkIf cfg.enable {
services.kubernetes.kubelet.seedDockerImages = [infraContainer];
+ # Drop crictl into administrative command line.
+ environment.systemPackages = with pkgs; [ cri-tools ];
+
+ # Force disable Docker.
+ virtualisation.docker.enable = false;
+
+ # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
+ # Kubelet 1.19.
+ systemd.enableUnifiedCgroupHierarchy = false;
+
+ # Run containerd service. This is exposes the CRI API that is consumed by
+ # crictl and Kubelet.
+ systemd.services.containerd = {
+ description = "containerd container runtime";
+ wantedBy = [ "kubernetes.target" ];
+ after = [ "network.target" ];
+ path = with pkgs; [ runc iptables ];
+ serviceConfig = {
+ Delegate = "yes";
+ KillMode = "process";
+ Restart = "always";
+ RestartSec = "5";
+ LimitNPROC = "infinity";
+ LimitCORE = "infinity";
+ # https://github.com/coreos/fedora-coreos-tracker/issues/329
+ LimitNOFILE = "1048576";
+ TasksMax = "infinity";
+ OOMScoreAdjust = "-999";
+
+ ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
+ };
+ };
+
systemd.services.kubelet = {
description = "Kubernetes Kubelet Service";
wantedBy = [ "kubernetes.target" ];
- after = [ "network.target" "docker.service" "kube-apiserver.service" ];
- path = with pkgs; [ gitMinimal openssh docker utillinux iproute ethtool thin-provisioning-tools iptables socat ] ++ top.path;
+ after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
+ path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
+
+ # Mildly hacky - by moving over to OCI image build infrastructure in
+ # NixOS we should be able to get rid of the gunzip.
+ # TODO(q3k): figure this out, check if this is even being used by
+ # kubelet.
preStart = ''
${concatMapStrings (img: ''
- echo "Seeding docker image: ${img}"
- docker load <${img}
+ echo "Seeding OCI image: ${img}"
+ cp ${img} /tmp/image.tar.gz
+ rm -f /tmp/image.tar
+ gunzip /tmp/image.tar.gz
+ ctr -n=k8s.io images import /tmp/image.tar || true
+ rm /tmp/image.tar
'') cfg.seedDockerImages}
'';
serviceConfig = {
@@ -221,6 +257,9 @@
Restart = "on-failure";
RestartSec = "1000ms";
ExecStart = ''${cfg.package}/bin/kubelet \
+ --cgroup-driver=systemd \
+ --container-runtime=remote \
+ --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
--address=${cfg.address} \
--authentication-token-webhook \
--authentication-token-webhook-cache-ttl="10s" \
@@ -263,7 +302,8 @@
};
};
- boot.kernelModules = ["br_netfilter"];
+ boot.kernelModules = [ "br_netfilter" "overlay" ];
+ boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
services.kubernetes.kubelet.hostname = with config.networking;
mkDefault (hostName + optionalString (domain != null) ".${domain}");
diff --git a/cluster/nix/modules/kubernetes.nix b/cluster/nix/modules/kubernetes.nix
index 92e28de..879c50f 100644
--- a/cluster/nix/modules/kubernetes.nix
+++ b/cluster/nix/modules/kubernetes.nix
@@ -30,24 +30,6 @@
./kubelet.nix
];
- # List services that you want to enable:
- virtualisation.docker.enable = true;
- virtualisation.docker.extraOptions = "--iptables=false --ip-masq=false --ip-forward=true";
-
- # Docker 1.13 sets iptables FORWARD to DROP. Unfuck this.
- systemd.services."docker-iptables-unfuck" = {
- enable = true;
- wantedBy = [ "kubernetes.target" ];
- description = "Docker iptable Unfuck";
- after = [ "docker.service" ];
- requires = [ "docker.service" ];
- path = [ pkgs.iptables ];
- script = ''
- iptables -P FORWARD ACCEPT
- '';
- serviceConfig.Type = "oneshot";
- };
-
networking.firewall.enable = false;
# Point k8s apiserver address at ourselves, as every machine runs an apiserver with this cert name.