cluster: replace docker with containerd
This removes Docker and docker-shim from our production kubernetes, and
moves over to containerd/CRI. Docker support within Kubernetes was
always slightly shitty, and with 1.20 the integration was dropped
entirely. CRI/Containerd/runc is pretty much the new standard.
Change-Id: I98c89d5433f221b5fe766fcbef261fd72db530fe
diff --git a/cluster/nix/modules/kubelet.nix b/cluster/nix/modules/kubelet.nix
index f475b5b..1a71b48 100644
--- a/cluster/nix/modules/kubelet.nix
+++ b/cluster/nix/modules/kubelet.nix
@@ -16,7 +16,7 @@
name = "pause";
tag = "latest";
contents = top.package.pause;
- config.Cmd = "/bin/pause";
+ config.Cmd = ["/bin/pause"];
};
kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
@@ -45,12 +45,6 @@
taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
in
{
- imports = [
- #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "applyManifests" ] "")
- #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "cadvisorPort" ] "")
- #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "allowPrivileged" ] "")
- ];
-
# services/cluster/kubernetes/default.nix still wants to poke flannel,
# but since we nuke that module we have to add a fake tunable for it.
options.services.kubernetes.flannel = {
@@ -203,15 +197,57 @@
(mkIf cfg.enable {
services.kubernetes.kubelet.seedDockerImages = [infraContainer];
+ # Drop crictl into administrative command line.
+ environment.systemPackages = with pkgs; [ cri-tools ];
+
+ # Force disable Docker.
+ virtualisation.docker.enable = false;
+
+ # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
+ # Kubelet 1.19.
+ systemd.enableUnifiedCgroupHierarchy = false;
+
+ # Run containerd service. This is exposes the CRI API that is consumed by
+ # crictl and Kubelet.
+ systemd.services.containerd = {
+ description = "containerd container runtime";
+ wantedBy = [ "kubernetes.target" ];
+ after = [ "network.target" ];
+ path = with pkgs; [ runc iptables ];
+ serviceConfig = {
+ Delegate = "yes";
+ KillMode = "process";
+ Restart = "always";
+ RestartSec = "5";
+ LimitNPROC = "infinity";
+ LimitCORE = "infinity";
+ # https://github.com/coreos/fedora-coreos-tracker/issues/329
+ LimitNOFILE = "1048576";
+ TasksMax = "infinity";
+ OOMScoreAdjust = "-999";
+
+ ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
+ };
+ };
+
systemd.services.kubelet = {
description = "Kubernetes Kubelet Service";
wantedBy = [ "kubernetes.target" ];
- after = [ "network.target" "docker.service" "kube-apiserver.service" ];
- path = with pkgs; [ gitMinimal openssh docker utillinux iproute ethtool thin-provisioning-tools iptables socat ] ++ top.path;
+ after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
+ path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
+
+ # Mildly hacky - by moving over to OCI image build infrastructure in
+ # NixOS we should be able to get rid of the gunzip.
+ # TODO(q3k): figure this out, check if this is even being used by
+ # kubelet.
preStart = ''
${concatMapStrings (img: ''
- echo "Seeding docker image: ${img}"
- docker load <${img}
+ echo "Seeding OCI image: ${img}"
+ cp ${img} /tmp/image.tar.gz
+ rm -f /tmp/image.tar
+ gunzip /tmp/image.tar.gz
+ ctr -n=k8s.io images import /tmp/image.tar || true
+ rm /tmp/image.tar
'') cfg.seedDockerImages}
'';
serviceConfig = {
@@ -221,6 +257,9 @@
Restart = "on-failure";
RestartSec = "1000ms";
ExecStart = ''${cfg.package}/bin/kubelet \
+ --cgroup-driver=systemd \
+ --container-runtime=remote \
+ --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
--address=${cfg.address} \
--authentication-token-webhook \
--authentication-token-webhook-cache-ttl="10s" \
@@ -263,7 +302,8 @@
};
};
- boot.kernelModules = ["br_netfilter"];
+ boot.kernelModules = [ "br_netfilter" "overlay" ];
+ boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
services.kubernetes.kubelet.hostname = with config.networking;
mkDefault (hostName + optionalString (domain != null) ".${domain}");