cluster: replace docker with containerd

This removes Docker and docker-shim from our production kubernetes, and
moves over to containerd/CRI. Docker support within Kubernetes was
always slightly shitty, and with 1.20 the integration was dropped
entirely. CRI/Containerd/runc is pretty much the new standard.

Change-Id: I98c89d5433f221b5fe766fcbef261fd72db530fe
diff --git a/cluster/nix/modules/kubelet.nix b/cluster/nix/modules/kubelet.nix
index f475b5b..1a71b48 100644
--- a/cluster/nix/modules/kubelet.nix
+++ b/cluster/nix/modules/kubelet.nix
@@ -16,7 +16,7 @@
     name = "pause";
     tag = "latest";
     contents = top.package.pause;
-    config.Cmd = "/bin/pause";
+    config.Cmd = ["/bin/pause"];
   };
 
   kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
@@ -45,12 +45,6 @@
   taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
 in
 {
-  imports = [
-    #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "applyManifests" ] "")
-    #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "cadvisorPort" ] "")
-    #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "allowPrivileged" ] "")
-  ];
-
   # services/cluster/kubernetes/default.nix still wants to poke flannel,
   # but since we nuke that module we have to add a fake tunable for it.
   options.services.kubernetes.flannel = {
@@ -203,15 +197,57 @@
     (mkIf cfg.enable {
       services.kubernetes.kubelet.seedDockerImages = [infraContainer];
 
+      # Drop crictl into administrative command line.
+      environment.systemPackages = with pkgs; [ cri-tools ];
+
+      # Force disable Docker.
+      virtualisation.docker.enable = false;
+
+      # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
+      # Kubelet 1.19.
+      systemd.enableUnifiedCgroupHierarchy = false;
+
+      # Run containerd service. This is exposes the CRI API that is consumed by
+      # crictl and Kubelet.
+      systemd.services.containerd = {
+        description = "containerd container runtime";
+        wantedBy = [ "kubernetes.target" ];
+        after = [ "network.target" ];
+        path = with pkgs; [ runc iptables ];
+        serviceConfig = {
+          Delegate = "yes";
+          KillMode = "process";
+          Restart = "always";
+          RestartSec = "5";
+          LimitNPROC = "infinity";
+          LimitCORE = "infinity";
+          # https://github.com/coreos/fedora-coreos-tracker/issues/329
+          LimitNOFILE = "1048576";
+          TasksMax = "infinity";
+          OOMScoreAdjust = "-999";
+
+          ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
+        };
+      };
+
       systemd.services.kubelet = {
         description = "Kubernetes Kubelet Service";
         wantedBy = [ "kubernetes.target" ];
-        after = [ "network.target" "docker.service" "kube-apiserver.service" ];
-        path = with pkgs; [ gitMinimal openssh docker utillinux iproute ethtool thin-provisioning-tools iptables socat ] ++ top.path;
+        after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
+        path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
+
+        # Mildly hacky - by moving over to OCI image build infrastructure in
+        # NixOS we should be able to get rid of the gunzip.
+        # TODO(q3k): figure this out, check if this is even being used by
+        # kubelet.
         preStart = ''
           ${concatMapStrings (img: ''
-            echo "Seeding docker image: ${img}"
-            docker load <${img}
+            echo "Seeding OCI image: ${img}"
+            cp ${img} /tmp/image.tar.gz
+            rm -f /tmp/image.tar
+            gunzip /tmp/image.tar.gz
+            ctr -n=k8s.io images import /tmp/image.tar || true
+            rm /tmp/image.tar
           '') cfg.seedDockerImages}
         '';
         serviceConfig = {
@@ -221,6 +257,9 @@
           Restart = "on-failure";
           RestartSec = "1000ms";
           ExecStart = ''${cfg.package}/bin/kubelet \
+            --cgroup-driver=systemd \
+            --container-runtime=remote \
+            --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
             --address=${cfg.address} \
             --authentication-token-webhook \
             --authentication-token-webhook-cache-ttl="10s" \
@@ -263,7 +302,8 @@
         };
       };
 
-      boot.kernelModules = ["br_netfilter"];
+      boot.kernelModules = [ "br_netfilter" "overlay" ];
+      boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
 
       services.kubernetes.kubelet.hostname = with config.networking;
         mkDefault (hostName + optionalString (domain != null) ".${domain}");