cluster: replace docker with containerd

This removes Docker and docker-shim from our production kubernetes, and
moves over to containerd/CRI. Docker support within Kubernetes was
always slightly shitty, and with 1.20 the integration was dropped
entirely. CRI/Containerd/runc is pretty much the new standard.

Change-Id: I98c89d5433f221b5fe766fcbef261fd72db530fe
diff --git a/cluster/nix/modules/containerd.toml b/cluster/nix/modules/containerd.toml
new file mode 100644
index 0000000..b079637
--- /dev/null
+++ b/cluster/nix/modules/containerd.toml
@@ -0,0 +1,134 @@
+version = 2
+root = "/var/lib/containerd"
+state = "/run/containerd"
+plugin_dir = ""
+disabled_plugins = []
+required_plugins = []
+oom_score = 0
+
+[grpc]
+  address = "/run/containerd/containerd.sock"
+  tcp_address = ""
+  tcp_tls_cert = ""
+  tcp_tls_key = ""
+  uid = 0
+  gid = 0
+  max_recv_message_size = 16777216
+  max_send_message_size = 16777216
+
+[ttrpc]
+  address = ""
+  uid = 0
+  gid = 0
+
+[debug]
+  address = ""
+  uid = 0
+  gid = 0
+  level = ""
+
+[metrics]
+  address = ""
+  grpc_histogram = false
+
+[cgroup]
+  path = ""
+
+[timeouts]
+  "io.containerd.timeout.shim.cleanup" = "5s"
+  "io.containerd.timeout.shim.load" = "5s"
+  "io.containerd.timeout.shim.shutdown" = "3s"
+  "io.containerd.timeout.task.state" = "2s"
+
+[plugins]
+  [plugins."io.containerd.gc.v1.scheduler"]
+    pause_threshold = 0.02
+    deletion_threshold = 0
+    mutation_threshold = 100
+    schedule_delay = "0s"
+    startup_delay = "100ms"
+  [plugins."io.containerd.grpc.v1.cri"]
+    disable_tcp_service = true
+    stream_server_address = "127.0.0.1"
+    stream_server_port = "0"
+    stream_idle_timeout = "4h0m0s"
+    enable_selinux = false
+    selinux_category_range = 1024
+    sandbox_image = "k8s.gcr.io/pause:3.2"
+    stats_collect_period = 10
+    systemd_cgroup = false
+    enable_tls_streaming = false
+    max_container_log_line_size = 16384
+    disable_cgroup = false
+    disable_apparmor = false
+    restrict_oom_score_adj = false
+    max_concurrent_downloads = 3
+    disable_proc_mount = false
+    unset_seccomp_profile = ""
+    tolerate_missing_hugetlb_controller = true
+    disable_hugetlb_controller = true
+    ignore_image_defined_volumes = false
+    [plugins."io.containerd.grpc.v1.cri".containerd]
+      snapshotter = "overlayfs"
+      default_runtime_name = "runc"
+      no_pivot = false
+      disable_snapshot_annotations = true
+      discard_unpacked_layers = false
+      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
+        runtime_type = ""
+        runtime_engine = ""
+        runtime_root = ""
+        privileged_without_host_devices = false
+        base_runtime_spec = ""
+      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
+        runtime_type = ""
+        runtime_engine = ""
+        runtime_root = ""
+        privileged_without_host_devices = false
+        base_runtime_spec = ""
+      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+          runtime_type = "io.containerd.runc.v2"
+          runtime_engine = ""
+          runtime_root = ""
+          privileged_without_host_devices = false
+          base_runtime_spec = ""
+          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+            SystemdCgroup = true
+    [plugins."io.containerd.grpc.v1.cri".cni]
+      bin_dir = "/opt/cni/bin"
+      conf_dir = "/opt/cni/conf"
+      max_conf_num = 1
+      conf_template = ""
+    [plugins."io.containerd.grpc.v1.cri".registry]
+      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
+        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
+          endpoint = ["https://registry-1.docker.io"]
+    [plugins."io.containerd.grpc.v1.cri".image_decryption]
+      key_model = ""
+    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
+      tls_cert_file = ""
+      tls_key_file = ""
+  [plugins."io.containerd.internal.v1.opt"]
+    path = "/opt/containerd"
+  [plugins."io.containerd.internal.v1.restart"]
+    interval = "10s"
+  [plugins."io.containerd.metadata.v1.bolt"]
+    content_sharing_policy = "shared"
+  [plugins."io.containerd.monitor.v1.cgroups"]
+    no_prometheus = false
+  [plugins."io.containerd.runtime.v1.linux"]
+    shim = "containerd-shim"
+    runtime = "runc"
+    runtime_root = ""
+    no_shim = false
+    shim_debug = false
+  [plugins."io.containerd.runtime.v2.task"]
+    platforms = ["linux/amd64"]
+  [plugins."io.containerd.service.v1.diff-service"]
+    default = ["walking"]
+  [plugins."io.containerd.snapshotter.v1.devmapper"]
+    root_path = ""
+    pool_name = ""
+    base_image_size = ""
+    async_remove = false
diff --git a/cluster/nix/modules/kubelet.nix b/cluster/nix/modules/kubelet.nix
index f475b5b..1a71b48 100644
--- a/cluster/nix/modules/kubelet.nix
+++ b/cluster/nix/modules/kubelet.nix
@@ -16,7 +16,7 @@
     name = "pause";
     tag = "latest";
     contents = top.package.pause;
-    config.Cmd = "/bin/pause";
+    config.Cmd = ["/bin/pause"];
   };
 
   kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
@@ -45,12 +45,6 @@
   taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
 in
 {
-  imports = [
-    #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "applyManifests" ] "")
-    #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "cadvisorPort" ] "")
-    #(mkRemovedOptionModule [ "services" "kubernetes" "kubelet" "allowPrivileged" ] "")
-  ];
-
   # services/cluster/kubernetes/default.nix still wants to poke flannel,
   # but since we nuke that module we have to add a fake tunable for it.
   options.services.kubernetes.flannel = {
@@ -203,15 +197,57 @@
     (mkIf cfg.enable {
       services.kubernetes.kubelet.seedDockerImages = [infraContainer];
 
+      # Drop crictl into administrative command line.
+      environment.systemPackages = with pkgs; [ cri-tools ];
+
+      # Force disable Docker.
+      virtualisation.docker.enable = false;
+
+      # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
+      # Kubelet 1.19.
+      systemd.enableUnifiedCgroupHierarchy = false;
+
+      # Run containerd service. This is exposes the CRI API that is consumed by
+      # crictl and Kubelet.
+      systemd.services.containerd = {
+        description = "containerd container runtime";
+        wantedBy = [ "kubernetes.target" ];
+        after = [ "network.target" ];
+        path = with pkgs; [ runc iptables ];
+        serviceConfig = {
+          Delegate = "yes";
+          KillMode = "process";
+          Restart = "always";
+          RestartSec = "5";
+          LimitNPROC = "infinity";
+          LimitCORE = "infinity";
+          # https://github.com/coreos/fedora-coreos-tracker/issues/329
+          LimitNOFILE = "1048576";
+          TasksMax = "infinity";
+          OOMScoreAdjust = "-999";
+
+          ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
+        };
+      };
+
       systemd.services.kubelet = {
         description = "Kubernetes Kubelet Service";
         wantedBy = [ "kubernetes.target" ];
-        after = [ "network.target" "docker.service" "kube-apiserver.service" ];
-        path = with pkgs; [ gitMinimal openssh docker utillinux iproute ethtool thin-provisioning-tools iptables socat ] ++ top.path;
+        after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
+        path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
+
+        # Mildly hacky - by moving over to OCI image build infrastructure in
+        # NixOS we should be able to get rid of the gunzip.
+        # TODO(q3k): figure this out, check if this is even being used by
+        # kubelet.
         preStart = ''
           ${concatMapStrings (img: ''
-            echo "Seeding docker image: ${img}"
-            docker load <${img}
+            echo "Seeding OCI image: ${img}"
+            cp ${img} /tmp/image.tar.gz
+            rm -f /tmp/image.tar
+            gunzip /tmp/image.tar.gz
+            ctr -n=k8s.io images import /tmp/image.tar || true
+            rm /tmp/image.tar
           '') cfg.seedDockerImages}
         '';
         serviceConfig = {
@@ -221,6 +257,9 @@
           Restart = "on-failure";
           RestartSec = "1000ms";
           ExecStart = ''${cfg.package}/bin/kubelet \
+            --cgroup-driver=systemd \
+            --container-runtime=remote \
+            --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
             --address=${cfg.address} \
             --authentication-token-webhook \
             --authentication-token-webhook-cache-ttl="10s" \
@@ -263,7 +302,8 @@
         };
       };
 
-      boot.kernelModules = ["br_netfilter"];
+      boot.kernelModules = [ "br_netfilter" "overlay" ];
+      boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
 
       services.kubernetes.kubelet.hostname = with config.networking;
         mkDefault (hostName + optionalString (domain != null) ".${domain}");
diff --git a/cluster/nix/modules/kubernetes.nix b/cluster/nix/modules/kubernetes.nix
index 92e28de..879c50f 100644
--- a/cluster/nix/modules/kubernetes.nix
+++ b/cluster/nix/modules/kubernetes.nix
@@ -30,24 +30,6 @@
       ./kubelet.nix
     ];
 
-  # List services that you want to enable:
-  virtualisation.docker.enable = true;
-  virtualisation.docker.extraOptions = "--iptables=false --ip-masq=false --ip-forward=true";
-
-  # Docker 1.13 sets iptables FORWARD to DROP. Unfuck this.
-  systemd.services."docker-iptables-unfuck" = {
-    enable = true;
-    wantedBy = [ "kubernetes.target" ];
-    description = "Docker iptable Unfuck";
-    after = [ "docker.service" ];
-    requires = [ "docker.service" ];
-    path = [ pkgs.iptables ];
-    script = ''
-      iptables -P FORWARD ACCEPT
-    '';
-    serviceConfig.Type = "oneshot";
-  };
-
   networking.firewall.enable = false;
 
   # Point k8s apiserver address at ourselves, as every machine runs an apiserver with this cert name.