cluster: refactor nix machinery to fit //ops

This is a chonky refactor that get rids of the previous cluster-centric
defs-* plain nix file setup.

Now, nodes are configured individually in plain nixos modules, and are
provided a view of all other nodes in the 'machines' attribute. Cluster
logic is moved into modules which inspect this array to find other nodes
within the same cluster.

Kubernetes options are not fully clusterified yet (ie., they are still
hardcode to only provide the 'k0' cluster) but that can be fixed later.
The Ceph machinery is a good example of how that can be done.

The new NixOS configs are zero-diff against prod. While this is done
mostly by keeping the logic, we had to keep a few newly discovered
'bugs' around by adding some temporary options which keeps things as they
are. These will be removed in a future CL, then introducing a diff (but
no functional changes, hopefully).

We also remove the nix eval from clustercfg as it was not used anymore
(basically since we refactored certs at some point).

Change-Id: Id79772a96249b0e6344046f96f9c2cb481c4e1f4
Reviewed-on: https://gerrit.hackerspace.pl/c/hscloud/+/1322
Reviewed-by: informatic <informatic@hackerspace.pl>
diff --git a/cluster/machines/bc01n01.hswaw.net.nix b/cluster/machines/bc01n01.hswaw.net.nix
new file mode 100644
index 0000000..defcbca
--- /dev/null
+++ b/cluster/machines/bc01n01.hswaw.net.nix
@@ -0,0 +1,39 @@
+{ config, pkgs, ... }:
+
+with builtins;
+
+rec {
+  networking.hostName = "bc01n01";
+  # TODO: undefine fqdn and define domain after big nix change
+  hscloud.base.fqdn = "${networking.hostName}.hswaw.net";
+  #networking.domain = "hswaw.net";
+  system.stateVersion = "18.09";
+  nix.maxJobs = 16;
+
+  boot.loader.grub.device = "/dev/disk/by-id/scsi-360024e8078a9060023b1043107388af5";
+  fileSystems."/".device = "/dev/disk/by-uuid/518ecac1-00ea-4ef0-9418-9eca6ce6d918";
+
+  hscloud.base = {
+    mgmtIf = "eno1";
+    ipAddr = "185.236.240.35";
+    ipAddrBits = 28;
+    gw = "185.236.240.33";
+  };
+
+  hscloud.kube.control.enable = true;
+  hscloud.kube.data = {
+    enable = true;
+    podNet = "10.10.16.0/24";
+  };
+
+  hscloud.ceph = {
+    name = "k0";
+    fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354";
+    enable = true;
+  };
+
+  environment.systemPackages = [
+    pkgs.shadow
+  ];
+}
+
diff --git a/cluster/machines/bc01n02.hswaw.net.nix b/cluster/machines/bc01n02.hswaw.net.nix
new file mode 100644
index 0000000..43b61d0
--- /dev/null
+++ b/cluster/machines/bc01n02.hswaw.net.nix
@@ -0,0 +1,36 @@
+{ config, pkgs, ... }:
+
+with builtins;
+
+rec {
+  networking.hostName = "bc01n02";
+  # TODO: undefine fqdn and define domain after big nix change
+  hscloud.base.fqdn = "${networking.hostName}.hswaw.net";
+  #networking.domain = "hswaw.net";
+  system.stateVersion = "18.09";
+  nix.maxJobs = 16;
+
+  boot.loader.grub.device = "/dev/disk/by-id/scsi-360024e8078b0250023b10f8706d3c99e";
+  fileSystems."/".device = "/dev/disk/by-uuid/2d45c87b-029b-463e-a7cb-afd5a3089327";
+
+  hscloud.base = {
+    mgmtIf = "eno1";
+    ipAddr = "185.236.240.36";
+    ipAddrBits = 28;
+    gw = "185.236.240.33";
+  };
+
+  hscloud.kube = {
+    control.enable = true;
+    data.enable = true;
+    data.podNet = "10.10.17.0/24";
+  };
+
+  hscloud.ceph = {
+    name = "k0";
+    fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354";
+
+    control.enable = true;
+  };
+}
+
diff --git a/cluster/machines/dcr01s22.hswaw.net.nix b/cluster/machines/dcr01s22.hswaw.net.nix
new file mode 100644
index 0000000..742a541
--- /dev/null
+++ b/cluster/machines/dcr01s22.hswaw.net.nix
@@ -0,0 +1,41 @@
+{ config, pkgs, ... }:
+
+with builtins;
+
+rec {
+  networking.hostName = "dcr01s22";
+  # TODO: undefine fqdn and define domain after big nix change
+  hscloud.base.fqdn = "${networking.hostName}.hswaw.net";
+  #networking.domain = "hswaw.net";
+  system.stateVersion = "19.09";
+  nix.maxJobs = 48;
+
+  boot.loader.grub.device = "/dev/disk/by-id/ata-Samsung_SSD_860_EVO_250GB_S3YJNX1M604518E";
+  fileSystems."/".device = "/dev/disk/by-uuid/b4149083-49fe-4951-a143-aff4cedaf33a";
+
+  hscloud.base = {
+    mgmtIf = "enp130s0f0";
+    ipAddr = "185.236.240.39";
+    ipAddrBits = 28;
+    gw = "185.236.240.33";
+  };
+
+  hscloud.kube = {
+    control.enable = true;
+    data.enable = true;
+    data.podNet = "10.10.19.0/24";
+  };
+
+  hscloud.ceph = {
+    name = "k0";
+    fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354";
+
+    osd.devices = [
+      { id = 0; path = "/dev/disk/by-id/scsi-35000c500850293e3"; uuid = "314034c5-474c-4d0d-ba41-36a881c52560";}
+      { id = 1; path = "/dev/disk/by-id/scsi-35000c500850312cb"; uuid = "a7f1baa0-0fc3-4ab1-9895-67abdc29de03";}
+      { id = 2; path = "/dev/disk/by-id/scsi-35000c5008508e3ef"; uuid = "11ac8316-6a87-48a7-a0c7-74c3cef6c2fa";}
+      { id = 3; path = "/dev/disk/by-id/scsi-35000c5008508e23f"; uuid = "c6b838d1-b08c-4788-936c-293041ed2d4d";}
+    ];
+  };
+}
+
diff --git a/cluster/machines/dcr01s24.hswaw.net.nix b/cluster/machines/dcr01s24.hswaw.net.nix
new file mode 100644
index 0000000..c3ad18e
--- /dev/null
+++ b/cluster/machines/dcr01s24.hswaw.net.nix
@@ -0,0 +1,41 @@
+{ config, pkgs, ... }:
+
+with builtins;
+
+rec {
+  networking.hostName = "dcr01s24";
+  # TODO: undefine fqdn and define domain after big nix change
+  hscloud.base.fqdn = "${networking.hostName}.hswaw.net";
+  #networking.domain = "hswaw.net";
+  system.stateVersion = "19.09";
+  nix.maxJobs = 48;
+
+  boot.loader.grub.device = "/dev/disk/by-id/ata-Samsung_SSD_860_EVO_250GB_S3YJNF0M717009H";
+  fileSystems."/".device = "/dev/disk/by-uuid/fc5c6456-5bbd-4b9e-a93e-7f9073ffe09a";
+
+  hscloud.base = {
+    mgmtIf = "enp130s0f0";
+    ipAddr = "185.236.240.40";
+    ipAddrBits = 28;
+    gw = "185.236.240.33";
+  };
+
+  hscloud.kube = {
+    control.enable = true;
+    data.enable = true;
+    data.podNet = "10.10.20.0/24";
+  };
+
+  hscloud.ceph = {
+    name = "k0";
+    fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354";
+
+    osd.devices = [
+      { id = 4; path = "/dev/disk/by-id/scsi-35000c5008509199b"; uuid = "a2b4663d-bd8f-49b3-b0b0-195c56ba252f";}
+      { id = 5; path = "/dev/disk/by-id/scsi-35000c50085046abf"; uuid = "a2242989-ccce-4367-8813-519b64b5afdb";}
+      { id = 6; path = "/dev/disk/by-id/scsi-35000c5008502929b"; uuid = "7deac89c-22dd-4c2b-b3cc-43ff7f990fd6";}
+      { id = 7; path = "/dev/disk/by-id/scsi-35000c5008502a323"; uuid = "e305ebb3-9cac-44d2-9f1d-bbb72c8ab51f";}
+    ];
+  };
+}
+
diff --git a/cluster/machines/modules/base.nix b/cluster/machines/modules/base.nix
new file mode 100644
index 0000000..66335ef
--- /dev/null
+++ b/cluster/machines/modules/base.nix
@@ -0,0 +1,100 @@
+{ config, pkgs, lib, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.base;
+
+in {
+  options.hscloud.base = {
+    fqdn = mkOption {
+      type = types.str;
+      description = "Node's FQDN.";
+      default = "${config.networking.hostName}.${config.networking.domain}";
+    };
+    mgmtIf = mkOption {
+      type = types.str;
+      description = "Main network interface. Called mgmtIf for legacy reasons.";
+    };
+    ipAddr = mkOption {
+      type = types.str;
+      description = "IPv4 address on main network interface.";
+    };
+    ipAddrBits = mkOption {
+      type = types.int;
+      description = "IPv4 CIDR mask bits.";
+    };
+    gw = mkOption {
+      type = types.str;
+      description = "IPv4 address of gateway.";
+    };
+  };
+  config = rec {
+    boot.loader.grub.enable = true;
+    boot.loader.grub.version = 2;
+  
+    fileSystems."/" =
+      { # device = ""; needs to be defined
+        fsType = "ext4";
+      };
+    swapDevices = [ ];
+  
+    boot.kernelPackages = pkgs.linuxPackages_latest;
+    boot.kernelParams = [ "boot.shell_on_fail" ];
+    boot.kernel.sysctl."net.ipv4.conf.all.rp_filter" = "0";
+    boot.kernel.sysctl."net.ipv4.conf.default.rp_filter" = "0";
+    boot.initrd.availableKernelModules = [ "uhci_hcd" "ehci_pci" "megaraid_sas" "usb_storage" "usbhid" "sd_mod" "sr_mod"  ];
+    boot.kernelModules = [ "kvm-intel" ];
+    boot.extraModulePackages = [];
+    hardware.enableRedistributableFirmware = true;
+  
+    time.timeZone = "Europe/Warsaw";
+  
+    environment.systemPackages = with pkgs; [
+      wget vim htop tcpdump
+      rxvt_unicode.terminfo
+    ];
+    programs.mtr.enable = true;
+  
+    networking.useDHCP = false;
+    networking.interfaces."${cfg.mgmtIf}" = {
+      ipv4.addresses = [
+        {
+          address = cfg.ipAddr;
+          prefixLength = cfg.ipAddrBits;
+        }
+      ];
+    };
+    networking.defaultGateway = cfg.gw;
+    networking.nameservers = ["185.236.240.1"];
+  
+    # Instead of using nixpkgs from the root/nixos channel, use pkgs pin from this file.
+    nix.nixPath = [ "nixpkgs=${pkgs.path}" "nixos-config=/etc/nixos/configuration.nix" ];
+  
+    # Otherwise fetchGit nixpkgs pin fails.
+    systemd.services.nixos-upgrade.path = [ pkgs.git ];
+  
+    # Use Chrony instead of systemd-timesyncd
+    services.chrony.enable = true;
+  
+    # Symlink lvm into /sbin/lvm on activation. This is needed by Rook OSD
+    # instances running on Kubernetes.
+    # See: https://github.com/rook/rook/commit/f3c4975e353e3ce3599c958ec6d2cae8ee8f6f61
+    system.activationScripts.sbinlvm =
+      ''
+        mkdir -m 0755 -p /sbin
+        ln -sfn ${pkgs.lvm2.bin}/bin/lvm /sbin/lvm
+      '';
+  
+    # Enable the OpenSSH daemon.
+    services.openssh.enable = true;
+    users.users.root.openssh.authorizedKeys.keys = [
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDD4VJXAXEHEXZk2dxNwehneuJcEGkfXG/U7z4fO79vDVIENdedtXQUyLyhZJc5RTEfHhQj66FwIqzl7mzBHd9x9PuDp6QAYXrkVNMj48s6JXqZqBvF6H/weRqFMf4a2TZv+hG8D0kpvmLheCwWAVRls7Jofnp/My+yDd57GMdsbG/yFEf6WPMiOnA7hxdSJSVihCsCSw2p8PD4GhBe8CVt7xIuinhutjm9zYBjV78NT8acjDUfJh0B1ODTjs7nuW1CC4jybSe2j/OU3Yczj4AxRxBNWuFxUq+jBo9BfpbKLh+Tt7re+zBkaicM77KM/oV6943JJxgHNBBOsv9scZE7 q3k@amnesia"
+      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIG599UildOrAq+LIOQjKqtGMwjgjIxozI1jtQQRKHtCP q3k@mimeomia"
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQb3YQoiYFZLKwvHYKbu1bMqzNeDCAszQhAe1+QI5SLDOotclyY/vFmOReZOsmyMFl71G2d7d+FbYNusUnNNjTxRYQ021tVc+RkMdLJaORRURmQfEFEKbai6QSFTwErXzuoIzyEPK0lbsQuGgqT9WaVnRzHJ2Q/4+qQbxAS34PuR5NqEkmn4G6LMo3OyJ5mwPkCj9lsqz4BcxRaMWFO3mNcwGDfSW+sqgc3E8N6LKrTpZq3ke7xacpQmcG5DU9VO+2QVPdltl9jWbs3gXjmF92YRNOuKPVfAOZBBsp8JOznfx8s9wDgs7RwPmDpjIAJEyoABqW5hlXfqRbTnfnMvuR informatic@InformaticPC"
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDGkMgEVwQM8yeuFUYL2TwlJIq9yUNBmHnwce46zeL2PK2CkMz7sxT/om7sp/K5XDiqeD05Nioe+Dr3drP6B8uI33S5NgxPIfaqQsRS+CBEgk6cqFlcdlKETU/DT+/WsdoO173n7mgGeafPInEuQuGDUID0Fl099kIxtqfAhdeZFMM6/szAZEZsElLJ8K6dp1Ni/jmnXCZhjivZH3AZUlnqrmtDG7FY1bgcOfDXAal45LItughGPtrdiigXe9DK2fW3+9DBZZduh5DMJTNlphAZ+nfSrbyHVKUg6WsgMSprur4KdU47q1QwzqqvEj75JcdP1jOWoZi4F6VJDte9Wb9lhD1jGgjxY9O6Gs4CH35bx15W7CN9hgNa0C8NbPJe/fZYIeMZmJ1m7O2xmnYwP8j+t7RNJWu7Pa3Em4mOEXvhBF07Zfq+Ye/4SluoRgADy5eII2x5fFo5EBhInxK0/X8wF6XZvysalVifoCh7T4Edejoi91oAxFgYAxbboXGlod0eEHIi2hla8SM9+IBHOChmgawKBYp2kzAJyAmHNBF+Pah9G4arVCj/axp/SJZDZbJQoI7UT/fJzEtvlb5RWrHXRq+y6IvjpUq4pzpDWW04+9UMqEEXRmhWOakHfEVM9rN8h3aJBflLUBBnh0Z/hVsKNh8bCRHaKtah8TrD9i+wMw== patryk.jakuszew@gmail.com"
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC33naG1ptCvUcRWX9cj9wXM1nW1lyQC4SvMJzWlr9aMD96O8hQ2JMkuIUgUJvorAY02QRplQ2BuoVoVkdkzwjMyi1bL3OdgcKo7Z1yByClGTTocqNJYY0lcUb6EJH8+6e6F9ydrQlSxNzL1uCaA7phZr+yPcmAmWbSfioXn98yXNkE0emHxzJv/nypJY56sDCMC2IXDRd8L2goDtPwgPEW7bWfAQdIFMJ75xOidZOTxJ8eqyXLw/kxY5UlyX66jdoYz1sE5XUHuoQl1AOG9UdlMo0aMhUvP4pX5l7r7EnA9OttKMFB3oWqkVK/R6ynZ52YNOU5BZ9V+Ppaj34W0xNu+p0mbHcCtXYCTrf/OU0hcZDbDaNTjs6Vtcm2wYw9iAKX7Tex+eOMwUwlrlcyPNRV5BTot7lGNYfauHCSIuWJKN4NhCLR/NtVNh4/94eKkPTwJsY6XqDcS7q49wPAs4DAH7BJgsbHPOqygVHrY0YYEfz3Pj0HTxJHQMCP/hQX4fXEGt0BjgoVJbXPAQtPyeg0JuxiUg+b4CgVVfQ6R060MlM1BZzhmh+FY5MJH6nJppS0aHYCvSg8Z68NUlCPKy0jpcyfuAIWQWwSGG1O010WShQG2ELsvNdg5/4HVdCGNl5mmoom6JOd72FOZyQlHDFfeQUQRn9HOeCq/c51rK99SQ== bartek@IHM"
+      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICTR292kx/2CNuWYIsZ6gykQ036aBGrmheIuZa6S1D2x implr@thonk"
+    ];
+  };
+}
diff --git a/cluster/machines/modules/ceph.nix b/cluster/machines/modules/ceph.nix
new file mode 100644
index 0000000..4f15bdd
--- /dev/null
+++ b/cluster/machines/modules/ceph.nix
@@ -0,0 +1,193 @@
+# This runs Ceph on hscloud cluster(s).
+#
+# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
+#
+# Most importantly, it does _not_ attempt to do any cluster
+# bootstrapping/maintenance. This means, that any configuration action that
+# does the following:
+#  0. Bringing up a cluster
+#  1. Adding/removing Mons
+#  2. Changing a Mon IP address
+#  3. Adding/removing OSDs
+# ... must be done in tandem with manual operations on the affected nodes. For
+# example, bootstrapping a cluster will involve keychain and monmap management,
+# changing anything with mons will involve monmap management, adding new OSDs
+# will require provisioning them with ceph-volume, etc.
+#
+# This is in stark contrast to a fully-managed solution like rook. Since we
+# don't have hundreds of clusters, none of the above is automated, especially
+# as that kind of automation is quite tricky to do reliably.
+
+{ config, lib, pkgs, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.ceph;
+
+  allNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;
+
+  machineName = config.networking.hostName;
+
+  # This NixOS Ceph option fragment is present on every machine that runs a
+  # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
+  # this machine.
+  cephMonConfig = if cfg.control.enable then {
+    mon = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+    mgr = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+  } else {};
+
+  # Same as for cephMonConfig, but this time for OSDs.
+  cephOsdConfig = if (length cfg.osd.devices) > 0 then {
+    osd = {
+      enable = true;
+      daemons = map (el: "${toString el.id}") cfg.osd.devices;
+    };
+    rgw = {
+      enable = true;
+      daemons = [ "rook-k0.rgw.${machineName}" ];
+    };
+  } else {};
+
+
+  # Merge ceph-volume lvm activate into ceph-osd-ID services.
+  #
+  # This is because the upstream module seems to have been written with
+  # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
+  # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
+  # contains everything for that OSD to work. 
+  #
+  # Bluestore is a bit different. Instead of a normal filesystem being mounted,
+  # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
+  # Every bluestore volume needs to be 'activated' before it can be used by an
+  # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
+  # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
+  # there. However, instead of this being a diskmount, it's instead a tmpfs
+  # into which a bunch of files are dropped, loaded from the LVM raw device.
+  #
+  # To make the upstream NixOS module OSD work with bluestore, we do the following:
+  #  1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
+  #     path. This gates the service on that device being present.
+  #  2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
+  #  3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
+  #     which seems to look for them on PATH instead of being properly
+  #     nixified).
+  #
+  # We also inject smartmontools into PATH for smartctl, which allows the OSD
+  # to monitor device health.
+  osdActivateServices = listToAttrs (map (el: let
+      osdId = toString el.id;
+      osdUuid = el.uuid;
+      diskPath = el.path;
+    in {
+    name = "ceph-osd-${osdId}";
+    value = {
+      path = with pkgs; [
+        lvm2
+        cryptsetup
+        smartmontools
+      ];
+      serviceConfig = {
+        ExecStartPre = lib.mkForce [
+          ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
+            #!/bin/sh
+            set -e
+            dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
+            disk="${el.path}"
+            uuid="${osdUuid}"
+            if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
+              echo "Volume $dir already activated, skipping..."
+            else
+              echo "Activating $dir with $disk, uuid $uuid..."
+              ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
+            fi
+
+          '')))
+
+          "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
+        ];
+      };
+      unitConfig = {
+        ConditionPathExists = lib.mkForce el.path;
+      };
+    };
+  }) cfg.osd.devices);
+
+in rec {
+  options = {
+    hscloud.ceph = {
+      enable = mkOption {
+        type = types.bool;
+        description = "Enable Ceph storage cluster (native NixOS), not rook.";
+        default = ((length cfg.osd.devices) > 0) || cfg.control.enable;
+      };
+      name = mkOption {
+        type = types.str;
+        description = "Short identifier of cluster.";
+      };
+      fsid = mkOption {
+        type = types.str;
+        description = "UUID of cluster, as generated by first mon.";
+      };
+      control = {
+        enable = mkEnableOption "mon and mgr on this host";
+      };
+      osd = {
+        devices = mkOption {
+          type = types.listOf (types.submodule {
+            options = {
+              id = mkOption {
+                description = "Numeric ID of OSD.";
+                type = types.int;
+              };
+              path = mkOption {
+                description = "Path to underlying block device for OSD storage.";
+                type = types.str;
+              };
+              uuid = mkOption {
+                description = "UUID of generated OSD storage.";
+                type = types.str;
+              };
+            };
+          });
+          default = [];
+        };
+      };
+    };
+  };
+  config = mkIf cfg.enable {
+    services.ceph = {
+      enable = cfg.control.enable || (length cfg.osd.devices) > 0;
+      global = {
+        fsid = cfg.fsid;
+        clusterName = cfg.name;
+
+        # Every Ceph node always attempts to connect to all mons.
+        monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
+        monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
+      };
+    } // cephMonConfig // cephOsdConfig;
+  
+    environment.systemPackages = with pkgs; [
+      ceph cryptsetup smartmontools
+    ];
+  
+    systemd.services = osdActivateServices;
+  
+    # Hack - the upstream ceph module should generate ${clusterName}.conf instead
+    # of ceph.conf, let's just symlink it.
+    environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
+  };
+}
diff --git a/cluster/machines/modules/containerd.toml b/cluster/machines/modules/containerd.toml
new file mode 100644
index 0000000..b079637
--- /dev/null
+++ b/cluster/machines/modules/containerd.toml
@@ -0,0 +1,134 @@
+version = 2
+root = "/var/lib/containerd"
+state = "/run/containerd"
+plugin_dir = ""
+disabled_plugins = []
+required_plugins = []
+oom_score = 0
+
+[grpc]
+  address = "/run/containerd/containerd.sock"
+  tcp_address = ""
+  tcp_tls_cert = ""
+  tcp_tls_key = ""
+  uid = 0
+  gid = 0
+  max_recv_message_size = 16777216
+  max_send_message_size = 16777216
+
+[ttrpc]
+  address = ""
+  uid = 0
+  gid = 0
+
+[debug]
+  address = ""
+  uid = 0
+  gid = 0
+  level = ""
+
+[metrics]
+  address = ""
+  grpc_histogram = false
+
+[cgroup]
+  path = ""
+
+[timeouts]
+  "io.containerd.timeout.shim.cleanup" = "5s"
+  "io.containerd.timeout.shim.load" = "5s"
+  "io.containerd.timeout.shim.shutdown" = "3s"
+  "io.containerd.timeout.task.state" = "2s"
+
+[plugins]
+  [plugins."io.containerd.gc.v1.scheduler"]
+    pause_threshold = 0.02
+    deletion_threshold = 0
+    mutation_threshold = 100
+    schedule_delay = "0s"
+    startup_delay = "100ms"
+  [plugins."io.containerd.grpc.v1.cri"]
+    disable_tcp_service = true
+    stream_server_address = "127.0.0.1"
+    stream_server_port = "0"
+    stream_idle_timeout = "4h0m0s"
+    enable_selinux = false
+    selinux_category_range = 1024
+    sandbox_image = "k8s.gcr.io/pause:3.2"
+    stats_collect_period = 10
+    systemd_cgroup = false
+    enable_tls_streaming = false
+    max_container_log_line_size = 16384
+    disable_cgroup = false
+    disable_apparmor = false
+    restrict_oom_score_adj = false
+    max_concurrent_downloads = 3
+    disable_proc_mount = false
+    unset_seccomp_profile = ""
+    tolerate_missing_hugetlb_controller = true
+    disable_hugetlb_controller = true
+    ignore_image_defined_volumes = false
+    [plugins."io.containerd.grpc.v1.cri".containerd]
+      snapshotter = "overlayfs"
+      default_runtime_name = "runc"
+      no_pivot = false
+      disable_snapshot_annotations = true
+      discard_unpacked_layers = false
+      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
+        runtime_type = ""
+        runtime_engine = ""
+        runtime_root = ""
+        privileged_without_host_devices = false
+        base_runtime_spec = ""
+      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
+        runtime_type = ""
+        runtime_engine = ""
+        runtime_root = ""
+        privileged_without_host_devices = false
+        base_runtime_spec = ""
+      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+          runtime_type = "io.containerd.runc.v2"
+          runtime_engine = ""
+          runtime_root = ""
+          privileged_without_host_devices = false
+          base_runtime_spec = ""
+          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+            SystemdCgroup = true
+    [plugins."io.containerd.grpc.v1.cri".cni]
+      bin_dir = "/opt/cni/bin"
+      conf_dir = "/opt/cni/conf"
+      max_conf_num = 1
+      conf_template = ""
+    [plugins."io.containerd.grpc.v1.cri".registry]
+      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
+        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
+          endpoint = ["https://registry-1.docker.io"]
+    [plugins."io.containerd.grpc.v1.cri".image_decryption]
+      key_model = ""
+    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
+      tls_cert_file = ""
+      tls_key_file = ""
+  [plugins."io.containerd.internal.v1.opt"]
+    path = "/opt/containerd"
+  [plugins."io.containerd.internal.v1.restart"]
+    interval = "10s"
+  [plugins."io.containerd.metadata.v1.bolt"]
+    content_sharing_policy = "shared"
+  [plugins."io.containerd.monitor.v1.cgroups"]
+    no_prometheus = false
+  [plugins."io.containerd.runtime.v1.linux"]
+    shim = "containerd-shim"
+    runtime = "runc"
+    runtime_root = ""
+    no_shim = false
+    shim_debug = false
+  [plugins."io.containerd.runtime.v2.task"]
+    platforms = ["linux/amd64"]
+  [plugins."io.containerd.service.v1.diff-service"]
+    default = ["walking"]
+  [plugins."io.containerd.snapshotter.v1.devmapper"]
+    root_path = ""
+    pool_name = ""
+    base_image_size = ""
+    async_remove = false
diff --git a/cluster/machines/modules/kube-common.nix b/cluster/machines/modules/kube-common.nix
new file mode 100644
index 0000000..6707efa
--- /dev/null
+++ b/cluster/machines/modules/kube-common.nix
@@ -0,0 +1,94 @@
+{ config, pkgs, lib, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.kube;
+  fqdn = config.hscloud.base.fqdn;
+
+in {
+  options.hscloud.kube = {
+    package = mkOption {
+      description = "Kubernetes package to use for everything but kubelet.";
+      type = types.package;
+      default = (import (fetchGit {
+        # Now at 1.16.5
+        name = "nixos-unstable-2020-01-22";
+        url = https://github.com/nixos/nixpkgs-channels/;
+        rev = "a96ed5d70427bdc2fbb9e805784e1b9621157a98";
+      }) {}).kubernetes;
+      defaultText = "pkgs.kubernetes";
+    };
+    packageKubelet = mkOption {
+      description = "Kubernetes package to use for kubelet.";
+      type = types.package;
+      default = cfg.package;
+      defaultText = "pkgs.kubernetes";
+    };
+    portAPIServerSecure = mkOption {
+      type = types.int;
+      description = "Port at which k8s apiserver will listen.";
+      default = 4001;
+    };
+    pki = let
+      mk = (radix: name: rec {
+        ca = ./../../certs + "/ca-${radix}.crt";
+        cert = ./../../certs + "/${radix}-${name}.cert";
+        key = ./../../secrets/plain + "/${radix}-${name}.key";
+      });
+      mkKube = (name: (mk "kube" name) // {
+        config = {
+          server = "https://k0.hswaw.net:${toString cfg.portAPIServerSecure}";
+          certFile = (mk "kube" name).cert;
+          keyFile = (mk "kube" name).key;
+        };
+      });
+    in mkOption {
+      type = types.attrs;
+      default = {
+        kube = rec {
+          ca = apiserver.ca;
+          
+          # Used to identify apiserver.
+          apiserver = mkKube "apiserver";
+
+          # Used to identify controller-manager.
+          controllermanager = mkKube "controllermanager";
+
+          # Used to identify scheduler.
+          scheduler = mkKube "scheduler";
+
+          # Used to encrypt service accounts.
+          serviceaccounts = mkKube "serviceaccounts";
+
+          # Used to identify kube-proxy.
+          proxy = mkKube "proxy";
+
+          # Used to identify kubelet.
+          kubelet = mkKube "kubelet-${fqdn}";
+        };
+
+        kubeFront = {
+          apiserver = mk "kubefront" "apiserver";
+        };
+
+        etcd = {
+          peer = mk "etcdpeer" fqdn;
+          server = mk "etcd" fqdn;
+          kube = mk "etcd" "kube";
+        };
+      };
+    };
+  };
+
+  config = {
+    services.kubernetes = {
+      # We do not use any nixpkgs predefined roles for k8s. Instead, we enable
+      # k8s components manually.
+      roles = [];
+      caFile = cfg.pki.kube.apiserver.ca;
+      clusterCidr = "10.10.16.0/20";
+      addons.dns.enable = false;
+    };
+  };
+}
diff --git a/cluster/machines/modules/kube-controlplane.nix b/cluster/machines/modules/kube-controlplane.nix
new file mode 100644
index 0000000..8efda58
--- /dev/null
+++ b/cluster/machines/modules/kube-controlplane.nix
@@ -0,0 +1,178 @@
+{ config, pkgs, lib, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.kube.control;
+
+  # All control plane nodes.
+  allNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.kube.control) && (m.config.hscloud.kube.control.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  # All control plane nodes that aren't the node being evaluated.
+  otherNodes = (filter (m: m.config.networking.hostName != config.networking.hostName) allNodes);
+
+  fqdn = config.hscloud.base.fqdn;
+
+  pki = config.hscloud.kube.pki;
+
+in {
+  imports = [
+    ./kube-common.nix
+  ];
+
+  options.hscloud.kube.control = {
+    enable = mkEnableOption "kubernetes control plane";
+    portControllerManagerSecure = mkOption {
+      type = types.int;
+      description = "Port at which k8s controller-manager will listen.";
+      default = 4003;
+    };
+    portSchedulerSecure = mkOption {
+      type = types.int;
+      description = "Port at which k8s scheduler will listen.";
+      default = 4005;
+    };
+  };
+
+  config = mkIf cfg.enable {
+    networking.firewall.enable = false;
+
+    # Point k8s apiserver address at ourselves, as we _are_ the apiserver.
+    networking.extraHosts = ''
+      127.0.0.1 k0.hswaw.net
+    '';
+
+    services.etcd = rec {
+      enable = true;
+      name = fqdn;
+      listenClientUrls = ["https://0.0.0.0:2379"];
+      advertiseClientUrls = ["https://${fqdn}:2379"];
+      listenPeerUrls = ["https://0.0.0.0:2380"];
+      initialAdvertisePeerUrls = ["https://${fqdn}:2380"];
+      initialCluster = (map (n: "${n.config.hscloud.base.fqdn}=https://${n.config.hscloud.base.fqdn}:2380") allNodes);
+      initialClusterState = "existing";
+
+      clientCertAuth = true;
+      trustedCaFile = pki.etcd.server.ca;
+      certFile = pki.etcd.server.cert;
+      keyFile = pki.etcd.server.key;
+
+      peerClientCertAuth = true;
+      peerTrustedCaFile = pki.etcd.peer.ca;
+      peerCertFile = pki.etcd.peer.cert;
+      peerKeyFile = pki.etcd.peer.key;
+
+      extraConf = {
+        PEER_CLIENT_CERT_AUTH = "true";
+      };
+    };
+
+    # https://github.com/NixOS/nixpkgs/issues/60687
+    systemd.services.kube-control-plane-online = {
+      preStart = pkgs.lib.mkForce "";
+    };
+
+    services.kubernetes = {
+      package = config.hscloud.kube.package;
+      # We do not use any nixpkgs predefined roles for k8s. Instead, we enable
+      # k8s components manually.
+      roles = [];
+      addons.dns.enable = false;
+      caFile = pki.kube.apiserver.ca;
+      clusterCidr = "10.10.16.0/20";
+
+      apiserver = rec {
+        enable = true;
+        # BUG: should be 0.
+        insecurePort = 4000;
+        securePort = config.hscloud.kube.portAPIServerSecure;
+        advertiseAddress = config.hscloud.base.ipAddr;
+
+        etcd = {
+          # Only point at our own etcd.
+          servers = [ "https://${fqdn}:2379" ];
+          caFile = pki.etcd.kube.ca;
+          keyFile = pki.etcd.kube.key;
+          certFile = pki.etcd.kube.cert;
+        };
+
+        tlsCertFile = pki.kube.apiserver.cert;
+        tlsKeyFile = pki.kube.apiserver.key;
+        clientCaFile = pki.kube.apiserver.ca;
+
+        kubeletHttps = true;
+        # Same CA as main APIServer CA.
+        kubeletClientCaFile = pki.kube.apiserver.ca;
+        kubeletClientCertFile = pki.kube.apiserver.cert;
+        kubeletClientKeyFile = pki.kube.apiserver.key;
+
+        serviceAccountKeyFile = pki.kube.serviceaccounts.key;
+
+        allowPrivileged = true;
+        serviceClusterIpRange = "10.10.12.0/24";
+        runtimeConfig = "api/all,authentication.k8s.io/v1beta1";
+        authorizationMode = [
+          "Node" "RBAC"
+        ];
+        enableAdmissionPlugins = [
+          "NamespaceLifecycle" "NodeRestriction" "LimitRanger" "ServiceAccount"
+          "DefaultStorageClass" "ResourceQuota" "PodSecurityPolicy"
+        ];
+        extraOpts = ''
+          --apiserver-count=5 \
+          --proxy-client-cert-file=${pki.kubeFront.apiserver.cert} \
+          --proxy-client-key-file=${pki.kubeFront.apiserver.key} \
+          --requestheader-allowed-names= \
+          --requestheader-client-ca-file=${pki.kubeFront.apiserver.ca} \
+          --requestheader-extra-headers-prefix=X-Remote-Extra- \
+          --requestheader-group-headers=X-Remote-Group  \
+          --requestheader-username-headers=X-Remote-User \
+          -v=5
+        '';
+      };
+
+      controllerManager = let
+        top = config.services.kubernetes;
+        kubeconfig = top.lib.mkKubeConfig "controller-manager" pki.kube.controllermanager.config;
+      in {
+        enable = true;
+        bindAddress = "0.0.0.0";
+        insecurePort = 0;
+        leaderElect = true;
+        serviceAccountKeyFile = pki.kube.serviceaccounts.key;
+        rootCaFile = pki.kube.ca;
+        extraOpts = ''
+          --service-cluster-ip-range=10.10.12.0/24 \
+          --use-service-account-credentials=true \
+          --secure-port=${toString cfg.portControllerManagerSecure}\
+          --authentication-kubeconfig=${kubeconfig}\
+          --authorization-kubeconfig=${kubeconfig}\
+        '';
+        kubeconfig = pki.kube.controllermanager.config;
+      };
+
+      scheduler = let
+        top = config.services.kubernetes;
+        # BUG: this should be scheduler
+        # TODO(q3k): change after big nix change
+        kubeconfig = top.lib.mkKubeConfig "scheduler" pki.kube.controllermanager.config;
+      in {
+        enable = true;
+        address = "0.0.0.0";
+        port = 0;
+        leaderElect = true;
+        kubeconfig = pki.kube.scheduler.config;
+        extraOpts = ''
+          --secure-port=${toString cfg.portSchedulerSecure}\
+          --authentication-kubeconfig=${kubeconfig}\
+          --authorization-kubeconfig=${kubeconfig}\
+        '';
+      };
+    };
+  };
+}
+
diff --git a/cluster/machines/modules/kube-dataplane.nix b/cluster/machines/modules/kube-dataplane.nix
new file mode 100644
index 0000000..f38ad84
--- /dev/null
+++ b/cluster/machines/modules/kube-dataplane.nix
@@ -0,0 +1,96 @@
+{ config, pkgs, lib, machines, ... }:
+
+with lib;
+
+let
+  # Pin for kubelet and proxy.
+  k8spkgs = import (fetchGit {
+    # Now at 1.16.5
+    name = "nixos-unstable-2020-01-22";
+    url = https://github.com/nixos/nixpkgs-channels/;
+    rev = "a96ed5d70427bdc2fbb9e805784e1b9621157a98";
+  }) {};
+
+  cfg = config.hscloud.kube.data;
+
+  # All control plane nodes.
+  controlNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.kube.control) && (m.config.hscloud.kube.control.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  fqdn = config.hscloud.base.fqdn;
+
+  pki = config.hscloud.kube.pki;
+
+in {
+  options.hscloud.kube.data = {
+    enable = mkEnableOption "kubernetes data plane";
+    podNet = mkOption {
+      type = types.str;
+      description = "Subnet in which this node will run pods. Must be exclusive with podNets of other nodes.";
+    };
+  };
+
+  # Disable kubelet service and bring in our own override.
+  # Also nuke flannel from the orbit.
+  disabledModules = [
+    "services/cluster/kubernetes/kubelet.nix"
+    "services/cluster/kubernetes/flannel.nix"
+  ];
+
+  imports = [
+    ./kubelet.nix
+    ./kube-common.nix
+  ];
+
+
+  config = mkIf cfg.enable {
+    # If we're not running the control plane, render a hostsfile that points at
+    # all other control plane nodes. Otherwise, the control plane module will
+    # make this hostsfile contain the node itself.
+    networking.extraHosts = mkIf (!config.hscloud.kube.control.enable) (concatStringsSep "\n" (map
+      (n: ''
+        ${n.config.hscloud.base.mgmtIf} ${n.config.hscloud.base.fqdn}
+      '')
+    controlNodes));
+
+    # this seems to depend on flannel
+    # TODO(q3k): file issue
+    systemd.services.kubelet-online = {
+      script = pkgs.lib.mkForce "sleep 1";
+    };
+
+    services.kubernetes = {
+      # The kubelet wants to mkfs.ext4 when mounting pvcs.
+      path = [ pkgs.e2fsprogs ];
+
+      proxy = {
+        enable = true;
+        kubeconfig = pki.kube.proxy.config;
+        extraOpts = ''
+          --hostname-override=${fqdn}\
+          --proxy-mode=iptables
+        '';
+      };
+
+      kubelet = {
+        enable = true;
+        unschedulable = false;
+        hostname = fqdn;
+        tlsCertFile = pki.kube.kubelet.cert;
+        tlsKeyFile = pki.kube.kubelet.key;
+        clientCaFile = pki.kube.kubelet.ca;
+        nodeIp = config.hscloud.base.ipAddr;
+        networkPlugin = "cni";
+        clusterDns = "10.10.12.254";
+        kubeconfig = pki.kube.kubelet.config;
+        extraOpts = ''
+          --read-only-port=0
+        '';
+        package = config.hscloud.kube.packageKubelet;
+      };
+    };
+  };
+}
diff --git a/cluster/machines/modules/kubelet.nix b/cluster/machines/modules/kubelet.nix
new file mode 100644
index 0000000..1a71b48
--- /dev/null
+++ b/cluster/machines/modules/kubelet.nix
@@ -0,0 +1,348 @@
+# Same as upstream kubelet.nix module from nixpkgs, but with the following
+# changes:
+#   - cni tunables nuked and replaced with static host dirs, so that calico
+#     running on k8s can drop CNI plugins there itself
+#   - package configurable separately from rest of kubernetes
+
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+  top = config.services.kubernetes;
+  cfg = top.kubelet;
+
+  infraContainer = pkgs.dockerTools.buildImage {
+    name = "pause";
+    tag = "latest";
+    contents = top.package.pause;
+    config.Cmd = ["/bin/pause"];
+  };
+
+  kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
+
+  manifestPath = "kubernetes/manifests";
+
+  taintOptions = with lib.types; { name, ... }: {
+    options = {
+      key = mkOption {
+        description = "Key of taint.";
+        default = name;
+        type = str;
+      };
+      value = mkOption {
+        description = "Value of taint.";
+        type = str;
+      };
+      effect = mkOption {
+        description = "Effect of taint.";
+        example = "NoSchedule";
+        type = enum ["NoSchedule" "PreferNoSchedule" "NoExecute"];
+      };
+    };
+  };
+
+  taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
+in
+{
+  # services/cluster/kubernetes/default.nix still wants to poke flannel,
+  # but since we nuke that module we have to add a fake tunable for it.
+  options.services.kubernetes.flannel = {
+    enable = mkEnableOption "enable flannel networking";
+  };
+
+  ###### interface
+  options.services.kubernetes.kubelet = with lib.types; {
+
+    address = mkOption {
+      description = "Kubernetes kubelet info server listening address.";
+      default = "0.0.0.0";
+      type = str;
+    };
+
+    clusterDns = mkOption {
+      description = "Use alternative DNS.";
+      default = "10.1.0.1";
+      type = str;
+    };
+
+    clusterDomain = mkOption {
+      description = "Use alternative domain.";
+      default = config.services.kubernetes.addons.dns.clusterDomain;
+      type = str;
+    };
+
+    clientCaFile = mkOption {
+      description = "Kubernetes apiserver CA file for client authentication.";
+      default = top.caFile;
+      type = nullOr path;
+    };
+
+    enable = mkEnableOption "Kubernetes kubelet.";
+
+    extraOpts = mkOption {
+      description = "Kubernetes kubelet extra command line options.";
+      default = "";
+      type = str;
+    };
+
+    featureGates = mkOption {
+      description = "List set of feature gates";
+      default = top.featureGates;
+      type = listOf str;
+    };
+
+    healthz = {
+      bind = mkOption {
+        description = "Kubernetes kubelet healthz listening address.";
+        default = "127.0.0.1";
+        type = str;
+      };
+
+      port = mkOption {
+        description = "Kubernetes kubelet healthz port.";
+        default = 10248;
+        type = int;
+      };
+    };
+
+    hostname = mkOption {
+      description = "Kubernetes kubelet hostname override.";
+      default = config.networking.hostName;
+      type = str;
+    };
+
+    kubeconfig = top.lib.mkKubeConfigOptions "Kubelet";
+
+    manifests = mkOption {
+      description = "List of manifests to bootstrap with kubelet (only pods can be created as manifest entry)";
+      type = attrsOf attrs;
+      default = {};
+    };
+
+    networkPlugin = mkOption {
+      description = "Network plugin to use by Kubernetes.";
+      type = nullOr (enum ["cni" "kubenet"]);
+      default = "kubenet";
+    };
+
+    nodeIp = mkOption {
+      description = "IP address of the node. If set, kubelet will use this IP address for the node.";
+      default = null;
+      type = nullOr str;
+    };
+
+    registerNode = mkOption {
+      description = "Whether to auto register kubelet with API server.";
+      default = true;
+      type = bool;
+    };
+
+    package = mkOption {
+      description = "Kubernetes package to use.";
+      type = types.package;
+      default = pkgs.kubernetes;
+      defaultText = "pkgs.kubernetes";
+    };
+
+    port = mkOption {
+      description = "Kubernetes kubelet info server listening port.";
+      default = 10250;
+      type = int;
+    };
+
+    seedDockerImages = mkOption {
+      description = "List of docker images to preload on system";
+      default = [];
+      type = listOf package;
+    };
+
+    taints = mkOption {
+      description = "Node taints (https://kubernetes.io/docs/concepts/configuration/assign-pod-node/).";
+      default = {};
+      type = attrsOf (submodule [ taintOptions ]);
+    };
+
+    tlsCertFile = mkOption {
+      description = "File containing x509 Certificate for HTTPS.";
+      default = null;
+      type = nullOr path;
+    };
+
+    tlsKeyFile = mkOption {
+      description = "File containing x509 private key matching tlsCertFile.";
+      default = null;
+      type = nullOr path;
+    };
+
+    unschedulable = mkOption {
+      description = "Whether to set node taint to unschedulable=true as it is the case of node that has only master role.";
+      default = false;
+      type = bool;
+    };
+
+    verbosity = mkOption {
+      description = ''
+        Optional glog verbosity level for logging statements. See
+        <link xlink:href="https://github.com/kubernetes/community/blob/master/contributors/devel/logging.md"/>
+      '';
+      default = null;
+      type = nullOr int;
+    };
+
+  };
+
+  ###### implementation
+  config = mkMerge [
+    (mkIf cfg.enable {
+      services.kubernetes.kubelet.seedDockerImages = [infraContainer];
+
+      # Drop crictl into administrative command line.
+      environment.systemPackages = with pkgs; [ cri-tools ];
+
+      # Force disable Docker.
+      virtualisation.docker.enable = false;
+
+      # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
+      # Kubelet 1.19.
+      systemd.enableUnifiedCgroupHierarchy = false;
+
+      # Run containerd service. This is exposes the CRI API that is consumed by
+      # crictl and Kubelet.
+      systemd.services.containerd = {
+        description = "containerd container runtime";
+        wantedBy = [ "kubernetes.target" ];
+        after = [ "network.target" ];
+        path = with pkgs; [ runc iptables ];
+        serviceConfig = {
+          Delegate = "yes";
+          KillMode = "process";
+          Restart = "always";
+          RestartSec = "5";
+          LimitNPROC = "infinity";
+          LimitCORE = "infinity";
+          # https://github.com/coreos/fedora-coreos-tracker/issues/329
+          LimitNOFILE = "1048576";
+          TasksMax = "infinity";
+          OOMScoreAdjust = "-999";
+
+          ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
+        };
+      };
+
+      systemd.services.kubelet = {
+        description = "Kubernetes Kubelet Service";
+        wantedBy = [ "kubernetes.target" ];
+        after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
+        path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
+
+        # Mildly hacky - by moving over to OCI image build infrastructure in
+        # NixOS we should be able to get rid of the gunzip.
+        # TODO(q3k): figure this out, check if this is even being used by
+        # kubelet.
+        preStart = ''
+          ${concatMapStrings (img: ''
+            echo "Seeding OCI image: ${img}"
+            cp ${img} /tmp/image.tar.gz
+            rm -f /tmp/image.tar
+            gunzip /tmp/image.tar.gz
+            ctr -n=k8s.io images import /tmp/image.tar || true
+            rm /tmp/image.tar
+          '') cfg.seedDockerImages}
+        '';
+        serviceConfig = {
+          Slice = "kubernetes.slice";
+          CPUAccounting = true;
+          MemoryAccounting = true;
+          Restart = "on-failure";
+          RestartSec = "1000ms";
+          ExecStart = ''${cfg.package}/bin/kubelet \
+            --cgroup-driver=systemd \
+            --container-runtime=remote \
+            --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
+            --address=${cfg.address} \
+            --authentication-token-webhook \
+            --authentication-token-webhook-cache-ttl="10s" \
+            --authorization-mode=Webhook \
+            ${optionalString (cfg.clientCaFile != null)
+              "--client-ca-file=${cfg.clientCaFile}"} \
+            ${optionalString (cfg.clusterDns != "")
+              "--cluster-dns=${cfg.clusterDns}"} \
+            ${optionalString (cfg.clusterDomain != "")
+              "--cluster-domain=${cfg.clusterDomain}"} \
+            --cni-conf-dir=/opt/cni/conf \
+            --cni-bin-dir=/opt/cni/bin \
+            ${optionalString (cfg.featureGates != [])
+              "--feature-gates=${concatMapStringsSep "," (feature: "${feature}=true") cfg.featureGates}"} \
+            --hairpin-mode=hairpin-veth \
+            --healthz-bind-address=${cfg.healthz.bind} \
+            --healthz-port=${toString cfg.healthz.port} \
+            --hostname-override=${cfg.hostname} \
+            --kubeconfig=${kubeconfig} \
+            ${optionalString (cfg.networkPlugin != null)
+              "--network-plugin=${cfg.networkPlugin}"} \
+            ${optionalString (cfg.nodeIp != null)
+              "--node-ip=${cfg.nodeIp}"} \
+            --pod-infra-container-image=pause \
+            ${optionalString (cfg.manifests != {})
+              "--pod-manifest-path=/etc/${manifestPath}"} \
+            --port=${toString cfg.port} \
+            --register-node=${boolToString cfg.registerNode} \
+            ${optionalString (taints != "")
+              "--register-with-taints=${taints}"} \
+            --root-dir=${top.dataDir} \
+            ${optionalString (cfg.tlsCertFile != null)
+              "--tls-cert-file=${cfg.tlsCertFile}"} \
+            ${optionalString (cfg.tlsKeyFile != null)
+              "--tls-private-key-file=${cfg.tlsKeyFile}"} \
+            ${optionalString (cfg.verbosity != null) "--v=${toString cfg.verbosity}"} \
+            ${cfg.extraOpts}
+          '';
+          WorkingDirectory = top.dataDir;
+        };
+      };
+
+      boot.kernelModules = [ "br_netfilter" "overlay" ];
+      boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
+
+      services.kubernetes.kubelet.hostname = with config.networking;
+        mkDefault (hostName + optionalString (domain != null) ".${domain}");
+
+      services.kubernetes.pki.certs = with top.lib; {
+        kubelet = mkCert {
+          name = "kubelet";
+          CN = top.kubelet.hostname;
+          action = "systemctl restart kubelet.service";
+
+        };
+        kubeletClient = mkCert {
+          name = "kubelet-client";
+          CN = "system:node:${top.kubelet.hostname}";
+          fields = {
+            O = "system:nodes";
+          };
+          action = "systemctl restart kubelet.service";
+        };
+      };
+
+      services.kubernetes.kubelet.kubeconfig.server = mkDefault top.apiserverAddress;
+    })
+
+    (mkIf (cfg.enable && cfg.manifests != {}) {
+      environment.etc = mapAttrs' (name: manifest:
+        nameValuePair "${manifestPath}/${name}.json" {
+          text = builtins.toJSON manifest;
+          mode = "0755";
+        }
+      ) cfg.manifests;
+    })
+
+    (mkIf (cfg.unschedulable && cfg.enable) {
+      services.kubernetes.kubelet.taints.unschedulable = {
+        value = "true";
+        effect = "NoSchedule";
+      };
+    })
+
+  ];
+}