diff --git a/cluster/machines/modules/base.nix b/cluster/machines/modules/base.nix
new file mode 100644
index 0000000..66335ef
--- /dev/null
+++ b/cluster/machines/modules/base.nix
@@ -0,0 +1,100 @@
+{ config, pkgs, lib, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.base;
+
+in {
+  options.hscloud.base = {
+    fqdn = mkOption {
+      type = types.str;
+      description = "Node's FQDN.";
+      default = "${config.networking.hostName}.${config.networking.domain}";
+    };
+    mgmtIf = mkOption {
+      type = types.str;
+      description = "Main network interface. Called mgmtIf for legacy reasons.";
+    };
+    ipAddr = mkOption {
+      type = types.str;
+      description = "IPv4 address on main network interface.";
+    };
+    ipAddrBits = mkOption {
+      type = types.int;
+      description = "IPv4 CIDR mask bits.";
+    };
+    gw = mkOption {
+      type = types.str;
+      description = "IPv4 address of gateway.";
+    };
+  };
+  config = rec {
+    boot.loader.grub.enable = true;
+    boot.loader.grub.version = 2;
+  
+    fileSystems."/" =
+      { # device = ""; needs to be defined
+        fsType = "ext4";
+      };
+    swapDevices = [ ];
+  
+    boot.kernelPackages = pkgs.linuxPackages_latest;
+    boot.kernelParams = [ "boot.shell_on_fail" ];
+    boot.kernel.sysctl."net.ipv4.conf.all.rp_filter" = "0";
+    boot.kernel.sysctl."net.ipv4.conf.default.rp_filter" = "0";
+    boot.initrd.availableKernelModules = [ "uhci_hcd" "ehci_pci" "megaraid_sas" "usb_storage" "usbhid" "sd_mod" "sr_mod"  ];
+    boot.kernelModules = [ "kvm-intel" ];
+    boot.extraModulePackages = [];
+    hardware.enableRedistributableFirmware = true;
+  
+    time.timeZone = "Europe/Warsaw";
+  
+    environment.systemPackages = with pkgs; [
+      wget vim htop tcpdump
+      rxvt_unicode.terminfo
+    ];
+    programs.mtr.enable = true;
+  
+    networking.useDHCP = false;
+    networking.interfaces."${cfg.mgmtIf}" = {
+      ipv4.addresses = [
+        {
+          address = cfg.ipAddr;
+          prefixLength = cfg.ipAddrBits;
+        }
+      ];
+    };
+    networking.defaultGateway = cfg.gw;
+    networking.nameservers = ["185.236.240.1"];
+  
+    # Instead of using nixpkgs from the root/nixos channel, use pkgs pin from this file.
+    nix.nixPath = [ "nixpkgs=${pkgs.path}" "nixos-config=/etc/nixos/configuration.nix" ];
+  
+    # Otherwise fetchGit nixpkgs pin fails.
+    systemd.services.nixos-upgrade.path = [ pkgs.git ];
+  
+    # Use Chrony instead of systemd-timesyncd
+    services.chrony.enable = true;
+  
+    # Symlink lvm into /sbin/lvm on activation. This is needed by Rook OSD
+    # instances running on Kubernetes.
+    # See: https://github.com/rook/rook/commit/f3c4975e353e3ce3599c958ec6d2cae8ee8f6f61
+    system.activationScripts.sbinlvm =
+      ''
+        mkdir -m 0755 -p /sbin
+        ln -sfn ${pkgs.lvm2.bin}/bin/lvm /sbin/lvm
+      '';
+  
+    # Enable the OpenSSH daemon.
+    services.openssh.enable = true;
+    users.users.root.openssh.authorizedKeys.keys = [
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDD4VJXAXEHEXZk2dxNwehneuJcEGkfXG/U7z4fO79vDVIENdedtXQUyLyhZJc5RTEfHhQj66FwIqzl7mzBHd9x9PuDp6QAYXrkVNMj48s6JXqZqBvF6H/weRqFMf4a2TZv+hG8D0kpvmLheCwWAVRls7Jofnp/My+yDd57GMdsbG/yFEf6WPMiOnA7hxdSJSVihCsCSw2p8PD4GhBe8CVt7xIuinhutjm9zYBjV78NT8acjDUfJh0B1ODTjs7nuW1CC4jybSe2j/OU3Yczj4AxRxBNWuFxUq+jBo9BfpbKLh+Tt7re+zBkaicM77KM/oV6943JJxgHNBBOsv9scZE7 q3k@amnesia"
+      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIG599UildOrAq+LIOQjKqtGMwjgjIxozI1jtQQRKHtCP q3k@mimeomia"
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQb3YQoiYFZLKwvHYKbu1bMqzNeDCAszQhAe1+QI5SLDOotclyY/vFmOReZOsmyMFl71G2d7d+FbYNusUnNNjTxRYQ021tVc+RkMdLJaORRURmQfEFEKbai6QSFTwErXzuoIzyEPK0lbsQuGgqT9WaVnRzHJ2Q/4+qQbxAS34PuR5NqEkmn4G6LMo3OyJ5mwPkCj9lsqz4BcxRaMWFO3mNcwGDfSW+sqgc3E8N6LKrTpZq3ke7xacpQmcG5DU9VO+2QVPdltl9jWbs3gXjmF92YRNOuKPVfAOZBBsp8JOznfx8s9wDgs7RwPmDpjIAJEyoABqW5hlXfqRbTnfnMvuR informatic@InformaticPC"
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDGkMgEVwQM8yeuFUYL2TwlJIq9yUNBmHnwce46zeL2PK2CkMz7sxT/om7sp/K5XDiqeD05Nioe+Dr3drP6B8uI33S5NgxPIfaqQsRS+CBEgk6cqFlcdlKETU/DT+/WsdoO173n7mgGeafPInEuQuGDUID0Fl099kIxtqfAhdeZFMM6/szAZEZsElLJ8K6dp1Ni/jmnXCZhjivZH3AZUlnqrmtDG7FY1bgcOfDXAal45LItughGPtrdiigXe9DK2fW3+9DBZZduh5DMJTNlphAZ+nfSrbyHVKUg6WsgMSprur4KdU47q1QwzqqvEj75JcdP1jOWoZi4F6VJDte9Wb9lhD1jGgjxY9O6Gs4CH35bx15W7CN9hgNa0C8NbPJe/fZYIeMZmJ1m7O2xmnYwP8j+t7RNJWu7Pa3Em4mOEXvhBF07Zfq+Ye/4SluoRgADy5eII2x5fFo5EBhInxK0/X8wF6XZvysalVifoCh7T4Edejoi91oAxFgYAxbboXGlod0eEHIi2hla8SM9+IBHOChmgawKBYp2kzAJyAmHNBF+Pah9G4arVCj/axp/SJZDZbJQoI7UT/fJzEtvlb5RWrHXRq+y6IvjpUq4pzpDWW04+9UMqEEXRmhWOakHfEVM9rN8h3aJBflLUBBnh0Z/hVsKNh8bCRHaKtah8TrD9i+wMw== patryk.jakuszew@gmail.com"
+      "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC33naG1ptCvUcRWX9cj9wXM1nW1lyQC4SvMJzWlr9aMD96O8hQ2JMkuIUgUJvorAY02QRplQ2BuoVoVkdkzwjMyi1bL3OdgcKo7Z1yByClGTTocqNJYY0lcUb6EJH8+6e6F9ydrQlSxNzL1uCaA7phZr+yPcmAmWbSfioXn98yXNkE0emHxzJv/nypJY56sDCMC2IXDRd8L2goDtPwgPEW7bWfAQdIFMJ75xOidZOTxJ8eqyXLw/kxY5UlyX66jdoYz1sE5XUHuoQl1AOG9UdlMo0aMhUvP4pX5l7r7EnA9OttKMFB3oWqkVK/R6ynZ52YNOU5BZ9V+Ppaj34W0xNu+p0mbHcCtXYCTrf/OU0hcZDbDaNTjs6Vtcm2wYw9iAKX7Tex+eOMwUwlrlcyPNRV5BTot7lGNYfauHCSIuWJKN4NhCLR/NtVNh4/94eKkPTwJsY6XqDcS7q49wPAs4DAH7BJgsbHPOqygVHrY0YYEfz3Pj0HTxJHQMCP/hQX4fXEGt0BjgoVJbXPAQtPyeg0JuxiUg+b4CgVVfQ6R060MlM1BZzhmh+FY5MJH6nJppS0aHYCvSg8Z68NUlCPKy0jpcyfuAIWQWwSGG1O010WShQG2ELsvNdg5/4HVdCGNl5mmoom6JOd72FOZyQlHDFfeQUQRn9HOeCq/c51rK99SQ== bartek@IHM"
+      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICTR292kx/2CNuWYIsZ6gykQ036aBGrmheIuZa6S1D2x implr@thonk"
+    ];
+  };
+}
diff --git a/cluster/machines/modules/ceph.nix b/cluster/machines/modules/ceph.nix
new file mode 100644
index 0000000..4f15bdd
--- /dev/null
+++ b/cluster/machines/modules/ceph.nix
@@ -0,0 +1,193 @@
+# This runs Ceph on hscloud cluster(s).
+#
+# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
+#
+# Most importantly, it does _not_ attempt to do any cluster
+# bootstrapping/maintenance. This means, that any configuration action that
+# does the following:
+#  0. Bringing up a cluster
+#  1. Adding/removing Mons
+#  2. Changing a Mon IP address
+#  3. Adding/removing OSDs
+# ... must be done in tandem with manual operations on the affected nodes. For
+# example, bootstrapping a cluster will involve keychain and monmap management,
+# changing anything with mons will involve monmap management, adding new OSDs
+# will require provisioning them with ceph-volume, etc.
+#
+# This is in stark contrast to a fully-managed solution like rook. Since we
+# don't have hundreds of clusters, none of the above is automated, especially
+# as that kind of automation is quite tricky to do reliably.
+
+{ config, lib, pkgs, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.ceph;
+
+  allNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;
+
+  machineName = config.networking.hostName;
+
+  # This NixOS Ceph option fragment is present on every machine that runs a
+  # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
+  # this machine.
+  cephMonConfig = if cfg.control.enable then {
+    mon = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+    mgr = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+  } else {};
+
+  # Same as for cephMonConfig, but this time for OSDs.
+  cephOsdConfig = if (length cfg.osd.devices) > 0 then {
+    osd = {
+      enable = true;
+      daemons = map (el: "${toString el.id}") cfg.osd.devices;
+    };
+    rgw = {
+      enable = true;
+      daemons = [ "rook-k0.rgw.${machineName}" ];
+    };
+  } else {};
+
+
+  # Merge ceph-volume lvm activate into ceph-osd-ID services.
+  #
+  # This is because the upstream module seems to have been written with
+  # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
+  # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
+  # contains everything for that OSD to work. 
+  #
+  # Bluestore is a bit different. Instead of a normal filesystem being mounted,
+  # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
+  # Every bluestore volume needs to be 'activated' before it can be used by an
+  # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
+  # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
+  # there. However, instead of this being a diskmount, it's instead a tmpfs
+  # into which a bunch of files are dropped, loaded from the LVM raw device.
+  #
+  # To make the upstream NixOS module OSD work with bluestore, we do the following:
+  #  1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
+  #     path. This gates the service on that device being present.
+  #  2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
+  #  3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
+  #     which seems to look for them on PATH instead of being properly
+  #     nixified).
+  #
+  # We also inject smartmontools into PATH for smartctl, which allows the OSD
+  # to monitor device health.
+  osdActivateServices = listToAttrs (map (el: let
+      osdId = toString el.id;
+      osdUuid = el.uuid;
+      diskPath = el.path;
+    in {
+    name = "ceph-osd-${osdId}";
+    value = {
+      path = with pkgs; [
+        lvm2
+        cryptsetup
+        smartmontools
+      ];
+      serviceConfig = {
+        ExecStartPre = lib.mkForce [
+          ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
+            #!/bin/sh
+            set -e
+            dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
+            disk="${el.path}"
+            uuid="${osdUuid}"
+            if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
+              echo "Volume $dir already activated, skipping..."
+            else
+              echo "Activating $dir with $disk, uuid $uuid..."
+              ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
+            fi
+
+          '')))
+
+          "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
+        ];
+      };
+      unitConfig = {
+        ConditionPathExists = lib.mkForce el.path;
+      };
+    };
+  }) cfg.osd.devices);
+
+in rec {
+  options = {
+    hscloud.ceph = {
+      enable = mkOption {
+        type = types.bool;
+        description = "Enable Ceph storage cluster (native NixOS), not rook.";
+        default = ((length cfg.osd.devices) > 0) || cfg.control.enable;
+      };
+      name = mkOption {
+        type = types.str;
+        description = "Short identifier of cluster.";
+      };
+      fsid = mkOption {
+        type = types.str;
+        description = "UUID of cluster, as generated by first mon.";
+      };
+      control = {
+        enable = mkEnableOption "mon and mgr on this host";
+      };
+      osd = {
+        devices = mkOption {
+          type = types.listOf (types.submodule {
+            options = {
+              id = mkOption {
+                description = "Numeric ID of OSD.";
+                type = types.int;
+              };
+              path = mkOption {
+                description = "Path to underlying block device for OSD storage.";
+                type = types.str;
+              };
+              uuid = mkOption {
+                description = "UUID of generated OSD storage.";
+                type = types.str;
+              };
+            };
+          });
+          default = [];
+        };
+      };
+    };
+  };
+  config = mkIf cfg.enable {
+    services.ceph = {
+      enable = cfg.control.enable || (length cfg.osd.devices) > 0;
+      global = {
+        fsid = cfg.fsid;
+        clusterName = cfg.name;
+
+        # Every Ceph node always attempts to connect to all mons.
+        monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
+        monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
+      };
+    } // cephMonConfig // cephOsdConfig;
+  
+    environment.systemPackages = with pkgs; [
+      ceph cryptsetup smartmontools
+    ];
+  
+    systemd.services = osdActivateServices;
+  
+    # Hack - the upstream ceph module should generate ${clusterName}.conf instead
+    # of ceph.conf, let's just symlink it.
+    environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
+  };
+}
diff --git a/cluster/machines/modules/containerd.toml b/cluster/machines/modules/containerd.toml
new file mode 100644
index 0000000..b079637
--- /dev/null
+++ b/cluster/machines/modules/containerd.toml
@@ -0,0 +1,134 @@
+version = 2
+root = "/var/lib/containerd"
+state = "/run/containerd"
+plugin_dir = ""
+disabled_plugins = []
+required_plugins = []
+oom_score = 0
+
+[grpc]
+  address = "/run/containerd/containerd.sock"
+  tcp_address = ""
+  tcp_tls_cert = ""
+  tcp_tls_key = ""
+  uid = 0
+  gid = 0
+  max_recv_message_size = 16777216
+  max_send_message_size = 16777216
+
+[ttrpc]
+  address = ""
+  uid = 0
+  gid = 0
+
+[debug]
+  address = ""
+  uid = 0
+  gid = 0
+  level = ""
+
+[metrics]
+  address = ""
+  grpc_histogram = false
+
+[cgroup]
+  path = ""
+
+[timeouts]
+  "io.containerd.timeout.shim.cleanup" = "5s"
+  "io.containerd.timeout.shim.load" = "5s"
+  "io.containerd.timeout.shim.shutdown" = "3s"
+  "io.containerd.timeout.task.state" = "2s"
+
+[plugins]
+  [plugins."io.containerd.gc.v1.scheduler"]
+    pause_threshold = 0.02
+    deletion_threshold = 0
+    mutation_threshold = 100
+    schedule_delay = "0s"
+    startup_delay = "100ms"
+  [plugins."io.containerd.grpc.v1.cri"]
+    disable_tcp_service = true
+    stream_server_address = "127.0.0.1"
+    stream_server_port = "0"
+    stream_idle_timeout = "4h0m0s"
+    enable_selinux = false
+    selinux_category_range = 1024
+    sandbox_image = "k8s.gcr.io/pause:3.2"
+    stats_collect_period = 10
+    systemd_cgroup = false
+    enable_tls_streaming = false
+    max_container_log_line_size = 16384
+    disable_cgroup = false
+    disable_apparmor = false
+    restrict_oom_score_adj = false
+    max_concurrent_downloads = 3
+    disable_proc_mount = false
+    unset_seccomp_profile = ""
+    tolerate_missing_hugetlb_controller = true
+    disable_hugetlb_controller = true
+    ignore_image_defined_volumes = false
+    [plugins."io.containerd.grpc.v1.cri".containerd]
+      snapshotter = "overlayfs"
+      default_runtime_name = "runc"
+      no_pivot = false
+      disable_snapshot_annotations = true
+      discard_unpacked_layers = false
+      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
+        runtime_type = ""
+        runtime_engine = ""
+        runtime_root = ""
+        privileged_without_host_devices = false
+        base_runtime_spec = ""
+      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
+        runtime_type = ""
+        runtime_engine = ""
+        runtime_root = ""
+        privileged_without_host_devices = false
+        base_runtime_spec = ""
+      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+          runtime_type = "io.containerd.runc.v2"
+          runtime_engine = ""
+          runtime_root = ""
+          privileged_without_host_devices = false
+          base_runtime_spec = ""
+          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+            SystemdCgroup = true
+    [plugins."io.containerd.grpc.v1.cri".cni]
+      bin_dir = "/opt/cni/bin"
+      conf_dir = "/opt/cni/conf"
+      max_conf_num = 1
+      conf_template = ""
+    [plugins."io.containerd.grpc.v1.cri".registry]
+      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
+        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
+          endpoint = ["https://registry-1.docker.io"]
+    [plugins."io.containerd.grpc.v1.cri".image_decryption]
+      key_model = ""
+    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
+      tls_cert_file = ""
+      tls_key_file = ""
+  [plugins."io.containerd.internal.v1.opt"]
+    path = "/opt/containerd"
+  [plugins."io.containerd.internal.v1.restart"]
+    interval = "10s"
+  [plugins."io.containerd.metadata.v1.bolt"]
+    content_sharing_policy = "shared"
+  [plugins."io.containerd.monitor.v1.cgroups"]
+    no_prometheus = false
+  [plugins."io.containerd.runtime.v1.linux"]
+    shim = "containerd-shim"
+    runtime = "runc"
+    runtime_root = ""
+    no_shim = false
+    shim_debug = false
+  [plugins."io.containerd.runtime.v2.task"]
+    platforms = ["linux/amd64"]
+  [plugins."io.containerd.service.v1.diff-service"]
+    default = ["walking"]
+  [plugins."io.containerd.snapshotter.v1.devmapper"]
+    root_path = ""
+    pool_name = ""
+    base_image_size = ""
+    async_remove = false
diff --git a/cluster/machines/modules/kube-common.nix b/cluster/machines/modules/kube-common.nix
new file mode 100644
index 0000000..6707efa
--- /dev/null
+++ b/cluster/machines/modules/kube-common.nix
@@ -0,0 +1,94 @@
+{ config, pkgs, lib, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.kube;
+  fqdn = config.hscloud.base.fqdn;
+
+in {
+  options.hscloud.kube = {
+    package = mkOption {
+      description = "Kubernetes package to use for everything but kubelet.";
+      type = types.package;
+      default = (import (fetchGit {
+        # Now at 1.16.5
+        name = "nixos-unstable-2020-01-22";
+        url = https://github.com/nixos/nixpkgs-channels/;
+        rev = "a96ed5d70427bdc2fbb9e805784e1b9621157a98";
+      }) {}).kubernetes;
+      defaultText = "pkgs.kubernetes";
+    };
+    packageKubelet = mkOption {
+      description = "Kubernetes package to use for kubelet.";
+      type = types.package;
+      default = cfg.package;
+      defaultText = "pkgs.kubernetes";
+    };
+    portAPIServerSecure = mkOption {
+      type = types.int;
+      description = "Port at which k8s apiserver will listen.";
+      default = 4001;
+    };
+    pki = let
+      mk = (radix: name: rec {
+        ca = ./../../certs + "/ca-${radix}.crt";
+        cert = ./../../certs + "/${radix}-${name}.cert";
+        key = ./../../secrets/plain + "/${radix}-${name}.key";
+      });
+      mkKube = (name: (mk "kube" name) // {
+        config = {
+          server = "https://k0.hswaw.net:${toString cfg.portAPIServerSecure}";
+          certFile = (mk "kube" name).cert;
+          keyFile = (mk "kube" name).key;
+        };
+      });
+    in mkOption {
+      type = types.attrs;
+      default = {
+        kube = rec {
+          ca = apiserver.ca;
+          
+          # Used to identify apiserver.
+          apiserver = mkKube "apiserver";
+
+          # Used to identify controller-manager.
+          controllermanager = mkKube "controllermanager";
+
+          # Used to identify scheduler.
+          scheduler = mkKube "scheduler";
+
+          # Used to encrypt service accounts.
+          serviceaccounts = mkKube "serviceaccounts";
+
+          # Used to identify kube-proxy.
+          proxy = mkKube "proxy";
+
+          # Used to identify kubelet.
+          kubelet = mkKube "kubelet-${fqdn}";
+        };
+
+        kubeFront = {
+          apiserver = mk "kubefront" "apiserver";
+        };
+
+        etcd = {
+          peer = mk "etcdpeer" fqdn;
+          server = mk "etcd" fqdn;
+          kube = mk "etcd" "kube";
+        };
+      };
+    };
+  };
+
+  config = {
+    services.kubernetes = {
+      # We do not use any nixpkgs predefined roles for k8s. Instead, we enable
+      # k8s components manually.
+      roles = [];
+      caFile = cfg.pki.kube.apiserver.ca;
+      clusterCidr = "10.10.16.0/20";
+      addons.dns.enable = false;
+    };
+  };
+}
diff --git a/cluster/machines/modules/kube-controlplane.nix b/cluster/machines/modules/kube-controlplane.nix
new file mode 100644
index 0000000..8efda58
--- /dev/null
+++ b/cluster/machines/modules/kube-controlplane.nix
@@ -0,0 +1,178 @@
+{ config, pkgs, lib, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.kube.control;
+
+  # All control plane nodes.
+  allNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.kube.control) && (m.config.hscloud.kube.control.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  # All control plane nodes that aren't the node being evaluated.
+  otherNodes = (filter (m: m.config.networking.hostName != config.networking.hostName) allNodes);
+
+  fqdn = config.hscloud.base.fqdn;
+
+  pki = config.hscloud.kube.pki;
+
+in {
+  imports = [
+    ./kube-common.nix
+  ];
+
+  options.hscloud.kube.control = {
+    enable = mkEnableOption "kubernetes control plane";
+    portControllerManagerSecure = mkOption {
+      type = types.int;
+      description = "Port at which k8s controller-manager will listen.";
+      default = 4003;
+    };
+    portSchedulerSecure = mkOption {
+      type = types.int;
+      description = "Port at which k8s scheduler will listen.";
+      default = 4005;
+    };
+  };
+
+  config = mkIf cfg.enable {
+    networking.firewall.enable = false;
+
+    # Point k8s apiserver address at ourselves, as we _are_ the apiserver.
+    networking.extraHosts = ''
+      127.0.0.1 k0.hswaw.net
+    '';
+
+    services.etcd = rec {
+      enable = true;
+      name = fqdn;
+      listenClientUrls = ["https://0.0.0.0:2379"];
+      advertiseClientUrls = ["https://${fqdn}:2379"];
+      listenPeerUrls = ["https://0.0.0.0:2380"];
+      initialAdvertisePeerUrls = ["https://${fqdn}:2380"];
+      initialCluster = (map (n: "${n.config.hscloud.base.fqdn}=https://${n.config.hscloud.base.fqdn}:2380") allNodes);
+      initialClusterState = "existing";
+
+      clientCertAuth = true;
+      trustedCaFile = pki.etcd.server.ca;
+      certFile = pki.etcd.server.cert;
+      keyFile = pki.etcd.server.key;
+
+      peerClientCertAuth = true;
+      peerTrustedCaFile = pki.etcd.peer.ca;
+      peerCertFile = pki.etcd.peer.cert;
+      peerKeyFile = pki.etcd.peer.key;
+
+      extraConf = {
+        PEER_CLIENT_CERT_AUTH = "true";
+      };
+    };
+
+    # https://github.com/NixOS/nixpkgs/issues/60687
+    systemd.services.kube-control-plane-online = {
+      preStart = pkgs.lib.mkForce "";
+    };
+
+    services.kubernetes = {
+      package = config.hscloud.kube.package;
+      # We do not use any nixpkgs predefined roles for k8s. Instead, we enable
+      # k8s components manually.
+      roles = [];
+      addons.dns.enable = false;
+      caFile = pki.kube.apiserver.ca;
+      clusterCidr = "10.10.16.0/20";
+
+      apiserver = rec {
+        enable = true;
+        # BUG: should be 0.
+        insecurePort = 4000;
+        securePort = config.hscloud.kube.portAPIServerSecure;
+        advertiseAddress = config.hscloud.base.ipAddr;
+
+        etcd = {
+          # Only point at our own etcd.
+          servers = [ "https://${fqdn}:2379" ];
+          caFile = pki.etcd.kube.ca;
+          keyFile = pki.etcd.kube.key;
+          certFile = pki.etcd.kube.cert;
+        };
+
+        tlsCertFile = pki.kube.apiserver.cert;
+        tlsKeyFile = pki.kube.apiserver.key;
+        clientCaFile = pki.kube.apiserver.ca;
+
+        kubeletHttps = true;
+        # Same CA as main APIServer CA.
+        kubeletClientCaFile = pki.kube.apiserver.ca;
+        kubeletClientCertFile = pki.kube.apiserver.cert;
+        kubeletClientKeyFile = pki.kube.apiserver.key;
+
+        serviceAccountKeyFile = pki.kube.serviceaccounts.key;
+
+        allowPrivileged = true;
+        serviceClusterIpRange = "10.10.12.0/24";
+        runtimeConfig = "api/all,authentication.k8s.io/v1beta1";
+        authorizationMode = [
+          "Node" "RBAC"
+        ];
+        enableAdmissionPlugins = [
+          "NamespaceLifecycle" "NodeRestriction" "LimitRanger" "ServiceAccount"
+          "DefaultStorageClass" "ResourceQuota" "PodSecurityPolicy"
+        ];
+        extraOpts = ''
+          --apiserver-count=5 \
+          --proxy-client-cert-file=${pki.kubeFront.apiserver.cert} \
+          --proxy-client-key-file=${pki.kubeFront.apiserver.key} \
+          --requestheader-allowed-names= \
+          --requestheader-client-ca-file=${pki.kubeFront.apiserver.ca} \
+          --requestheader-extra-headers-prefix=X-Remote-Extra- \
+          --requestheader-group-headers=X-Remote-Group  \
+          --requestheader-username-headers=X-Remote-User \
+          -v=5
+        '';
+      };
+
+      controllerManager = let
+        top = config.services.kubernetes;
+        kubeconfig = top.lib.mkKubeConfig "controller-manager" pki.kube.controllermanager.config;
+      in {
+        enable = true;
+        bindAddress = "0.0.0.0";
+        insecurePort = 0;
+        leaderElect = true;
+        serviceAccountKeyFile = pki.kube.serviceaccounts.key;
+        rootCaFile = pki.kube.ca;
+        extraOpts = ''
+          --service-cluster-ip-range=10.10.12.0/24 \
+          --use-service-account-credentials=true \
+          --secure-port=${toString cfg.portControllerManagerSecure}\
+          --authentication-kubeconfig=${kubeconfig}\
+          --authorization-kubeconfig=${kubeconfig}\
+        '';
+        kubeconfig = pki.kube.controllermanager.config;
+      };
+
+      scheduler = let
+        top = config.services.kubernetes;
+        # BUG: this should be scheduler
+        # TODO(q3k): change after big nix change
+        kubeconfig = top.lib.mkKubeConfig "scheduler" pki.kube.controllermanager.config;
+      in {
+        enable = true;
+        address = "0.0.0.0";
+        port = 0;
+        leaderElect = true;
+        kubeconfig = pki.kube.scheduler.config;
+        extraOpts = ''
+          --secure-port=${toString cfg.portSchedulerSecure}\
+          --authentication-kubeconfig=${kubeconfig}\
+          --authorization-kubeconfig=${kubeconfig}\
+        '';
+      };
+    };
+  };
+}
+
diff --git a/cluster/machines/modules/kube-dataplane.nix b/cluster/machines/modules/kube-dataplane.nix
new file mode 100644
index 0000000..f38ad84
--- /dev/null
+++ b/cluster/machines/modules/kube-dataplane.nix
@@ -0,0 +1,96 @@
+{ config, pkgs, lib, machines, ... }:
+
+with lib;
+
+let
+  # Pin for kubelet and proxy.
+  k8spkgs = import (fetchGit {
+    # Now at 1.16.5
+    name = "nixos-unstable-2020-01-22";
+    url = https://github.com/nixos/nixpkgs-channels/;
+    rev = "a96ed5d70427bdc2fbb9e805784e1b9621157a98";
+  }) {};
+
+  cfg = config.hscloud.kube.data;
+
+  # All control plane nodes.
+  controlNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.kube.control) && (m.config.hscloud.kube.control.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  fqdn = config.hscloud.base.fqdn;
+
+  pki = config.hscloud.kube.pki;
+
+in {
+  options.hscloud.kube.data = {
+    enable = mkEnableOption "kubernetes data plane";
+    podNet = mkOption {
+      type = types.str;
+      description = "Subnet in which this node will run pods. Must be exclusive with podNets of other nodes.";
+    };
+  };
+
+  # Disable kubelet service and bring in our own override.
+  # Also nuke flannel from the orbit.
+  disabledModules = [
+    "services/cluster/kubernetes/kubelet.nix"
+    "services/cluster/kubernetes/flannel.nix"
+  ];
+
+  imports = [
+    ./kubelet.nix
+    ./kube-common.nix
+  ];
+
+
+  config = mkIf cfg.enable {
+    # If we're not running the control plane, render a hostsfile that points at
+    # all other control plane nodes. Otherwise, the control plane module will
+    # make this hostsfile contain the node itself.
+    networking.extraHosts = mkIf (!config.hscloud.kube.control.enable) (concatStringsSep "\n" (map
+      (n: ''
+        ${n.config.hscloud.base.mgmtIf} ${n.config.hscloud.base.fqdn}
+      '')
+    controlNodes));
+
+    # this seems to depend on flannel
+    # TODO(q3k): file issue
+    systemd.services.kubelet-online = {
+      script = pkgs.lib.mkForce "sleep 1";
+    };
+
+    services.kubernetes = {
+      # The kubelet wants to mkfs.ext4 when mounting pvcs.
+      path = [ pkgs.e2fsprogs ];
+
+      proxy = {
+        enable = true;
+        kubeconfig = pki.kube.proxy.config;
+        extraOpts = ''
+          --hostname-override=${fqdn}\
+          --proxy-mode=iptables
+        '';
+      };
+
+      kubelet = {
+        enable = true;
+        unschedulable = false;
+        hostname = fqdn;
+        tlsCertFile = pki.kube.kubelet.cert;
+        tlsKeyFile = pki.kube.kubelet.key;
+        clientCaFile = pki.kube.kubelet.ca;
+        nodeIp = config.hscloud.base.ipAddr;
+        networkPlugin = "cni";
+        clusterDns = "10.10.12.254";
+        kubeconfig = pki.kube.kubelet.config;
+        extraOpts = ''
+          --read-only-port=0
+        '';
+        package = config.hscloud.kube.packageKubelet;
+      };
+    };
+  };
+}
diff --git a/cluster/machines/modules/kubelet.nix b/cluster/machines/modules/kubelet.nix
new file mode 100644
index 0000000..1a71b48
--- /dev/null
+++ b/cluster/machines/modules/kubelet.nix
@@ -0,0 +1,348 @@
+# Same as upstream kubelet.nix module from nixpkgs, but with the following
+# changes:
+#   - cni tunables nuked and replaced with static host dirs, so that calico
+#     running on k8s can drop CNI plugins there itself
+#   - package configurable separately from rest of kubernetes
+
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+  top = config.services.kubernetes;
+  cfg = top.kubelet;
+
+  infraContainer = pkgs.dockerTools.buildImage {
+    name = "pause";
+    tag = "latest";
+    contents = top.package.pause;
+    config.Cmd = ["/bin/pause"];
+  };
+
+  kubeconfig = top.lib.mkKubeConfig "kubelet" cfg.kubeconfig;
+
+  manifestPath = "kubernetes/manifests";
+
+  taintOptions = with lib.types; { name, ... }: {
+    options = {
+      key = mkOption {
+        description = "Key of taint.";
+        default = name;
+        type = str;
+      };
+      value = mkOption {
+        description = "Value of taint.";
+        type = str;
+      };
+      effect = mkOption {
+        description = "Effect of taint.";
+        example = "NoSchedule";
+        type = enum ["NoSchedule" "PreferNoSchedule" "NoExecute"];
+      };
+    };
+  };
+
+  taints = concatMapStringsSep "," (v: "${v.key}=${v.value}:${v.effect}") (mapAttrsToList (n: v: v) cfg.taints);
+in
+{
+  # services/cluster/kubernetes/default.nix still wants to poke flannel,
+  # but since we nuke that module we have to add a fake tunable for it.
+  options.services.kubernetes.flannel = {
+    enable = mkEnableOption "enable flannel networking";
+  };
+
+  ###### interface
+  options.services.kubernetes.kubelet = with lib.types; {
+
+    address = mkOption {
+      description = "Kubernetes kubelet info server listening address.";
+      default = "0.0.0.0";
+      type = str;
+    };
+
+    clusterDns = mkOption {
+      description = "Use alternative DNS.";
+      default = "10.1.0.1";
+      type = str;
+    };
+
+    clusterDomain = mkOption {
+      description = "Use alternative domain.";
+      default = config.services.kubernetes.addons.dns.clusterDomain;
+      type = str;
+    };
+
+    clientCaFile = mkOption {
+      description = "Kubernetes apiserver CA file for client authentication.";
+      default = top.caFile;
+      type = nullOr path;
+    };
+
+    enable = mkEnableOption "Kubernetes kubelet.";
+
+    extraOpts = mkOption {
+      description = "Kubernetes kubelet extra command line options.";
+      default = "";
+      type = str;
+    };
+
+    featureGates = mkOption {
+      description = "List set of feature gates";
+      default = top.featureGates;
+      type = listOf str;
+    };
+
+    healthz = {
+      bind = mkOption {
+        description = "Kubernetes kubelet healthz listening address.";
+        default = "127.0.0.1";
+        type = str;
+      };
+
+      port = mkOption {
+        description = "Kubernetes kubelet healthz port.";
+        default = 10248;
+        type = int;
+      };
+    };
+
+    hostname = mkOption {
+      description = "Kubernetes kubelet hostname override.";
+      default = config.networking.hostName;
+      type = str;
+    };
+
+    kubeconfig = top.lib.mkKubeConfigOptions "Kubelet";
+
+    manifests = mkOption {
+      description = "List of manifests to bootstrap with kubelet (only pods can be created as manifest entry)";
+      type = attrsOf attrs;
+      default = {};
+    };
+
+    networkPlugin = mkOption {
+      description = "Network plugin to use by Kubernetes.";
+      type = nullOr (enum ["cni" "kubenet"]);
+      default = "kubenet";
+    };
+
+    nodeIp = mkOption {
+      description = "IP address of the node. If set, kubelet will use this IP address for the node.";
+      default = null;
+      type = nullOr str;
+    };
+
+    registerNode = mkOption {
+      description = "Whether to auto register kubelet with API server.";
+      default = true;
+      type = bool;
+    };
+
+    package = mkOption {
+      description = "Kubernetes package to use.";
+      type = types.package;
+      default = pkgs.kubernetes;
+      defaultText = "pkgs.kubernetes";
+    };
+
+    port = mkOption {
+      description = "Kubernetes kubelet info server listening port.";
+      default = 10250;
+      type = int;
+    };
+
+    seedDockerImages = mkOption {
+      description = "List of docker images to preload on system";
+      default = [];
+      type = listOf package;
+    };
+
+    taints = mkOption {
+      description = "Node taints (https://kubernetes.io/docs/concepts/configuration/assign-pod-node/).";
+      default = {};
+      type = attrsOf (submodule [ taintOptions ]);
+    };
+
+    tlsCertFile = mkOption {
+      description = "File containing x509 Certificate for HTTPS.";
+      default = null;
+      type = nullOr path;
+    };
+
+    tlsKeyFile = mkOption {
+      description = "File containing x509 private key matching tlsCertFile.";
+      default = null;
+      type = nullOr path;
+    };
+
+    unschedulable = mkOption {
+      description = "Whether to set node taint to unschedulable=true as it is the case of node that has only master role.";
+      default = false;
+      type = bool;
+    };
+
+    verbosity = mkOption {
+      description = ''
+        Optional glog verbosity level for logging statements. See
+        <link xlink:href="https://github.com/kubernetes/community/blob/master/contributors/devel/logging.md"/>
+      '';
+      default = null;
+      type = nullOr int;
+    };
+
+  };
+
+  ###### implementation
+  config = mkMerge [
+    (mkIf cfg.enable {
+      services.kubernetes.kubelet.seedDockerImages = [infraContainer];
+
+      # Drop crictl into administrative command line.
+      environment.systemPackages = with pkgs; [ cri-tools ];
+
+      # Force disable Docker.
+      virtualisation.docker.enable = false;
+
+      # TODO(q3k): move to unified cgroups (cgroup v2) once we upgrade to
+      # Kubelet 1.19.
+      systemd.enableUnifiedCgroupHierarchy = false;
+
+      # Run containerd service. This is exposes the CRI API that is consumed by
+      # crictl and Kubelet.
+      systemd.services.containerd = {
+        description = "containerd container runtime";
+        wantedBy = [ "kubernetes.target" ];
+        after = [ "network.target" ];
+        path = with pkgs; [ runc iptables ];
+        serviceConfig = {
+          Delegate = "yes";
+          KillMode = "process";
+          Restart = "always";
+          RestartSec = "5";
+          LimitNPROC = "infinity";
+          LimitCORE = "infinity";
+          # https://github.com/coreos/fedora-coreos-tracker/issues/329
+          LimitNOFILE = "1048576";
+          TasksMax = "infinity";
+          OOMScoreAdjust = "-999";
+
+          ExecStart = "${pkgs.containerd}/bin/containerd -c ${./containerd.toml}";
+        };
+      };
+
+      systemd.services.kubelet = {
+        description = "Kubernetes Kubelet Service";
+        wantedBy = [ "kubernetes.target" ];
+        after = [ "network.target" "containerd.service" "kube-apiserver.service" ];
+        path = with pkgs; [ gitMinimal openssh utillinux iproute ethtool thin-provisioning-tools iptables socat cri-tools containerd gzip ] ++ top.path;
+
+        # Mildly hacky - by moving over to OCI image build infrastructure in
+        # NixOS we should be able to get rid of the gunzip.
+        # TODO(q3k): figure this out, check if this is even being used by
+        # kubelet.
+        preStart = ''
+          ${concatMapStrings (img: ''
+            echo "Seeding OCI image: ${img}"
+            cp ${img} /tmp/image.tar.gz
+            rm -f /tmp/image.tar
+            gunzip /tmp/image.tar.gz
+            ctr -n=k8s.io images import /tmp/image.tar || true
+            rm /tmp/image.tar
+          '') cfg.seedDockerImages}
+        '';
+        serviceConfig = {
+          Slice = "kubernetes.slice";
+          CPUAccounting = true;
+          MemoryAccounting = true;
+          Restart = "on-failure";
+          RestartSec = "1000ms";
+          ExecStart = ''${cfg.package}/bin/kubelet \
+            --cgroup-driver=systemd \
+            --container-runtime=remote \
+            --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock \
+            --address=${cfg.address} \
+            --authentication-token-webhook \
+            --authentication-token-webhook-cache-ttl="10s" \
+            --authorization-mode=Webhook \
+            ${optionalString (cfg.clientCaFile != null)
+              "--client-ca-file=${cfg.clientCaFile}"} \
+            ${optionalString (cfg.clusterDns != "")
+              "--cluster-dns=${cfg.clusterDns}"} \
+            ${optionalString (cfg.clusterDomain != "")
+              "--cluster-domain=${cfg.clusterDomain}"} \
+            --cni-conf-dir=/opt/cni/conf \
+            --cni-bin-dir=/opt/cni/bin \
+            ${optionalString (cfg.featureGates != [])
+              "--feature-gates=${concatMapStringsSep "," (feature: "${feature}=true") cfg.featureGates}"} \
+            --hairpin-mode=hairpin-veth \
+            --healthz-bind-address=${cfg.healthz.bind} \
+            --healthz-port=${toString cfg.healthz.port} \
+            --hostname-override=${cfg.hostname} \
+            --kubeconfig=${kubeconfig} \
+            ${optionalString (cfg.networkPlugin != null)
+              "--network-plugin=${cfg.networkPlugin}"} \
+            ${optionalString (cfg.nodeIp != null)
+              "--node-ip=${cfg.nodeIp}"} \
+            --pod-infra-container-image=pause \
+            ${optionalString (cfg.manifests != {})
+              "--pod-manifest-path=/etc/${manifestPath}"} \
+            --port=${toString cfg.port} \
+            --register-node=${boolToString cfg.registerNode} \
+            ${optionalString (taints != "")
+              "--register-with-taints=${taints}"} \
+            --root-dir=${top.dataDir} \
+            ${optionalString (cfg.tlsCertFile != null)
+              "--tls-cert-file=${cfg.tlsCertFile}"} \
+            ${optionalString (cfg.tlsKeyFile != null)
+              "--tls-private-key-file=${cfg.tlsKeyFile}"} \
+            ${optionalString (cfg.verbosity != null) "--v=${toString cfg.verbosity}"} \
+            ${cfg.extraOpts}
+          '';
+          WorkingDirectory = top.dataDir;
+        };
+      };
+
+      boot.kernelModules = [ "br_netfilter" "overlay" ];
+      boot.kernel.sysctl."net.ipv4.ip_forward" = "1";
+
+      services.kubernetes.kubelet.hostname = with config.networking;
+        mkDefault (hostName + optionalString (domain != null) ".${domain}");
+
+      services.kubernetes.pki.certs = with top.lib; {
+        kubelet = mkCert {
+          name = "kubelet";
+          CN = top.kubelet.hostname;
+          action = "systemctl restart kubelet.service";
+
+        };
+        kubeletClient = mkCert {
+          name = "kubelet-client";
+          CN = "system:node:${top.kubelet.hostname}";
+          fields = {
+            O = "system:nodes";
+          };
+          action = "systemctl restart kubelet.service";
+        };
+      };
+
+      services.kubernetes.kubelet.kubeconfig.server = mkDefault top.apiserverAddress;
+    })
+
+    (mkIf (cfg.enable && cfg.manifests != {}) {
+      environment.etc = mapAttrs' (name: manifest:
+        nameValuePair "${manifestPath}/${name}.json" {
+          text = builtins.toJSON manifest;
+          mode = "0755";
+        }
+      ) cfg.manifests;
+    })
+
+    (mkIf (cfg.unschedulable && cfg.enable) {
+      services.kubernetes.kubelet.taints.unschedulable = {
+        value = "true";
+        effect = "NoSchedule";
+      };
+    })
+
+  ];
+}
