Merge "ops, cluster: consolidate NixOS provisioning"
diff --git a/cluster/clustercfg/clustercfg.py b/cluster/clustercfg/clustercfg.py
index 0adef40..d852d6a 100644
--- a/cluster/clustercfg/clustercfg.py
+++ b/cluster/clustercfg/clustercfg.py
@@ -206,10 +206,12 @@
         ca_admitomatic = ca.CA(ss, certs_root, 'admitomatic', 'admitomatic webhook CA')
         ca_admitomatic.make_cert('admitomatic-webhook', ou='Admitomatic Webhook', hosts=['admitomatic.admitomatic.svc'])
 
-    subprocess.check_call(["nix", "run",
-                           "-f", local_root,
-                           "cluster.nix.provision",
-                           "-c", "provision-{}".format(fqdn.split('.')[0])])
+    toplevel = subprocess.check_output([
+        "nix-build",
+        local_root,
+        "-A", "ops.machines.\"" + fqdn + "\".config.passthru.hscloud.provision",
+    ]).decode().strip()
+    subprocess.check_call([toplevel])
 
 
 def usage():
diff --git a/cluster/nix/provision.nix b/cluster/nix/provision.nix
deleted file mode 100644
index 7ab7e71..0000000
--- a/cluster/nix/provision.nix
+++ /dev/null
@@ -1,49 +0,0 @@
-{ hscloud, pkgs, ... }:
-
-with builtins;
-
-let 
-  machines = (import ./defs-machines.nix);
-  configurations = builtins.listToAttrs (map (machine: {
-    name = machine.fqdn;
-    value = pkgs.nixos ({ config, pkgs, ... }: {
-      networking.hostName = machine.name;
-      imports = [
-        ./modules/base.nix
-        ./modules/kubernetes.nix
-      ];
-    });
-  }) machines);
-
-  scriptForMachine = machine: let
-    configuration = configurations."${machine.fqdn}";
-  in ''
-   set -e
-   remote=root@${machine.fqdn}
-   echo "Configuration for ${machine.fqdn} is ${configuration.toplevel}"
-   nix copy --no-check-sigs -s --to ssh://$remote ${configuration.toplevel}
-   echo "/etc/systemd/system diff:"
-   ssh $remote diff -ur /var/run/current-system/etc/systemd/system ${configuration.toplevel}/etc/systemd/system || true
-   echo ""
-   echo ""
-   ssh $remote ${configuration.toplevel}/bin/switch-to-configuration dry-activate
-   read -p "Do you want to switch to this configuration? " -n 1 -r
-   echo
-   if [[ $REPLY =~ ^[Yy]$ ]]; then
-       ssh $remote ${configuration.toplevel}/bin/switch-to-configuration switch
-   fi
-  '';
-
-  provisioners = (map (machine:
-    pkgs.writeScriptBin "provision-${machine.name}" (scriptForMachine machine)
-  ) machines);
-
-  provision = pkgs.writeScriptBin "provision" (
-    ''
-      echo "Available provisioniers:"
-    '' + (concatStringsSep "\n" (map (machine: "echo '  provision-${machine.name}'") machines)));
-in
-pkgs.symlinkJoin {
-  name = "provision";
-  paths = [ provision ] ++ provisioners;
-}
diff --git a/ops/README.md b/ops/README.md
new file mode 100644
index 0000000..d31f767
--- /dev/null
+++ b/ops/README.md
@@ -0,0 +1,23 @@
+Operations
+===
+
+Deploying NixOS machines
+---
+
+Machine configurations are in `ops/machines.nix`.
+
+Wrapper script to show all available machines and provision a single machine:
+
+     $ $(nix-build -A ops.provision)
+     Available machines:
+      - bc01n01.hswaw.net
+      - bc01n02.hswaw.net
+      - dcr01s22.hswaw.net
+      - dcr01s24.hswaw.net
+      - edge01.waw.bgp.wtf
+
+     $ $(nix-build -A ops.provision) edge01.waw.bgp.wtf
+
+This can be slow, as it evaluates/builds all machines' configs. If you just want to deploy one machine and possible iterate faster:
+
+    $ $(nix-build -A 'ops.machines."edge01.waw.bgp.wtf".config.passthru.hscloud.provision')
diff --git a/ops/machines.nix b/ops/machines.nix
index 0e63228..5401e30 100644
--- a/ops/machines.nix
+++ b/ops/machines.nix
@@ -3,30 +3,41 @@
 # This allows to have a common attrset of machines that can be deployed
 # in the same way.
 #
-# Currently building/deployment is still done in a half-assed way:
-#
-#    machine=edge01.waw.bgp.wtf
-#    d=$(nix-build -A 'ops.machines."'$machine'"'.toplevel)
-#
-# To then deploy derivation $d on $machine:
-#
-#    nix-copy-closure --to root@$machine $d
-#    ssh root@$machine $d/bin/switch-to-configuration dry-activate
-#    ssh root@$machine $d/bin/switch-to-configuration test
-#    ssh root@$machine nix-env -p /nix/var/nix/profiles/system --set $d
-#    ssh root@$machine $d/bin/switch-to-configuration boot
-#
-# TODO(q3k): merge this with //cluster/clustercfg - this should be unified!
+# For information about building/deploying machines see //ops/README.md.
 
 { hscloud, pkgs, ... }:
 
 let
+  # nixpkgs for cluster machines (.hswaw.net). Currently pinned to an old
+  # nixpkgs because NixOS modules for kubernetes changed enough that it's not
+  # super easy to use them as is.
+  #
+  # TODO(q3k): fix this: use an old nixpkgs for Kube modules while using
+  # hscloud nixpkgs for everything else.
+  nixpkgsCluster = import (pkgs.fetchFromGitHub {
+    owner = "nixos";
+    repo = "nixpkgs-channels";
+    rev = "44ad80ab1036c5cc83ada4bfa451dac9939f2a10";
+    sha256 = "1b61nzvy0d46cspy07szkc0rggacxiqg9v1py27pkqpj7rvawfsk";
+  }) {};
+
+  # edge01 still lives on an old nixpkgs checkout.
+  #
+  # TODO(b/3): unpin and deploy.
+  nixpkgsBgpwtf = import (pkgs.fetchFromGitHub {
+    owner = "nixos";
+    repo = "nixpkgs-channels";
+    rev = "c59ea8b8a0e7f927e7291c14ea6cd1bd3a16ff38";
+    sha256 = "1ak7jqx94fjhc68xh1lh35kh3w3ndbadprrb762qgvcfb8351x8v";
+  }) {};
+
   # Stopgap measure to import //cluster/nix machine definitions into new
-  # //ops/machines infrastructure.
+  # //ops/ infrastructure.
+  #
   # TODO(q3k): inject defs-cluster-k0.nix / defs-machines.nix content via
   # nixos options instead of having module definitions loading it themselves,
   # deduplicate list of machines below with defs-machines.nix somehow.
-  mkClusterMachine = name: pkgs.nixos ({ config, pkgs, ... }: {
+  clusterMachineConfig = name: [({ config, pkgs, ...}: {
     # The hostname is used by //cluster/nix machinery to load the appropriate
     # config from defs-machines into defs-cluster-k0.
     networking.hostName = name;
@@ -34,29 +45,71 @@
       ../cluster/nix/modules/base.nix
       ../cluster/nix/modules/kubernetes.nix
     ];
-  });
+  })];
 
+  # mkMachine builds NixOS modules into a NixOS derivation, and injects
+  # passthru.hscloud.provision which deploys that configuration over SSH to a
+  # production machine.
   mkMachine = pkgs: paths: pkgs.nixos ({ config, pkgs, ... }: {
     imports = paths;
+
+    config = let
+      name = config.networking.hostName;
+      domain = if (config.networking ? domain) && config.networking.domain != null then config.networking.domain else "hswaw.net";
+      fqdn = name + "." + domain;
+      toplevel = config.system.build.toplevel;
+
+      runProvision = ''
+        #!/bin/sh
+        set -eu
+        remote=root@${fqdn}
+        echo "Configuration for ${fqdn} is ${toplevel}"
+        nix copy -s --to ssh://$remote ${toplevel}
+
+        running="$(ssh $remote readlink -f /nix/var/nix/profiles/system)"
+        if [ "$running" == "${toplevel}" ]; then
+          echo "${fqdn} already running ${toplevel}."
+        else
+          echo "/etc/systemd/system diff:"
+          ssh $remote diff -ur /var/run/current-system/etc/systemd/system ${toplevel}/etc/systemd/system || true
+          echo ""
+          echo ""
+          echo "dry-activate diff:"
+          ssh $remote ${toplevel}/bin/switch-to-configuration dry-activate
+          read -p "Do you want to switch to this configuration? " -n 1 -r
+          echo
+          if ! [[ $REPLY =~ ^[Yy]$ ]]; then
+            exit 1
+          fi
+
+          echo -ne "\n\nswitch-to-configuration test...\n"
+          ssh $remote ${toplevel}/bin/switch-to-configuration test
+        fi
+
+        echo -ne "\n\n"
+        read -p "Do you want to set this configuration as boot? " -n 1 -r
+        echo
+        if ! [[ $REPLY =~ ^[Yy]$ ]]; then
+            exit 1
+        fi
+
+        echo -ne "\n\nsetting system profile...\n"
+        ssh $remote nix-env -p /nix/var/nix/profiles/system --set ${toplevel}
+
+        echo -ne "\n\nswitch-to-configuration boot...\n"
+        ssh $remote ${toplevel}/bin/switch-to-configuration boot
+      '';
+    in {
+      passthru.hscloud.provision = pkgs.writeScript "provision-${fqdn}" runProvision;
+    };
   });
-
 in {
-  "bc01n01.hswaw.net" = mkClusterMachine "bc01n01";
-  "bc01n02.hswaw.net" = mkClusterMachine "bc01n02";
-  "bc01n03.hswaw.net" = mkClusterMachine "bc01n03";
-  "dcr01s22.hswaw.net" = mkClusterMachine "dcr01s22";
-  "dcr01s24.hswaw.net" = mkClusterMachine "dcr01s24";
+  "bc01n01.hswaw.net"  = mkMachine nixpkgsCluster (clusterMachineConfig "bc01n01");
+  "bc01n02.hswaw.net"  = mkMachine nixpkgsCluster (clusterMachineConfig "bc01n02");
+  "dcr01s22.hswaw.net" = mkMachine nixpkgsCluster (clusterMachineConfig "dcr01s22");
+  "dcr01s24.hswaw.net" = mkMachine nixpkgsCluster (clusterMachineConfig "dcr01s24");
 
-  # edge01 still lives on an old nixpkgs checkout.
-  # TODO(b/3): unpin and deploy.
-  "edge01.waw.bgp.wtf" = mkMachine (
-    import (pkgs.fetchFromGitHub {
-      owner = "nixos";
-      repo = "nixpkgs-channels";
-      rev = "c59ea8b8a0e7f927e7291c14ea6cd1bd3a16ff38";
-      sha256 = "1ak7jqx94fjhc68xh1lh35kh3w3ndbadprrb762qgvcfb8351x8v";
-    }) {}
-  ) [
+  "edge01.waw.bgp.wtf" = mkMachine nixpkgsBgpwtf [
     ../bgpwtf/machines/edge01.waw.bgp.wtf.nix
     ../bgpwtf/machines/edge01.waw.bgp.wtf-hardware.nix
   ];
diff --git a/ops/provision.nix b/ops/provision.nix
new file mode 100644
index 0000000..76054c4
--- /dev/null
+++ b/ops/provision.nix
@@ -0,0 +1,74 @@
+# Top-level wrapper script for calling per-machine provisioners.
+#
+# Given ops.machines."edge01.waw.bgp.wtf".config.passthru.hscloud.provision,
+# this script allows to run it by doing:
+#   $ $(nix-build -A ops.provision) edge01.waw.bgp.wtf
+# Or, to first list all available machines by doing:
+#   $ $(nix-build -A ops.provision)
+#
+# The main logic of the provisioner script is in machines.nix.
+
+{ hscloud, pkgs, lib, ... }:
+
+with lib; with builtins;
+
+let
+
+  # All machines from ops.machines, keyed by FQDN.
+  machines = filterAttrs (n: _: n != "__readTree") hscloud.ops.machines;
+  # Machines' provisioner scripts, keyed by machine FQDN.
+  machineProvisioners = mapAttrs (_: v: v.config.passthru.hscloud.provision) machines;
+  # List of machine FQDNs.
+  machineNames = attrNames machines;
+
+  # User-friendly list of machines by FQDN.
+  machineList = concatStringsSep "\n"
+    (map
+      (name: "  - ${name}")
+      machineNames);
+
+  # Derivation containing bin/provision-FQDN symlinks to machines' provisioners.
+  forest = pkgs.linkFarm "provision-forest"
+    (mapAttrsToList
+      (fqdn: p: { name = "bin/provision-${fqdn}"; path = p; })
+      machineProvisioners);
+in
+
+pkgs.writeScript "provision" ''
+  #!/bin/sh
+  name="$1"
+
+  usage() {
+    echo >&2 "Usage: $0 machine|machine.hswaw.net"
+    echo >&2 "Available machines:"
+    echo >&2 "${machineList}"
+  }
+
+  if [ -z "$name" ]; then
+    usage
+    exit 1
+  fi
+
+  provisioner="${forest}/bin/provision-$name"
+  if [ ! -e "$provisioner" ]; then
+    name="$name.hswaw.net"
+    provisioner="${forest}/bin/provision-$name"
+  fi
+  if [ ! -e "$provisioner" ]; then
+    usage
+    exit 1
+  fi
+  # :^)
+  echo -ne "\e[34mh \e[31ms \e[33mc l \e[34mo \e[32mu \e[31md \e[0m"
+  echo ""
+  echo "Starting provisioner for $name..."
+  echo ""
+  echo "Too slow to evaluate? Equivalent faster command line that rebuilds just one node:"
+  echo "  \$(nix-build -A 'ops.machines.\"$name\".config.passthru.hscloud.provision')"
+  echo ""
+  echo "Or, if you want to deploy the same configuration on different machines, just run"
+  echo "this script again without re-evaluating nix:"
+  echo "  $0 $name"
+  echo ""
+  exec "$provisioner"
+''