Merge "cluster: deploy NixOS-based ceph"
diff --git a/cluster/nix/defs-cluster-k0.nix b/cluster/nix/defs-cluster-k0.nix
index c3519cc..cd0fcac 100644
--- a/cluster/nix/defs-cluster-k0.nix
+++ b/cluster/nix/defs-cluster-k0.nix
@@ -10,8 +10,60 @@
   fqdn = machineName + domain;
   machine = (builtins.head (builtins.filter (n: n.fqdn == fqdn) machines));
   otherMachines = (builtins.filter (n: n.fqdn != fqdn) machines);
+  machinesByName = builtins.listToAttrs (map (m: { name =; value = m; }) machines);
   inherit machines;
+  # Ceph cluster to run systemd modules for.
+  cephCluster = {
+    fsid = "74592dc2-31b7-4dbe-88cf-40459dfeb354";
+    name = "k0";
+    # Map from node name to mon configuration (currently always empty).
+    #
+    # Each mon also runs a mgr daemon (which is a leader-elected kitchen
+    # sink^W^Whousekeeping service hanging off of a mon cluster).
+    #
+    # Consult the Ceph documentation
+    # ( on
+    # how to actually carry out mon-related maintenance operations.
+    mons = {
+      bc01n02 = {};
+    };
+    # Map from node name to list of disks on node.
+    # Each disk is:
+    #  id:   OSD numerical ID, eg. 0 for osd.0. You get this after running
+    #        ceph-lvm volume create.
+    #  path: Filesystem path for disk backing drive. This should be something
+    #        in /dev/disk/by-id for safety. This is only used to gate OSD
+    #        daemon startup by disk presence.
+    #  uuid: OSD uuid/fsid. You get this after running ceph-lvm volume create.
+    #
+    # Quick guide how to set up a new OSD (but please refer to the Ceph manual):
+    # 0. Copy /var/lib/ceph/bootstrap-osd/k0.keyring from another OSD node to
+    #    the new OSD node, if this is a new node. Remember to chown ceph:ceph
+    #    chmod 0600!
+    # 1. nix-shell -p ceph lvm2 cryptsetup (if on a node that's not yet an OSD)
+    # 2. ceph-volume --cluster k0 lvm create --bluestore --data /dev/sdX --no-systemd --dmcrypt
+    # 3. The above will mount a tmpfs on /var/lib/ceph/osd/k0-X. X is the new
+    #    osd id. A file named fsid inside this directory is the new OSD fsid/uuid.
+    # 4. Configure osds below with the above information, redeploy node from nix.
+    osds = {
+      dcr01s22 = [
+        { id = 0; path = "/dev/disk/by-id/scsi-35000c500850293e3"; uuid = "314034c5-474c-4d0d-ba41-36a881c52560";}
+        { id = 1; path = "/dev/disk/by-id/scsi-35000c500850312cb"; uuid = "a7f1baa0-0fc3-4ab1-9895-67abdc29de03";}
+        { id = 2; path = "/dev/disk/by-id/scsi-35000c5008508e3ef"; uuid = "11ac8316-6a87-48a7-a0c7-74c3cef6c2fa";}
+        { id = 3; path = "/dev/disk/by-id/scsi-35000c5008508e23f"; uuid = "c6b838d1-b08c-4788-936c-293041ed2d4d";}
+      ];
+      dcr01s24 = [
+        { id = 4; path = "/dev/disk/by-id/scsi-35000c5008509199b"; uuid = "a2b4663d-bd8f-49b3-b0b0-195c56ba252f";}
+        { id = 5; path = "/dev/disk/by-id/scsi-35000c50085046abf"; uuid = "a2242989-ccce-4367-8813-519b64b5afdb";}
+        { id = 6; path = "/dev/disk/by-id/scsi-35000c5008502929b"; uuid = "7deac89c-22dd-4c2b-b3cc-43ff7f990fd6";}
+        { id = 7; path = "/dev/disk/by-id/scsi-35000c5008502a323"; uuid = "e305ebb3-9cac-44d2-9f1d-bbb72c8ab51f";}
+      ];
+    };
+  };
   pki = rec {
     make = (radix: name: rec {
       ca = ./../certs + "/ca-${radix}.crt";
diff --git a/cluster/nix/modules/ceph.nix b/cluster/nix/modules/ceph.nix
new file mode 100644
index 0000000..bc3180f
--- /dev/null
+++ b/cluster/nix/modules/ceph.nix
@@ -0,0 +1,145 @@
+# This runs Ceph on hscloud cluster(s).
+# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
+# Most importantly, it does _not_ attempt to do any cluster
+# bootstrapping/maintenance. This means, that any configuration action that
+# does the following:
+#  0. Bringing up a cluster
+#  1. Adding/removing Mons
+#  2. Changing a Mon IP address
+#  3. Adding/removing OSDs
+# ... must be done in tandem with manual operations on the affected nodes. For
+# example, bootstrapping a cluster will involve keychain and monmap management,
+# changing anything with mons will involve monmap management, adding new OSDs
+# will require provisioning them with ceph-volume, etc.
+# This is in stark contrast to a fully-managed solution like rook. Since we
+# don't have hundreds of clusters, none of the above is automated, especially
+# as that kind of automation is quite tricky to do reliably.
+{ config, lib, pkgs, ... }:
+with builtins;
+with lib;
+with (( import ../defs-cluster-k0.nix ) config.networking.hostName);
+  machineName = config.networking.hostName;
+  isMon = hasAttr machineName cephCluster.mons;
+  isOsd = hasAttr machineName cephCluster.osds;
+  hasCeph = isMon || isOsd;
+  # This NixOS Ceph option fragment is present on every machine that runs a
+  # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
+  # this machine.
+  cephMonConfig = if isMon then {
+    mon = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+    mgr = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+  } else {};
+  # Same as for cephMonConfig, but this time for OSDs.
+  cephOsdConfig = if isOsd then {
+    osd = {
+      enable = true;
+      daemons = map (el: "${toString}") cephCluster.osds.${machineName};
+    };
+  } else {};
+  # The full option fragment for services.ceph. It contains ceph.conf fragments
+  # (in .global.*) and merges ceph{Mon,Osd}Config.
+  cephConfig = {
+    enable = true;
+    global = {
+      fsid = cephCluster.fsid;
+      clusterName =;
+      # Every Ceph node always attempts to connect to all mons.
+      monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons);
+      monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons);
+    };
+  } // cephMonConfig // cephOsdConfig;
+  # Merge ceph-volume lvm activate into ceph-osd-ID services.
+  #
+  # This is because the upstream module seems to have been written with
+  # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
+  # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
+  # contains everything for that OSD to work. 
+  #
+  # Bluestore is a bit different. Instead of a normal filesystem being mounted,
+  # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
+  # Every bluestore volume needs to be 'activated' before it can be used by an
+  # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
+  # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
+  # there. However, instead of this being a diskmount, it's instead a tmpfs
+  # into which a bunch of files are dropped, loaded from the LVM raw device.
+  #
+  # To make the upstream NixOS module OSD work with bluestore, we do the following:
+  #  1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
+  #     path. This gates the service on that device being present.
+  #  2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
+  #  3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
+  #     which seems to look for them on PATH instead of being properly
+  #     nixified).
+  #
+  # We also inject smartmontools into PATH for smartctl, which allows the OSD
+  # to monitor device health.
+  osdActivateServices = listToAttrs (map (el: let
+      osdId = toString;
+      osdUuid = el.uuid;
+      diskPath = el.path;
+    in {
+    name = "ceph-osd-${osdId}";
+    value = {
+      path = with pkgs; [
+        lvm2
+        cryptsetup
+        smartmontools
+      ];
+      serviceConfig = {
+        ExecStartPre = lib.mkForce [
+          ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}" ''
+            #!/bin/sh
+            set -e
+            dir="/var/lib/ceph/osd/${}-${osdId}/"
+            disk="${el.path}"
+            uuid="${osdUuid}"
+            if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
+              echo "Volume $dir already activated, skipping..."
+            else
+              echo "Activating $dir with $disk, uuid $uuid..."
+              ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
+            fi
+          '')))
+          "${pkgs.ceph.lib}/libexec/ceph/ --id ${osdId} --cluster ${}"
+        ];
+      };
+      unitConfig = {
+        ConditionPathExists = lib.mkForce el.path;
+      };
+    };
+  }) (if isOsd then cephCluster.osds.${machineName} else []));
+in rec {
+  services.ceph = if hasCeph then cephConfig else {};
+  environment.systemPackages = with pkgs; [
+    ceph cryptsetup smartmontools
+  ];
+ = osdActivateServices;
+  # Hack - the upstream ceph module should generate ${clusterName}.conf instead
+  # of ceph.conf, let's just symlink it.
+  environment.etc."ceph/${}.conf".source = "/etc/ceph/ceph.conf";
diff --git a/ops/ceph/0000-fix-SPDK-build-env.patch b/ops/ceph/0000-fix-SPDK-build-env.patch
new file mode 100644
index 0000000..a117408
--- /dev/null
+++ b/ops/ceph/0000-fix-SPDK-build-env.patch
@@ -0,0 +1,11 @@
+--- a/cmake/modules/BuildSPDK.cmake
++++ b/cmake/modules/BuildSPDK.cmake
+@@ -35,7 +35,7 @@ macro(build_spdk)
+     # unset $CFLAGS, otherwise it will interfere with how SPDK sets
+     # its include directory.
+     # unset $LDFLAGS, otherwise SPDK will fail to mock some functions.
++    BUILD_COMMAND env -i PATH=$ENV{PATH} CC=${CMAKE_C_COMPILER} ${make_cmd} EXTRA_CFLAGS="${spdk_CFLAGS}" C_OPT="-mssse3"
+     INSTALL_COMMAND "true")
+   unset(make_cmd)
diff --git a/ops/ceph/COPYING b/ops/ceph/COPYING
new file mode 100644
index 0000000..fe46c6a
--- /dev/null
+++ b/ops/ceph/COPYING
@@ -0,0 +1,20 @@
+Copyright (c) 2003-2021 Eelco Dolstra and the Nixpkgs/NixOS contributors
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
diff --git a/ops/ceph/ b/ops/ceph/
new file mode 100644
index 0000000..1a25652
--- /dev/null
+++ b/ops/ceph/
@@ -0,0 +1,3 @@
+Ceph 16.4 backport from nixpkgs @ 2021-09-10.
+To be removed once nixpkgs on hscloud nodes is bumped past this version being available upstream.
diff --git a/ops/ceph/default.nix b/ops/ceph/default.nix
new file mode 100644
index 0000000..0ccc96c
--- /dev/null
+++ b/ops/ceph/default.nix
@@ -0,0 +1,254 @@
+{ lib, stdenv, runCommand, fetchurl
+, ensureNewerSourcesHook
+, cmake, pkg-config
+, which, git
+, boost
+, libxml2, zlib, lz4
+, openldap, lttng-ust
+, babeltrace, gperf
+, gtest
+, cunit, snappy
+, makeWrapper
+, leveldb, oathToolkit
+, libnl, libcap_ng
+, rdkafka
+, nixosTests
+, cryptsetup
+, sqlite
+, lua
+, icu
+, bzip2
+, doxygen
+, graphviz
+, fmt
+, python3
+# Optional Dependencies
+, yasm ? null, fcgi ? null, expat ? null
+, curl ? null, fuse ? null
+, libedit ? null, libatomic_ops ? null
+, libs3 ? null
+# Mallocs
+, jemalloc ? null, gperftools ? null
+# Crypto Dependencies
+, cryptopp ? null
+, nss ? null, nspr ? null
+# Linux Only Dependencies
+, linuxHeaders, util-linux, libuuid, udev, keyutils, rdma-core, rabbitmq-c
+, libaio ? null, libxfs ? null, zfs ? null, liburing ? null
+, ...
+# We must have one crypto library
+assert cryptopp != null || (nss != null && nspr != null);
+  shouldUsePkg = pkg: if pkg != null && pkg.meta.available then pkg else null;
+  optYasm = shouldUsePkg yasm;
+  optFcgi = shouldUsePkg fcgi;
+  optExpat = shouldUsePkg expat;
+  optCurl = shouldUsePkg curl;
+  optFuse = shouldUsePkg fuse;
+  optLibedit = shouldUsePkg libedit;
+  optLibatomic_ops = shouldUsePkg libatomic_ops;
+  optLibs3 = shouldUsePkg libs3;
+  optJemalloc = shouldUsePkg jemalloc;
+  optGperftools = shouldUsePkg gperftools;
+  optCryptopp = shouldUsePkg cryptopp;
+  optNss = shouldUsePkg nss;
+  optNspr = shouldUsePkg nspr;
+  optLibaio = shouldUsePkg libaio;
+  optLibxfs = shouldUsePkg libxfs;
+  optZfs = shouldUsePkg zfs;
+  hasRadosgw = optFcgi != null && optExpat != null && optCurl != null && optLibedit != null;
+  # Malloc implementation (can be jemalloc, tcmalloc or null)
+  malloc = if optJemalloc != null then optJemalloc else optGperftools;
+  # We prefer nss over cryptopp
+  cryptoStr = if optNss != null && optNspr != null then "nss" else
+    if optCryptopp != null then "cryptopp" else "none";
+  cryptoLibsMap = {
+    nss = [ optNss optNspr ];
+    cryptopp = [ optCryptopp ];
+    none = [ ];
+  };
+  getMeta = description: with lib; {
+     homepage = "";
+     inherit description;
+     license = with licenses; [ lgpl21 gpl2 bsd3 mit publicDomain ];
+     maintainers = with maintainers; [ adev ak johanot krav ];
+     platforms = [ "x86_64-linux" "aarch64-linux" ];
+   };
+  ceph-common = python.pkgs.buildPythonPackage rec{
+    pname = "ceph-common";
+    inherit src version;
+    sourceRoot = "ceph-${version}/src/python-common";
+    checkInputs = [ python.pkgs.pytest ];
+    propagatedBuildInputs = with python.pkgs; [ pyyaml six ];
+    meta = getMeta "Ceph common module for code shared by manager modules";
+  };
+  python = python3.override {
+    packageOverrides = self: super: {
+      # scipy > 1.3 breaks diskprediction_local, leading to mgr hang on startup
+      # Bump once these issues are resolved:
+      #
+      scipy = super.scipy.overridePythonAttrs (oldAttrs: rec {
+        version = "1.3.3";
+        src = oldAttrs.src.override {
+          inherit version;
+          sha256 = "02iqb7ws7fw5fd1a83hx705pzrw1imj7z0bphjsl4bfvw254xgv4";
+        };
+        doCheck = false;
+      });
+    };
+  };
+  ceph-python-env = python.withPackages (ps: [
+    ps.sphinx
+    ps.flask
+    ps.cython
+    ps.setuptools
+    ps.virtualenv
+    # Libraries needed by the python tools
+    ps.Mako
+    ceph-common
+    ps.cherrypy
+    ps.cmd2
+    ps.colorama
+    ps.python-dateutil
+    ps.jsonpatch
+    ps.pecan
+    ps.prettytable
+    ps.pyopenssl
+    ps.pyjwt
+    ps.webob
+    ps.bcrypt
+    ps.scipy
+    ps.six
+    ps.pyyaml
+  ]);
+  sitePackages = ceph-python-env.python.sitePackages;
+  version = "16.2.4";
+  src = fetchurl {
+    url = "${version}.tar.gz";
+    sha256 = "sha256-J6FVK7feNN8cGO5BSDlfRGACAzchmRUSWR+a4ZgeWy0=";
+  };
+in rec {
+  ceph = stdenv.mkDerivation {
+    pname = "ceph";
+    inherit src version;
+    patches = [
+      ./0000-fix-SPDK-build-env.patch
+    ];
+    nativeBuildInputs = [
+      cmake
+      pkg-config which git python.pkgs.wrapPython makeWrapper
+      python.pkgs.python # for the toPythonPath function
+      (ensureNewerSourcesHook { year = "1980"; })
+      python
+      fmt
+      # for building docs/man-pages presumably
+      doxygen
+      graphviz
+    ];
+    buildInputs = cryptoLibsMap.${cryptoStr} ++ [
+      boost ceph-python-env libxml2 optYasm optLibatomic_ops optLibs3
+      malloc zlib openldap lttng-ust babeltrace gperf gtest cunit
+      snappy lz4 oathToolkit leveldb libnl libcap_ng rdkafka
+      cryptsetup sqlite lua icu bzip2
+    ] ++ lib.optionals stdenv.isLinux [
+      linuxHeaders util-linux libuuid udev keyutils liburing optLibaio optLibxfs optZfs
+      # ceph 14
+      rdma-core rabbitmq-c
+    ] ++ lib.optionals hasRadosgw [
+      optFcgi optExpat optCurl optFuse optLibedit
+    ];
+    pythonPath = [ ceph-python-env "${placeholder "out"}/${ceph-python-env.sitePackages}" ];
+    preConfigure =''
+      substituteInPlace src/common/module.c --replace "/sbin/modinfo"  "modinfo"
+      substituteInPlace src/common/module.c --replace "/sbin/modprobe" "modprobe"
+      substituteInPlace src/common/module.c --replace "/bin/grep" "grep"
+      # for pybind/rgw to find internal dep
+      export LD_LIBRARY_PATH="$PWD/build/lib''${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH"
+      # install target needs to be in PYTHONPATH for "*.pth support" check to succeed
+      # set PYTHONPATH, so the build system doesn't silently skip installing ceph-volume and others
+      export PYTHONPATH=${ceph-python-env}/${sitePackages}:$lib/${sitePackages}:$out/${sitePackages}
+      patchShebangs src/script src/spdk src/test src/tools
+    '';
+    cmakeFlags = [
+      "-DWITH_SYSTEM_ROCKSDB=OFF"  # breaks Bluestore
+      "-DCMAKE_INSTALL_DATADIR=${placeholder "lib"}/lib"
+      "-DMGR_PYTHON_VERSION=${ceph-python-env.python.pythonVersion}"
+      # TODO breaks with sandbox, tries to download stuff with npm
+      # WITH_XFS has been set default ON from Ceph 16, keeping it optional in nixpkgs for now
+      ''-DWITH_XFS=${if optLibxfs != null then "ON" else "OFF"}''
+    ] ++ lib.optional stdenv.isLinux "-DWITH_SYSTEM_LIBURING=ON";
+    postFixup = ''
+      wrapPythonPrograms
+      wrapProgram $out/bin/ceph-mgr --prefix PYTHONPATH ":" "$(toPythonPath ${placeholder "out"}):$(toPythonPath ${ceph-python-env})"
+      # Test that ceph-volume exists since the build system has a tendency to
+      # silently drop it with misconfigurations.
+      test -f $out/bin/ceph-volume
+    '';
+    outputs = [ "out" "lib" "dev" "doc" "man" ];
+    doCheck = false; # uses pip to install things from the internet
+    # Takes 7+h to build with 2 cores.
+    requiredSystemFeatures = [ "big-parallel" ];
+    meta = getMeta "Distributed storage system";
+    passthru.version = version;
+    passthru.tests = { inherit (nixosTests) ceph-single-node ceph-multi-node ceph-single-node-bluestore; };
+  };
+  ceph-client = runCommand "ceph-client-${version}" {
+      meta = getMeta "Tools needed to mount Ceph's RADOS Block Devices";
+    } ''
+      mkdir -p $out/{bin,etc,${sitePackages},share/bash-completion/completions}
+      cp -r ${ceph}/bin/{ceph,.ceph-wrapped,rados,rbd,rbdmap} $out/bin
+      cp -r ${ceph}/bin/ceph-{authtool,conf,dencoder,rbdnamer,syn} $out/bin
+      cp -r ${ceph}/bin/rbd-replay* $out/bin
+      cp -r ${ceph}/${sitePackages} $out/${sitePackages}
+      cp -r ${ceph}/etc/bash_completion.d $out/share/bash-completion/completions
+      # wrapPythonPrograms modifies .ceph-wrapped, so lets just update its paths
+      substituteInPlace $out/bin/ceph          --replace ${ceph} $out
+      substituteInPlace $out/bin/.ceph-wrapped --replace ${ceph} $out
+   '';
diff --git a/ops/machines.nix b/ops/machines.nix
index 5401e30..9a54c56 100644
--- a/ops/machines.nix
+++ b/ops/machines.nix
@@ -19,7 +19,28 @@
     repo = "nixpkgs-channels";
     rev = "44ad80ab1036c5cc83ada4bfa451dac9939f2a10";
     sha256 = "1b61nzvy0d46cspy07szkc0rggacxiqg9v1py27pkqpj7rvawfsk";
-  }) {};
+  }) {
+    overlays = [
+      (self: super: rec {
+        # Use a newer version of Ceph (16, Pacific, EOL 2023-06-01) than in
+        # this nixpkgs (15, Octopus, EOL 2022-06-01).
+        #
+        # This is to:
+        #  1. Fix a bug in which ceph-volume lvm create fails due to a rocksdb
+        #     mismatch (
+        #  2. At the time of deployment not start out with an ancient version
+        #     of Ceph.
+        #
+        # Once we unpin nixpkgsCluster past a version that contains this Ceph,
+        # this can be unoverlayed.
+        inherit (super.callPackages ./ceph {
+          boost = super.boost17x.override { enablePython = true; python = super.python3; };
+          lua = super.lua5_4;
+        }) ceph ceph-client;
+        ceph-lib = ceph.lib;
+      })
+    ];
+  };
   # edge01 still lives on an old nixpkgs checkout.
@@ -44,6 +65,7 @@
     imports = [
+      ../cluster/nix/modules/ceph.nix