| # This runs Ceph on hscloud cluster(s). |
| # |
| # This lightly wraps the upstream NixOS ceph module, which is already fairly light. |
| # |
| # Most importantly, it does _not_ attempt to do any cluster |
| # bootstrapping/maintenance. This means, that any configuration action that |
| # does the following: |
| # 0. Bringing up a cluster |
| # 1. Adding/removing Mons |
| # 2. Changing a Mon IP address |
| # 3. Adding/removing OSDs |
| # ... must be done in tandem with manual operations on the affected nodes. For |
| # example, bootstrapping a cluster will involve keychain and monmap management, |
| # changing anything with mons will involve monmap management, adding new OSDs |
| # will require provisioning them with ceph-volume, etc. |
| # |
| # This is in stark contrast to a fully-managed solution like rook. Since we |
| # don't have hundreds of clusters, none of the above is automated, especially |
| # as that kind of automation is quite tricky to do reliably. |
| |
| { config, lib, pkgs, ... }: |
| |
| with builtins; |
| with lib; |
| |
| with (( import ../defs-cluster-k0.nix ) config.networking.hostName); |
| |
| let |
| machineName = config.networking.hostName; |
| isMon = hasAttr machineName cephCluster.mons; |
| isOsd = hasAttr machineName cephCluster.osds; |
| hasCeph = isMon || isOsd; |
| |
| # This NixOS Ceph option fragment is present on every machine that runs a |
| # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on |
| # this machine. |
| cephMonConfig = if isMon then { |
| mon = { |
| enable = true; |
| daemons = [ machineName ]; |
| }; |
| mgr = { |
| enable = true; |
| daemons = [ machineName ]; |
| }; |
| } else {}; |
| |
| # Same as for cephMonConfig, but this time for OSDs. |
| cephOsdConfig = if isOsd then { |
| osd = { |
| enable = true; |
| daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName}; |
| }; |
| rgw = { |
| enable = true; |
| daemons = [ "rook-k0.rgw.${machineName}" ]; |
| }; |
| } else {}; |
| |
| # The full option fragment for services.ceph. It contains ceph.conf fragments |
| # (in .global.*) and merges ceph{Mon,Osd}Config. |
| cephConfig = { |
| enable = true; |
| global = { |
| fsid = cephCluster.fsid; |
| clusterName = cephCluster.name; |
| |
| # Every Ceph node always attempts to connect to all mons. |
| monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons); |
| monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons); |
| }; |
| } // cephMonConfig // cephOsdConfig; |
| |
| # Merge ceph-volume lvm activate into ceph-osd-ID services. |
| # |
| # This is because the upstream module seems to have been written with |
| # filestore in mind, not bluestore. Filestore is relatively simple: an xfs |
| # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn |
| # contains everything for that OSD to work. |
| # |
| # Bluestore is a bit different. Instead of a normal filesystem being mounted, |
| # Ceph manages a block device fully using LVM (and in our case, dmcrypt). |
| # Every bluestore volume needs to be 'activated' before it can be used by an |
| # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares |
| # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present |
| # there. However, instead of this being a diskmount, it's instead a tmpfs |
| # into which a bunch of files are dropped, loaded from the LVM raw device. |
| # |
| # To make the upstream NixOS module OSD work with bluestore, we do the following: |
| # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id |
| # path. This gates the service on that device being present. |
| # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed. |
| # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume, |
| # which seems to look for them on PATH instead of being properly |
| # nixified). |
| # |
| # We also inject smartmontools into PATH for smartctl, which allows the OSD |
| # to monitor device health. |
| osdActivateServices = listToAttrs (map (el: let |
| osdId = toString el.id; |
| osdUuid = el.uuid; |
| diskPath = el.path; |
| in { |
| name = "ceph-osd-${osdId}"; |
| value = { |
| path = with pkgs; [ |
| lvm2 |
| cryptsetup |
| smartmontools |
| ]; |
| serviceConfig = { |
| ExecStartPre = lib.mkForce [ |
| ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" '' |
| #!/bin/sh |
| set -e |
| dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/" |
| disk="${el.path}" |
| uuid="${osdUuid}" |
| if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then |
| echo "Volume $dir already activated, skipping..." |
| else |
| echo "Activating $dir with $disk, uuid $uuid..." |
| ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid |
| fi |
| |
| ''))) |
| |
| "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}" |
| ]; |
| }; |
| unitConfig = { |
| ConditionPathExists = lib.mkForce el.path; |
| }; |
| }; |
| }) (if isOsd then cephCluster.osds.${machineName} else [])); |
| |
| in rec { |
| services.ceph = if hasCeph then cephConfig else {}; |
| |
| environment.systemPackages = with pkgs; [ |
| ceph cryptsetup smartmontools |
| ]; |
| |
| systemd.services = osdActivateServices; |
| |
| # Hack - the upstream ceph module should generate ${clusterName}.conf instead |
| # of ceph.conf, let's just symlink it. |
| environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf"; |
| } |