Serge Bazanski | 9848e7e | 2021-09-10 22:30:56 +0000 | [diff] [blame] | 1 | # This runs Ceph on hscloud cluster(s). |
| 2 | # |
| 3 | # This lightly wraps the upstream NixOS ceph module, which is already fairly light. |
| 4 | # |
| 5 | # Most importantly, it does _not_ attempt to do any cluster |
| 6 | # bootstrapping/maintenance. This means, that any configuration action that |
| 7 | # does the following: |
| 8 | # 0. Bringing up a cluster |
| 9 | # 1. Adding/removing Mons |
| 10 | # 2. Changing a Mon IP address |
| 11 | # 3. Adding/removing OSDs |
| 12 | # ... must be done in tandem with manual operations on the affected nodes. For |
| 13 | # example, bootstrapping a cluster will involve keychain and monmap management, |
| 14 | # changing anything with mons will involve monmap management, adding new OSDs |
| 15 | # will require provisioning them with ceph-volume, etc. |
| 16 | # |
| 17 | # This is in stark contrast to a fully-managed solution like rook. Since we |
| 18 | # don't have hundreds of clusters, none of the above is automated, especially |
| 19 | # as that kind of automation is quite tricky to do reliably. |
| 20 | |
| 21 | { config, lib, pkgs, ... }: |
| 22 | |
| 23 | with builtins; |
| 24 | with lib; |
| 25 | |
| 26 | with (( import ../defs-cluster-k0.nix ) config.networking.hostName); |
| 27 | |
| 28 | let |
| 29 | machineName = config.networking.hostName; |
| 30 | isMon = hasAttr machineName cephCluster.mons; |
| 31 | isOsd = hasAttr machineName cephCluster.osds; |
| 32 | hasCeph = isMon || isOsd; |
| 33 | |
| 34 | # This NixOS Ceph option fragment is present on every machine that runs a |
| 35 | # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on |
| 36 | # this machine. |
| 37 | cephMonConfig = if isMon then { |
| 38 | mon = { |
| 39 | enable = true; |
| 40 | daemons = [ machineName ]; |
| 41 | }; |
| 42 | mgr = { |
| 43 | enable = true; |
| 44 | daemons = [ machineName ]; |
| 45 | }; |
| 46 | } else {}; |
| 47 | |
| 48 | # Same as for cephMonConfig, but this time for OSDs. |
| 49 | cephOsdConfig = if isOsd then { |
| 50 | osd = { |
| 51 | enable = true; |
| 52 | daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName}; |
| 53 | }; |
| 54 | } else {}; |
| 55 | |
| 56 | # The full option fragment for services.ceph. It contains ceph.conf fragments |
| 57 | # (in .global.*) and merges ceph{Mon,Osd}Config. |
| 58 | cephConfig = { |
| 59 | enable = true; |
| 60 | global = { |
| 61 | fsid = cephCluster.fsid; |
| 62 | clusterName = cephCluster.name; |
| 63 | |
| 64 | # Every Ceph node always attempts to connect to all mons. |
| 65 | monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons); |
| 66 | monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons); |
| 67 | }; |
| 68 | } // cephMonConfig // cephOsdConfig; |
| 69 | |
| 70 | # Merge ceph-volume lvm activate into ceph-osd-ID services. |
| 71 | # |
| 72 | # This is because the upstream module seems to have been written with |
| 73 | # filestore in mind, not bluestore. Filestore is relatively simple: an xfs |
| 74 | # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn |
| 75 | # contains everything for that OSD to work. |
| 76 | # |
| 77 | # Bluestore is a bit different. Instead of a normal filesystem being mounted, |
| 78 | # Ceph manages a block device fully using LVM (and in our case, dmcrypt). |
| 79 | # Every bluestore volume needs to be 'activated' before it can be used by an |
| 80 | # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares |
| 81 | # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present |
| 82 | # there. However, instead of this being a diskmount, it's instead a tmpfs |
| 83 | # into which a bunch of files are dropped, loaded from the LVM raw device. |
| 84 | # |
| 85 | # To make the upstream NixOS module OSD work with bluestore, we do the following: |
| 86 | # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id |
| 87 | # path. This gates the service on that device being present. |
| 88 | # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed. |
| 89 | # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume, |
| 90 | # which seems to look for them on PATH instead of being properly |
| 91 | # nixified). |
| 92 | # |
| 93 | # We also inject smartmontools into PATH for smartctl, which allows the OSD |
| 94 | # to monitor device health. |
| 95 | osdActivateServices = listToAttrs (map (el: let |
| 96 | osdId = toString el.id; |
| 97 | osdUuid = el.uuid; |
| 98 | diskPath = el.path; |
| 99 | in { |
| 100 | name = "ceph-osd-${osdId}"; |
| 101 | value = { |
| 102 | path = with pkgs; [ |
| 103 | lvm2 |
| 104 | cryptsetup |
| 105 | smartmontools |
| 106 | ]; |
| 107 | serviceConfig = { |
| 108 | ExecStartPre = lib.mkForce [ |
| 109 | ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" '' |
| 110 | #!/bin/sh |
| 111 | set -e |
| 112 | dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/" |
| 113 | disk="${el.path}" |
| 114 | uuid="${osdUuid}" |
| 115 | if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then |
| 116 | echo "Volume $dir already activated, skipping..." |
| 117 | else |
| 118 | echo "Activating $dir with $disk, uuid $uuid..." |
| 119 | ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid |
| 120 | fi |
| 121 | |
| 122 | ''))) |
| 123 | |
| 124 | "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}" |
| 125 | ]; |
| 126 | }; |
| 127 | unitConfig = { |
| 128 | ConditionPathExists = lib.mkForce el.path; |
| 129 | }; |
| 130 | }; |
| 131 | }) (if isOsd then cephCluster.osds.${machineName} else [])); |
| 132 | |
| 133 | in rec { |
| 134 | services.ceph = if hasCeph then cephConfig else {}; |
| 135 | |
| 136 | environment.systemPackages = with pkgs; [ |
| 137 | ceph cryptsetup smartmontools |
| 138 | ]; |
| 139 | |
| 140 | systemd.services = osdActivateServices; |
| 141 | |
| 142 | # Hack - the upstream ceph module should generate ${clusterName}.conf instead |
| 143 | # of ceph.conf, let's just symlink it. |
| 144 | environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf"; |
| 145 | } |