Serge Bazanski | 55a486a | 2022-06-11 18:27:01 +0000 | [diff] [blame] | 1 | # This runs Ceph on hscloud cluster(s). |
| 2 | # |
| 3 | # This lightly wraps the upstream NixOS ceph module, which is already fairly light. |
| 4 | # |
| 5 | # Most importantly, it does _not_ attempt to do any cluster |
| 6 | # bootstrapping/maintenance. This means, that any configuration action that |
| 7 | # does the following: |
| 8 | # 0. Bringing up a cluster |
| 9 | # 1. Adding/removing Mons |
| 10 | # 2. Changing a Mon IP address |
| 11 | # 3. Adding/removing OSDs |
| 12 | # ... must be done in tandem with manual operations on the affected nodes. For |
| 13 | # example, bootstrapping a cluster will involve keychain and monmap management, |
| 14 | # changing anything with mons will involve monmap management, adding new OSDs |
| 15 | # will require provisioning them with ceph-volume, etc. |
| 16 | # |
| 17 | # This is in stark contrast to a fully-managed solution like rook. Since we |
| 18 | # don't have hundreds of clusters, none of the above is automated, especially |
| 19 | # as that kind of automation is quite tricky to do reliably. |
| 20 | |
| 21 | { config, lib, pkgs, machines, ... }: |
| 22 | |
| 23 | with lib; |
| 24 | |
| 25 | let |
| 26 | cfg = config.hscloud.ceph; |
| 27 | |
| 28 | allNodes = let |
| 29 | list = mapAttrsToList (_: v: v) machines; |
| 30 | filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list; |
| 31 | sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered; |
| 32 | in sorted; |
| 33 | |
| 34 | monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes; |
| 35 | |
| 36 | machineName = config.networking.hostName; |
| 37 | |
| 38 | # This NixOS Ceph option fragment is present on every machine that runs a |
| 39 | # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on |
| 40 | # this machine. |
| 41 | cephMonConfig = if cfg.control.enable then { |
| 42 | mon = { |
| 43 | enable = true; |
| 44 | daemons = [ machineName ]; |
| 45 | }; |
| 46 | mgr = { |
| 47 | enable = true; |
| 48 | daemons = [ machineName ]; |
| 49 | }; |
| 50 | } else {}; |
| 51 | |
| 52 | # Same as for cephMonConfig, but this time for OSDs. |
| 53 | cephOsdConfig = if (length cfg.osd.devices) > 0 then { |
| 54 | osd = { |
| 55 | enable = true; |
| 56 | daemons = map (el: "${toString el.id}") cfg.osd.devices; |
| 57 | }; |
| 58 | rgw = { |
| 59 | enable = true; |
| 60 | daemons = [ "rook-k0.rgw.${machineName}" ]; |
| 61 | }; |
| 62 | } else {}; |
| 63 | |
| 64 | |
| 65 | # Merge ceph-volume lvm activate into ceph-osd-ID services. |
| 66 | # |
| 67 | # This is because the upstream module seems to have been written with |
| 68 | # filestore in mind, not bluestore. Filestore is relatively simple: an xfs |
| 69 | # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn |
| 70 | # contains everything for that OSD to work. |
| 71 | # |
| 72 | # Bluestore is a bit different. Instead of a normal filesystem being mounted, |
| 73 | # Ceph manages a block device fully using LVM (and in our case, dmcrypt). |
| 74 | # Every bluestore volume needs to be 'activated' before it can be used by an |
| 75 | # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares |
| 76 | # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present |
| 77 | # there. However, instead of this being a diskmount, it's instead a tmpfs |
| 78 | # into which a bunch of files are dropped, loaded from the LVM raw device. |
| 79 | # |
| 80 | # To make the upstream NixOS module OSD work with bluestore, we do the following: |
| 81 | # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id |
| 82 | # path. This gates the service on that device being present. |
| 83 | # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed. |
| 84 | # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume, |
| 85 | # which seems to look for them on PATH instead of being properly |
| 86 | # nixified). |
| 87 | # |
| 88 | # We also inject smartmontools into PATH for smartctl, which allows the OSD |
| 89 | # to monitor device health. |
| 90 | osdActivateServices = listToAttrs (map (el: let |
| 91 | osdId = toString el.id; |
| 92 | osdUuid = el.uuid; |
| 93 | diskPath = el.path; |
| 94 | in { |
| 95 | name = "ceph-osd-${osdId}"; |
| 96 | value = { |
| 97 | path = with pkgs; [ |
| 98 | lvm2 |
| 99 | cryptsetup |
| 100 | smartmontools |
| 101 | ]; |
| 102 | serviceConfig = { |
| 103 | ExecStartPre = lib.mkForce [ |
| 104 | ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" '' |
| 105 | #!/bin/sh |
| 106 | set -e |
| 107 | dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/" |
| 108 | disk="${el.path}" |
| 109 | uuid="${osdUuid}" |
| 110 | if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then |
| 111 | echo "Volume $dir already activated, skipping..." |
| 112 | else |
| 113 | echo "Activating $dir with $disk, uuid $uuid..." |
| 114 | ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid |
| 115 | fi |
| 116 | |
| 117 | ''))) |
| 118 | |
| 119 | "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}" |
| 120 | ]; |
| 121 | }; |
| 122 | unitConfig = { |
| 123 | ConditionPathExists = lib.mkForce el.path; |
| 124 | }; |
| 125 | }; |
| 126 | }) cfg.osd.devices); |
| 127 | |
| 128 | in rec { |
| 129 | options = { |
| 130 | hscloud.ceph = { |
| 131 | enable = mkOption { |
| 132 | type = types.bool; |
| 133 | description = "Enable Ceph storage cluster (native NixOS), not rook."; |
| 134 | default = ((length cfg.osd.devices) > 0) || cfg.control.enable; |
| 135 | }; |
| 136 | name = mkOption { |
| 137 | type = types.str; |
| 138 | description = "Short identifier of cluster."; |
| 139 | }; |
| 140 | fsid = mkOption { |
| 141 | type = types.str; |
| 142 | description = "UUID of cluster, as generated by first mon."; |
| 143 | }; |
| 144 | control = { |
| 145 | enable = mkEnableOption "mon and mgr on this host"; |
| 146 | }; |
| 147 | osd = { |
| 148 | devices = mkOption { |
| 149 | type = types.listOf (types.submodule { |
| 150 | options = { |
| 151 | id = mkOption { |
| 152 | description = "Numeric ID of OSD."; |
| 153 | type = types.int; |
| 154 | }; |
| 155 | path = mkOption { |
| 156 | description = "Path to underlying block device for OSD storage."; |
| 157 | type = types.str; |
| 158 | }; |
| 159 | uuid = mkOption { |
| 160 | description = "UUID of generated OSD storage."; |
| 161 | type = types.str; |
| 162 | }; |
| 163 | }; |
| 164 | }); |
| 165 | default = []; |
| 166 | }; |
| 167 | }; |
| 168 | }; |
| 169 | }; |
| 170 | config = mkIf cfg.enable { |
| 171 | services.ceph = { |
| 172 | enable = cfg.control.enable || (length cfg.osd.devices) > 0; |
| 173 | global = { |
| 174 | fsid = cfg.fsid; |
| 175 | clusterName = cfg.name; |
| 176 | |
| 177 | # Every Ceph node always attempts to connect to all mons. |
| 178 | monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes); |
| 179 | monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes); |
| 180 | }; |
| 181 | } // cephMonConfig // cephOsdConfig; |
| 182 | |
| 183 | environment.systemPackages = with pkgs; [ |
| 184 | ceph cryptsetup smartmontools |
| 185 | ]; |
| 186 | |
| 187 | systemd.services = osdActivateServices; |
| 188 | |
| 189 | # Hack - the upstream ceph module should generate ${clusterName}.conf instead |
| 190 | # of ceph.conf, let's just symlink it. |
| 191 | environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf"; |
| 192 | }; |
| 193 | } |