| # This runs Ceph on hscloud cluster(s). |
| # |
| # This lightly wraps the upstream NixOS ceph module, which is already fairly light. |
| # |
| # Most importantly, it does _not_ attempt to do any cluster |
| # bootstrapping/maintenance. This means, that any configuration action that |
| # does the following: |
| # 0. Bringing up a cluster |
| # 1. Adding/removing Mons |
| # 2. Changing a Mon IP address |
| # 3. Adding/removing OSDs |
| # ... must be done in tandem with manual operations on the affected nodes. For |
| # example, bootstrapping a cluster will involve keychain and monmap management, |
| # changing anything with mons will involve monmap management, adding new OSDs |
| # will require provisioning them with ceph-volume, etc. |
| # |
| # This is in stark contrast to a fully-managed solution like rook. Since we |
| # don't have hundreds of clusters, none of the above is automated, especially |
| # as that kind of automation is quite tricky to do reliably. |
| |
| { config, lib, pkgs, machines, ... }: |
| |
| with lib; |
| |
| let |
| cfg = config.hscloud.ceph; |
| |
| allNodes = let |
| list = mapAttrsToList (_: v: v) machines; |
| filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list; |
| sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered; |
| in sorted; |
| |
| monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes; |
| |
| machineName = config.networking.hostName; |
| |
| # This NixOS Ceph option fragment is present on every machine that runs a |
| # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on |
| # this machine. |
| cephMonConfig = if cfg.control.enable then { |
| mon = { |
| enable = true; |
| daemons = [ machineName ]; |
| }; |
| mgr = { |
| enable = true; |
| daemons = [ machineName ]; |
| }; |
| } else {}; |
| |
| # Same as for cephMonConfig, but this time for OSDs. |
| cephOsdConfig = if (length cfg.osd.devices) > 0 then { |
| osd = { |
| enable = true; |
| daemons = map (el: "${toString el.id}") cfg.osd.devices; |
| }; |
| rgw = { |
| enable = true; |
| daemons = [ "rook-k0.rgw.${machineName}" ]; |
| }; |
| } else {}; |
| |
| |
| # Merge ceph-volume lvm activate into ceph-osd-ID services. |
| # |
| # This is because the upstream module seems to have been written with |
| # filestore in mind, not bluestore. Filestore is relatively simple: an xfs |
| # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn |
| # contains everything for that OSD to work. |
| # |
| # Bluestore is a bit different. Instead of a normal filesystem being mounted, |
| # Ceph manages a block device fully using LVM (and in our case, dmcrypt). |
| # Every bluestore volume needs to be 'activated' before it can be used by an |
| # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares |
| # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present |
| # there. However, instead of this being a diskmount, it's instead a tmpfs |
| # into which a bunch of files are dropped, loaded from the LVM raw device. |
| # |
| # To make the upstream NixOS module OSD work with bluestore, we do the following: |
| # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id |
| # path. This gates the service on that device being present. |
| # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed. |
| # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume, |
| # which seems to look for them on PATH instead of being properly |
| # nixified). |
| # |
| # We also inject smartmontools into PATH for smartctl, which allows the OSD |
| # to monitor device health. |
| osdActivateServices = listToAttrs (map (el: let |
| osdId = toString el.id; |
| osdUuid = el.uuid; |
| diskPath = el.path; |
| in { |
| name = "ceph-osd-${osdId}"; |
| value = { |
| path = with pkgs; [ |
| lvm2 |
| cryptsetup |
| smartmontools |
| ]; |
| serviceConfig = { |
| ExecStartPre = lib.mkForce [ |
| ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" '' |
| #!/bin/sh |
| set -e |
| dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/" |
| disk="${el.path}" |
| uuid="${osdUuid}" |
| if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then |
| echo "Volume $dir already activated, skipping..." |
| else |
| echo "Activating $dir with $disk, uuid $uuid..." |
| ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid |
| fi |
| |
| ''))) |
| |
| "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}" |
| ]; |
| }; |
| unitConfig = { |
| ConditionPathExists = lib.mkForce el.path; |
| }; |
| }; |
| }) cfg.osd.devices); |
| |
| in rec { |
| options = { |
| hscloud.ceph = { |
| enable = mkOption { |
| type = types.bool; |
| description = "Enable Ceph storage cluster (native NixOS), not rook."; |
| default = ((length cfg.osd.devices) > 0) || cfg.control.enable; |
| }; |
| name = mkOption { |
| type = types.str; |
| description = "Short identifier of cluster."; |
| }; |
| fsid = mkOption { |
| type = types.str; |
| description = "UUID of cluster, as generated by first mon."; |
| }; |
| control = { |
| enable = mkEnableOption "mon and mgr on this host"; |
| }; |
| osd = { |
| devices = mkOption { |
| type = types.listOf (types.submodule { |
| options = { |
| id = mkOption { |
| description = "Numeric ID of OSD."; |
| type = types.int; |
| }; |
| path = mkOption { |
| description = "Path to underlying block device for OSD storage."; |
| type = types.str; |
| }; |
| uuid = mkOption { |
| description = "UUID of generated OSD storage."; |
| type = types.str; |
| }; |
| }; |
| }); |
| default = []; |
| }; |
| }; |
| }; |
| }; |
| config = mkIf cfg.enable { |
| services.ceph = { |
| enable = cfg.control.enable || (length cfg.osd.devices) > 0; |
| global = { |
| fsid = cfg.fsid; |
| clusterName = cfg.name; |
| |
| # Every Ceph node always attempts to connect to all mons. |
| monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes); |
| monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes); |
| }; |
| } // cephMonConfig // cephOsdConfig; |
| |
| environment.systemPackages = with pkgs; [ |
| ceph cryptsetup smartmontools |
| ]; |
| |
| systemd.services = osdActivateServices; |
| |
| # Hack - the upstream ceph module should generate ${clusterName}.conf instead |
| # of ceph.conf, let's just symlink it. |
| environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf"; |
| }; |
| } |