blob: c258f5bc6c97d4975bf923c6ce6f3777d363ae29 [file] [log] [blame]
Serge Bazanski9848e7e2021-09-10 22:30:56 +00001# This runs Ceph on hscloud cluster(s).
2#
3# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
4#
5# Most importantly, it does _not_ attempt to do any cluster
6# bootstrapping/maintenance. This means, that any configuration action that
7# does the following:
8# 0. Bringing up a cluster
9# 1. Adding/removing Mons
10# 2. Changing a Mon IP address
11# 3. Adding/removing OSDs
12# ... must be done in tandem with manual operations on the affected nodes. For
13# example, bootstrapping a cluster will involve keychain and monmap management,
14# changing anything with mons will involve monmap management, adding new OSDs
15# will require provisioning them with ceph-volume, etc.
16#
17# This is in stark contrast to a fully-managed solution like rook. Since we
18# don't have hundreds of clusters, none of the above is automated, especially
19# as that kind of automation is quite tricky to do reliably.
20
21{ config, lib, pkgs, ... }:
22
23with builtins;
24with lib;
25
26with (( import ../defs-cluster-k0.nix ) config.networking.hostName);
27
28let
29 machineName = config.networking.hostName;
30 isMon = hasAttr machineName cephCluster.mons;
31 isOsd = hasAttr machineName cephCluster.osds;
32 hasCeph = isMon || isOsd;
33
34 # This NixOS Ceph option fragment is present on every machine that runs a
35 # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
36 # this machine.
37 cephMonConfig = if isMon then {
38 mon = {
39 enable = true;
40 daemons = [ machineName ];
41 };
42 mgr = {
43 enable = true;
44 daemons = [ machineName ];
45 };
46 } else {};
47
48 # Same as for cephMonConfig, but this time for OSDs.
49 cephOsdConfig = if isOsd then {
50 osd = {
51 enable = true;
52 daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName};
53 };
Serge Bazanski18084c12021-09-12 22:09:48 +000054 rgw = {
55 enable = true;
56 daemons = [ "rook-k0.rgw.${machineName}" ];
57 };
Serge Bazanski9848e7e2021-09-10 22:30:56 +000058 } else {};
59
60 # The full option fragment for services.ceph. It contains ceph.conf fragments
61 # (in .global.*) and merges ceph{Mon,Osd}Config.
62 cephConfig = {
63 enable = true;
64 global = {
65 fsid = cephCluster.fsid;
66 clusterName = cephCluster.name;
67
68 # Every Ceph node always attempts to connect to all mons.
69 monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons);
70 monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons);
71 };
72 } // cephMonConfig // cephOsdConfig;
73
74 # Merge ceph-volume lvm activate into ceph-osd-ID services.
75 #
76 # This is because the upstream module seems to have been written with
77 # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
78 # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
79 # contains everything for that OSD to work.
80 #
81 # Bluestore is a bit different. Instead of a normal filesystem being mounted,
82 # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
83 # Every bluestore volume needs to be 'activated' before it can be used by an
84 # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
85 # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
86 # there. However, instead of this being a diskmount, it's instead a tmpfs
87 # into which a bunch of files are dropped, loaded from the LVM raw device.
88 #
89 # To make the upstream NixOS module OSD work with bluestore, we do the following:
90 # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
91 # path. This gates the service on that device being present.
92 # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
93 # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
94 # which seems to look for them on PATH instead of being properly
95 # nixified).
96 #
97 # We also inject smartmontools into PATH for smartctl, which allows the OSD
98 # to monitor device health.
99 osdActivateServices = listToAttrs (map (el: let
100 osdId = toString el.id;
101 osdUuid = el.uuid;
102 diskPath = el.path;
103 in {
104 name = "ceph-osd-${osdId}";
105 value = {
106 path = with pkgs; [
107 lvm2
108 cryptsetup
109 smartmontools
110 ];
111 serviceConfig = {
112 ExecStartPre = lib.mkForce [
113 ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
114 #!/bin/sh
115 set -e
116 dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/"
117 disk="${el.path}"
118 uuid="${osdUuid}"
119 if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
120 echo "Volume $dir already activated, skipping..."
121 else
122 echo "Activating $dir with $disk, uuid $uuid..."
123 ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
124 fi
125
126 '')))
127
128 "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}"
129 ];
130 };
131 unitConfig = {
132 ConditionPathExists = lib.mkForce el.path;
133 };
134 };
135 }) (if isOsd then cephCluster.osds.${machineName} else []));
136
137in rec {
138 services.ceph = if hasCeph then cephConfig else {};
139
140 environment.systemPackages = with pkgs; [
141 ceph cryptsetup smartmontools
142 ];
143
144 systemd.services = osdActivateServices;
145
146 # Hack - the upstream ceph module should generate ${clusterName}.conf instead
147 # of ceph.conf, let's just symlink it.
148 environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf";
149}