blob: 4f15bdd3cb464ffe0774c8237cea36344242763d [file] [log] [blame]
Serge Bazanski55a486a2022-06-11 18:27:01 +00001# This runs Ceph on hscloud cluster(s).
2#
3# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
4#
5# Most importantly, it does _not_ attempt to do any cluster
6# bootstrapping/maintenance. This means, that any configuration action that
7# does the following:
8# 0. Bringing up a cluster
9# 1. Adding/removing Mons
10# 2. Changing a Mon IP address
11# 3. Adding/removing OSDs
12# ... must be done in tandem with manual operations on the affected nodes. For
13# example, bootstrapping a cluster will involve keychain and monmap management,
14# changing anything with mons will involve monmap management, adding new OSDs
15# will require provisioning them with ceph-volume, etc.
16#
17# This is in stark contrast to a fully-managed solution like rook. Since we
18# don't have hundreds of clusters, none of the above is automated, especially
19# as that kind of automation is quite tricky to do reliably.
20
21{ config, lib, pkgs, machines, ... }:
22
23with lib;
24
25let
26 cfg = config.hscloud.ceph;
27
28 allNodes = let
29 list = mapAttrsToList (_: v: v) machines;
30 filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
31 sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
32 in sorted;
33
34 monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;
35
36 machineName = config.networking.hostName;
37
38 # This NixOS Ceph option fragment is present on every machine that runs a
39 # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
40 # this machine.
41 cephMonConfig = if cfg.control.enable then {
42 mon = {
43 enable = true;
44 daemons = [ machineName ];
45 };
46 mgr = {
47 enable = true;
48 daemons = [ machineName ];
49 };
50 } else {};
51
52 # Same as for cephMonConfig, but this time for OSDs.
53 cephOsdConfig = if (length cfg.osd.devices) > 0 then {
54 osd = {
55 enable = true;
56 daemons = map (el: "${toString el.id}") cfg.osd.devices;
57 };
58 rgw = {
59 enable = true;
60 daemons = [ "rook-k0.rgw.${machineName}" ];
61 };
62 } else {};
63
64
65 # Merge ceph-volume lvm activate into ceph-osd-ID services.
66 #
67 # This is because the upstream module seems to have been written with
68 # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
69 # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
70 # contains everything for that OSD to work.
71 #
72 # Bluestore is a bit different. Instead of a normal filesystem being mounted,
73 # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
74 # Every bluestore volume needs to be 'activated' before it can be used by an
75 # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
76 # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
77 # there. However, instead of this being a diskmount, it's instead a tmpfs
78 # into which a bunch of files are dropped, loaded from the LVM raw device.
79 #
80 # To make the upstream NixOS module OSD work with bluestore, we do the following:
81 # 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
82 # path. This gates the service on that device being present.
83 # 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
84 # 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
85 # which seems to look for them on PATH instead of being properly
86 # nixified).
87 #
88 # We also inject smartmontools into PATH for smartctl, which allows the OSD
89 # to monitor device health.
90 osdActivateServices = listToAttrs (map (el: let
91 osdId = toString el.id;
92 osdUuid = el.uuid;
93 diskPath = el.path;
94 in {
95 name = "ceph-osd-${osdId}";
96 value = {
97 path = with pkgs; [
98 lvm2
99 cryptsetup
100 smartmontools
101 ];
102 serviceConfig = {
103 ExecStartPre = lib.mkForce [
104 ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
105 #!/bin/sh
106 set -e
107 dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
108 disk="${el.path}"
109 uuid="${osdUuid}"
110 if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
111 echo "Volume $dir already activated, skipping..."
112 else
113 echo "Activating $dir with $disk, uuid $uuid..."
114 ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
115 fi
116
117 '')))
118
119 "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
120 ];
121 };
122 unitConfig = {
123 ConditionPathExists = lib.mkForce el.path;
124 };
125 };
126 }) cfg.osd.devices);
127
128in rec {
129 options = {
130 hscloud.ceph = {
131 enable = mkOption {
132 type = types.bool;
133 description = "Enable Ceph storage cluster (native NixOS), not rook.";
134 default = ((length cfg.osd.devices) > 0) || cfg.control.enable;
135 };
136 name = mkOption {
137 type = types.str;
138 description = "Short identifier of cluster.";
139 };
140 fsid = mkOption {
141 type = types.str;
142 description = "UUID of cluster, as generated by first mon.";
143 };
144 control = {
145 enable = mkEnableOption "mon and mgr on this host";
146 };
147 osd = {
148 devices = mkOption {
149 type = types.listOf (types.submodule {
150 options = {
151 id = mkOption {
152 description = "Numeric ID of OSD.";
153 type = types.int;
154 };
155 path = mkOption {
156 description = "Path to underlying block device for OSD storage.";
157 type = types.str;
158 };
159 uuid = mkOption {
160 description = "UUID of generated OSD storage.";
161 type = types.str;
162 };
163 };
164 });
165 default = [];
166 };
167 };
168 };
169 };
170 config = mkIf cfg.enable {
171 services.ceph = {
172 enable = cfg.control.enable || (length cfg.osd.devices) > 0;
173 global = {
174 fsid = cfg.fsid;
175 clusterName = cfg.name;
176
177 # Every Ceph node always attempts to connect to all mons.
178 monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
179 monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
180 };
181 } // cephMonConfig // cephOsdConfig;
182
183 environment.systemPackages = with pkgs; [
184 ceph cryptsetup smartmontools
185 ];
186
187 systemd.services = osdActivateServices;
188
189 # Hack - the upstream ceph module should generate ${clusterName}.conf instead
190 # of ceph.conf, let's just symlink it.
191 environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
192 };
193}