blob: 4f15bdd3cb464ffe0774c8237cea36344242763d [file] [log] [blame]
# This runs Ceph on hscloud cluster(s).
#
# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
#
# Most importantly, it does _not_ attempt to do any cluster
# bootstrapping/maintenance. This means, that any configuration action that
# does the following:
# 0. Bringing up a cluster
# 1. Adding/removing Mons
# 2. Changing a Mon IP address
# 3. Adding/removing OSDs
# ... must be done in tandem with manual operations on the affected nodes. For
# example, bootstrapping a cluster will involve keychain and monmap management,
# changing anything with mons will involve monmap management, adding new OSDs
# will require provisioning them with ceph-volume, etc.
#
# This is in stark contrast to a fully-managed solution like rook. Since we
# don't have hundreds of clusters, none of the above is automated, especially
# as that kind of automation is quite tricky to do reliably.
{ config, lib, pkgs, machines, ... }:
with lib;
let
cfg = config.hscloud.ceph;
allNodes = let
list = mapAttrsToList (_: v: v) machines;
filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
in sorted;
monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;
machineName = config.networking.hostName;
# This NixOS Ceph option fragment is present on every machine that runs a
# mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
# this machine.
cephMonConfig = if cfg.control.enable then {
mon = {
enable = true;
daemons = [ machineName ];
};
mgr = {
enable = true;
daemons = [ machineName ];
};
} else {};
# Same as for cephMonConfig, but this time for OSDs.
cephOsdConfig = if (length cfg.osd.devices) > 0 then {
osd = {
enable = true;
daemons = map (el: "${toString el.id}") cfg.osd.devices;
};
rgw = {
enable = true;
daemons = [ "rook-k0.rgw.${machineName}" ];
};
} else {};
# Merge ceph-volume lvm activate into ceph-osd-ID services.
#
# This is because the upstream module seems to have been written with
# filestore in mind, not bluestore. Filestore is relatively simple: an xfs
# filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
# contains everything for that OSD to work.
#
# Bluestore is a bit different. Instead of a normal filesystem being mounted,
# Ceph manages a block device fully using LVM (and in our case, dmcrypt).
# Every bluestore volume needs to be 'activated' before it can be used by an
# OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
# the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
# there. However, instead of this being a diskmount, it's instead a tmpfs
# into which a bunch of files are dropped, loaded from the LVM raw device.
#
# To make the upstream NixOS module OSD work with bluestore, we do the following:
# 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
# path. This gates the service on that device being present.
# 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
# 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
# which seems to look for them on PATH instead of being properly
# nixified).
#
# We also inject smartmontools into PATH for smartctl, which allows the OSD
# to monitor device health.
osdActivateServices = listToAttrs (map (el: let
osdId = toString el.id;
osdUuid = el.uuid;
diskPath = el.path;
in {
name = "ceph-osd-${osdId}";
value = {
path = with pkgs; [
lvm2
cryptsetup
smartmontools
];
serviceConfig = {
ExecStartPre = lib.mkForce [
("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
#!/bin/sh
set -e
dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
disk="${el.path}"
uuid="${osdUuid}"
if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
echo "Volume $dir already activated, skipping..."
else
echo "Activating $dir with $disk, uuid $uuid..."
${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
fi
'')))
"${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
];
};
unitConfig = {
ConditionPathExists = lib.mkForce el.path;
};
};
}) cfg.osd.devices);
in rec {
options = {
hscloud.ceph = {
enable = mkOption {
type = types.bool;
description = "Enable Ceph storage cluster (native NixOS), not rook.";
default = ((length cfg.osd.devices) > 0) || cfg.control.enable;
};
name = mkOption {
type = types.str;
description = "Short identifier of cluster.";
};
fsid = mkOption {
type = types.str;
description = "UUID of cluster, as generated by first mon.";
};
control = {
enable = mkEnableOption "mon and mgr on this host";
};
osd = {
devices = mkOption {
type = types.listOf (types.submodule {
options = {
id = mkOption {
description = "Numeric ID of OSD.";
type = types.int;
};
path = mkOption {
description = "Path to underlying block device for OSD storage.";
type = types.str;
};
uuid = mkOption {
description = "UUID of generated OSD storage.";
type = types.str;
};
};
});
default = [];
};
};
};
};
config = mkIf cfg.enable {
services.ceph = {
enable = cfg.control.enable || (length cfg.osd.devices) > 0;
global = {
fsid = cfg.fsid;
clusterName = cfg.name;
# Every Ceph node always attempts to connect to all mons.
monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
};
} // cephMonConfig // cephOsdConfig;
environment.systemPackages = with pkgs; [
ceph cryptsetup smartmontools
];
systemd.services = osdActivateServices;
# Hack - the upstream ceph module should generate ${clusterName}.conf instead
# of ceph.conf, let's just symlink it.
environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
};
}