cluster/nix/modules/ceph.nix - hscloud - Gitiles

 # This runs Ceph on hscloud cluster(s).
 #
 # This lightly wraps the upstream NixOS ceph module, which is already fairly light.
 #
 # Most importantly, it does _not_ attempt to do any cluster
 # bootstrapping/maintenance. This means, that any configuration action that
 # does the following:
 #  0. Bringing up a cluster
 #  1. Adding/removing Mons
 #  2. Changing a Mon IP address
 #  3. Adding/removing OSDs
 # ... must be done in tandem with manual operations on the affected nodes. For
 # example, bootstrapping a cluster will involve keychain and monmap management,
 # changing anything with mons will involve monmap management, adding new OSDs
 # will require provisioning them with ceph-volume, etc.
 #
 # This is in stark contrast to a fully-managed solution like rook. Since we
 # don't have hundreds of clusters, none of the above is automated, especially
 # as that kind of automation is quite tricky to do reliably.

 { config, lib, pkgs, ... }:

 with builtins;
 with lib;

 with (( import ../defs-cluster-k0.nix ) config.networking.hostName);

 let
   machineName = config.networking.hostName;
   isMon = hasAttr machineName cephCluster.mons;
   isOsd = hasAttr machineName cephCluster.osds;
   hasCeph = isMon || isOsd;

   # This NixOS Ceph option fragment is present on every machine that runs a
   # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
   # this machine.
   cephMonConfig = if isMon then {
     mon = {
       enable = true;
       daemons = [ machineName ];
     };
     mgr = {
       enable = true;
       daemons = [ machineName ];
     };
   } else {};

   # Same as for cephMonConfig, but this time for OSDs.
   cephOsdConfig = if isOsd then {
     osd = {
       enable = true;
       daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName};
     };
     rgw = {
       enable = true;
       daemons = [ "rook-k0.rgw.${machineName}" ];
     };
   } else {};

   # The full option fragment for services.ceph. It contains ceph.conf fragments
   # (in .global.*) and merges ceph{Mon,Osd}Config.
   cephConfig = {
     enable = true;
     global = {
       fsid = cephCluster.fsid;
       clusterName = cephCluster.name;

       # Every Ceph node always attempts to connect to all mons.
       monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons);
       monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons);
     };
   } // cephMonConfig // cephOsdConfig;

   # Merge ceph-volume lvm activate into ceph-osd-ID services.
   #
   # This is because the upstream module seems to have been written with
   # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
   # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
   # contains everything for that OSD to work.
   #
   # Bluestore is a bit different. Instead of a normal filesystem being mounted,
   # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
   # Every bluestore volume needs to be 'activated' before it can be used by an
   # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
   # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
   # there. However, instead of this being a diskmount, it's instead a tmpfs
   # into which a bunch of files are dropped, loaded from the LVM raw device.
   #
   # To make the upstream NixOS module OSD work with bluestore, we do the following:
   #  1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
   #     path. This gates the service on that device being present.
   #  2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
   #  3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
   #     which seems to look for them on PATH instead of being properly
   #     nixified).
   #
   # We also inject smartmontools into PATH for smartctl, which allows the OSD
   # to monitor device health.
   osdActivateServices = listToAttrs (map (el: let
       osdId = toString el.id;
       osdUuid = el.uuid;
       diskPath = el.path;
     in {
     name = "ceph-osd-${osdId}";
     value = {
       path = with pkgs; [
         lvm2
         cryptsetup
         smartmontools
       ];
       serviceConfig = {
         ExecStartPre = lib.mkForce [
           ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
             #!/bin/sh
             set -e
             dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/"
             disk="${el.path}"
             uuid="${osdUuid}"
             if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
               echo "Volume $dir already activated, skipping..."
             else
               echo "Activating $dir with $disk, uuid $uuid..."
               ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
             fi

           '')))

           "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}"
         ];
       };
       unitConfig = {
         ConditionPathExists = lib.mkForce el.path;
       };
     };
   }) (if isOsd then cephCluster.osds.${machineName} else []));

 in rec {
   services.ceph = if hasCeph then cephConfig else {};

   environment.systemPackages = with pkgs; [
     ceph cryptsetup smartmontools
   ];

   systemd.services = osdActivateServices;

   # Hack - the upstream ceph module should generate ${clusterName}.conf instead
   # of ceph.conf, let's just symlink it.
   environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf";
 }
	# This runs Ceph on hscloud cluster(s).
	#
	# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
	#
	# Most importantly, it does _not_ attempt to do any cluster
	# bootstrapping/maintenance. This means, that any configuration action that
	# does the following:
	# 0. Bringing up a cluster
	# 1. Adding/removing Mons
	# 2. Changing a Mon IP address
	# 3. Adding/removing OSDs
	# ... must be done in tandem with manual operations on the affected nodes. For
	# example, bootstrapping a cluster will involve keychain and monmap management,
	# changing anything with mons will involve monmap management, adding new OSDs
	# will require provisioning them with ceph-volume, etc.
	#
	# This is in stark contrast to a fully-managed solution like rook. Since we
	# don't have hundreds of clusters, none of the above is automated, especially
	# as that kind of automation is quite tricky to do reliably.

	{ config, lib, pkgs, ... }:

	with builtins;
	with lib;

	with (( import ../defs-cluster-k0.nix ) config.networking.hostName);

	let
	machineName = config.networking.hostName;
	isMon = hasAttr machineName cephCluster.mons;
	isOsd = hasAttr machineName cephCluster.osds;
	hasCeph = isMon \|\| isOsd;

	# This NixOS Ceph option fragment is present on every machine that runs a
	# mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
	# this machine.
	cephMonConfig = if isMon then {
	mon = {
	enable = true;
	daemons = [ machineName ];
	};
	mgr = {
	enable = true;
	daemons = [ machineName ];
	};
	} else {};

	# Same as for cephMonConfig, but this time for OSDs.
	cephOsdConfig = if isOsd then {
	osd = {
	enable = true;
	daemons = map (el: "${toString el.id}") cephCluster.osds.${machineName};
	};
	rgw = {
	enable = true;
	daemons = [ "rook-k0.rgw.${machineName}" ];
	};
	} else {};

	# The full option fragment for services.ceph. It contains ceph.conf fragments
	# (in .global.*) and merges ceph{Mon,Osd}Config.
	cephConfig = {
	enable = true;
	global = {
	fsid = cephCluster.fsid;
	clusterName = cephCluster.name;

	# Every Ceph node always attempts to connect to all mons.
	monHost = concatStringsSep "," (mapAttrsToList (k: _: machinesByName.${k}.ipAddr) cephCluster.mons);
	monInitialMembers = concatStringsSep "," (builtins.attrNames cephCluster.mons);
	};
	} // cephMonConfig // cephOsdConfig;

	# Merge ceph-volume lvm activate into ceph-osd-ID services.
	#
	# This is because the upstream module seems to have been written with
	# filestore in mind, not bluestore. Filestore is relatively simple: an xfs
	# filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
	# contains everything for that OSD to work.
	#
	# Bluestore is a bit different. Instead of a normal filesystem being mounted,
	# Ceph manages a block device fully using LVM (and in our case, dmcrypt).
	# Every bluestore volume needs to be 'activated' before it can be used by an
	# OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
	# the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
	# there. However, instead of this being a diskmount, it's instead a tmpfs
	# into which a bunch of files are dropped, loaded from the LVM raw device.
	#
	# To make the upstream NixOS module OSD work with bluestore, we do the following:
	# 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
	# path. This gates the service on that device being present.
	# 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
	# 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
	# which seems to look for them on PATH instead of being properly
	# nixified).
	#
	# We also inject smartmontools into PATH for smartctl, which allows the OSD
	# to monitor device health.
	osdActivateServices = listToAttrs (map (el: let
	osdId = toString el.id;
	osdUuid = el.uuid;
	diskPath = el.path;
	in {
	name = "ceph-osd-${osdId}";
	value = {
	path = with pkgs; [
	lvm2
	cryptsetup
	smartmontools
	];
	serviceConfig = {
	ExecStartPre = lib.mkForce [
	("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
	#!/bin/sh
	set -e
	dir="/var/lib/ceph/osd/${cephCluster.name}-${osdId}/"
	disk="${el.path}"
	uuid="${osdUuid}"
	if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
	echo "Volume $dir already activated, skipping..."
	else
	echo "Activating $dir with $disk, uuid $uuid..."
	${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
	fi

	'')))

	"${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cephCluster.name}"
	];
	};
	unitConfig = {
	ConditionPathExists = lib.mkForce el.path;
	};
	};
	}) (if isOsd then cephCluster.osds.${machineName} else []));

	in rec {
	services.ceph = if hasCeph then cephConfig else {};

	environment.systemPackages = with pkgs; [
	ceph cryptsetup smartmontools
	];

	systemd.services = osdActivateServices;

	# Hack - the upstream ceph module should generate ${clusterName}.conf instead
	# of ceph.conf, let's just symlink it.
	environment.etc."ceph/${cephCluster.name}.conf".source = "/etc/ceph/ceph.conf";
	}