cluster/machines/modules/ceph.nix - hscloud - Gitiles

 # This runs Ceph on hscloud cluster(s).
 #
 # This lightly wraps the upstream NixOS ceph module, which is already fairly light.
 #
 # Most importantly, it does _not_ attempt to do any cluster
 # bootstrapping/maintenance. This means, that any configuration action that
 # does the following:
 #  0. Bringing up a cluster
 #  1. Adding/removing Mons
 #  2. Changing a Mon IP address
 #  3. Adding/removing OSDs
 # ... must be done in tandem with manual operations on the affected nodes. For
 # example, bootstrapping a cluster will involve keychain and monmap management,
 # changing anything with mons will involve monmap management, adding new OSDs
 # will require provisioning them with ceph-volume, etc.
 #
 # This is in stark contrast to a fully-managed solution like rook. Since we
 # don't have hundreds of clusters, none of the above is automated, especially
 # as that kind of automation is quite tricky to do reliably.

 { config, lib, pkgs, machines, ... }:

 with lib;

 let
   cfg = config.hscloud.ceph;

   allNodes = let
     list = mapAttrsToList (_: v: v) machines;
     filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
     sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
   in sorted;

   monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;

   machineName = config.networking.hostName;

   # This NixOS Ceph option fragment is present on every machine that runs a
   # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
   # this machine.
   cephMonConfig = if cfg.control.enable then {
     mon = {
       enable = true;
       daemons = [ machineName ];
     };
     mgr = {
       enable = true;
       daemons = [ machineName ];
     };
   } else {};

   # Same as for cephMonConfig, but this time for OSDs.
   cephOsdConfig = if (length cfg.osd.devices) > 0 then {
     osd = {
       enable = true;
       daemons = map (el: "${toString el.id}") cfg.osd.devices;
     };
     rgw = {
       enable = true;
       daemons = [ "rook-k0.rgw.${machineName}" ];
     };
   } else {};


   # Merge ceph-volume lvm activate into ceph-osd-ID services.
   #
   # This is because the upstream module seems to have been written with
   # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
   # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
   # contains everything for that OSD to work.
   #
   # Bluestore is a bit different. Instead of a normal filesystem being mounted,
   # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
   # Every bluestore volume needs to be 'activated' before it can be used by an
   # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
   # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
   # there. However, instead of this being a diskmount, it's instead a tmpfs
   # into which a bunch of files are dropped, loaded from the LVM raw device.
   #
   # To make the upstream NixOS module OSD work with bluestore, we do the following:
   #  1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
   #     path. This gates the service on that device being present.
   #  2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
   #  3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
   #     which seems to look for them on PATH instead of being properly
   #     nixified).
   #
   # We also inject smartmontools into PATH for smartctl, which allows the OSD
   # to monitor device health.
   osdActivateServices = listToAttrs (map (el: let
       osdId = toString el.id;
       osdUuid = el.uuid;
       diskPath = el.path;
     in {
     name = "ceph-osd-${osdId}";
     value = {
       path = with pkgs; [
         lvm2
         cryptsetup
         smartmontools
       ];
       serviceConfig = {
         ExecStartPre = lib.mkForce [
           ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
             #!/bin/sh
             set -e
             dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
             disk="${el.path}"
             uuid="${osdUuid}"
             if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
               echo "Volume $dir already activated, skipping..."
             else
               echo "Activating $dir with $disk, uuid $uuid..."
               ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
             fi

           '')))

           "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
         ];
       };
       unitConfig = {
         ConditionPathExists = lib.mkForce el.path;
       };
     };
   }) cfg.osd.devices);

 in rec {
   options = {
     hscloud.ceph = {
       enable = mkOption {
         type = types.bool;
         description = "Enable Ceph storage cluster (native NixOS), not rook.";
         default = ((length cfg.osd.devices) > 0) || cfg.control.enable;
       };
       name = mkOption {
         type = types.str;
         description = "Short identifier of cluster.";
       };
       fsid = mkOption {
         type = types.str;
         description = "UUID of cluster, as generated by first mon.";
       };
       control = {
         enable = mkEnableOption "mon and mgr on this host";
       };
       osd = {
         devices = mkOption {
           type = types.listOf (types.submodule {
             options = {
               id = mkOption {
                 description = "Numeric ID of OSD.";
                 type = types.int;
               };
               path = mkOption {
                 description = "Path to underlying block device for OSD storage.";
                 type = types.str;
               };
               uuid = mkOption {
                 description = "UUID of generated OSD storage.";
                 type = types.str;
               };
             };
           });
           default = [];
         };
       };
     };
   };
   config = mkIf cfg.enable {
     services.ceph = {
       enable = cfg.control.enable || (length cfg.osd.devices) > 0;
       global = {
         fsid = cfg.fsid;
         clusterName = cfg.name;

         # Every Ceph node always attempts to connect to all mons.
         monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
         monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
       };
     } // cephMonConfig // cephOsdConfig;

     environment.systemPackages = with pkgs; [
       ceph cryptsetup smartmontools
     ];

     systemd.services = osdActivateServices;

     # Hack - the upstream ceph module should generate ${clusterName}.conf instead
     # of ceph.conf, let's just symlink it.
     environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
   };
 }
	# This runs Ceph on hscloud cluster(s).
	#
	# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
	#
	# Most importantly, it does _not_ attempt to do any cluster
	# bootstrapping/maintenance. This means, that any configuration action that
	# does the following:
	# 0. Bringing up a cluster
	# 1. Adding/removing Mons
	# 2. Changing a Mon IP address
	# 3. Adding/removing OSDs
	# ... must be done in tandem with manual operations on the affected nodes. For
	# example, bootstrapping a cluster will involve keychain and monmap management,
	# changing anything with mons will involve monmap management, adding new OSDs
	# will require provisioning them with ceph-volume, etc.
	#
	# This is in stark contrast to a fully-managed solution like rook. Since we
	# don't have hundreds of clusters, none of the above is automated, especially
	# as that kind of automation is quite tricky to do reliably.

	{ config, lib, pkgs, machines, ... }:

	with lib;

	let
	cfg = config.hscloud.ceph;

	allNodes = let
	list = mapAttrsToList (_: v: v) machines;
	filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
	sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
	in sorted;

	monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;

	machineName = config.networking.hostName;

	# This NixOS Ceph option fragment is present on every machine that runs a
	# mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
	# this machine.
	cephMonConfig = if cfg.control.enable then {
	mon = {
	enable = true;
	daemons = [ machineName ];
	};
	mgr = {
	enable = true;
	daemons = [ machineName ];
	};
	} else {};

	# Same as for cephMonConfig, but this time for OSDs.
	cephOsdConfig = if (length cfg.osd.devices) > 0 then {
	osd = {
	enable = true;
	daemons = map (el: "${toString el.id}") cfg.osd.devices;
	};
	rgw = {
	enable = true;
	daemons = [ "rook-k0.rgw.${machineName}" ];
	};
	} else {};


	# Merge ceph-volume lvm activate into ceph-osd-ID services.
	#
	# This is because the upstream module seems to have been written with
	# filestore in mind, not bluestore. Filestore is relatively simple: an xfs
	# filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
	# contains everything for that OSD to work.
	#
	# Bluestore is a bit different. Instead of a normal filesystem being mounted,
	# Ceph manages a block device fully using LVM (and in our case, dmcrypt).
	# Every bluestore volume needs to be 'activated' before it can be used by an
	# OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
	# the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
	# there. However, instead of this being a diskmount, it's instead a tmpfs
	# into which a bunch of files are dropped, loaded from the LVM raw device.
	#
	# To make the upstream NixOS module OSD work with bluestore, we do the following:
	# 1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
	# path. This gates the service on that device being present.
	# 2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
	# 3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
	# which seems to look for them on PATH instead of being properly
	# nixified).
	#
	# We also inject smartmontools into PATH for smartctl, which allows the OSD
	# to monitor device health.
	osdActivateServices = listToAttrs (map (el: let
	osdId = toString el.id;
	osdUuid = el.uuid;
	diskPath = el.path;
	in {
	name = "ceph-osd-${osdId}";
	value = {
	path = with pkgs; [
	lvm2
	cryptsetup
	smartmontools
	];
	serviceConfig = {
	ExecStartPre = lib.mkForce [
	("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
	#!/bin/sh
	set -e
	dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
	disk="${el.path}"
	uuid="${osdUuid}"
	if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
	echo "Volume $dir already activated, skipping..."
	else
	echo "Activating $dir with $disk, uuid $uuid..."
	${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
	fi

	'')))

	"${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
	];
	};
	unitConfig = {
	ConditionPathExists = lib.mkForce el.path;
	};
	};
	}) cfg.osd.devices);

	in rec {
	options = {
	hscloud.ceph = {
	enable = mkOption {
	type = types.bool;
	description = "Enable Ceph storage cluster (native NixOS), not rook.";
	default = ((length cfg.osd.devices) > 0) \|\| cfg.control.enable;
	};
	name = mkOption {
	type = types.str;
	description = "Short identifier of cluster.";
	};
	fsid = mkOption {
	type = types.str;
	description = "UUID of cluster, as generated by first mon.";
	};
	control = {
	enable = mkEnableOption "mon and mgr on this host";
	};
	osd = {
	devices = mkOption {
	type = types.listOf (types.submodule {
	options = {
	id = mkOption {
	description = "Numeric ID of OSD.";
	type = types.int;
	};
	path = mkOption {
	description = "Path to underlying block device for OSD storage.";
	type = types.str;
	};
	uuid = mkOption {
	description = "UUID of generated OSD storage.";
	type = types.str;
	};
	};
	});
	default = [];
	};
	};
	};
	};
	config = mkIf cfg.enable {
	services.ceph = {
	enable = cfg.control.enable \|\| (length cfg.osd.devices) > 0;
	global = {
	fsid = cfg.fsid;
	clusterName = cfg.name;

	# Every Ceph node always attempts to connect to all mons.
	monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
	monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
	};
	} // cephMonConfig // cephOsdConfig;

	environment.systemPackages = with pkgs; [
	ceph cryptsetup smartmontools
	];

	systemd.services = osdActivateServices;

	# Hack - the upstream ceph module should generate ${clusterName}.conf instead
	# of ceph.conf, let's just symlink it.
	environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
	};
	}