cluster: refactor nix machinery to fit //ops This is a chonky refactor that get rids of the previous cluster-centric defs-* plain nix file setup. Now, nodes are configured individually in plain nixos modules, and are provided a view of all other nodes in the 'machines' attribute. Cluster logic is moved into modules which inspect this array to find other nodes within the same cluster. Kubernetes options are not fully clusterified yet (ie., they are still hardcode to only provide the 'k0' cluster) but that can be fixed later. The Ceph machinery is a good example of how that can be done. The new NixOS configs are zero-diff against prod. While this is done mostly by keeping the logic, we had to keep a few newly discovered 'bugs' around by adding some temporary options which keeps things as they are. These will be removed in a future CL, then introducing a diff (but no functional changes, hopefully). We also remove the nix eval from clustercfg as it was not used anymore (basically since we refactored certs at some point). Change-Id: Id79772a96249b0e6344046f96f9c2cb481c4e1f4 Reviewed-on: https://gerrit.hackerspace.pl/c/hscloud/+/1322 Reviewed-by: informatic <informatic@hackerspace.pl>

commit: 55a486ae49f1c92d9d4658fcd902d53ee568d81d [log] [tgz]
author: Serge Bazanski <q3k@hackerspace.pl> Sat Jun 11 18:27:01 2022 +0000
committer: q3k <q3k@hackerspace.pl> Sun Jun 19 11:48:52 2022 +0000
tree: e238dbcfb2994a221827f20bbfdf80b4ed909357
parent: 1da87e52095118febbae6d0e0e014fc296eacb72 [diff] [blame]
diff --git a/cluster/machines/modules/ceph.nix b/cluster/machines/modules/ceph.nix
new file mode 100644
index 0000000..4f15bdd
--- /dev/null
+++ b/cluster/machines/modules/ceph.nix

@@ -0,0 +1,193 @@
+# This runs Ceph on hscloud cluster(s).
+#
+# This lightly wraps the upstream NixOS ceph module, which is already fairly light.
+#
+# Most importantly, it does _not_ attempt to do any cluster
+# bootstrapping/maintenance. This means, that any configuration action that
+# does the following:
+#  0. Bringing up a cluster
+#  1. Adding/removing Mons
+#  2. Changing a Mon IP address
+#  3. Adding/removing OSDs
+# ... must be done in tandem with manual operations on the affected nodes. For
+# example, bootstrapping a cluster will involve keychain and monmap management,
+# changing anything with mons will involve monmap management, adding new OSDs
+# will require provisioning them with ceph-volume, etc.
+#
+# This is in stark contrast to a fully-managed solution like rook. Since we
+# don't have hundreds of clusters, none of the above is automated, especially
+# as that kind of automation is quite tricky to do reliably.
+
+{ config, lib, pkgs, machines, ... }:
+
+with lib;
+
+let
+  cfg = config.hscloud.ceph;
+
+  allNodes = let
+    list = mapAttrsToList (_: v: v) machines;
+    filtered = filter (m: (m.config ? hscloud.ceph) && (m.config.hscloud.ceph.enable)) list;
+    sorted = sort (a: b: a.config.hscloud.base.fqdn < b.config.hscloud.base.fqdn) filtered;
+  in sorted;
+
+  monNodes = filter (m: m.config.hscloud.ceph.control.enable) allNodes;
+
+  machineName = config.networking.hostName;
+
+  # This NixOS Ceph option fragment is present on every machine that runs a
+  # mon, and basically tells the NixOS machinery to run mons/mgrs if needed on
+  # this machine.
+  cephMonConfig = if cfg.control.enable then {
+    mon = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+    mgr = {
+      enable = true;
+      daemons = [ machineName ];
+    };
+  } else {};
+
+  # Same as for cephMonConfig, but this time for OSDs.
+  cephOsdConfig = if (length cfg.osd.devices) > 0 then {
+    osd = {
+      enable = true;
+      daemons = map (el: "${toString el.id}") cfg.osd.devices;
+    };
+    rgw = {
+      enable = true;
+      daemons = [ "rook-k0.rgw.${machineName}" ];
+    };
+  } else {};
+
+
+  # Merge ceph-volume lvm activate into ceph-osd-ID services.
+  #
+  # This is because the upstream module seems to have been written with
+  # filestore in mind, not bluestore. Filestore is relatively simple: an xfs
+  # filesystem is mounted into /var/lib/caph/osd/$cluster-$id, that in turn
+  # contains everything for that OSD to work. 
+  #
+  # Bluestore is a bit different. Instead of a normal filesystem being mounted,
+  # Ceph manages a block device fully using LVM (and in our case, dmcrypt).
+  # Every bluestore volume needs to be 'activated' before it can be used by an
+  # OSD. Activation takes care of doing LVM and dmcrypt mounts, and prepares
+  # the /var/lib/ceph/osd/$cluster-$id directory as if a filestore was present
+  # there. However, instead of this being a diskmount, it's instead a tmpfs
+  # into which a bunch of files are dropped, loaded from the LVM raw device.
+  #
+  # To make the upstream NixOS module OSD work with bluestore, we do the following:
+  #  1. Change ConditionPathExists from the OSD mount into a /dev/disk/by-id
+  #     path. This gates the service on that device being present.
+  #  2. Inject an ExecStartPre which runs ceph-volume lvm activate, if needed.
+  #  3. Add lvm/cryptsetup to the PATH of the service (as used by ceph-volume,
+  #     which seems to look for them on PATH instead of being properly
+  #     nixified).
+  #
+  # We also inject smartmontools into PATH for smartctl, which allows the OSD
+  # to monitor device health.
+  osdActivateServices = listToAttrs (map (el: let
+      osdId = toString el.id;
+      osdUuid = el.uuid;
+      diskPath = el.path;
+    in {
+    name = "ceph-osd-${osdId}";
+    value = {
+      path = with pkgs; [
+        lvm2
+        cryptsetup
+        smartmontools
+      ];
+      serviceConfig = {
+        ExecStartPre = lib.mkForce [
+          ("+" + (toString (pkgs.writeScript "ceph-osd-${osdId}-activate.sh" ''
+            #!/bin/sh
+            set -e
+            dir="/var/lib/ceph/osd/${cfg.name}-${osdId}/"
+            disk="${el.path}"
+            uuid="${osdUuid}"
+            if [ -d "$dir" ] && [ -f "$dir"/keyring ]; then
+              echo "Volume $dir already activated, skipping..."
+            else
+              echo "Activating $dir with $disk, uuid $uuid..."
+              ${pkgs.ceph}/bin/ceph-volume lvm activate --bluestore --no-systemd ${osdId} $uuid
+            fi
+
+          '')))
+
+          "${pkgs.ceph.lib}/libexec/ceph/ceph-osd-prestart.sh --id ${osdId} --cluster ${cfg.name}"
+        ];
+      };
+      unitConfig = {
+        ConditionPathExists = lib.mkForce el.path;
+      };
+    };
+  }) cfg.osd.devices);
+
+in rec {
+  options = {
+    hscloud.ceph = {
+      enable = mkOption {
+        type = types.bool;
+        description = "Enable Ceph storage cluster (native NixOS), not rook.";
+        default = ((length cfg.osd.devices) > 0) || cfg.control.enable;
+      };
+      name = mkOption {
+        type = types.str;
+        description = "Short identifier of cluster.";
+      };
+      fsid = mkOption {
+        type = types.str;
+        description = "UUID of cluster, as generated by first mon.";
+      };
+      control = {
+        enable = mkEnableOption "mon and mgr on this host";
+      };
+      osd = {
+        devices = mkOption {
+          type = types.listOf (types.submodule {
+            options = {
+              id = mkOption {
+                description = "Numeric ID of OSD.";
+                type = types.int;
+              };
+              path = mkOption {
+                description = "Path to underlying block device for OSD storage.";
+                type = types.str;
+              };
+              uuid = mkOption {
+                description = "UUID of generated OSD storage.";
+                type = types.str;
+              };
+            };
+          });
+          default = [];
+        };
+      };
+    };
+  };
+  config = mkIf cfg.enable {
+    services.ceph = {
+      enable = cfg.control.enable || (length cfg.osd.devices) > 0;
+      global = {
+        fsid = cfg.fsid;
+        clusterName = cfg.name;
+
+        # Every Ceph node always attempts to connect to all mons.
+        monHost = concatStringsSep "," (map (n: n.config.hscloud.base.ipAddr) monNodes);
+        monInitialMembers = concatStringsSep "," (map (n: n.config.networking.hostName) monNodes);
+      };
+    } // cephMonConfig // cephOsdConfig;
+  
+    environment.systemPackages = with pkgs; [
+      ceph cryptsetup smartmontools
+    ];
+  
+    systemd.services = osdActivateServices;
+  
+    # Hack - the upstream ceph module should generate ${clusterName}.conf instead
+    # of ceph.conf, let's just symlink it.
+    environment.etc."ceph/${cfg.name}.conf".source = "/etc/ceph/ceph.conf";
+  };
+}
commit	55a486ae49f1c92d9d4658fcd902d53ee568d81d	[log] [tgz]
author	Serge Bazanski <q3k@hackerspace.pl>	Sat Jun 11 18:27:01 2022 +0000
committer	q3k <q3k@hackerspace.pl>	Sun Jun 19 11:48:52 2022 +0000
tree	e238dbcfb2994a221827f20bbfdf80b4ed909357
parent	1da87e52095118febbae6d0e0e014fc296eacb72 [diff] [blame]