cluster: k0: move ceph-waw3 to proper realm/zonegroup

With this we can use Ceph's multi-site support to easily migrate to our
new k0 Ceph cluster.

This migration was done by using radosgw-admin to rename the existing
realm/zonegroup to the new names (hscloud and eu), and then reworking
the jsonnet so that the Rook operator would effectively do nothing.

It sounds weird that creating a bunch of CRs like
Object{Realm,ZoneGroup,Zone} realm would be a no-op for the operator,
but that's how Rook works - a CephObjectStore generally creates
everything that the above CRs would create too, but implicitly. Adding
the extra CRs just allows specifying extra settings, like names.

(it wasn't fully a no-op, as the rgw daemon is parametrized by
realm/zonegroup/zone names, so that had to be restarted)

We also make the radosgw serve under object.ceph-eu.hswaw.net, which
allows us to right away start using a zonegroup URL instead of the
zone-only URL.

Change-Id: I4dca55a705edb3bd28e54f50982c85720a17b877
diff --git a/cluster/kube/k0.libsonnet b/cluster/kube/k0.libsonnet
index bd3a9f5..6f24500 100644
--- a/cluster/kube/k0.libsonnet
+++ b/cluster/kube/k0.libsonnet
@@ -205,15 +205,30 @@
                         },
                     },
                 },
-                objectRedundant: rook.S3ObjectStore(k0.ceph.waw3, "waw-hdd-redundant-3-object") {
-                    spec: {
-                        metadataPool: {
-                            failureDomain: "host",
-                            replicated: { size: 2 },
+
+                object: {
+                    local poolSpec = {
+                        failureDomain: "host",
+                        replicated: { size: 2 },
+                    },
+
+                    realm: rook.S3ObjectRealm(k0.ceph.waw3, "hscloud"),
+                    zonegroup: rook.S3ObjectZoneGroup(self.realm, "eu"),
+                    // This is serving at object.ceph-waw3.hswaw.net, but
+                    // internally to Ceph it is known as
+                    // waw-hdd-redundant-3-object (name of radosgw zone).
+                    store: rook.S3ObjectStore(self.zonegroup, "waw-hdd-redundant-3-object") {
+                        cfg+: {
+                            // Override so that this radosgw serves on
+                            // object.ceph-{waw3,eu}.hswaw.net instead of
+                            // ceph-{waw-hdd-redundant-3-object,eu}.
+                            domainParts: [
+                                "waw3", "eu",
+                            ],
                         },
-                        dataPool: {
-                            failureDomain: "host",
-                            replicated: { size: 2 },
+                        spec: {
+                            metadataPool: poolSpec,
+                            dataPool: poolSpec,
                         },
                     },
                 },
@@ -370,6 +385,7 @@
 
                         { namespace: "ceph-waw3", dns: "ceph-waw3.hswaw.net" },
                         { namespace: "ceph-waw3", dns: "object.ceph-waw3.hswaw.net" },
+                        { namespace: "ceph-waw3", dns: "object.ceph-eu.hswaw.net" },
                         { namespace: "monitoring-global-k0", dns: "*.hswaw.net" },
                         { namespace: "registry", dns: "*.hswaw.net" },
 
diff --git a/cluster/kube/lib/rook.libsonnet b/cluster/kube/lib/rook.libsonnet
index 6646df5..4acf80d 100644
--- a/cluster/kube/lib/rook.libsonnet
+++ b/cluster/kube/lib/rook.libsonnet
@@ -1087,38 +1087,133 @@
         },
     },
 
-    S3ObjectStore(cluster, name):: {
-        local store = self,
-        spec:: error "spec must be specified",
-        objectStore: kube._Object("ceph.rook.io/v1", "CephObjectStore", name) {
+    // This is a rook CephObjectRealm which corresponds to a radosgw realm.
+    //
+    // A realm is a 'world' of radosgw user-facing metadata, like credentials,
+    // buckets, and underlying structures like zones and zonegroups. A realm
+    // contains zonegroups and zones, but a single Ceph cluster can actually
+    // serve multiple realms, by running multiple radosgw instances.
+    S3ObjectRealm(cluster, name):: {
+        cluster:: cluster,
+        realm: kube._Object("ceph.rook.io/v1", "CephObjectRealm", name) {
             metadata+: cluster.metadata,
+        },
+    },
+
+    // This is a rook CephObjectZoneGroup which corresponds to a radosgw
+    // zonegroup.
+    //
+    // A zonegroup contains zones, and zones within a zonegroup will serve a
+    // concise view of objects in buckets, and will sync between eachother to
+    // eventually contain the same data.
+    //
+    // A single zonegroup within a realm must be a 'master' zonegroup, and will
+    // then hold and replicate the metadata of this realm. All realm operations
+    // via radosgw-admin must be performed within the master zonegroup.
+    S3ObjectZoneGroup(realm, name):: {
+        realm:: realm,
+        zonegroup: kube._Object("ceph.rook.io/v1", "CephObjectZoneGroup", name) {
+            metadata+: realm.cluster.metadata,
+            spec+: {
+                realm: realm.realm.metadata.name,
+            },
+        },
+    },
+
+    // This is a CephObjectZone but also a CephObjectStore.
+    //
+    // Rook attempts to hide away Ceph's radosgw multisite structures
+    // (realm/zonegroup/zone) by presenting a single CRD named
+    // 'CephObjectStore'. When such a resource is created, Rook will create a
+    // realm, zonegroup and zone under the hood, as a radosgw zone is required
+    // to serve data, and a radosgw zone cannot exist without a zonegroup, and
+    // a radosgw zonegroup cannot exist without a realm.
+    //
+    // However, rook also exposes the lower-level API by letting the user
+    // specify 'zone' in the ObjectStore's spec, which should point to a
+    // CephObjectZone. Then, an entirely different reconciliation codepath is
+    // taken and instead users are expected to manage
+    // CephObject{Realm,ZoneGroup,Zone} manually at Ceph's native abstraction
+    // level.
+    //
+    // CephObjectStore not only represents a Ceph zone (and possibly
+    // zonegroup/realm), but also pods and services that are required to servev
+    // radosgw data publicly. That's why S3ObjectStore takes parameters like
+    // 'public port' and 'instance number'.
+    //
+    // To add to the confusion, our S3ObjectStore wrapper also sprinkles in an
+    // Ingress with TLS to terminate the above service, and automatically
+    // creates a CephObjectZone.
+    //
+    // This whole jsonent abstraction basically forces users to manually create
+    // realms and zonegroups, but makes it very easy to do so. By forcing these
+    // to be explicitly created by rook objects, only the 'multi-site'
+    // reconciliation codepath is taken in rook, making things a bit more
+    // predictable.
+    S3ObjectStore(zonegroup, name):: {
+        local store = self,
+        spec:: {
+            dataPool: error "spec.dataPool must be specified",
+            metadataPool: error "spec.metadataPool must be specified",
+        },
+
+        cfg:: {
+            // We want to have each rgw run under a domain corresponding to the
+            // zone it's running in, but also to the zonegroup it's running in.
+            // This will allow us to DNS loadbalance a zonegroup to be backed
+            // by multiple zone ingresses.
+            domainParts: [
+                zonegroup.zone.metadata.name,
+                zonegroup.zonegroup.metadata.name,
+            ],
+            domains: [
+                "object.ceph-%s.hswaw.net" % [part]
+                for part in cfg.domainParts
+            ],
+        },
+        local cfg = self.cfg,
+
+        zone: kube._Object("ceph.rook.io/v1", "CephObjectZone", name) {
+            metadata+: zonegroup.realm.cluster.metadata,
             spec: store.spec {
+                zoneGroup: zonegroup.zonegroup.metadata.name,
+            },
+        },
+
+        objectStore: kube._Object("ceph.rook.io/v1", "CephObjectStore", name) {
+            metadata+: zonegroup.realm.cluster.metadata,
+            spec: {
                 gateway: {
                     port: 80,
                     instances: 1,
                     allNodes: false,
                 },
+                zone: {
+                    name: name,
+                },
+                preservePoolsOnDelete: true,
             },
         },
 
         objectIngress: kube.Ingress(name) {
-            metadata+: cluster.metadata {
+            metadata+: zonegroup.realm.cluster.metadata {
                 annotations+: {
                     "kubernetes.io/tls-acme": "true",
                     "certmanager.k8s.io/cluster-issuer": "letsencrypt-prod",
                     "nginx.ingress.kubernetes.io/proxy-body-size": "0",
                 },
             },
+
             spec+: {
                 tls: [
                     {
-                        hosts: ["object.%s.hswaw.net" % [cluster.metadata.namespace]],
+                        hosts: cfg.domains,
                         secretName: "%s-tls" % [name],
                     },
                 ],
                 rules: [
                     {
-                        host: "object.%s.hswaw.net" % [cluster.metadata.namespace],
+                        host: domain,
                         http: {
                             paths: [
                                 {
@@ -1131,6 +1226,7 @@
                             ]
                         },
                     }
+                    for domain in cfg.domains
                 ],
             },
         },