kube/policies: implement mostlysecure

This now allows to run apt and should allow to run most upstream docker
images. In return, we prohibit some mildly sketchy stuff. But this is
safe enough for project namespaces with limited administrative access.

We should still get gvisor sooner than later...

Change-Id: Ida5ccfae440bacb6f3fd55dcc34ca0addfddd5ae
diff --git a/kube/policies.libsonnet b/kube/policies.libsonnet
index 242c00c..0f8d3d8 100644
--- a/kube/policies.libsonnet
+++ b/kube/policies.libsonnet
@@ -7,6 +7,12 @@
     policyNameAllowSecure: "policy:allow-secure",
     policyNameAllowMostlySecure: "policy:allow-mostlysecure",
 
+    # egrep 'define CAP_[A-Z_]+.+[0-9]+$' include/linux/capability.h | cut -d' ' -f 2 | tr '\n' ','
+    local allCapsStr = 'CAP_CHOWN,CAP_DAC_OVERRIDE,CAP_DAC_READ_SEARCH,CAP_FOWNER,CAP_FSETID,CAP_KILL,CAP_SETGID,CAP_SETUID,CAP_SETPCAP,CAP_LINUX_IMMUTABLE,CAP_NET_BIND_SERVICE,CAP_NET_BROADCAST,CAP_NET_ADMIN,CAP_NET_RAW,CAP_IPC_LOCK,CAP_IPC_OWNER,CAP_SYS_MODULE,CAP_SYS_RAWIO,CAP_SYS_CHROOT,CAP_SYS_PTRACE,CAP_SYS_PACCT,CAP_SYS_ADMIN,CAP_SYS_BOOT,CAP_SYS_NICE,CAP_SYS_RESOURCE,CAP_SYS_TIME,CAP_SYS_TTY_CONFIG,CAP_MKNOD,CAP_LEASE,CAP_AUDIT_WRITE,CAP_AUDIT_CONTROL,CAP_SETFCAP,CAP_MAC_OVERRIDE,CAP_MAC_ADMIN,CAP_SYSLOG,CAP_WAKE_ALARM,CAP_BLOCK_SUSPEND,CAP_AUDIT_READ',
+    // Split by `,`, remove CAP_ prefix, turn into unique set.
+    local allCaps = std.set(std.map(function(el) std.substr(el, 4, std.length(el)-4), std.split(allCapsStr, ','))),
+
+
     Cluster: {
         local cluster = self,
 
@@ -98,6 +104,7 @@
                     ],
                 },
                 readOnlyRootFilesystem: false,
+
             },
         },
         secureRole: kube.ClusterRole(policies.policyNameAllowSecure) {
@@ -111,13 +118,38 @@
             ],
         },
 
-        // MostlySecure: like secure, but allows for setuid inside containers.
+        // MostlySecure: like secure, but allows for setuid inside containers
+        // and enough filesystem access to run apt.
         mostlySecure: cluster.secure {
             metadata+: {
                 name: "mostlysecure",
             },
             spec+: {
-                allowPrivilegeEscalation: true,
+                requiredDropCapabilities: std.setDiff(allCaps, [
+                    // Drop everything apart from:
+                    "CHOWN",
+                    "DAC_OVERRIDE",
+                    "FOWNER",
+                    "LEASE",
+                    "SETGID",
+                    "SETUID",
+                ]),
+                supplementalGroups: {
+                    // Allow running as root gid - we allow running as root
+                    // uid anyway, as we trust our container runtime.
+                    rule: 'MustRunAs',
+                    ranges: [
+                        { min: 0, max: 65535, },
+                    ],
+                },
+                fsGroup: {
+                    // Allow setting the fsGroup to 0, as all filesystem mounts
+                    // are trusted anyway.
+                    rule: 'MustRunAs',
+                    ranges: [
+                        { min: 0, max: 65535, },
+                    ],
+                },
             },
         },
         mostlySecureRole: kube.ClusterRole(policies.policyNameAllowMostlySecure) {