Merge "k0.hswaw.net: pass metallb through Calico"
diff --git a/cluster/kube/k0.calico.yaml b/cluster/kube/k0.calico.yaml
new file mode 100644
index 0000000..eef2661
--- /dev/null
+++ b/cluster/kube/k0.calico.yaml
@@ -0,0 +1,78 @@
+# This is the current Calico configuration in k0.hswaw.net.
+# Unfortunately, we do not have Calico configured to use CRDs, and instead to
+# keep its resources separately from Kubernetes. Thus, this configuration
+# cannot be managed by Kubernetes/jsonnet. Instead, it must be applied manially:
+#
+#     calicoctl apply -f k0.calico.yaml
+
+apiVersion: projectcalico.org/v3
+kind: BGPConfiguration
+metadata:
+  name: default
+spec:
+  logSeverityScreen: Info
+  nodeToNodeMeshEnabled: true
+  asNumber: 65003
+
+---
+
+# metallb peer, must be compatible with the metallbc definition in k0.libsonnet.
+apiVersion: projectcalico.org/v3
+kind: BGPPeer
+metadata:
+  name: metallb
+spec:
+  peerIP: 127.0.0.1
+  asNumber: 65002
+
+---
+
+# ToR switch peering, must be compatible with the configuration on dcsw01.hswaw.net.
+apiVersion: projectcalico.org/v3
+kind: BGPPeer
+metadata:
+  name: dcsw01
+spec:
+  peerIP: 185.236.240.33
+  asNumber: 65001
+
+---
+
+# IP pool that's used by metallb. We mark it as disabled so that Calico doesn't
+# allocate Service IPs from it, just allow metallb routes from that pool to
+# pass through eBGP (otherwise Calico BIRD filter will filter them out).
+# Keep in sync with k0.libsonnet.
+apiVersion: projectcalico.org/v3
+kind: IPPool
+metadata:
+  name: public-v4-1
+spec:
+  cidr: 185.236.240.48/28
+  disabled: true
+---
+
+# IP pool that's used by metallb. We mark it as disabled so that Calico doesn't
+# allocate Service IPs from it, just allow metallb routes from that pool to
+# pass through eBGP (otherwise Calico BIRD filter will filter them out).
+# Keep in sync with k0.libsonnet.
+apiVersion: projectcalico.org/v3
+kind: IPPool
+metadata:
+  name: public-v4-2
+spec:
+  cidr: 185.236.240.112/28
+  disabled: true
+
+---
+
+# IP pool for the service network.
+apiVersion: projectcalico.org/v3
+kind: IPPool
+metadata:
+  name: default-ipv4-ippool
+spec:
+  blockSize: 26
+  cidr: 10.10.24.0/21
+  ipipMode: CrossSubnet
+  natOutgoing: true
+
diff --git a/cluster/kube/k0.libsonnet b/cluster/kube/k0.libsonnet
index 6146085..45ae4c1 100644
--- a/cluster/kube/k0.libsonnet
+++ b/cluster/kube/k0.libsonnet
@@ -20,13 +20,15 @@
             },
             metallb+: {
                 cfg+: {
+                    // Peer with calico running on same node.
                     peers: [
                         {
-                            "peer-address": "185.236.240.33",
-                            "peer-asn": 65001,
+                            "peer-address": "127.0.0.1",
+                            "peer-asn": 65003,
                             "my-asn": 65002,
                         },
                     ],
+                    // Public IP address pools. Keep in sync with k0.calico.yaml.
                     addressPools: [
                         {
                             name: "public-v4-1",
diff --git a/cluster/kube/lib/calico-bird-ipam.cfg.template b/cluster/kube/lib/calico-bird-ipam.cfg.template
new file mode 100644
index 0000000..869a480
--- /dev/null
+++ b/cluster/kube/lib/calico-bird-ipam.cfg.template
@@ -0,0 +1,66 @@
+# This is forked from bird.cfg.template from calico running on k0.hswaw.net on 2020/09/21.
+# Changed vs. upstream (C-f HSCLOUD):
+#  - do not program RTD_UNREACHABLE routes into the kernel (these come from metallb, and
+#    programming them seems to break things)
+# Generated by confd
+filter calico_export_to_bgp_peers {
+  calico_aggr();
+{{- $static_key := "/staticroutes"}}
+{{- if ls $static_key}}
+
+  # Export static routes.
+  {{- range ls $static_key}}
+    {{- $parts := split . "-"}}
+    {{- $cidr := join $parts "/"}}
+  if ( net ~ {{$cidr}} ) then { accept; }
+  {{- end}}
+{{- end}}
+{{range ls "/v1/ipam/v4/pool"}}{{$data := json (getv (printf "/v1/ipam/v4/pool/%s" .))}}
+  if ( net ~ {{$data.cidr}} ) then {
+    accept;
+  }
+{{- end}}
+  reject;
+}
+
+{{$network_key := printf "/bgp/v1/host/%s/network_v4" (getenv "NODENAME")}}
+filter calico_kernel_programming {
+{{- $reject_key := "/rejectcidrs"}}
+{{- if ls $reject_key}}
+
+  if ( dest = RTD_UNREACHABLE ) then { # HSCLOUD
+    reject;
+  }
+
+  # Don't program static routes into kernel.
+  {{- range ls $reject_key}}
+    {{- $parts := split . "-"}}
+    {{- $cidr := join $parts "/"}}
+  if ( net ~ {{$cidr}} ) then { reject; }
+  {{- end}}
+
+{{- end}}
+{{- if exists $network_key}}{{$network := getv $network_key}}
+{{range ls "/v1/ipam/v4/pool"}}{{$data := json (getv (printf "/v1/ipam/v4/pool/%s" .))}}
+  if ( net ~ {{$data.cidr}} ) then {
+{{- if $data.vxlan_mode}}
+    # Don't program VXLAN routes into the kernel - these are handled by Felix.
+    reject;
+  }
+{{- else if $data.ipip_mode}}{{if eq $data.ipip_mode "cross-subnet"}}
+    if defined(bgp_next_hop) && ( bgp_next_hop ~ {{$network}} ) then
+      krt_tunnel = "";                     {{- /* Destination in ipPool, mode is cross sub-net, route from-host on subnet, do not use IPIP */}}
+    else
+      krt_tunnel = "{{$data.ipip}}";       {{- /* Destination in ipPool, mode is cross sub-net, route from-host off subnet, set the tunnel (if IPIP not enabled, value will be "") */}}
+    accept;
+  } {{- else}}
+    krt_tunnel = "{{$data.ipip}}";         {{- /* Destination in ipPool, mode not cross sub-net, set the tunnel (if IPIP not enabled, value will be "") */}}
+    accept;
+  } {{- end}} {{- else}}
+    krt_tunnel = "{{$data.ipip}}";         {{- /* Destination in ipPool, mode field is not present, set the tunnel (if IPIP not enabled, value will be "") */}}
+    accept;
+  } {{- end}}
+{{end}}
+{{- end}}{{/* End of 'exists $network_key' */}}
+  accept;                                  {{- /* Destination is not in any ipPool, accept  */}}
+}
diff --git a/cluster/kube/lib/calico-bird.cfg.template b/cluster/kube/lib/calico-bird.cfg.template
new file mode 100644
index 0000000..8a79deb
--- /dev/null
+++ b/cluster/kube/lib/calico-bird.cfg.template
@@ -0,0 +1,164 @@
+# This is forked from bird.cfg.template from calico running on k0.hswaw.net on 2020/09/21.
+# Changed vs. upstream (C-f HSCLOUD):
+#  - set 'passive on' on 127.0.0.1 neighbors, used for estabilishing connectivity
+#    with metallb.
+# Generated by confd
+include "bird_aggr.cfg";
+include "bird_ipam.cfg";
+
+{{- $node_ip_key := printf "/host/%s/ip_addr_v4" (getenv "NODENAME")}}{{$node_ip := getv $node_ip_key}}
+{{- $router_id := getenv "CALICO_ROUTER_ID" ""}}
+
+{{- $node_name := getenv "NODENAME"}}
+
+router id {{if eq "hash" ($router_id) -}}
+	{{hashToIPv4 $node_name}};
+{{- else -}}
+	{{if ne "" ($router_id)}}{{$router_id}}{{else}}{{$node_ip}}{{end}};
+{{- end}}
+
+{{- define "LOGGING"}}
+{{- $node_logging_key := printf "/host/%s/loglevel" (getenv "NODENAME")}}
+{{- if exists $node_logging_key}}
+{{- $logging := getv $node_logging_key}}
+{{- if eq $logging "debug"}}
+  debug all;
+{{- else if ne $logging "none"}}
+  debug { states };
+{{- end}}
+{{- else if exists "/global/loglevel"}}
+{{- $logging := getv "/global/loglevel"}}
+{{- if eq $logging "debug"}}
+  debug all;
+{{- else if ne $logging "none"}}
+  debug { states };
+{{- end}}
+{{- else}}
+  debug { states };
+{{- end}}
+{{- end}}
+
+# Configure synchronization between routing tables and kernel.
+protocol kernel {
+  learn;             # Learn all alien routes from the kernel
+  persist;           # Don't remove routes on bird shutdown
+  scan time 2;       # Scan kernel routing table every 2 seconds
+  import all;
+  export filter calico_kernel_programming; # Default is export none
+  graceful restart;  # Turn on graceful restart to reduce potential flaps in
+                     # routes when reloading BIRD configuration.  With a full
+                     # automatic mesh, there is no way to prevent BGP from
+                     # flapping since multiple nodes update their BGP
+                     # configuration at the same time, GR is not guaranteed to
+                     # work correctly in this scenario.
+}
+
+# Watch interface up/down events.
+protocol device {
+{{- template "LOGGING"}}
+  scan time 2;    # Scan interfaces every 2 seconds
+}
+
+protocol direct {
+{{- template "LOGGING"}}
+  interface -"cali*", -"kube-ipvs*", "*"; # Exclude cali* and kube-ipvs* but
+                                          # include everything else.  In
+                                          # IPVS-mode, kube-proxy creates a
+                                          # kube-ipvs0 interface. We exclude
+                                          # kube-ipvs0 because this interface
+                                          # gets an address for every in use
+                                          # cluster IP. We use static routes
+                                          # for when we legitimately want to
+                                          # export cluster IPs.
+}
+
+{{if eq "" ($node_ip)}}# IPv4 disabled on this node.
+{{else}}{{$node_as_key := printf "/host/%s/as_num" (getenv "NODENAME")}}
+# Template for all BGP clients
+template bgp bgp_template {
+{{- $as_key := or (and (exists $node_as_key) $node_as_key) "/global/as_num"}}
+{{- $node_as_num := getv $as_key}}
+{{- template "LOGGING"}}
+  description "Connection to BGP peer";
+  local as {{$node_as_num}};
+  multihop;
+  gateway recursive; # This should be the default, but just in case.
+  import all;        # Import all routes, since we don't know what the upstream
+                     # topology is and therefore have to trust the ToR/RR.
+  export filter calico_export_to_bgp_peers;  # Only want to export routes for workloads.
+  source address {{$node_ip}};  # The local address we use for the TCP connection
+  add paths on;
+  graceful restart;  # See comment in kernel section about graceful restart.
+  connect delay time 2;
+  connect retry time 5;
+  error wait time 5,30;
+}
+
+# ------------- Node-to-node mesh -------------
+{{- $node_cid_key := printf "/host/%s/rr_cluster_id" (getenv "NODENAME")}}
+{{- $node_cluster_id := getv $node_cid_key}}
+{{if (json (getv "/global/node_mesh")).enabled}}
+{{range $host := lsdir "/host"}}
+{{$onode_as_key := printf "/host/%s/as_num" .}}
+{{$onode_ip_key := printf "/host/%s/ip_addr_v4" .}}{{if exists $onode_ip_key}}{{$onode_ip := getv $onode_ip_key}}
+{{$nums := split $onode_ip "."}}{{$id := join $nums "_"}}
+# For peer {{$onode_ip_key}}
+{{if eq $onode_ip ($node_ip) }}# Skipping ourselves ({{$node_ip}})
+{{else if ne "" $onode_ip}}protocol bgp Mesh_{{$id}} from bgp_template {
+  neighbor {{$onode_ip}} as {{if exists $onode_as_key}}{{getv $onode_as_key}}{{else}}{{getv "/global/as_num"}}{{end}};
+  {{- /*
+       Make the peering unidirectional. This avoids a race where
+       - peer A opens a connection and begins a graceful restart
+       - before the restart completes, peer B opens its connection
+       - peer A sees the new connection and aborts the graceful restart, causing a route flap.
+  */ -}}
+  {{if gt $onode_ip $node_ip}}
+  passive on; # Mesh is unidirectional, peer will connect to us. 
+  {{- end}}
+}{{end}}{{end}}{{end}}
+{{else}}
+# Node-to-node mesh disabled
+{{end}}
+
+
+# ------------- Global peers -------------
+{{if ls "/global/peer_v4"}}
+{{range gets "/global/peer_v4/*"}}{{$data := json .Value}}
+{{$nums := split $data.ip "."}}{{$id := join $nums "_"}}
+# For peer {{.Key}}
+{{- if eq $data.ip ($node_ip) }}
+# Skipping ourselves ({{$node_ip}})
+{{- else}}
+protocol bgp Global_{{$id}} from bgp_template {
+  {{if eq $data.ip ("127.0.0.1")}}passive on; # HSCLOUD {{end}}
+  neighbor {{$data.ip}} as {{$data.as_num}};
+{{- if and (eq $data.as_num $node_as_num) (ne "" ($node_cluster_id)) (ne $data.rr_cluster_id ($node_cluster_id))}}
+  rr client;
+  rr cluster id {{$node_cluster_id}};
+{{- end}}
+}
+{{- end}}
+{{end}}
+{{else}}# No global peers configured.{{end}}
+
+
+# ------------- Node-specific peers -------------
+{{$node_peers_key := printf "/host/%s/peer_v4" (getenv "NODENAME")}}
+{{if ls $node_peers_key}}
+{{range gets (printf "%s/*" $node_peers_key)}}{{$data := json .Value}}
+{{$nums := split $data.ip "."}}{{$id := join $nums "_"}}
+# For peer {{.Key}}
+{{- if eq $data.ip ($node_ip) }}
+# Skipping ourselves ({{$node_ip}})
+{{- else}}
+protocol bgp Node_{{$id}} from bgp_template {
+  neighbor {{$data.ip}} as {{$data.as_num}};
+{{- if and (eq $data.as_num $node_as_num) (ne "" ($node_cluster_id)) (ne $data.rr_cluster_id ($node_cluster_id))}}
+  rr client;
+  rr cluster id {{$node_cluster_id}};
+{{- end}}
+}
+{{- end}}
+{{end}}
+{{else}}# No node-specific peers configured.{{end}}
+{{end}}{{/* End of IPv4 enable check */}}
diff --git a/cluster/kube/lib/calico.libsonnet b/cluster/kube/lib/calico.libsonnet
index b5c83a7..1e2d503 100644
--- a/cluster/kube/lib/calico.libsonnet
+++ b/cluster/kube/lib/calico.libsonnet
@@ -230,6 +230,17 @@
             },
         },
 
+        # ConfigMap that holds overriden bird.cfg.template and bird_ipam.cfg.template.
+        calicoMetallbBird: kube.ConfigMap("calico-metallb-bird") {
+            metadata+: {
+                namespace: cfg.namespace,
+            },
+            data: {
+                "bird.cfg.template": (importstr "calico-bird.cfg.template"),
+                "bird_ipam.cfg.template": (importstr "calico-bird-ipam.cfg.template"),
+            },
+        },
+
         nodeDaemon: kube.DaemonSet("calico-node") {
             metadata+: {
                 namespace: cfg.namespace,
@@ -258,6 +269,7 @@
                             xtables_lock: kube.HostPathVolume("/run/xtables.lock"),
                             var_run_calico: kube.HostPathVolume("/var/run/calico"),
                             var_lib_calico: kube.HostPathVolume("/var/lib/calico"),
+                            bird_cfg_template: kube.ConfigMapVolume(env.calicoMetallbBird),
                         },
                         initContainers_: {
                             installCNI: kube.Container("install-cni") {
@@ -335,6 +347,16 @@
                                     var_lib_calico: { mountPath: "/var/lib/calico" },
                                     secrets: { mountPath: env.cm.secretPrefix },
                                 },
+                                volumeMounts+: [
+                                    { name: "bird-cfg-template",
+                                      mountPath: "/etc/calico/confd/templates/bird.cfg.template",
+                                      subPath: "bird.cfg.template"
+                                    },
+                                    { name: "bird-cfg-template",
+                                      mountPath: "/etc/calico/confd/templates/bird_ipam.cfg.template",
+                                      subPath: "bird_ipam.cfg.template"
+                                    },
+                                ],
                             },
                         },
                     },