summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--nixos/modules/services/networking/dhcpcd.nix5
-rw-r--r--nixos/modules/virtualisation/containers.nix113
-rw-r--r--nixos/modules/virtualisation/run-in-netns.c50
3 files changed, 155 insertions, 13 deletions
diff --git a/nixos/modules/services/networking/dhcpcd.nix b/nixos/modules/services/networking/dhcpcd.nix
index d4ec96a18f6f..38a21a723d97 100644
--- a/nixos/modules/services/networking/dhcpcd.nix
+++ b/nixos/modules/services/networking/dhcpcd.nix
@@ -34,8 +34,9 @@ let
 
       # Ignore peth* devices; on Xen, they're renamed physical
       # Ethernet cards used for bridging.  Likewise for vif* and tap*
-      # (Xen) and virbr* and vnet* (libvirt).
-      denyinterfaces ${toString ignoredInterfaces} peth* vif* tap* tun* virbr* vnet* vboxnet*
+      # (Xen) and virbr* and vnet* (libvirt) and c-* and ctmp-* (NixOS
+      # containers).
+      denyinterfaces ${toString ignoredInterfaces} peth* vif* tap* tun* virbr* vnet* vboxnet* c-* ctmp-*
 
       ${config.networking.dhcpcd.extraConfig}
     '';
diff --git a/nixos/modules/virtualisation/containers.nix b/nixos/modules/virtualisation/containers.nix
index 6a4833e1e215..49046975d83e 100644
--- a/nixos/modules/virtualisation/containers.nix
+++ b/nixos/modules/virtualisation/containers.nix
@@ -2,6 +2,20 @@
 
 with pkgs.lib;
 
+let
+
+  runInNetns = pkgs.stdenv.mkDerivation {
+    name = "run-in-netns";
+    unpackPhase = "true";
+    buildPhase = ''
+      mkdir -p $out/bin
+      gcc ${./run-in-netns.c} -o $out/bin/run-in-netns
+    '';
+    installPhase = "true";
+  };
+
+in
+
 {
   options = {
 
@@ -45,6 +59,39 @@ with pkgs.lib;
               '';
             };
 
+            privateNetwork = mkOption {
+              type = types.bool;
+              default = false;
+              description = ''
+                Whether to give the container its own private virtual
+                Ethernet interface.  The interface is called
+                <literal>eth0</literal>, and is hooked up to the interface
+                <literal>c-<replaceable>container-name</replaceable></literal>
+                on the host.  If this option is not set, then the
+                container shares the network interfaces of the host,
+                and can bind to any port on any interface.
+              '';
+            };
+
+            hostAddress = mkOption {
+              type = types.nullOr types.string;
+              default = null;
+              example = "10.231.136.1";
+              description = ''
+                The IPv4 address assigned to the host interface.
+              '';
+            };
+
+            localAddress = mkOption {
+              type = types.nullOr types.string;
+              default = null;
+              example = "10.231.136.2";
+              description = ''
+                The IPv4 address assigned to <literal>eth0</literal>
+                in the container.
+              '';
+            };
+
           };
 
           config = mkMerge
@@ -97,32 +144,70 @@ with pkgs.lib;
 
   config = {
 
-    systemd.services = mapAttrs' (name: container: nameValuePair "container-${name}"
-      { description = "Container '${name}'";
+    systemd.services = mapAttrs' (name: cfg:
+      let
+        # FIXME: interface names have a maximum length.
+        ifaceHost = "c-${name}";
+        ifaceCont = "ctmp-${name}";
+        ns = "net-${name}";
+      in
+      nameValuePair "container-${name}" {
+        description = "Container '${name}'";
 
         wantedBy = [ "multi-user.target" ];
 
-        unitConfig.RequiresMountsFor = [ container.root ];
+        unitConfig.RequiresMountsFor = [ cfg.root ];
+
+        path = [ pkgs.iproute ];
 
         preStart =
           ''
-            mkdir -p -m 0755 ${container.root}/etc
-            if ! [ -e ${container.root}/etc/os-release ]; then
-              touch ${container.root}/etc/os-release
+            mkdir -p -m 0755 ${cfg.root}/etc
+            if ! [ -e ${cfg.root}/etc/os-release ]; then
+              touch ${cfg.root}/etc/os-release
             fi
 
             mkdir -p -m 0755 \
               /nix/var/nix/profiles/per-container/${name} \
               /nix/var/nix/gcroots/per-container/${name}
+          ''
+
+          + optionalString cfg.privateNetwork ''
+            # Cleanup from last time.
+            ip netns del ${ns} 2> /dev/null || true
+            ip link del ${ifaceHost} 2> /dev/null || true
+            ip link del ${ifaceCont} 2> /dev/null || true
+
+            # Create a pair of virtual ethernet devices.  On the host,
+            # we get ‘c-<container-name’, and on the guest, we get
+            # ‘eth0’.
+            set -x
+            ip link add ${ifaceHost} type veth peer name ${ifaceCont}
+            ip netns add ${ns}
+            ip link set ${ifaceCont} netns ${ns}
+            ip netns exec ${ns} ip link set ${ifaceCont} name eth0
+            ip netns exec ${ns} ip link set dev eth0 up
+            ip link set dev ${ifaceHost} up
+            ${optionalString (cfg.hostAddress != null) ''
+              ip addr add ${cfg.hostAddress} dev ${ifaceHost}
+              ip netns exec ${ns} ip route add ${cfg.hostAddress} dev eth0
+              ip netns exec ${ns} ip route add default via ${cfg.hostAddress}
+            ''}
+            ${optionalString (cfg.localAddress != null) ''
+              ip netns exec ${ns} ip addr add ${cfg.localAddress} dev eth0
+              ip route add ${cfg.localAddress} dev ${ifaceHost}
+            ''}
           '';
 
         serviceConfig.ExecStart =
-          "${config.systemd.package}/bin/systemd-nspawn"
-          + " -M ${name} -D ${container.root}"
+          (optionalString cfg.privateNetwork "${runInNetns}/bin/run-in-netns ${ns} ")
+          + "${config.systemd.package}/bin/systemd-nspawn"
+          + (optionalString cfg.privateNetwork " --capability=CAP_NET_ADMIN")
+          + " -M ${name} -D ${cfg.root}"
           + " --bind-ro=/nix/store --bind-ro=/nix/var/nix/db --bind-ro=/nix/var/nix/daemon-socket"
           + " --bind=/nix/var/nix/profiles/per-container/${name}:/nix/var/nix/profiles"
           + " --bind=/nix/var/nix/gcroots/per-container/${name}:/nix/var/nix/gcroots"
-          + " ${container.path}/init";
+          + " ${cfg.path}/init";
 
         preStop =
           ''
@@ -146,10 +231,16 @@ with pkgs.lib;
 
         serviceConfig.ExecReload =
           "${pkgs.bash}/bin/bash -c '"
-          + "echo ${container.path}/bin/switch-to-configuration test "
-          + "| ${pkgs.socat}/bin/socat unix:${container.root}/var/lib/root-shell.socket -'";
+          + "echo ${cfg.path}/bin/switch-to-configuration test "
+          + "| ${pkgs.socat}/bin/socat unix:${cfg.root}/var/lib/root-shell.socket -'";
 
       }) config.systemd.containers;
 
+    # Generate /etc/hosts entries for the containers.
+    networking.extraHosts = concatStrings (mapAttrsToList (name: cfg: optionalString (cfg.localAddress != null)
+      ''
+        ${cfg.localAddress} ${name}.containers
+      '') config.systemd.containers);
+
   };
 }
diff --git a/nixos/modules/virtualisation/run-in-netns.c b/nixos/modules/virtualisation/run-in-netns.c
new file mode 100644
index 000000000000..d375bddf2e6b
--- /dev/null
+++ b/nixos/modules/virtualisation/run-in-netns.c
@@ -0,0 +1,50 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include <unistd.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+
+int main(int argc, char * * argv)
+{
+    if (argc < 3) {
+        fprintf(stderr, "%s: missing arguments\n", argv[0]);
+        return 1;
+    }
+
+    char nsPath[PATH_MAX];
+
+    sprintf(nsPath, "/run/netns/%s", argv[1]);
+
+    int fd = open(nsPath, O_RDONLY);
+    if (fd == -1) {
+        fprintf(stderr, "%s: opening network namespace: %s\n", argv[0], strerror(errno));
+        return 1;
+    }
+
+    if (setns(fd, CLONE_NEWNET) == -1) {
+        fprintf(stderr, "%s: setting network namespace: %s\n", argv[0], strerror(errno));
+        return 1;
+    }
+
+    umount2(nsPath, MNT_DETACH);
+    if (unlink(nsPath) == -1) {
+        fprintf(stderr, "%s: unlinking network namespace: %s\n", argv[0], strerror(errno));
+        return 1;
+    }
+
+    /* FIXME: Remount /sys so that /sys/class/net reflects the
+       interfaces visible in the network namespace. This requires
+       bind-mounting /sys/fs/cgroups etc. */
+
+    execv(argv[2], argv + 2);
+    fprintf(stderr, "%s: running command: %s\n", argv[0], strerror(errno));
+    return 1;
+}