about summary refs log tree commit diff
path: root/nixpkgs/pkgs/build-support/docker/stream_layered_image.py
diff options
context:
space:
mode:
Diffstat (limited to 'nixpkgs/pkgs/build-support/docker/stream_layered_image.py')
-rw-r--r--nixpkgs/pkgs/build-support/docker/stream_layered_image.py398
1 files changed, 398 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/build-support/docker/stream_layered_image.py b/nixpkgs/pkgs/build-support/docker/stream_layered_image.py
new file mode 100644
index 000000000000..98ec2358cef9
--- /dev/null
+++ b/nixpkgs/pkgs/build-support/docker/stream_layered_image.py
@@ -0,0 +1,398 @@
+"""
+This script generates a Docker image from a set of store paths. Uses
+Docker Image Specification v1.2 as reference [1].
+
+It expects a JSON file with the following properties and writes the
+image as an uncompressed tarball to stdout:
+
+* "architecture", "config", "os", "created", "repo_tag" correspond to
+  the fields with the same name on the image spec [2].
+* "created" can be "now".
+* "created" is also used as mtime for files added to the image.
+* "uid", "gid", "uname", "gname" is the file ownership, for example,
+  0, 0, "root", "root".
+* "store_layers" is a list of layers in ascending order, where each
+  layer is the list of store paths to include in that layer.
+
+The main challenge for this script to create the final image in a
+streaming fashion, without dumping any intermediate data to disk
+for performance.
+
+A docker image has each layer contents archived as separate tarballs,
+and they later all get enveloped into a single big tarball in a
+content addressed fashion. However, because how "tar" format works,
+we have to know about the name (which includes the checksum in our
+case) and the size of the tarball before we can start adding it to the
+outer tarball.  We achieve that by creating the layer tarballs twice;
+on the first iteration we calculate the file size and the checksum,
+and on the second one we actually stream the contents. 'add_layer_dir'
+function does all this.
+
+[1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
+[2]: https://github.com/moby/moby/blob/4fb59c20a4fb54f944fe170d0ff1d00eb4a24d6f/image/spec/v1.2.md#image-json-field-descriptions
+"""  # noqa: E501
+
+
+import io
+import os
+import re
+import sys
+import json
+import hashlib
+import pathlib
+import tarfile
+import itertools
+import threading
+from datetime import datetime, timezone
+from collections import namedtuple
+
+
+def archive_paths_to(obj, paths, mtime, uid, gid, uname, gname):
+    """
+    Writes the given store paths as a tar file to the given stream.
+
+    obj: Stream to write to. Should have a 'write' method.
+    paths: List of store paths.
+    """
+
+    # gettarinfo makes the paths relative, this makes them
+    # absolute again
+    def append_root(ti):
+        ti.name = "/" + ti.name
+        return ti
+
+    def apply_filters(ti):
+        ti.mtime = mtime
+        ti.uid = uid
+        ti.gid = gid
+        ti.uname = uname
+        ti.gname = gname
+        return ti
+
+    def nix_root(ti):
+        ti.mode = 0o0755  # rwxr-xr-x
+        return ti
+
+    def dir(path):
+        ti = tarfile.TarInfo(path)
+        ti.type = tarfile.DIRTYPE
+        return ti
+
+    with tarfile.open(fileobj=obj, mode="w|") as tar:
+        # To be consistent with the docker utilities, we need to have
+        # these directories first when building layer tarballs.
+        tar.addfile(apply_filters(nix_root(dir("/nix"))))
+        tar.addfile(apply_filters(nix_root(dir("/nix/store"))))
+
+        for path in paths:
+            path = pathlib.Path(path)
+            if path.is_symlink():
+                files = [path]
+            else:
+                files = itertools.chain([path], path.rglob("*"))
+
+            for filename in sorted(files):
+                ti = append_root(tar.gettarinfo(filename))
+
+                # copy hardlinks as regular files
+                if ti.islnk():
+                    ti.type = tarfile.REGTYPE
+                    ti.linkname = ""
+                    ti.size = filename.stat().st_size
+
+                ti = apply_filters(ti)
+                if ti.isfile():
+                    with open(filename, "rb") as f:
+                        tar.addfile(ti, f)
+                else:
+                    tar.addfile(ti)
+
+
+class ExtractChecksum:
+    """
+    A writable stream which only calculates the final file size and
+    sha256sum, while discarding the actual contents.
+    """
+
+    def __init__(self):
+        self._digest = hashlib.sha256()
+        self._size = 0
+
+    def write(self, data):
+        self._digest.update(data)
+        self._size += len(data)
+
+    def extract(self):
+        """
+        Returns: Hex-encoded sha256sum and size as a tuple.
+        """
+        return (self._digest.hexdigest(), self._size)
+
+
+FromImage = namedtuple("FromImage", ["tar", "manifest_json", "image_json"])
+# Some metadata for a layer
+LayerInfo = namedtuple("LayerInfo", ["size", "checksum", "path", "paths"])
+
+
+def load_from_image(from_image_str):
+    """
+    Loads the given base image, if any.
+
+    from_image_str: Path to the base image archive.
+
+    Returns: A 'FromImage' object with references to the loaded base image,
+             or 'None' if no base image was provided.
+    """
+    if from_image_str is None:
+        return None
+
+    base_tar = tarfile.open(from_image_str)
+
+    manifest_json_tarinfo = base_tar.getmember("manifest.json")
+    with base_tar.extractfile(manifest_json_tarinfo) as f:
+        manifest_json = json.load(f)
+
+    image_json_tarinfo = base_tar.getmember(manifest_json[0]["Config"])
+    with base_tar.extractfile(image_json_tarinfo) as f:
+        image_json = json.load(f)
+
+    return FromImage(base_tar, manifest_json, image_json)
+
+
+def add_base_layers(tar, from_image):
+    """
+    Adds the layers from the given base image to the final image.
+
+    tar: 'tarfile.TarFile' object for new layers to be added to.
+    from_image: 'FromImage' object with references to the loaded base image.
+    """
+    if from_image is None:
+        print("No 'fromImage' provided", file=sys.stderr)
+        return []
+
+    layers = from_image.manifest_json[0]["Layers"]
+    checksums = from_image.image_json["rootfs"]["diff_ids"]
+    layers_checksums = zip(layers, checksums)
+
+    for num, (layer, checksum) in enumerate(layers_checksums, start=1):
+        layer_tarinfo = from_image.tar.getmember(layer)
+        checksum = re.sub(r"^sha256:", "", checksum)
+
+        tar.addfile(layer_tarinfo, from_image.tar.extractfile(layer_tarinfo))
+        path = layer_tarinfo.path
+        size = layer_tarinfo.size
+
+        print("Adding base layer", num, "from", path, file=sys.stderr)
+        yield LayerInfo(size=size, checksum=checksum, path=path, paths=[path])
+
+    from_image.tar.close()
+
+
+def overlay_base_config(from_image, final_config):
+    """
+    Overlays the final image 'config' JSON on top of selected defaults from the
+    base image 'config' JSON.
+
+    from_image: 'FromImage' object with references to the loaded base image.
+    final_config: 'dict' object of the final image 'config' JSON.
+    """
+    if from_image is None:
+        return final_config
+
+    base_config = from_image.image_json["config"]
+
+    # Preserve environment from base image
+    final_env = base_config.get("Env", []) + final_config.get("Env", [])
+    if final_env:
+        # Resolve duplicates (last one wins) and format back as list
+        resolved_env = {entry.split("=", 1)[0]: entry for entry in final_env}
+        final_config["Env"] = list(resolved_env.values())
+    return final_config
+
+
+def add_layer_dir(tar, paths, store_dir, mtime, uid, gid, uname, gname):
+    """
+    Appends given store paths to a TarFile object as a new layer.
+
+    tar: 'tarfile.TarFile' object for the new layer to be added to.
+    paths: List of store paths.
+    store_dir: the root directory of the nix store
+    mtime: 'mtime' of the added files and the layer tarball.
+           Should be an integer representing a POSIX time.
+
+    Returns: A 'LayerInfo' object containing some metadata of
+             the layer added.
+    """
+
+    invalid_paths = [i for i in paths if not i.startswith(store_dir)]
+    assert len(invalid_paths) == 0, \
+        f"Expecting absolute paths from {store_dir}, but got: {invalid_paths}"
+
+    # First, calculate the tarball checksum and the size.
+    extract_checksum = ExtractChecksum()
+    archive_paths_to(
+        extract_checksum,
+        paths,
+        mtime, uid, gid, uname, gname
+    )
+    (checksum, size) = extract_checksum.extract()
+
+    path = f"{checksum}/layer.tar"
+    layer_tarinfo = tarfile.TarInfo(path)
+    layer_tarinfo.size = size
+    layer_tarinfo.mtime = mtime
+
+    # Then actually stream the contents to the outer tarball.
+    read_fd, write_fd = os.pipe()
+    with open(read_fd, "rb") as read, open(write_fd, "wb") as write:
+        def producer():
+            archive_paths_to(
+                write,
+                paths,
+                mtime, uid, gid, uname, gname
+            )
+            write.close()
+
+        # Closing the write end of the fifo also closes the read end,
+        # so we don't need to wait until this thread is finished.
+        #
+        # Any exception from the thread will get printed by the default
+        # exception handler, and the 'addfile' call will fail since it
+        # won't be able to read required amount of bytes.
+        threading.Thread(target=producer).start()
+        tar.addfile(layer_tarinfo, read)
+
+    return LayerInfo(size=size, checksum=checksum, path=path, paths=paths)
+
+
+def add_customisation_layer(target_tar, customisation_layer, mtime):
+    """
+    Adds the customisation layer as a new layer. This is layer is structured
+    differently; given store path has the 'layer.tar' and corresponding
+    sha256sum ready.
+
+    tar: 'tarfile.TarFile' object for the new layer to be added to.
+    customisation_layer: Path containing the layer archive.
+    mtime: 'mtime' of the added layer tarball.
+    """
+
+    checksum_path = os.path.join(customisation_layer, "checksum")
+    with open(checksum_path) as f:
+        checksum = f.read().strip()
+    assert len(checksum) == 64, f"Invalid sha256 at ${checksum_path}."
+
+    layer_path = os.path.join(customisation_layer, "layer.tar")
+
+    path = f"{checksum}/layer.tar"
+    tarinfo = target_tar.gettarinfo(layer_path)
+    tarinfo.name = path
+    tarinfo.mtime = mtime
+
+    with open(layer_path, "rb") as f:
+        target_tar.addfile(tarinfo, f)
+
+    return LayerInfo(
+      size=None,
+      checksum=checksum,
+      path=path,
+      paths=[customisation_layer]
+    )
+
+
+def add_bytes(tar, path, content, mtime):
+    """
+    Adds a file to the tarball with given path and contents.
+
+    tar: 'tarfile.TarFile' object.
+    path: Path of the file as a string.
+    content: Contents of the file.
+    mtime: 'mtime' of the file. Should be an integer representing a POSIX time.
+    """
+    assert type(content) is bytes
+
+    ti = tarfile.TarInfo(path)
+    ti.size = len(content)
+    ti.mtime = mtime
+    tar.addfile(ti, io.BytesIO(content))
+
+
+def main():
+    with open(sys.argv[1], "r") as f:
+        conf = json.load(f)
+
+    created = (
+      datetime.now(tz=timezone.utc)
+      if conf["created"] == "now"
+      else datetime.fromisoformat(conf["created"])
+    )
+    mtime = int(created.timestamp())
+    uid = int(conf["uid"])
+    gid = int(conf["gid"])
+    uname = conf["uname"]
+    gname = conf["gname"]
+    store_dir = conf["store_dir"]
+
+    from_image = load_from_image(conf["from_image"])
+
+    with tarfile.open(mode="w|", fileobj=sys.stdout.buffer) as tar:
+        layers = []
+        layers.extend(add_base_layers(tar, from_image))
+
+        start = len(layers) + 1
+        for num, store_layer in enumerate(conf["store_layers"], start=start):
+            print("Creating layer", num, "from paths:", store_layer,
+                  file=sys.stderr)
+            info = add_layer_dir(tar, store_layer, store_dir,
+                                 mtime, uid, gid, uname, gname)
+            layers.append(info)
+
+        print("Creating layer", len(layers) + 1, "with customisation...",
+              file=sys.stderr)
+        layers.append(
+          add_customisation_layer(
+            tar,
+            conf["customisation_layer"],
+            mtime=mtime
+          )
+        )
+
+        print("Adding manifests...", file=sys.stderr)
+
+        image_json = {
+            "created": datetime.isoformat(created),
+            "architecture": conf["architecture"],
+            "os": "linux",
+            "config": overlay_base_config(from_image, conf["config"]),
+            "rootfs": {
+                "diff_ids": [f"sha256:{layer.checksum}" for layer in layers],
+                "type": "layers",
+            },
+            "history": [
+                {
+                  "created": datetime.isoformat(created),
+                  "comment": f"store paths: {layer.paths}"
+                }
+                for layer in layers
+            ],
+        }
+
+        image_json = json.dumps(image_json, indent=4).encode("utf-8")
+        image_json_checksum = hashlib.sha256(image_json).hexdigest()
+        image_json_path = f"{image_json_checksum}.json"
+        add_bytes(tar, image_json_path, image_json, mtime=mtime)
+
+        manifest_json = [
+            {
+                "Config": image_json_path,
+                "RepoTags": [conf["repo_tag"]],
+                "Layers": [layer.path for layer in layers],
+            }
+        ]
+        manifest_json = json.dumps(manifest_json, indent=4).encode("utf-8")
+        add_bytes(tar, "manifest.json", manifest_json, mtime=mtime)
+
+        print("Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()