summary refs log tree commit diff
path: root/pkgs/build-support/references-by-popularity/closure-graph.py
diff options
context:
space:
mode:
Diffstat (limited to 'pkgs/build-support/references-by-popularity/closure-graph.py')
-rw-r--r--pkgs/build-support/references-by-popularity/closure-graph.py520
1 files changed, 520 insertions, 0 deletions
diff --git a/pkgs/build-support/references-by-popularity/closure-graph.py b/pkgs/build-support/references-by-popularity/closure-graph.py
new file mode 100644
index 000000000000..d67a5dfcf140
--- /dev/null
+++ b/pkgs/build-support/references-by-popularity/closure-graph.py
@@ -0,0 +1,520 @@
+# IMPORTANT: Making changes?
+#
+# Validate your changes with python3 ./closure-graph.py --test
+
+
+# Using a simple algorithm, convert the references to a path in to a
+# sorted list of dependent paths based on how often they're referenced
+# and how deep in the tree they live. Equally-"popular" paths are then
+# sorted by name.
+#
+# The existing writeReferencesToFile prints the paths in a simple
+# ascii-based sorting of the paths.
+#
+# Sorting the paths by graph improves the chances that the difference
+# between two builds appear near the end of the list, instead of near
+# the beginning. This makes a difference for Nix builds which export a
+# closure for another program to consume, if that program implements its
+# own level of binary diffing.
+#
+# For an example, Docker Images. If each store path is a separate layer
+# then Docker Images can be very efficiently transfered between systems,
+# and we get very good cache reuse between images built with the same
+# version of Nixpkgs. However, since Docker only reliably supports a
+# small number of layers (42) it is important to pick the individual
+# layers carefully. By storing very popular store paths in the first 40
+# layers, we improve the chances that the next Docker image will share
+# many of those layers.*
+#
+# Given the dependency tree:
+#
+#     A - B - C - D -\
+#      \   \   \      \
+#       \   \   \      \
+#        \   \ - E ---- F
+#         \- G
+#
+# Nodes which have multiple references are duplicated:
+#
+#     A - B - C - D - F
+#      \   \   \
+#       \   \   \- E - F
+#        \   \
+#         \   \- E - F
+#          \
+#           \- G
+#
+# Each leaf node is now replaced by a counter defaulted to 1:
+#
+#     A - B - C - D - (F:1)
+#      \   \   \
+#       \   \   \- E - (F:1)
+#        \   \
+#         \   \- E - (F:1)
+#          \
+#           \- (G:1)
+#
+# Then each leaf counter is merged with its parent node, replacing the
+# parent node with a counter of 1, and each existing counter being
+# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
+#
+#     A - B - C - (D:1, F:2)
+#      \   \   \
+#       \   \   \- (E:1, F:2)
+#        \   \
+#         \   \- (E:1, F:2)
+#          \
+#           \- (G:1)
+#
+# Then each leaf counter is merged with its parent node again, merging
+# any counters, then incrementing each:
+#
+#     A - B - (C:1, D:2, E:2, F:5)
+#      \   \
+#       \   \- (E:1, F:2)
+#        \
+#         \- (G:1)
+#
+# And again:
+#
+#     A - (B:1, C:2, D:3, E:4, F:8)
+#      \
+#       \- (G:1)
+#
+# And again:
+#
+#     (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
+#
+# and then paths have the following "popularity":
+#
+#     A     1
+#     B     2
+#     C     3
+#     D     4
+#     E     5
+#     F     9
+#     G     2
+#
+# and the popularity contest would result in the paths being printed as:
+#
+#     F
+#     E
+#     D
+#     C
+#     B
+#     G
+#     A
+#
+# * Note: People who have used a Dockerfile before assume Docker's
+# Layers are inherently ordered. However, this is not true -- Docker
+# layers are content-addressable and are not explicitly layered until
+# they are composed in to an Image.
+
+import sys
+import json
+import unittest
+
+from pprint import pprint
+from collections import defaultdict
+
+# Find paths in the original dataset which are never referenced by
+# any other paths
+def find_roots(closures):
+    roots = [];
+
+    for closure in closures:
+        path = closure['path']
+        if not any_refer_to(path, closures):
+            roots.append(path)
+
+    return roots
+
+class TestFindRoots(unittest.TestCase):
+    def test_find_roots(self):
+        self.assertCountEqual(
+            find_roots([
+                {
+                    "path": "/nix/store/foo",
+                    "references": [
+                        "/nix/store/foo",
+                        "/nix/store/bar"
+                    ]
+                },
+                {
+                    "path": "/nix/store/bar",
+                    "references": [
+                        "/nix/store/bar",
+                        "/nix/store/tux"
+                    ]
+                },
+                {
+                    "path": "/nix/store/hello",
+                    "references": [
+                    ]
+                }
+            ]),
+            ["/nix/store/foo", "/nix/store/hello"]
+        )
+
+
+def any_refer_to(path, closures):
+    for closure in closures:
+        if path != closure['path']:
+            if path in closure['references']:
+                return True
+    return False
+
+class TestAnyReferTo(unittest.TestCase):
+    def test_has_references(self):
+        self.assertTrue(
+            any_refer_to(
+                "/nix/store/bar",
+                [
+                    {
+                        "path": "/nix/store/foo",
+                        "references": [
+                            "/nix/store/bar"
+                        ]
+                    },
+                ]
+            ),
+        )
+    def test_no_references(self):
+        self.assertFalse(
+            any_refer_to(
+                "/nix/store/foo",
+                [
+                    {
+                        "path": "/nix/store/foo",
+                        "references": [
+                            "/nix/store/foo",
+                            "/nix/store/bar"
+                        ]
+                    },
+                ]
+            ),
+        )
+
+def all_paths(closures):
+    paths = []
+    for closure in closures:
+        paths.append(closure['path'])
+        paths.extend(closure['references'])
+    paths.sort()
+    return list(set(paths))
+
+
+class TestAllPaths(unittest.TestCase):
+    def test_returns_all_paths(self):
+        self.assertCountEqual(
+            all_paths([
+                {
+                    "path": "/nix/store/foo",
+                    "references": [
+                        "/nix/store/foo",
+                        "/nix/store/bar"
+                    ]
+                },
+                {
+                    "path": "/nix/store/bar",
+                    "references": [
+                        "/nix/store/bar",
+                        "/nix/store/tux"
+                    ]
+                },
+                {
+                    "path": "/nix/store/hello",
+                    "references": [
+                    ]
+                }
+            ]),
+            ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
+        )
+    def test_no_references(self):
+        self.assertFalse(
+            any_refer_to(
+                "/nix/store/foo",
+                [
+                    {
+                        "path": "/nix/store/foo",
+                        "references": [
+                            "/nix/store/foo",
+                            "/nix/store/bar"
+                        ]
+                    },
+                ]
+            ),
+        )
+
+# Convert:
+#
+# [
+#    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
+#    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
+#    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
+#    { path: /nix/store/tux, references: [ /nix/store/tux ] }
+#  ]
+#
+# To:
+#    {
+#      /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
+#      /nix/store/bar: [ /nix/store/baz ],
+#      /nix/store/baz: [ /nix/store/tux ] },
+#      /nix/store/tux: [ ]
+#    }
+#
+# Note that it drops self-references to avoid loops.
+def make_lookup(closures):
+    lookup = {}
+
+    for closure in closures:
+        # paths often self-refer
+        nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']]
+        lookup[closure['path']] = nonreferential_paths
+
+    return lookup
+
+class TestMakeLookup(unittest.TestCase):
+    def test_returns_lookp(self):
+        self.assertDictEqual(
+            make_lookup([
+                {
+                    "path": "/nix/store/foo",
+                    "references": [
+                        "/nix/store/foo",
+                        "/nix/store/bar"
+                    ]
+                },
+                {
+                    "path": "/nix/store/bar",
+                    "references": [
+                        "/nix/store/bar",
+                        "/nix/store/tux"
+                    ]
+                },
+                {
+                    "path": "/nix/store/hello",
+                    "references": [
+                    ]
+                }
+            ]),
+            {
+                "/nix/store/foo": [ "/nix/store/bar" ],
+                "/nix/store/bar": [ "/nix/store/tux" ],
+                "/nix/store/hello": [ ],
+            }
+        )
+
+# Convert:
+#
+# /nix/store/foo with
+#  {
+#    /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
+#    /nix/store/bar: [ /nix/store/baz ],
+#    /nix/store/baz: [ /nix/store/tux ] },
+#    /nix/store/tux: [ ]
+#  }
+#
+# To:
+#
+# {
+#   /nix/store/bar: {
+#                    /nix/store/baz: {
+#                                     /nix/store/tux: {}
+#                    }
+#   },
+#   /nix/store/baz: {
+#                   /nix/store/tux: {}
+#   }
+# }
+def make_graph_segment_from_root(root, lookup):
+    children = {}
+    for ref in lookup[root]:
+        children[ref] = make_graph_segment_from_root(ref, lookup)
+    return children
+
+class TestMakeGraphSegmentFromRoot(unittest.TestCase):
+    def test_returns_graph(self):
+        self.assertDictEqual(
+            make_graph_segment_from_root("/nix/store/foo", {
+                "/nix/store/foo": [ "/nix/store/bar" ],
+                "/nix/store/bar": [ "/nix/store/tux" ],
+                "/nix/store/tux": [ ],
+                "/nix/store/hello": [ ],
+            }),
+            {
+                "/nix/store/bar": {
+                    "/nix/store/tux": {}
+                }
+            }
+        )
+    def test_returns_graph_tiny(self):
+        self.assertDictEqual(
+            make_graph_segment_from_root("/nix/store/tux", {
+                "/nix/store/foo": [ "/nix/store/bar" ],
+                "/nix/store/bar": [ "/nix/store/tux" ],
+                "/nix/store/tux": [ ],
+            }),
+            {}
+        )
+
+# Convert a graph segment in to a popularity-counted dictionary:
+#
+# From:
+# {
+#    /nix/store/foo: {
+#                      /nix/store/bar: {
+#                                        /nix/store/baz: {
+#                                                           /nix/store/tux: {}
+#                                        }
+#                      }
+#                      /nix/store/baz: {
+#                                         /nix/store/tux: {}
+#                      }
+#    }
+# }
+#
+# to:
+# [
+#   /nix/store/foo: 1
+#   /nix/store/bar: 2
+#   /nix/store/baz: 4
+#   /nix/store/tux: 6
+# ]
+def graph_popularity_contest(full_graph):
+    popularity = defaultdict(int)
+    for path, subgraph in full_graph.items():
+        popularity[path] += 1
+        subcontest = graph_popularity_contest(subgraph)
+        for subpath, subpopularity in subcontest.items():
+            popularity[subpath] += subpopularity + 1
+
+    return popularity
+
+class TestGraphPopularityContest(unittest.TestCase):
+    def test_counts_popularity(self):
+        self.assertDictEqual(
+            graph_popularity_contest({
+                "/nix/store/foo": {
+                    "/nix/store/bar": {
+                        "/nix/store/baz": {
+                            "/nix/store/tux": {}
+                        }
+                    },
+                    "/nix/store/baz": {
+                        "/nix/store/tux": {}
+                    }
+                }
+            }),
+            {
+                   "/nix/store/foo": 1,
+                   "/nix/store/bar": 2,
+                   "/nix/store/baz": 4,
+                   "/nix/store/tux": 6,
+            }
+        )
+
+# Emit a list of packages by popularity, most first:
+#
+# From:
+# [
+#   /nix/store/foo: 1
+#   /nix/store/bar: 1
+#   /nix/store/baz: 2
+#   /nix/store/tux: 2
+# ]
+#
+# To:
+# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
+def order_by_popularity(paths):
+    paths_by_popularity = defaultdict(list)
+    popularities = []
+    for path, popularity in paths.items():
+        popularities.append(popularity)
+        paths_by_popularity[popularity].append(path)
+
+    popularities = list(set(popularities))
+    popularities.sort()
+
+    flat_ordered = []
+    for popularity in popularities:
+        paths = paths_by_popularity[popularity]
+        paths.sort(key=package_name)
+
+        flat_ordered.extend(reversed(paths))
+    return list(reversed(flat_ordered))
+
+
+class TestOrderByPopularity(unittest.TestCase):
+    def test_returns_in_order(self):
+        self.assertEqual(
+            order_by_popularity({
+                   "/nix/store/foo": 1,
+                   "/nix/store/bar": 1,
+                   "/nix/store/baz": 2,
+                   "/nix/store/tux": 2,
+            }),
+            [
+                "/nix/store/baz",
+                "/nix/store/tux",
+                "/nix/store/bar",
+                "/nix/store/foo"
+            ]
+        )
+
+def package_name(path):
+    parts = path.split('-')
+    start = parts.pop(0)
+    # don't throw away any data, so the order is always the same.
+    # even in cases where only the hash at the start has changed.
+    parts.append(start)
+    return '-'.join(parts)
+
+def main():
+    filename = sys.argv[1]
+    key = sys.argv[2]
+
+    with open(filename) as f:
+        data = json.load(f)
+
+    # Data comes in as:
+    # [
+    #    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
+    #    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
+    #    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
+    #    { path: /nix/store/tux, references: [ /nix/store/tux ] }
+    #  ]
+    #
+    # and we want to get out a list of paths ordered by how universally,
+    # important they are, ie: tux is referenced by every path, transitively
+    # so it should be #1
+    #
+    # [
+    #   /nix/store/tux,
+    #   /nix/store/baz,
+    #   /nix/store/bar,
+    #   /nix/store/foo,
+    # ]
+    graph = data[key]
+
+    roots = find_roots(graph);
+    lookup = make_lookup(graph)
+
+    full_graph = {}
+    for root in roots:
+        full_graph[root] = make_graph_segment_from_root(root, lookup)
+
+    ordered = order_by_popularity(graph_popularity_contest(full_graph))
+    missing = []
+    for path in all_paths(graph):
+        if path not in ordered:
+            missing.append(path)
+
+    ordered.extend(missing)
+    print("\n".join(ordered))
+
+if "--test" in sys.argv:
+    # Don't pass --test otherwise unittest gets mad
+    unittest.main(argv = [f for f in sys.argv if f != "--test" ])
+else:
+    main()