diff options
Diffstat (limited to 'nixpkgs/pkgs/build-support/references-by-popularity/closure-graph.py')
-rw-r--r-- | nixpkgs/pkgs/build-support/references-by-popularity/closure-graph.py | 567 |
1 files changed, 567 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/build-support/references-by-popularity/closure-graph.py b/nixpkgs/pkgs/build-support/references-by-popularity/closure-graph.py new file mode 100644 index 000000000000..4f8efd42ed81 --- /dev/null +++ b/nixpkgs/pkgs/build-support/references-by-popularity/closure-graph.py @@ -0,0 +1,567 @@ +# IMPORTANT: Making changes? +# +# Validate your changes with python3 ./closure-graph.py --test + + +# Using a simple algorithm, convert the references to a path in to a +# sorted list of dependent paths based on how often they're referenced +# and how deep in the tree they live. Equally-"popular" paths are then +# sorted by name. +# +# The existing writeClosure prints the paths in a simple ascii-based +# sorting of the paths. +# +# Sorting the paths by graph improves the chances that the difference +# between two builds appear near the end of the list, instead of near +# the beginning. This makes a difference for Nix builds which export a +# closure for another program to consume, if that program implements its +# own level of binary diffing. +# +# For an example, Docker Images. If each store path is a separate layer +# then Docker Images can be very efficiently transfered between systems, +# and we get very good cache reuse between images built with the same +# version of Nixpkgs. However, since Docker only reliably supports a +# small number of layers (42) it is important to pick the individual +# layers carefully. By storing very popular store paths in the first 40 +# layers, we improve the chances that the next Docker image will share +# many of those layers.* +# +# Given the dependency tree: +# +# A - B - C - D -\ +# \ \ \ \ +# \ \ \ \ +# \ \ - E ---- F +# \- G +# +# Nodes which have multiple references are duplicated: +# +# A - B - C - D - F +# \ \ \ +# \ \ \- E - F +# \ \ +# \ \- E - F +# \ +# \- G +# +# Each leaf node is now replaced by a counter defaulted to 1: +# +# A - B - C - D - (F:1) +# \ \ \ +# \ \ \- E - (F:1) +# \ \ +# \ \- E - (F:1) +# \ +# \- (G:1) +# +# Then each leaf counter is merged with its parent node, replacing the +# parent node with a counter of 1, and each existing counter being +# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`: +# +# A - B - C - (D:1, F:2) +# \ \ \ +# \ \ \- (E:1, F:2) +# \ \ +# \ \- (E:1, F:2) +# \ +# \- (G:1) +# +# Then each leaf counter is merged with its parent node again, merging +# any counters, then incrementing each: +# +# A - B - (C:1, D:2, E:2, F:5) +# \ \ +# \ \- (E:1, F:2) +# \ +# \- (G:1) +# +# And again: +# +# A - (B:1, C:2, D:3, E:4, F:8) +# \ +# \- (G:1) +# +# And again: +# +# (A:1, B:2, C:3, D:4, E:5, F:9, G:2) +# +# and then paths have the following "popularity": +# +# A 1 +# B 2 +# C 3 +# D 4 +# E 5 +# F 9 +# G 2 +# +# and the popularity contest would result in the paths being printed as: +# +# F +# E +# D +# C +# B +# G +# A +# +# * Note: People who have used a Dockerfile before assume Docker's +# Layers are inherently ordered. However, this is not true -- Docker +# layers are content-addressable and are not explicitly layered until +# they are composed in to an Image. + +import sys +import json +import unittest + +from pprint import pprint +from collections import defaultdict + + +def debug(msg, *args, **kwargs): + if False: + print( + "DEBUG: {}".format( + msg.format(*args, **kwargs) + ), + file=sys.stderr + ) + + +# Find paths in the original dataset which are never referenced by +# any other paths +def find_roots(closures): + roots = []; + + for closure in closures: + path = closure['path'] + if not any_refer_to(path, closures): + roots.append(path) + + return roots + +class TestFindRoots(unittest.TestCase): + def test_find_roots(self): + self.assertCountEqual( + find_roots([ + { + "path": "/nix/store/foo", + "references": [ + "/nix/store/foo", + "/nix/store/bar" + ] + }, + { + "path": "/nix/store/bar", + "references": [ + "/nix/store/bar", + "/nix/store/tux" + ] + }, + { + "path": "/nix/store/hello", + "references": [ + ] + } + ]), + ["/nix/store/foo", "/nix/store/hello"] + ) + + +def any_refer_to(path, closures): + for closure in closures: + if path != closure['path']: + if path in closure['references']: + return True + return False + +class TestAnyReferTo(unittest.TestCase): + def test_has_references(self): + self.assertTrue( + any_refer_to( + "/nix/store/bar", + [ + { + "path": "/nix/store/foo", + "references": [ + "/nix/store/bar" + ] + }, + ] + ), + ) + def test_no_references(self): + self.assertFalse( + any_refer_to( + "/nix/store/foo", + [ + { + "path": "/nix/store/foo", + "references": [ + "/nix/store/foo", + "/nix/store/bar" + ] + }, + ] + ), + ) + +def all_paths(closures): + paths = [] + for closure in closures: + paths.append(closure['path']) + paths.extend(closure['references']) + paths.sort() + return list(set(paths)) + + +class TestAllPaths(unittest.TestCase): + def test_returns_all_paths(self): + self.assertCountEqual( + all_paths([ + { + "path": "/nix/store/foo", + "references": [ + "/nix/store/foo", + "/nix/store/bar" + ] + }, + { + "path": "/nix/store/bar", + "references": [ + "/nix/store/bar", + "/nix/store/tux" + ] + }, + { + "path": "/nix/store/hello", + "references": [ + ] + } + ]), + ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",] + ) + def test_no_references(self): + self.assertFalse( + any_refer_to( + "/nix/store/foo", + [ + { + "path": "/nix/store/foo", + "references": [ + "/nix/store/foo", + "/nix/store/bar" + ] + }, + ] + ), + ) + +# Convert: +# +# [ +# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, +# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, +# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, +# { path: /nix/store/tux, references: [ /nix/store/tux ] } +# ] +# +# To: +# { +# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], +# /nix/store/bar: [ /nix/store/baz ], +# /nix/store/baz: [ /nix/store/tux ] }, +# /nix/store/tux: [ ] +# } +# +# Note that it drops self-references to avoid loops. +def make_lookup(closures): + lookup = {} + + for closure in closures: + # paths often self-refer + nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']] + lookup[closure['path']] = nonreferential_paths + + return lookup + +class TestMakeLookup(unittest.TestCase): + def test_returns_lookp(self): + self.assertDictEqual( + make_lookup([ + { + "path": "/nix/store/foo", + "references": [ + "/nix/store/foo", + "/nix/store/bar" + ] + }, + { + "path": "/nix/store/bar", + "references": [ + "/nix/store/bar", + "/nix/store/tux" + ] + }, + { + "path": "/nix/store/hello", + "references": [ + ] + } + ]), + { + "/nix/store/foo": [ "/nix/store/bar" ], + "/nix/store/bar": [ "/nix/store/tux" ], + "/nix/store/hello": [ ], + } + ) + +# Convert: +# +# /nix/store/foo with +# { +# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], +# /nix/store/bar: [ /nix/store/baz ], +# /nix/store/baz: [ /nix/store/tux ] }, +# /nix/store/tux: [ ] +# } +# +# To: +# +# { +# /nix/store/bar: { +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# }, +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# } +subgraphs_cache = {} +def make_graph_segment_from_root(root, lookup): + global subgraphs_cache + children = {} + for ref in lookup[root]: + # make_graph_segment_from_root is a pure function, and will + # always return the same result based on a given input. Thus, + # cache computation. + # + # Python's assignment will use a pointer, preventing memory + # bloat for large graphs. + if ref not in subgraphs_cache: + debug("Subgraph Cache miss on {}".format(ref)) + subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup) + else: + debug("Subgraph Cache hit on {}".format(ref)) + children[ref] = subgraphs_cache[ref] + return children + +class TestMakeGraphSegmentFromRoot(unittest.TestCase): + def test_returns_graph(self): + self.assertDictEqual( + make_graph_segment_from_root("/nix/store/foo", { + "/nix/store/foo": [ "/nix/store/bar" ], + "/nix/store/bar": [ "/nix/store/tux" ], + "/nix/store/tux": [ ], + "/nix/store/hello": [ ], + }), + { + "/nix/store/bar": { + "/nix/store/tux": {} + } + } + ) + def test_returns_graph_tiny(self): + self.assertDictEqual( + make_graph_segment_from_root("/nix/store/tux", { + "/nix/store/foo": [ "/nix/store/bar" ], + "/nix/store/bar": [ "/nix/store/tux" ], + "/nix/store/tux": [ ], + }), + {} + ) + +# Convert a graph segment in to a popularity-counted dictionary: +# +# From: +# { +# /nix/store/foo: { +# /nix/store/bar: { +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# } +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# } +# } +# +# to: +# [ +# /nix/store/foo: 1 +# /nix/store/bar: 2 +# /nix/store/baz: 4 +# /nix/store/tux: 6 +# ] +popularity_cache = {} +def graph_popularity_contest(full_graph): + global popularity_cache + popularity = defaultdict(int) + for path, subgraph in full_graph.items(): + popularity[path] += 1 + # graph_popularity_contest is a pure function, and will + # always return the same result based on a given input. Thus, + # cache computation. + # + # Python's assignment will use a pointer, preventing memory + # bloat for large graphs. + if path not in popularity_cache: + debug("Popularity Cache miss on {}", path) + popularity_cache[path] = graph_popularity_contest(subgraph) + else: + debug("Popularity Cache hit on {}", path) + + subcontest = popularity_cache[path] + for subpath, subpopularity in subcontest.items(): + debug("Calculating popularity for {}", subpath) + popularity[subpath] += subpopularity + 1 + + return popularity + +class TestGraphPopularityContest(unittest.TestCase): + def test_counts_popularity(self): + self.assertDictEqual( + graph_popularity_contest({ + "/nix/store/foo": { + "/nix/store/bar": { + "/nix/store/baz": { + "/nix/store/tux": {} + } + }, + "/nix/store/baz": { + "/nix/store/tux": {} + } + } + }), + { + "/nix/store/foo": 1, + "/nix/store/bar": 2, + "/nix/store/baz": 4, + "/nix/store/tux": 6, + } + ) + +# Emit a list of packages by popularity, most first: +# +# From: +# [ +# /nix/store/foo: 1 +# /nix/store/bar: 1 +# /nix/store/baz: 2 +# /nix/store/tux: 2 +# ] +# +# To: +# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ] +def order_by_popularity(paths): + paths_by_popularity = defaultdict(list) + popularities = [] + for path, popularity in paths.items(): + popularities.append(popularity) + paths_by_popularity[popularity].append(path) + + popularities = list(set(popularities)) + popularities.sort() + + flat_ordered = [] + for popularity in popularities: + paths = paths_by_popularity[popularity] + paths.sort(key=package_name) + + flat_ordered.extend(reversed(paths)) + return list(reversed(flat_ordered)) + + +class TestOrderByPopularity(unittest.TestCase): + def test_returns_in_order(self): + self.assertEqual( + order_by_popularity({ + "/nix/store/foo": 1, + "/nix/store/bar": 1, + "/nix/store/baz": 2, + "/nix/store/tux": 2, + }), + [ + "/nix/store/baz", + "/nix/store/tux", + "/nix/store/bar", + "/nix/store/foo" + ] + ) + +def package_name(path): + parts = path.split('-') + start = parts.pop(0) + # don't throw away any data, so the order is always the same. + # even in cases where only the hash at the start has changed. + parts.append(start) + return '-'.join(parts) + +def main(): + filename = sys.argv[1] + key = sys.argv[2] + + debug("Loading from {}", filename) + with open(filename) as f: + data = json.load(f) + + # Data comes in as: + # [ + # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, + # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, + # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, + # { path: /nix/store/tux, references: [ /nix/store/tux ] } + # ] + # + # and we want to get out a list of paths ordered by how universally, + # important they are, ie: tux is referenced by every path, transitively + # so it should be #1 + # + # [ + # /nix/store/tux, + # /nix/store/baz, + # /nix/store/bar, + # /nix/store/foo, + # ] + graph = data[key] + + debug("Finding roots from {}", key) + roots = find_roots(graph); + debug("Making lookup for {}", key) + lookup = make_lookup(graph) + + full_graph = {} + for root in roots: + debug("Making full graph for {}", root) + full_graph[root] = make_graph_segment_from_root(root, lookup) + + debug("Running contest") + contest = graph_popularity_contest(full_graph) + debug("Ordering by popularity") + ordered = order_by_popularity(contest) + debug("Checking for missing paths") + missing = [] + for path in all_paths(graph): + if path not in ordered: + missing.append(path) + + ordered.extend(missing) + print("\n".join(ordered)) + +if "--test" in sys.argv: + # Don't pass --test otherwise unittest gets mad + unittest.main(argv = [f for f in sys.argv if f != "--test" ]) +else: + main() |