From 843e0992ca44be86cd7ca97f337bdd55a91c1655 Mon Sep 17 00:00:00 2001 From: Shea Levy Date: Tue, 10 Oct 2017 12:24:30 -0400 Subject: Partition all-cabal-hashes into case-insensitive-safe components. OS X by default has a case-insensitive filesystem, and fetching all-cabal-hashes there fails due to a hash mismatch caused by package pairs like compactable and Compactable. This partitions the package set such that each partition contains no equivalent-up-to-case pairs. --- pkgs/data/misc/hackage/default.nix | 38 ++++- .../data/misc/hackage/partition-all-cabal-hashes.c | 190 +++++++++++++++++++++ 2 files changed, 220 insertions(+), 8 deletions(-) create mode 100644 pkgs/data/misc/hackage/partition-all-cabal-hashes.c (limited to 'pkgs/data/misc') diff --git a/pkgs/data/misc/hackage/default.nix b/pkgs/data/misc/hackage/default.nix index eddc833da9d1..eae7a814674e 100644 --- a/pkgs/data/misc/hackage/default.nix +++ b/pkgs/data/misc/hackage/default.nix @@ -1,11 +1,33 @@ -{ fetchFromGitHub }: +{ fetchFromGitHub +, stdenv +, pkgconfig +, libarchive +, glib +, # Override this to use a different revision + src-spec ? + { owner = "commercialhaskell"; + repo = "all-cabal-hashes"; + rev = "901c2522e6797270f5ded4495b1a529e6c16ef45"; + sha256 = "05jmwsgrk77nz9vvgfbpsps0l320qgjpkr2c9zhkn9sc3d275lfb"; + } +, lib +}: # Use builtins.fetchTarball "https://github.com/commercialhaskell/all-cabal-hashes/archive/hackage.tar.gz" # instead if you want the latest Hackage automatically at the price of frequent re-downloads. - -fetchFromGitHub { - owner = "commercialhaskell"; - repo = "all-cabal-hashes"; - rev = "901c2522e6797270f5ded4495b1a529e6c16ef45"; - sha256 = "0wng314y3yn6bbwa5ar254l7p8y99gsvm8ll4z7f3wg77v5fzish"; -} +let partition-all-cabal-hashes = stdenv.mkDerivation + { name = "partition-all-cabal-hashes"; + src = ./partition-all-cabal-hashes.c; + unpackPhase = "true"; + buildInputs = [ pkgconfig libarchive glib ]; + buildPhase = + "cc -O3 $(pkg-config --cflags --libs libarchive glib-2.0) $src -o partition-all-cabal-hashes"; + installPhase = + '' + mkdir -p $out/bin + install -m755 partition-all-cabal-hashes $out/bin + ''; + }; +in fetchFromGitHub (src-spec // + { postFetch = "${partition-all-cabal-hashes}/bin/partition-all-cabal-hashes $downloadedFile $out"; + }) diff --git a/pkgs/data/misc/hackage/partition-all-cabal-hashes.c b/pkgs/data/misc/hackage/partition-all-cabal-hashes.c new file mode 100644 index 000000000000..d8c3eca37019 --- /dev/null +++ b/pkgs/data/misc/hackage/partition-all-cabal-hashes.c @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include +#include + +static char * case_normalize(char * str) { + for (char * iter = str; *iter; ++iter) { + *iter = tolower(*iter); + } + return str; +} + +static gint compare_str(const void * a, const void * b, void * _) { + return strcmp(a, b); +} + +int main(int argc, char ** argv) { + if (argc != 3) { + fprintf(stderr, "Usage: %s TARBALL OUTPUT\n", argv[0]); + return 1; + } + + size_t output_len = strlen(argv[2]); + + /* Switch to standard locale to ensure consistency in case-folding. + */ + setlocale(LC_CTYPE, "C"); + + /* Map from case-normalized package name to a sorted sequence of + * package names in the equivalence class defined by + * case-normalization. + */ + GHashTable * equivalence_classes = + g_hash_table_new(g_str_hash, g_str_equal); + + /* Open up the tarball. + */ + struct archive * ar = archive_read_new(); + if (!ar) { + perror("Allocating archive structure"); + return 1; + } + archive_read_support_filter_gzip(ar); + archive_read_support_format_tar(ar); + if (archive_read_open_filename( ar + , argv[1] + , 10240 + ) == ARCHIVE_FATAL) { + fprintf( stderr + , "Error opening %s: %s\n" + , argv[0] + , archive_error_string(ar) + ); + return 1; + } + + /* Extract the length of the output directory that prefixes all + * tarball entries from the first entry in the tarball. + */ + struct archive_entry * ent; + int err = archive_read_next_header(ar, &ent); + if (err != ARCHIVE_OK) { + if (err == ARCHIVE_EOF) { + fprintf( stderr + , "No entries in %s, surely this is an error!\n" + , argv[1] + ); + } else { + fprintf( stderr + , "Error reading entry from %s: %s\n" + , argv[1] + , archive_error_string(ar) + ); + } + return 1; + } + const char * path = archive_entry_pathname(ent); + /* Number of characters from the start of the path name until after + * the slash after the leading directory. + */ + size_t prefix_len = strchr(path, '/') - path + 1; + + /* Extract each entry to the right partition. + */ + do { + path = archive_entry_pathname(ent) + prefix_len; + const char * pkg_end = strchr(path, '/'); + if (!pkg_end) + /* If there is no second slash, then this is either just the entry + * corresponding to the root or some non-package file (e.g. + * travis.yml). In either case, we don't care. + */ + continue; + + /* Find our package in the equivalence class map. + */ + char * pkg_name = g_strndup(path, pkg_end - path); + char * pkg_normalized = + case_normalize(g_strndup(path, pkg_end - path)); + GSequence * pkg_class = + g_hash_table_lookup(equivalence_classes, pkg_normalized); + gint partition_num; + if (!pkg_class) { + /* We haven't seen any packages with this normalized name yet, + * so we need to initialize the sequence and add it to the map. + */ + pkg_class = g_sequence_new(NULL); + g_sequence_append(pkg_class, pkg_name); + g_hash_table_insert( equivalence_classes + , pkg_normalized + , pkg_class + ); + partition_num = 1; + } else { + g_free(pkg_normalized); + /* Find the package name in the equivalence class */ + GSequenceIter * pkg_iter = + g_sequence_search( pkg_class + , pkg_name + , compare_str + , NULL + ); + if (!g_sequence_iter_is_end(pkg_iter)) { + /* If there are any packages after this one in the list, bail + * out. In principle we could solve this by moving them up to + * the next partition, but so far I've never seen any github + * tarballs out of order so let's save ourselves the work + * until we know we need it. + */ + fprintf( stderr + , "Out of order github tarball: %s is after %s\n" + , pkg_name + , (char *) g_sequence_get(pkg_iter) + ); + return 1; + } + pkg_iter = g_sequence_iter_prev(pkg_iter); + if (strcmp( g_sequence_get(pkg_iter) + , pkg_name + ) != 0) { + /* This package doesn't have the same name as the one right + * before where it should be in the sequence, which means it's + * new and needs to be added to the sequence. + * + * !!! We need to change this to use g_sequence_insert_before + * if we ever get an out-of-order github tarball, see comment + * after the check for !g_sequence_iter_is_end(pkg_iter). + */ + pkg_iter = g_sequence_append(pkg_class, pkg_name); + } else { + g_free(pkg_name); + } + /* Get the partition number, starting with 1. + */ + partition_num = g_sequence_iter_get_position(pkg_iter) + 1; + } + + /* Set the destination path. + * The 3 below is for the length of /#/, the partition number part + * of the path. If we have more than 9 partitions, we deserve to + * segfault. The 1 at the end is for the trailing null. + */ + char * dest_path = g_malloc(output_len + 3 + strlen(path) + 1); + sprintf(dest_path, "%s/%d/%s", argv[2], partition_num, path); + archive_entry_set_pathname(ent, dest_path); + + if (archive_read_extract(ar, ent, 0) != ARCHIVE_OK) { + fprintf( stderr + , "Error extracting entry %s from %s: %s\n" + , dest_path + , argv[1] + , archive_error_string(ar) + ); + return 1; + } + } while ((err = archive_read_next_header(ar, &ent)) == ARCHIVE_OK); + if (err != ARCHIVE_EOF) { + fprintf( stderr + , "Error reading entry from %s: %s\n" + , argv[1] + , archive_error_string(ar) + ); + return 1; + } + + return 0; +} -- cgit 1.4.1