summary refs log tree commit diff
path: root/pkgs/data
diff options
context:
space:
mode:
authorShea Levy <shea@shealevy.com>2017-10-15 18:13:29 -0400
committerShea Levy <shea@shealevy.com>2017-10-15 18:13:29 -0400
commitad6b8f438db055a73e64218b5f2ed3cbca963ae5 (patch)
treed6d8163848c619e4a284e32c00ab8a0faab86f06 /pkgs/data
parent2d44e7280166256d8322546b11dcbe9b95f5bc5d (diff)
parent3ef3fc43a22f53cf2c00b9994569c28a83898829 (diff)
downloadnixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.tar
nixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.tar.gz
nixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.tar.bz2
nixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.tar.lz
nixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.tar.xz
nixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.tar.zst
nixlib-ad6b8f438db055a73e64218b5f2ed3cbca963ae5.zip
Merge branch 'all-cabal-hashes-components'
Diffstat (limited to 'pkgs/data')
-rw-r--r--pkgs/data/misc/hackage/default.nix39
-rw-r--r--pkgs/data/misc/hackage/partition-all-cabal-hashes.c190
2 files changed, 221 insertions, 8 deletions
diff --git a/pkgs/data/misc/hackage/default.nix b/pkgs/data/misc/hackage/default.nix
index eddc833da9d1..14ae22798c26 100644
--- a/pkgs/data/misc/hackage/default.nix
+++ b/pkgs/data/misc/hackage/default.nix
@@ -1,11 +1,34 @@
-{ fetchFromGitHub }:
+{ fetchFromGitHub
+, stdenv
+, pkgconfig
+, libarchive
+, glib
+, # Override this to use a different revision
+  src-spec ?
+    { owner = "commercialhaskell";
+      repo = "all-cabal-hashes";
+      rev = "901c2522e6797270f5ded4495b1a529e6c16ef45";
+      sha256 = "05jmwsgrk77nz9vvgfbpsps0l320qgjpkr2c9zhkn9sc3d275lfb";
+    }
+, lib
+}:
 
 # Use builtins.fetchTarball "https://github.com/commercialhaskell/all-cabal-hashes/archive/hackage.tar.gz"
 # instead if you want the latest Hackage automatically at the price of frequent re-downloads.
-
-fetchFromGitHub {
-  owner = "commercialhaskell";
-  repo = "all-cabal-hashes";
-  rev = "901c2522e6797270f5ded4495b1a529e6c16ef45";
-  sha256 = "0wng314y3yn6bbwa5ar254l7p8y99gsvm8ll4z7f3wg77v5fzish";
-}
+let partition-all-cabal-hashes = stdenv.mkDerivation
+      { name = "partition-all-cabal-hashes";
+        src = ./partition-all-cabal-hashes.c;
+        unpackPhase = "true";
+        buildInputs = [ libarchive glib ];
+        nativeBuildInputs = [ pkgconfig ];
+        buildPhase =
+          "cc -O3 $(pkg-config --cflags --libs libarchive glib-2.0) $src -o partition-all-cabal-hashes";
+        installPhase =
+          ''
+            mkdir -p $out/bin
+            install -m755 partition-all-cabal-hashes $out/bin
+          '';
+      };
+in fetchFromGitHub (src-spec //
+  { postFetch = "${partition-all-cabal-hashes}/bin/partition-all-cabal-hashes $downloadedFile $out";
+  })
diff --git a/pkgs/data/misc/hackage/partition-all-cabal-hashes.c b/pkgs/data/misc/hackage/partition-all-cabal-hashes.c
new file mode 100644
index 000000000000..d8c3eca37019
--- /dev/null
+++ b/pkgs/data/misc/hackage/partition-all-cabal-hashes.c
@@ -0,0 +1,190 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <glib.h>
+#include <string.h>
+#include <locale.h>
+#include <archive.h>
+#include <archive_entry.h>
+
+static char * case_normalize(char * str) {
+  for (char * iter = str; *iter; ++iter) {
+    *iter = tolower(*iter);
+  }
+  return str;
+}
+
+static gint compare_str(const void * a, const void * b, void * _) {
+  return strcmp(a, b);
+}
+
+int main(int argc, char ** argv) {
+  if (argc != 3) {
+    fprintf(stderr, "Usage: %s TARBALL OUTPUT\n", argv[0]);
+    return 1;
+  }
+
+  size_t output_len = strlen(argv[2]);
+
+  /* Switch to standard locale to ensure consistency in case-folding.
+   */
+  setlocale(LC_CTYPE, "C");
+
+  /* Map from case-normalized package name to a sorted sequence of
+   * package names in the equivalence class defined by
+   * case-normalization.
+   */
+  GHashTable * equivalence_classes =
+    g_hash_table_new(g_str_hash, g_str_equal);
+
+  /* Open up the tarball.
+   */
+  struct archive * ar = archive_read_new();
+  if (!ar) {
+    perror("Allocating archive structure");
+    return 1;
+  }
+  archive_read_support_filter_gzip(ar);
+  archive_read_support_format_tar(ar);
+  if (archive_read_open_filename( ar
+                                , argv[1]
+                                , 10240
+                                ) == ARCHIVE_FATAL) {
+    fprintf( stderr
+           , "Error opening %s: %s\n"
+           , argv[0]
+           , archive_error_string(ar)
+           );
+    return 1;
+  }
+
+  /* Extract the length of the output directory that prefixes all
+   * tarball entries from the first entry in the tarball.
+   */
+  struct archive_entry * ent;
+  int err = archive_read_next_header(ar, &ent);
+  if (err != ARCHIVE_OK) {
+    if (err == ARCHIVE_EOF) {
+      fprintf( stderr
+             , "No entries in %s, surely this is an error!\n"
+             , argv[1]
+             );
+    } else {
+      fprintf( stderr
+             , "Error reading entry from %s: %s\n"
+             , argv[1]
+             , archive_error_string(ar)
+             );
+    }
+    return 1;
+  }
+  const char * path = archive_entry_pathname(ent);
+  /* Number of characters from the start of the path name until after
+   * the slash after the leading directory.
+   */
+  size_t prefix_len = strchr(path, '/') - path + 1;
+
+  /* Extract each entry to the right partition.
+   */
+  do {
+    path = archive_entry_pathname(ent) + prefix_len;
+    const char * pkg_end = strchr(path, '/');
+    if (!pkg_end)
+      /* If there is no second slash, then this is either just the entry
+       * corresponding to the root or some non-package file (e.g.
+       * travis.yml). In either case, we don't care.
+       */
+      continue;
+
+    /* Find our package in the equivalence class map.
+     */
+    char * pkg_name = g_strndup(path, pkg_end - path);
+    char * pkg_normalized =
+      case_normalize(g_strndup(path, pkg_end - path));
+    GSequence * pkg_class =
+      g_hash_table_lookup(equivalence_classes, pkg_normalized);
+    gint partition_num;
+    if (!pkg_class) {
+      /* We haven't seen any packages with this normalized name yet,
+       * so we need to initialize the sequence and add it to the map.
+       */
+      pkg_class = g_sequence_new(NULL);
+      g_sequence_append(pkg_class, pkg_name);
+      g_hash_table_insert( equivalence_classes
+                         , pkg_normalized
+                         , pkg_class
+                         );
+      partition_num = 1;
+    } else {
+      g_free(pkg_normalized);
+      /* Find the package name in the equivalence class */
+      GSequenceIter * pkg_iter =
+        g_sequence_search( pkg_class
+                         , pkg_name
+                         , compare_str
+                         , NULL
+                         );
+      if (!g_sequence_iter_is_end(pkg_iter)) {
+        /* If there are any packages after this one in the list, bail
+         * out. In principle we could solve this by moving them up to
+         * the next partition, but so far I've never seen any github
+         * tarballs out of order so let's save ourselves the work
+         * until we know we need it.
+         */
+        fprintf( stderr
+               , "Out of order github tarball: %s is after %s\n"
+               , pkg_name
+               , (char *) g_sequence_get(pkg_iter)
+               );
+        return 1;
+      }
+      pkg_iter = g_sequence_iter_prev(pkg_iter);
+      if (strcmp( g_sequence_get(pkg_iter)
+                , pkg_name
+                ) != 0) {
+        /* This package doesn't have the same name as the one right
+         * before where it should be in the sequence, which means it's
+         * new and needs to be added to the sequence.
+         *
+         * !!! We need to change this to use g_sequence_insert_before
+         * if we ever get an out-of-order github tarball, see comment
+         * after the check for !g_sequence_iter_is_end(pkg_iter).
+         */
+        pkg_iter = g_sequence_append(pkg_class, pkg_name);
+      } else {
+        g_free(pkg_name);
+      }
+      /* Get the partition number, starting with 1.
+       */
+      partition_num = g_sequence_iter_get_position(pkg_iter) + 1;
+    }
+
+    /* Set the destination path.
+     * The 3 below is for the length of /#/, the partition number part
+     * of the path. If we have more than 9 partitions, we deserve to
+     * segfault. The 1 at the end is for the trailing null.
+     */
+    char * dest_path = g_malloc(output_len + 3 + strlen(path) + 1);
+    sprintf(dest_path, "%s/%d/%s", argv[2], partition_num, path);
+    archive_entry_set_pathname(ent, dest_path);
+
+    if (archive_read_extract(ar, ent, 0) != ARCHIVE_OK) {
+      fprintf( stderr
+             , "Error extracting entry %s from %s: %s\n"
+             , dest_path
+             , argv[1]
+             , archive_error_string(ar)
+             );
+      return 1;
+    }
+  } while ((err = archive_read_next_header(ar, &ent)) == ARCHIVE_OK);
+  if (err != ARCHIVE_EOF) {
+    fprintf( stderr
+           , "Error reading entry from %s: %s\n"
+           , argv[1]
+           , archive_error_string(ar)
+           );
+    return 1;
+  }
+
+  return 0;
+}