about summary refs log tree commit diff
path: root/pkgs/applications/graphics
diff options
context:
space:
mode:
authorErik Arvstedt <erik.arvstedt@gmail.com>2018-12-18 19:02:13 +0100
committerErik Arvstedt <erik.arvstedt@gmail.com>2018-12-19 18:08:16 +0100
commitaaaed1307765910c66a46361023e357dd3ad6021 (patch)
tree30cf794e1f822efde608c15e3e2c969951619d8d /pkgs/applications/graphics
parent45d2a2dd910b9595929e740525f4fa28c5cdc69e (diff)
downloadnixlib-aaaed1307765910c66a46361023e357dd3ad6021.tar
nixlib-aaaed1307765910c66a46361023e357dd3ad6021.tar.gz
nixlib-aaaed1307765910c66a46361023e357dd3ad6021.tar.bz2
nixlib-aaaed1307765910c66a46361023e357dd3ad6021.tar.lz
nixlib-aaaed1307765910c66a46361023e357dd3ad6021.tar.xz
nixlib-aaaed1307765910c66a46361023e357dd3ad6021.tar.zst
nixlib-aaaed1307765910c66a46361023e357dd3ad6021.zip
tesseract: add a wrapper to setup languages
Tesseract is now decoupled from the tessdata language corpus.

This avoids recompilation when building Tesseract with a custom set
of languages.

Update k2pdfopt to use the new wrapper interface.
Diffstat (limited to 'pkgs/applications/graphics')
-rw-r--r--pkgs/applications/graphics/tesseract/default.nix15
-rw-r--r--pkgs/applications/graphics/tesseract/languages.nix43
-rw-r--r--pkgs/applications/graphics/tesseract/tesseract3.nix44
-rw-r--r--pkgs/applications/graphics/tesseract/tesseract4.nix42
-rw-r--r--pkgs/applications/graphics/tesseract/wrapper.nix69
5 files changed, 130 insertions, 83 deletions
diff --git a/pkgs/applications/graphics/tesseract/default.nix b/pkgs/applications/graphics/tesseract/default.nix
index 6da6931eea38..969402bf7c59 100644
--- a/pkgs/applications/graphics/tesseract/default.nix
+++ b/pkgs/applications/graphics/tesseract/default.nix
@@ -1,11 +1,18 @@
 { callPackage, lowPrio }:
 
 let
-  tesseract3 = callPackage ./tesseract3.nix {};
-  tesseract4 = callPackage ./tesseract4.nix {};
+  base3 = callPackage ./tesseract3.nix {};
+  base4 = callPackage ./tesseract4.nix {};
+  languages = callPackage ./languages.nix {};
 in
 {
-  tesseract = tesseract3;
+  tesseract = callPackage ./wrapper.nix {
+    tesseractBase = base3;
+    languages = languages.v3;
+  };
 
-  tesseract_4 = lowPrio tesseract4;
+  tesseract_4 = lowPrio (callPackage ./wrapper.nix {
+    tesseractBase = base4;
+    languages = languages.v4;
+  });
 }
diff --git a/pkgs/applications/graphics/tesseract/languages.nix b/pkgs/applications/graphics/tesseract/languages.nix
new file mode 100644
index 000000000000..d06793ac4171
--- /dev/null
+++ b/pkgs/applications/graphics/tesseract/languages.nix
@@ -0,0 +1,43 @@
+{ stdenv, lib, fetchurl, fetchFromGitHub }:
+
+rec {
+  makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }:
+    let
+      tessdataSrc = fetchFromGitHub {
+        owner = "tesseract-ocr";
+        repo = "tessdata";
+        rev = tessdataRev;
+        sha256 = tessdata;
+      };
+
+      languageFile = lang: sha256: fetchurl {
+        url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata";
+        inherit sha256;
+      };
+    in
+      {
+        all = stdenv.mkDerivation {
+          name = "all";
+          buildCommand = ''
+            mkdir $out
+            cd ${tessdataSrc}
+            cp *.traineddata $out
+          '';
+          outputHashMode = "recursive";
+          outputHashAlgo = "sha256";
+          outputHash = all;
+        };
+      };
+
+  v3 = makeLanguages {
+    tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
+    tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
+    all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn";
+  };
+
+  v4 = makeLanguages {
+    tessdataRev = "4.0.0";
+    tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
+    all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg";
+  };
+}
diff --git a/pkgs/applications/graphics/tesseract/tesseract3.nix b/pkgs/applications/graphics/tesseract/tesseract3.nix
index 7940079d0994..db0e06434aa9 100644
--- a/pkgs/applications/graphics/tesseract/tesseract3.nix
+++ b/pkgs/applications/graphics/tesseract/tesseract3.nix
@@ -1,37 +1,5 @@
-{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
-, leptonica, libpng, libtiff, icu, pango, opencl-headers
-# Supported list of languages or `null' for all available languages
-, enableLanguages ? null
-# if you want just a specific list of languages, optionally specify a hash
-# to make tessdata a fixed output derivation.
-, enableLanguagesHash ? (if enableLanguages == null # all languages
-                         then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
-                         else null)
-}:
-
-let tessdata = stdenv.mkDerivation ({
-  name = "tessdata";
-  src = fetchFromGitHub {
-    owner = "tesseract-ocr";
-    repo = "tessdata";
-    rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
-    # when updating don't forget to update the default value fo enableLanguagesHash
-    sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
-  };
-  buildCommand = ''
-    cd $src;
-    for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
-      install -Dt $out/share/tessdata $src/$lang ;
-    done;
-  '';
-  preferLocalBuild = true;
-  } // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
-  # when a hash is given, we make this a fixed output derivation.
-  outputHashMode = "recursive";
-  outputHashAlgo = "sha256";
-  outputHash = enableLanguagesHash;
-  }));
-in
+{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig
+, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
 
 stdenv.mkDerivation rec {
   name = "tesseract-${version}";
@@ -51,17 +19,11 @@ stdenv.mkDerivation rec {
 
   LIBLEPT_HEADERSDIR = "${leptonica}/include";
 
-  postInstall = ''
-    for i in ${tessdata}/share/tessdata/*; do
-      ln -s $i $out/share/tessdata;
-    done
-  '';
-
   meta = {
     description = "OCR engine";
     homepage = https://github.com/tesseract-ocr/tesseract;
     license = stdenv.lib.licenses.asl20;
-    maintainers = with stdenv.lib.maintainers; [viric];
+    maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
     platforms = with stdenv.lib.platforms; linux ++ darwin;
   };
 }
diff --git a/pkgs/applications/graphics/tesseract/tesseract4.nix b/pkgs/applications/graphics/tesseract/tesseract4.nix
index 2ebca09b831f..df321023c741 100644
--- a/pkgs/applications/graphics/tesseract/tesseract4.nix
+++ b/pkgs/applications/graphics/tesseract/tesseract4.nix
@@ -1,9 +1,5 @@
 { stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
-, leptonica, libpng, libtiff, icu, pango, opencl-headers
-
-# Supported list of languages or `null' for all available languages
-, enableLanguages ? null
-}:
+, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
 
 stdenv.mkDerivation rec {
   name = "tesseract-${version}";
@@ -16,46 +12,16 @@ stdenv.mkDerivation rec {
     sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
   };
 
-  tessdata = fetchFromGitHub {
-    owner = "tesseract-ocr";
-    repo = "tessdata";
-    rev = version;
-    sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
-  };
+  enableParallelBuilding = true;
 
   nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
   buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
 
-  # Copy the .traineddata files of the languages specified in enableLanguages
-  # into `$out/share/tessdata' and check afterwards if copying was successful.
-  postInstall = let
-    mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
-    mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
-    findLangArgs = if enableLanguages != null
-                   then "\\( ${mkFindArgs enableLanguages} \\)"
-                   else "-iname '*.traineddata'";
-  in ''
-    numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
-      ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
-
-    ${if enableLanguages != null then ''
-      expected=${toString (builtins.length enableLanguages)}
-    '' else ''
-      expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
-    ''}
-
-    if [ "$numLangs" -ne "$expected" ]; then
-      echo "Expected $expected languages, but $numLangs" \
-           "were copied to \`$out/share/tessdata'" >&2
-      exit 1
-    fi
-  '';
-
   meta = {
     description = "OCR engine";
     homepage = https://github.com/tesseract-ocr/tesseract;
     license = stdenv.lib.licenses.asl20;
-    maintainers = with stdenv.lib.maintainers; [viric];
-    platforms = with stdenv.lib.platforms; linux;
+    maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
+    platforms = with stdenv.lib.platforms; linux ++ darwin;
   };
 }
diff --git a/pkgs/applications/graphics/tesseract/wrapper.nix b/pkgs/applications/graphics/tesseract/wrapper.nix
new file mode 100644
index 000000000000..6645cd9bfe94
--- /dev/null
+++ b/pkgs/applications/graphics/tesseract/wrapper.nix
@@ -0,0 +1,69 @@
+{ stdenv, makeWrapper, tesseractBase, languages
+
+# A list of languages like [ "eng" "spa" … ] or `null` for all available languages
+, enableLanguages ? null
+
+# A list of files or a directory containing files
+, tessdata ? (
+    if enableLanguages == null then
+       languages.all
+    else stdenv.mkDerivation ({
+      name = "tessdata";
+      buildCommand = ''
+        for lang in ${stdenv.lib.concatMapStringsSep " " (x: x + ".traineddata") enableLanguages}; do
+          install -Dt $out ${languages.all}/$lang
+        done
+      '';
+      preferLocalBuild = true;
+      } // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
+        # when a hash is given, we make this a fixed output derivation.
+        outputHashMode = "recursive";
+        outputHashAlgo = "sha256";
+        outputHash = enableLanguagesHash;
+      }))
+  )
+
+, enableLanguagesHash ? null
+}:
+
+let
+  passthru = { inherit tesseractBase languages tessdata; };
+
+  tesseractWithData = tesseractBase.overrideAttrs (_: {
+    inherit tesseractBase tessdata;
+
+    buildInputs = [ makeWrapper ];
+
+    buildCommand = ''
+      makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata
+
+      # Recursively link include, share
+      cp -rs --no-preserve=mode $tesseractBase/{include,share} $out
+
+      cp -r --no-preserve=mode $tesseractBase/lib $out
+      # Fixup the store paths in lib so that the tessdata from this derivation is used.
+      if (( ''${#tesseractBase} != ''${#out} )); then
+        echo "Can't replace store paths due to differing lengths"
+        exit 1
+      fi
+      find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \;
+
+      if [[ -d "$tessdata" ]]; then
+        ln -s $tessdata/* $out/share/tessdata
+      else
+        for lang in $tessdata; do
+          ln -s $lang $out/share/tessdata/''${lang#/nix/store*-}
+        done
+      fi
+
+      if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then
+         # This is a bug in Tesseract's internal tessdata discovery mechanism
+         echo "eng.traineddata must be present in tessdata for Tesseract to work"
+         exit 1
+      fi
+    '';
+  });
+
+  tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru;
+in
+  tesseract