tesseract: Reintroduce enableLanguages

I've removed that attribute in 68bc260ca2d71a676dd6afdb3524d4fff483016b, because the language files no longer were distributed as seperate files, but if we for example only want to use the English training data, the closure size of Tesseract gets quite large (around 1.2 GB), which is a bit much just to be able to run NixOS VM tests. For this reason I've also switched the VM tests back to using only the English language. Tested using the following VM tests (the ones that have OCR enabled) on x86_64-linux: * nixos/tests/chromium.nix -A stable * nixos/tests/emacs-daemon.nix * nixos/tests/installer.nix -A luksroot * nixos/tests/lightdm.nix * nixos/tests/plasma5.nix * nixos/tests/sddm.nix Signed-off-by: aszlig <aszlig@redmoonstudios.org>
author: aszlig <aszlig@redmoonstudios.org> 2017-04-07 23:05:04 +0200
committer: aszlig <aszlig@redmoonstudios.org> 2017-04-11 03:21:26 +0200
commit: 288a79187cdb6e92c8354b3e06cebbdc8ad65d4a (patch)
tree: 3a1f4ef6f06c7bf887e157fa1a041df678f3b4e6
parent: c8c340b05ac2ebef9e963c89b168d5053ae57dec (diff)
download: nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar
nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.gz
nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.bz2
nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.lz
nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.xz
nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.zst
nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.zip
2 files changed, 28 insertions, 2 deletions
diff --git a/nixos/lib/testing.nix b/nixos/lib/testing.nix
index 8539fef0a19d..d8b5df004df8 100644
--- a/nixos/lib/testing.nix
+++ b/nixos/lib/testing.nix
@@ -93,7 +93,7 @@ rec {
 
       vms = map (m: m.config.system.build.vm) (lib.attrValues nodes);
 
-      ocrProg = tesseract;
+      ocrProg = tesseract.override { enableLanguages = [ "eng" ]; };
 
       # Generate onvenience wrappers for running the test driver
       # interactively with the specified network, and for starting the
diff --git a/pkgs/applications/graphics/tesseract/default.nix b/pkgs/applications/graphics/tesseract/default.nix
index 1f1da9a389f2..b5e1707c4fe4 100644
--- a/pkgs/applications/graphics/tesseract/default.nix
+++ b/pkgs/applications/graphics/tesseract/default.nix
@@ -1,5 +1,8 @@
 { stdenv, fetchFromGitHub, pkgconfig, leptonica, libpng, libtiff
 , icu, pango, opencl-headers
+
+# Supported list of languages or `null' for all available languages
+, enableLanguages ? null
 }:
 
 stdenv.mkDerivation rec {
@@ -25,7 +28,30 @@ stdenv.mkDerivation rec {
 
   LIBLEPT_HEADERSDIR = "${leptonica}/include";
 
-  postInstall = "cp -Rt \"$out/share/tessdata\" \"$tessdata/\"*";
+  # Copy the .traineddata files of the languages specified in enableLanguages
+  # into `$out/share/tessdata' and check afterwards if copying was successful.
+  postInstall = let
+    mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
+    mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
+    findLangArgs = if enableLanguages != null
+                   then "\\( ${mkFindArgs enableLanguages} \\)"
+                   else "-iname '*.traineddata'";
+  in ''
+    numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
+      ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
+
+    ${if enableLanguages != null then ''
+      expected=${toString (builtins.length enableLanguages)}
+    '' else ''
+      expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
+    ''}
+
+    if [ "$numLangs" -ne "$expected" ]; then
+      echo "Expected $expected languages, but $numLangs" \
+           "were copied to \`$out/share/tessdata'" >&2
+      exit 1
+    fi
+  '';
 
   meta = {
     description = "OCR engine";
author	aszlig <aszlig@redmoonstudios.org>	2017-04-07 23:05:04 +0200
committer	aszlig <aszlig@redmoonstudios.org>	2017-04-11 03:21:26 +0200
commit	288a79187cdb6e92c8354b3e06cebbdc8ad65d4a (patch)
tree	3a1f4ef6f06c7bf887e157fa1a041df678f3b4e6
parent	c8c340b05ac2ebef9e963c89b168d5053ae57dec (diff)
download	nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.gz nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.bz2 nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.lz nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.xz nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.tar.zst nixlib-288a79187cdb6e92c8354b3e06cebbdc8ad65d4a.zip