summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--pkgs/applications/graphics/tesseract/4.x.nix61
-rw-r--r--pkgs/top-level/all-packages.nix1
2 files changed, 62 insertions, 0 deletions
diff --git a/pkgs/applications/graphics/tesseract/4.x.nix b/pkgs/applications/graphics/tesseract/4.x.nix
new file mode 100644
index 000000000000..577bff91e353
--- /dev/null
+++ b/pkgs/applications/graphics/tesseract/4.x.nix
@@ -0,0 +1,61 @@
+{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
+, leptonica, libpng, libtiff, icu, pango, opencl-headers
+
+# Supported list of languages or `null' for all available languages
+, enableLanguages ? null
+}:
+
+stdenv.mkDerivation rec {
+  name = "tesseract-${version}";
+  version = "4.00.00alpha-git-20170410";
+
+  src = fetchFromGitHub {
+    owner = "tesseract-ocr";
+    repo = "tesseract";
+    rev = "36a995bdc92eb2dd8bc5a63205708944a3f990a1";
+    sha256 = "0xz3krvap8sdm27v1dyb34lcdmx11wzvxyszpppfsfmjgkvg19bq";
+  };
+
+  tessdata = fetchFromGitHub {
+    owner = "tesseract-ocr";
+    repo = "tessdata";
+    rev = "8bf2e7ad08db9ca174ae2b0b3a7498c9f1f71d40";
+    sha256 = "0idwkv4qsmmqhrxcgyhy32yldl3vk054m7dkv4fjswfnalgsx794";
+  };
+
+  nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
+  buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
+
+  # Copy the .traineddata files of the languages specified in enableLanguages
+  # into `$out/share/tessdata' and check afterwards if copying was successful.
+  postInstall = let
+    mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
+    mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
+    findLangArgs = if enableLanguages != null
+                   then "\\( ${mkFindArgs enableLanguages} \\)"
+                   else "-iname '*.traineddata'";
+  in ''
+    numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
+      ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
+
+    ${if enableLanguages != null then ''
+      expected=${toString (builtins.length enableLanguages)}
+    '' else ''
+      expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
+    ''}
+
+    if [ "$numLangs" -ne "$expected" ]; then
+      echo "Expected $expected languages, but $numLangs" \
+           "were copied to \`$out/share/tessdata'" >&2
+      exit 1
+    fi
+  '';
+
+  meta = {
+    description = "OCR engine";
+    homepage = http://code.google.com/p/tesseract-ocr/;
+    license = stdenv.lib.licenses.asl20;
+    maintainers = with stdenv.lib.maintainers; [viric];
+    platforms = with stdenv.lib.platforms; linux;
+  };
+}
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 7062bedd9b14..29caabbda57f 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -15687,6 +15687,7 @@ with pkgs;
   };
 
   tesseract = callPackage ../applications/graphics/tesseract { };
+  tesseract_4 = lowPrio (callPackage ../applications/graphics/tesseract/4.x.nix { });
 
   tetraproc = callPackage ../applications/audio/tetraproc { };