summary refs log tree commit diff
diff options
context:
space:
mode:
authoraszlig <aszlig@redmoonstudios.org>2017-04-11 02:30:45 +0200
committeraszlig <aszlig@redmoonstudios.org>2017-04-11 03:21:46 +0200
commit7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1 (patch)
tree7dd9b3c3c4ef52934073c87e08123a493605b81d
parent49cf9346422aecb6fc174af548d562803b4dd791 (diff)
downloadnixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.tar
nixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.tar.gz
nixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.tar.bz2
nixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.tar.lz
nixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.tar.xz
nixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.tar.zst
nixlib-7b5263e1a63a69b7dd5f05dd54d9355bc6648fe1.zip
tesseract: Package version 4.x from Git master
Tesseract 4 has got a new long short-term memory neural networking based
OCR engine which really helps a lot in terms of accuracy and our VM
tests.

I ran the new version across a bunch of different screenshots and
comparing the results to the 3.x branch and it really makes a big
difference, especially with various font rendering settings.

The only downside of this is that version 4 hasn't been released yet and
is in alpha state right now, but it will eventually get there and the
only solutions that came into my mind sticking to version 3 were really
sub-par:

 * Use several passes with different color negation on the screenshots.
 * Train Tesseract 3 specifically for screenshots. This is sub-par
   because we'd need to do it for Tesseract 4 from scratch again.
 * Change the test systems so that it specifically uses *only* OCR an
   font when displaying. I've actually tried this but this also isn't
   accurate enough with our default font rendering setup.
 * Turn off special font rendering settings for our tests. In
   conjunction with changing to an OCR font this might work but it won't
   catch all the cases, because applications might use their own font
   rendering.

Given that version 4 is faster[1] when it comes to OCR detection and also
the points just mentioned I think even using the alpha version just for
tests isn't going to hurt anybody.

[1]: https://github.com/tesseract-ocr/tesseract/wiki/4.0-Accuracy-and-Performance

Signed-off-by: aszlig <aszlig@redmoonstudios.org>
-rw-r--r--pkgs/applications/graphics/tesseract/4.x.nix61
-rw-r--r--pkgs/top-level/all-packages.nix1
2 files changed, 62 insertions, 0 deletions
diff --git a/pkgs/applications/graphics/tesseract/4.x.nix b/pkgs/applications/graphics/tesseract/4.x.nix
new file mode 100644
index 000000000000..577bff91e353
--- /dev/null
+++ b/pkgs/applications/graphics/tesseract/4.x.nix
@@ -0,0 +1,61 @@
+{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
+, leptonica, libpng, libtiff, icu, pango, opencl-headers
+
+# Supported list of languages or `null' for all available languages
+, enableLanguages ? null
+}:
+
+stdenv.mkDerivation rec {
+  name = "tesseract-${version}";
+  version = "4.00.00alpha-git-20170410";
+
+  src = fetchFromGitHub {
+    owner = "tesseract-ocr";
+    repo = "tesseract";
+    rev = "36a995bdc92eb2dd8bc5a63205708944a3f990a1";
+    sha256 = "0xz3krvap8sdm27v1dyb34lcdmx11wzvxyszpppfsfmjgkvg19bq";
+  };
+
+  tessdata = fetchFromGitHub {
+    owner = "tesseract-ocr";
+    repo = "tessdata";
+    rev = "8bf2e7ad08db9ca174ae2b0b3a7498c9f1f71d40";
+    sha256 = "0idwkv4qsmmqhrxcgyhy32yldl3vk054m7dkv4fjswfnalgsx794";
+  };
+
+  nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
+  buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
+
+  # Copy the .traineddata files of the languages specified in enableLanguages
+  # into `$out/share/tessdata' and check afterwards if copying was successful.
+  postInstall = let
+    mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
+    mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
+    findLangArgs = if enableLanguages != null
+                   then "\\( ${mkFindArgs enableLanguages} \\)"
+                   else "-iname '*.traineddata'";
+  in ''
+    numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
+      ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
+
+    ${if enableLanguages != null then ''
+      expected=${toString (builtins.length enableLanguages)}
+    '' else ''
+      expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
+    ''}
+
+    if [ "$numLangs" -ne "$expected" ]; then
+      echo "Expected $expected languages, but $numLangs" \
+           "were copied to \`$out/share/tessdata'" >&2
+      exit 1
+    fi
+  '';
+
+  meta = {
+    description = "OCR engine";
+    homepage = http://code.google.com/p/tesseract-ocr/;
+    license = stdenv.lib.licenses.asl20;
+    maintainers = with stdenv.lib.maintainers; [viric];
+    platforms = with stdenv.lib.platforms; linux;
+  };
+}
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 7062bedd9b14..29caabbda57f 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -15687,6 +15687,7 @@ with pkgs;
   };
 
   tesseract = callPackage ../applications/graphics/tesseract { };
+  tesseract_4 = lowPrio (callPackage ../applications/graphics/tesseract/4.x.nix { });
 
   tetraproc = callPackage ../applications/audio/tetraproc { };