commit cfc05af26b571e9ca09e9c709c0fb8934e9e46dd Author: Guillaume Girol Date: Sat Aug 20 17:48:01 2022 +0200 Fix finding tesseract and cuneiform diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py index 2e5b717..35647e2 100644 --- a/src/pyocr/cuneiform.py +++ b/src/pyocr/cuneiform.py @@ -25,13 +25,9 @@ from . import builders from .error import CuneiformError -# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY -CUNEIFORM_CMD = 'cuneiform' +CUNEIFORM_CMD = '@cuneiform@/bin/cuneiform' -CUNEIFORM_DATA_POSSIBLE_PATHS = [ - "/usr/local/share/cuneiform", - "/usr/share/cuneiform", -] +CUNEIFORM_DATA_POSSIBLE_PATHS = ['@cuneiform@/share/cuneiform'] LANGUAGES_LINE_PREFIX = "Supported languages: " LANGUAGES_SPLIT_RE = re.compile("[^a-z]") diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py index 1edec8c..434a336 100644 --- a/src/pyocr/libtesseract/tesseract_raw.py +++ b/src/pyocr/libtesseract/tesseract_raw.py @@ -2,7 +2,6 @@ import ctypes import locale import logging import os -import sys from ..error import TesseractError @@ -10,51 +9,16 @@ from ..error import TesseractError logger = logging.getLogger(__name__) TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None) -libnames = [] +if TESSDATA_PREFIX is None: + TESSDATA_PREFIX = '@tesseract@/share/tessdata' + os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX + + # 70 is the minimum credible dpi for tesseract and force it to compute an # estimate of the image dpi DPI_DEFAULT = 70 - -if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'): - # Pyinstaller integration - libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")] - libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")] - tessdata = os.path.join(sys._MEIPASS, "data") - if not os.path.exists(os.path.join(tessdata, "tessdata")): - logger.warning( - "Running from container, but no tessdata ({}) found !".format( - tessdata - ) - ) - else: - TESSDATA_PREFIX = os.path.join(tessdata, "tessdata") - - -if sys.platform[:3] == "win": # pragma: no cover - libnames += [ - # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on - # Windows ? - "../vs2010/DLL_Release/libtesseract302.dll", - # prefer the most recent first - "libtesseract305.dll", - "libtesseract304.dll", - "libtesseract303.dll", - "libtesseract302.dll", - "libtesseract400.dll", # Tesseract 4 is still in alpha stage - "libtesseract.dll", - "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll", - "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll", - ] -else: - libnames += [ - "libtesseract.so.5", - "libtesseract.so.4", - "libtesseract.so.3", - "libtesseract.5.dylib", - "libtesseract.4.dylib", - ] - +libnames = [ "@tesseract@/lib/libtesseract.so" ] g_libtesseract = None @@ -367,12 +331,12 @@ def init(lang=None): try: if lang: lang = lang.encode("utf-8") - prefix = None - if TESSDATA_PREFIX: # pragma: no cover - prefix = TESSDATA_PREFIX.encode("utf-8") + + prefix = TESSDATA_PREFIX + g_libtesseract.TessBaseAPIInit3( ctypes.c_void_p(handle), - ctypes.c_char_p(prefix), + ctypes.c_char_p(prefix.encode('utf-8')), ctypes.c_char_p(lang) ) g_libtesseract.TessBaseAPISetVariable( diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index 0fe0d20..c1fdd27 100644 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility from .error import TesseractError # backward compatibility from .util import digits_only -# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY -TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' +TESSERACT_CMD = '@tesseract@/bin/tesseract' TESSDATA_EXTENSION = ".traineddata" diff --git a/tests/test_cuneiform.py b/tests/test_cuneiform.py index b76e93c..266f6b2 100644 --- a/tests/test_cuneiform.py +++ b/tests/test_cuneiform.py @@ -21,7 +21,7 @@ class TestCuneiform(BaseTest): # XXX is it useful? which.return_value = True self.assertTrue(cuneiform.is_available()) - which.assert_called_once_with("cuneiform") + which.assert_called_once_with("@cuneiform@/bin/cuneiform") @patch("subprocess.Popen") def test_version(self, popen): @@ -54,7 +54,7 @@ class TestCuneiform(BaseTest): self.assertIn("eng", langs) self.assertIn("fra", langs) popen.assert_called_once_with( - ["cuneiform", "-l"], + ["@cuneiform@/bin/cuneiform", "-l"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -110,7 +110,7 @@ class TestCuneiformTxt(BaseTest): output = cuneiform.image_to_string(self.image) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( - ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], + ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -126,7 +126,7 @@ class TestCuneiformTxt(BaseTest): builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( - ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename, + ["@cuneiform@/bin/cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT @@ -143,7 +143,7 @@ class TestCuneiformTxt(BaseTest): builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( - ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], + ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -174,7 +174,7 @@ class TestCuneiformTxt(BaseTest): output = cuneiform.image_to_string(image, builder=self.builder) self.assertEqual(output, self._get_file_content("text").strip()) popen.assert_called_once_with( - ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], + ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -230,7 +230,7 @@ class TestCuneiformWordBox(BaseTest): output = cuneiform.image_to_string(self.image, builder=self.builder) popen.assert_called_once_with( - ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], + ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -284,7 +284,7 @@ class TestCuneiformLineBox(BaseTest): output = cuneiform.image_to_string(self.image, builder=self.builder) popen.assert_called_once_with( - ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], + ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) diff --git a/tests/test_libtesseract.py b/tests/test_libtesseract.py index cc31a50..890c02c 100644 --- a/tests/test_libtesseract.py +++ b/tests/test_libtesseract.py @@ -167,7 +167,8 @@ class TestLibTesseractRaw(BaseTest): args = libtess.TessBaseAPIInit3.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) - self.assertEqual(args[1].value, None) + # we hardcode tesseract data, so we don't get None + #self.assertEqual(args[1].value, None) self.assertEqual(args[2].value, lang.encode() if lang else None) self.assertEqual( @@ -203,7 +204,8 @@ class TestLibTesseractRaw(BaseTest): args = libtess.TessBaseAPIInit3.call_args[0] self.assertEqual(len(args), 3) self.assertEqual(args[0].value, self.handle) - self.assertEqual(args[1].value, None) + # we hardcode tesseract data, so we don't get None + #self.assertEqual(args[1].value, None) self.assertEqual(args[2].value, lang.encode() if lang else None) self.assertEqual( diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py index 823818f..2ee5fb4 100644 --- a/tests/test_tesseract.py +++ b/tests/test_tesseract.py @@ -37,7 +37,7 @@ class TestTesseract(BaseTest): def test_available(self, which): which.return_value = True self.assertTrue(tesseract.is_available()) - which.assert_called_once_with("tesseract") + which.assert_called_once_with("@tesseract@/bin/tesseract") @patch("subprocess.Popen") def test_version_error(self, popen): @@ -163,7 +163,7 @@ class TestTesseract(BaseTest): for lang in ("eng", "fra", "jpn", "osd"): self.assertIn(lang, langs) popen.assert_called_once_with( - ["tesseract", "--list-langs"], + ["@tesseract@/bin/tesseract", "--list-langs"], startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -178,7 +178,7 @@ class TestTesseract(BaseTest): self.assertEqual(te.exception.status, 1) self.assertEqual("unable to get languages", te.exception.message) popen.assert_called_once_with( - ["tesseract", "--list-langs"], + ["@tesseract@/bin/tesseract", "--list-langs"], startupinfo=None, creationflags=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) @@ -255,7 +255,7 @@ class TestTesseract(BaseTest): self.assertEqual(status, 0) self.assertEqual(error, message) popen.assert_called_once_with( - ["tesseract", "input.bmp", "output"], + ["@tesseract@/bin/tesseract", "input.bmp", "output"], cwd=tmpdir, startupinfo=None, creationflags=0, @@ -278,7 +278,7 @@ class TestTesseract(BaseTest): self.assertEqual(status, 0) self.assertEqual(error, message) popen.assert_called_with( - ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], + ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"], cwd=tmpdir, startupinfo=None, creationflags=0, @@ -309,7 +309,7 @@ class TestTesseract(BaseTest): self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "--psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -345,7 +345,7 @@ class TestTesseract(BaseTest): self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "--psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -378,7 +378,7 @@ class TestTesseract(BaseTest): self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0", "-l", "osd"], stdin=subprocess.PIPE, shell=False, @@ -406,7 +406,7 @@ class TestTesseract(BaseTest): with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "--psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -440,7 +440,7 @@ class TestTesseract(BaseTest): with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "--psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -474,7 +474,7 @@ class TestTesseract(BaseTest): self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "-psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -507,7 +507,7 @@ class TestTesseract(BaseTest): self.assertEqual(result["angle"], 90) self.assertEqual(result["confidence"], 9.30) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -534,7 +534,7 @@ class TestTesseract(BaseTest): with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "-psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None, @@ -568,7 +568,7 @@ class TestTesseract(BaseTest): with self.assertRaises(tesseract.TesseractError) as te: tesseract.detect_orientation(self.image) popen.assert_called_once_with( - ["tesseract", "input.bmp", "stdout", "-psm", "0"], + ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"], stdin=subprocess.PIPE, shell=False, startupinfo=None,