This patch is required for pyocr to work with Tesseract version 3.05.00 and has been submitted upstream at the following URL: https://github.com/jflesch/pyocr/pull/62 diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index 73c964d..20f390c 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -240,8 +240,10 @@ class BaseBuilder(object): cuneiform_args : Arguments passed to the Cuneiform command line. """ - def __init__(self, file_extensions, tesseract_configs, cuneiform_args): + def __init__(self, file_extensions, tesseract_flags, tesseract_configs, + cuneiform_args): self.file_extensions = file_extensions + self.tesseract_flags = tesseract_flags self.tesseract_configs = tesseract_configs self.cuneiform_args = cuneiform_args @@ -298,7 +300,7 @@ class TextBuilder(BaseBuilder): def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, cuneiform_fax=False, cuneiform_singlecolumn=False): file_ext = ["txt"] - tess_conf = ["-psm", str(tesseract_layout)] + tess_flags = ["-psm", str(tesseract_layout)] cun_args = ["-f", "text"] # Add custom cuneiform parameters if needed for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), @@ -306,7 +308,7 @@ class TextBuilder(BaseBuilder): (cuneiform_singlecolumn, "--singlecolumn")]: if par: cun_args.append(arg) - super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args) + super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args) self.tesseract_layout = tesseract_layout self.built_text = [] @@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder): def __init__(self, tesseract_layout=1): file_ext = ["html", "hocr"] - tess_conf = ["hocr", "-psm", str(tesseract_layout)] + tess_flags = ["-psm", str(tesseract_layout)] + tess_conf = ["hocr"] cun_args = ["-f", "hocr"] - super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) + super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, + cun_args) self.word_boxes = [] self.tesseract_layout = tesseract_layout @@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder): def __init__(self, tesseract_layout=1): file_ext = ["html", "hocr"] - tess_conf = ["hocr", "-psm", str(tesseract_layout)] + tess_flags = ["-psm", str(tesseract_layout)] + tess_conf = ["hocr"] cun_args = ["-f", "hocr"] - super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) + super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, + cun_args) self.lines = [] self.tesseract_layout = tesseract_layout diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py index 0c2259a..f7ab309 100644 --- a/src/pyocr/libtesseract/tesseract_raw.py +++ b/src/pyocr/libtesseract/tesseract_raw.py @@ -263,11 +263,22 @@ if g_libtesseract: ] g_libtesseract.TessDeleteText.restype = None - g_libtesseract.TessBaseAPIDetectOS.argtypes = [ - ctypes.c_void_p, # TessBaseAPI* - ctypes.POINTER(OSResults), - ] - g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool + if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): + g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [ + ctypes.c_void_p, # TessBaseAPI* + ctypes.POINTER(ctypes.c_int), # orient_deg + ctypes.POINTER(ctypes.c_float), # orient_conf + ctypes.POINTER(ctypes.c_char_p), # script_name + ctypes.POINTER(ctypes.c_float), # script_conf + ] + g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \ + ctypes.c_bool + else: + g_libtesseract.TessBaseAPIDetectOS.argtypes = [ + ctypes.c_void_p, # TessBaseAPI* + ctypes.POINTER(OSResults), + ] + g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool def init(lang=None): @@ -526,15 +537,37 @@ def detect_os(handle): global g_libtesseract assert(g_libtesseract) - results = OSResults() - r = g_libtesseract.TessBaseAPIDetectOS( - ctypes.c_void_p(handle), - ctypes.pointer(results) - ) - if not r: - raise TesseractError("detect_orientation failed", - "TessBaseAPIDetectOS() failed") - return { - "orientation": results.best_orientation_id, - "confidence": results.best_oconfidence, - } + # Use the new API function if it is available, because since Tesseract + # 3.05.00 the old API function _always_ returns False. + if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'): + orientation_deg = ctypes.c_int(0) + orientation_confidence = ctypes.c_float(0.0) + + r = g_libtesseract.TessBaseAPIDetectOrientationScript( + ctypes.c_void_p(handle), + ctypes.byref(orientation_deg), + ctypes.byref(orientation_confidence), + None, # script_name + None # script_confidence + ) + + if not r: + raise TesseractError("detect_orientation failed", + "TessBaseAPIDetectOrientationScript() failed") + return { + "orientation": round(orientation_deg.value / 90), + "confidence": orientation_confidence.value, + } + else: # old API (before Tesseract 3.05.00) + results = OSResults() + r = g_libtesseract.TessBaseAPIDetectOS( + ctypes.c_void_p(handle), + ctypes.pointer(results) + ) + if not r: + raise TesseractError("detect_orientation failed", + "TessBaseAPIDetectOS() failed") + return { + "orientation": results.best_orientation_id, + "confidence": results.best_oconfidence, + } diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index 99b0121..658c96b 100755 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -22,6 +22,8 @@ import os import subprocess import sys import tempfile +import contextlib +import shutil from . import builders from . import error @@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder): def __init__(self): file_ext = ["box"] + tess_flags = [] tess_conf = ["batch.nochop", "makebox"] cun_args = [] - super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args) + super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, + cun_args) self.tesseract_layout = 1 @staticmethod @@ -173,18 +177,19 @@ def detect_orientation(image, lang=None): TesseractError --- if no script detected on the image """ _set_environment() - with temp_file(".bmp") as input_file: - command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"] + with temp_dir() as tmpdir: + command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"] if lang is not None: command += ['-l', lang] if image.mode != "RGB": image = image.convert("RGB") - image.save(input_file.name) + image.save(os.path.join(tmpdir, "input.bmp")) proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False, startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, + cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc.stdin.close() @@ -224,8 +229,8 @@ def get_available_builders(): ] -def run_tesseract(input_filename, output_filename_base, lang=None, - configs=None): +def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None, + flags=None, configs=None): ''' Runs Tesseract: `TESSERACT_CMD` \ @@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None, input_filename --- image to read output_filename_base --- file name in which must be stored the result (without the extension) + cwd --- Run Tesseract in the specified working directory or use current + one if None lang --- Tesseract language to use (if None, none will be specified) config --- List of Tesseract configs to use (if None, none will be specified) @@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None, if lang is not None: command += ['-l', lang] + if flags is not None: + command += flags + if configs is not None: command += configs - proc = subprocess.Popen(command, + proc = subprocess.Popen(command, cwd=cwd, startupinfo=g_subprocess_startup_info, creationflags=g_creation_flags, stdout=subprocess.PIPE, @@ -301,11 +311,18 @@ class ReOpenableTempfile(object): self.name = None -def temp_file(suffix): - ''' Returns a temporary file ''' - if os.name == 'nt': # Windows - return ReOpenableTempfile(suffix) - return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix) +@contextlib.contextmanager +def temp_dir(): + """ + A context manager for maintaining a temporary directory + """ + # NOTE: Drop this as soon as we don't support Python 2.7 anymore, because + # since Python 3.2 there is a context manager called TemporaryDirectory(). + path = tempfile.mkdtemp(prefix='tess_') + try: + yield path + finally: + shutil.rmtree(path) def image_to_string(image, lang=None, builder=None): @@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None): if builder is None: builder = builders.TextBuilder() - with temp_file(".bmp") as input_file: - with temp_file('') as output_file: - output_file_name_base = output_file.name - + with temp_dir() as tmpdir: if image.mode != "RGB": image = image.convert("RGB") - image.save(input_file.name) - (status, errors) = run_tesseract(input_file.name, - output_file_name_base, + image.save(os.path.join(tmpdir, "input.bmp")) + (status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir, lang=lang, + flags=builder.tesseract_flags, configs=builder.tesseract_configs) if status: raise TesseractError(status, errors) output_file_name = "ERROR" for file_extension in builder.file_extensions: - output_file_name = ('%s.%s' % (output_file_name_base, + output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"), file_extension)) if not os.access(output_file_name, os.F_OK): continue diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py index ccddd07..1ac2a4b 100644 --- a/tests/tests_libtesseract.py +++ b/tests/tests_libtesseract.py @@ -33,8 +33,9 @@ class TestContext(unittest.TestCase): (3, 3, 0), (3, 4, 0), (3, 4, 1), + (3, 5, 0), ), ("Tesseract does not have the expected version" - " (3.4.0) ! Some tests will be skipped !")) + " (3.5.0) ! Some tests will be skipped !")) def test_langs(self): langs = libtesseract.get_available_languages() diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py index e29c512..fa4d483 100644 --- a/tests/tests_tesseract.py +++ b/tests/tests_tesseract.py @@ -27,8 +27,9 @@ class TestContext(unittest.TestCase): (3, 3, 0), (3, 4, 0), (3, 4, 1), + (3, 5, 0), ), ("Tesseract does not have the expected version" - " (3.4.0) ! Some tests will be skipped !")) + " (3.5.0) ! Some tests will be skipped !")) def test_langs(self): langs = tesseract.get_available_languages()