Segment/Recognixe with tesserocr, sample processor app

solth · Apr 8, 2018 · d5f26cd · d5f26cd
commit d5f26cd
Show file tree

Hide file tree

Showing 25 changed files with 487 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+*
+!Makefile
+!setup.py
+!requirements.txt
+!LICENSE
+!README.rst
+
+!ocrd_tesserocr
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+venv3
+.pytest_cache
+__pycache__
+*.pyc
+*.egg-info
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,13 @@
+[MASTER]
+extension-pkg-whitelist=lxml
+ignored-modules=cv2,tesserocr
+
+[MESSAGES CONTROL]
+disable =
+    missing-docstring,
+    no-self-use,
+    too-many-arguments,
+    superfluous-parens,
+    invalid-name,
+    line-too-long,
+    too-few-public-methods,
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,12 @@
+language: python
+python:
+  # - 2.7
+  - 3.6
+before_install:
+  - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr
+  - sudo apt-get -qq update
+  - sudo make deps-ubuntu
+install:
+  - make deps-pip test-deps-pip
+script:
+  - export TESSDATA_PREFIX="/usr/share/tesseract-ocr/4.00/tessdata"; make test TESSDATA_PREFIX="/usr/share/tesseract-ocr/4.00/tessdata"
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,24 @@
+FROM ocrd/pyocrd
+MAINTAINER OCR-D
+ENV DEBIAN_FRONTEND noninteractive
+ENV PYTHONIOENCODING utf8
+ENV LC_ALL C.UTF-8
+ENV LANG C.UTF-8
+
+WORKDIR /build-ocrd
+COPY setup.py .
+COPY requirements.txt .
+COPY README.rst .
+COPY LICENSE .
+RUN apt-get update && \
+    apt-get -y install --no-install-recommends \
+    ca-certificates \
+    make \
+    git
+COPY Makefile .
+RUN make deps-ubuntu
+COPY ocrd_tesserocr ./ocrd_tesserocr
+RUN pip3 install --upgrade pip
+RUN make deps-pip install
+
+ENTRYPOINT ["/bin/sh", "-c"]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,20 @@
+Copyright © 2018 Konstantin Baierer
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/Makefile b/Makefile
@@ -0,0 +1,46 @@
+export
+
+SHELL = /bin/bash
+PYTHON = python
+PYTHONPATH := .:$(PYTHONPATH)
+PIP = pip
+LOG_LEVEL = INFO
+PYTHONIOENCODING=utf8
+
+# Docker container tag
+DOCKER_TAG = 'ocrd/ocrd_tesserocr'
+
+# BEGIN-EVAL makefile-parser --make-help Makefile
+
+help:
+	@echo ""
+	@echo "  Targets"
+	@echo ""
+	@echo "    deps-ubuntu  Dependencies for deployment in an ubuntu/debian linux"
+	@echo "    deps-pip     Install python deps via pip"
+
+# END-EVAL
+
+# Dependencies for deployment in an ubuntu/debian linux
+deps-ubuntu:
+	apt install -y \
+		libtesseract-dev \
+		libleptonica-dev \
+		tesseract-ocr-eng
+
+# Install python deps via pip
+deps-pip:
+	$(PIP) install -r requirements.txt
+
+# Install
+install:
+	$(PIP) install .
+
+# Build docker image
+docker:
+	docker build -t $(DOCKER_TAG) .
+
+.PHONY: test
+# Run test
+test:
+	python -m pytest test
diff --git a/README.rst b/README.rst
@@ -0,0 +1,42 @@
+ocrd_processor_tesserocr
+========================
+
+    Segment region, line, recognize with tesserocr
+
+.. image:: https://travis-ci.org/OCR-D/ocrd_tesserocr.svg?branch=master
+    :target: https://travis-ci.org/OCR-D/ocrd_tesserocr
+
+.. image:: https://img.shields.io/docker/automated/ocrd/ocrd_tesserocr.svg
+    :target: https://hub.docker.com/r/ocrd/ocrd_tesserocr/tags/
+    :alt: Docker Automated build
+
+
+Installation
+------------
+
+Required ubuntu packages:
+
+* Tesseract headers (``libtesseract-dev``)
+* Some tesseract language models (``tesseract-ocr-{eng,deu,deu-frak,...}``)
+* Leptonica headers (``libleptonica-dev``)
+
+::
+
+    pip install -r requirements
+    pip install .
+
+If tesserocr fails to compile with an error:::
+
+    $PREFIX/include/tesseract/unicharset.h:241:10: error: ‘string’ does not name a type; did you mean ‘stdin’? 
+           static string CleanupString(const char* utf8_str) {
+                  ^~~~~~
+                  stdin
+
+This is due to some inconsistencies in the installed tesseract C headers. Replace ``string`` with ``std::string`` in ``$PREFIX/include/tesseract/unicharset.h:265:5:`` and ``$PREFIX/include/tesseract/unichar.h:164:10:`` ff.
+
+If tesserocr fails with an error about ``LSTM``/``CUBE``, you are have a
+mismatch between tesseract header/data/pkg-config versions. ``apt policy
+libtesseract-dev`` lists the apt-installable versions, keep it consistent. Make
+sure there are no spurious pkg-config artifacts, e.g. in
+``/usr/local/lib/pkgconfig/tesseract.pc``. The same goes for language models.
+
diff --git a/ocrd-tool.json b/ocrd-tool.json
@@ -0,0 +1,24 @@
+{
+  "git_url": "https://github.com/ocr-d/ocrd-tesserocr",
+  "dockerhub": "ocrd/ocrd-tesserocr",
+  "tools": [
+    {
+      "tags": ["Layouterkennung"],
+      "description": "Segment page into regions with tesseract",
+      "binary": "ocrd_tesserocr_segment_line",
+      "step": "segment-line"
+    },
+    {
+      "tags": ["Layouterkennung"],
+      "description": "Segment regions into lines with tesseract",
+      "binary": "ocrd_tesserocr_segment_region",
+      "step": "segment-region"
+    },
+    {
+      "tags": ["Texterkennung"],
+      "description": "Recognize text in lines with tesseract",
+      "binary": "ocrd_tesserocr_recognize",
+      "step": "recognize"
+    }
+  ]
+}
diff --git a/ocrd_tesserocr/__init__.py b/ocrd_tesserocr/__init__.py
@@ -0,0 +1,3 @@
+from .recognize import TesserocrRecognize
+from .segment_line import TesserocrSegmentLine
+from .segment_region import TesserocrSegmentRegion
diff --git a/ocrd_tesserocr/cli.py b/ocrd_tesserocr/cli.py
@@ -0,0 +1,21 @@
+import click
+
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+from ocrd_tesserocr.recognize import TesserocrRecognize
+from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
+from ocrd_tesserocr.segment_line import TesserocrSegmentLine
+
+@click.command()
+@ocrd_cli_options
+def ocrd_tesserocr_segment_region(*args, **kwargs):
+    return ocrd_cli_wrap_processor(TesserocrSegmentRegion, *args, **kwargs)
+
+@click.command()
+@ocrd_cli_options
+def ocrd_tesserocr_segment_line(*args, **kwargs):
+    return ocrd_cli_wrap_processor(TesserocrSegmentLine, *args, **kwargs)
+
+@click.command()
+@ocrd_cli_options
+def ocrd_tesserocr_recognize(*args, **kwargs):
+    return ocrd_cli_wrap_processor(TesserocrRecognize, *args, **kwargs)
diff --git a/ocrd_tesserocr/config.py b/ocrd_tesserocr/config.py
@@ -0,0 +1,3 @@
+import os
+import tesserocr
+TESSDATA_PREFIX = os.environ['TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages()[0]
diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import
+import tesserocr
+from ocrd.utils import getLogger, mets_file_id
+from ocrd import Processor, OcrdPage, MIMETYPE_PAGE
+from .config import TESSDATA_PREFIX
+
+log = getLogger('processor.TesserocrRecognize')
+
+DEFAULT_MODEL = tesserocr.get_languages()[1][-1]
+
+class TesserocrRecognize(Processor):
+
+    def process(self):
+        """
+        Performs the (text) recognition.
+        """
+        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
+            log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1])
+            tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
+            for (n, input_file) in enumerate(self.input_files):
+                log.info("INPUT FILE %i / %s", n, input_file)
+                self.workspace.download_file(input_file)
+                page = OcrdPage.from_file(input_file)
+                image_url = page.imageFileName
+                log.info("page %s", page)
+                for region in page.list_textregions():
+                    textlines = region.list_textlines()
+                    log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.ID)
+                    for (line_no, line) in enumerate(textlines):
+                        log.debug("Recognizing text in region '%s' line '%s'", region.ID, line_no)
+                        # xTODO use binarized / gray
+                        image = self.workspace.resolve_image_as_pil(image_url, line.coords)
+                        tessapi.SetImage(image)
+                        line.textequiv = tessapi.GetUTF8Text()
+                self.add_output_file(
+                    ID=mets_file_id(self.outputGrp, n),
+                    input_file=input_file,
+                    mimetype=MIMETYPE_PAGE,
+                    content=page.to_xml()
+                )
diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py
@@ -0,0 +1,30 @@
+from __future__ import absolute_import
+import tesserocr
+from ocrd.utils import getLogger, mets_file_id
+from ocrd import Processor, OcrdPage, MIMETYPE_PAGE
+from .config import TESSDATA_PREFIX
+
+log = getLogger('processor.TesserocrSegmentLine')
+
+class TesserocrSegmentLine(Processor):
+
+    def process(self):
+        """
+        Performs the line segmentation.
+        """
+        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
+            for (n, input_file) in enumerate(self.input_files):
+                page = OcrdPage.from_file(self.workspace.download_file(input_file))
+                image_url = page.imageFileName
+                for region in page.list_textregions():
+                    log.debug("Detecting lines in %s with tesseract", region)
+                    image = self.workspace.resolve_image_as_pil(image_url, region.coords)
+                    tessapi.SetImage(image)
+                    for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True):
+                        region.add_textline(coords=component[1])
+                self.add_output_file(
+                    ID=mets_file_id(self.outputGrp, n),
+                    input_file=input_file,
+                    mimetype=MIMETYPE_PAGE,
+                    content=page.to_xml()
+                )
diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+import tesserocr
+from ocrd.utils import getLogger, mets_file_id
+from ocrd import Processor, OcrdPage, MIMETYPE_PAGE
+from .config import TESSDATA_PREFIX
+
+log = getLogger('processor.TesserocrSegmentRegion')
+
+class TesserocrSegmentRegion(Processor):
+
+    def process(self):
+        """
+        Performs the region segmentation.
+        """
+        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
+            for (n, input_file) in enumerate(self.input_files):
+                page = OcrdPage.from_file(self.workspace.download_file(input_file))
+                image = self.workspace.resolve_image_as_pil(page.imageFileName)
+                log.debug("Detecting regions with tesseract")
+                tessapi.SetImage(image)
+                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
+                    box, index = component[1], component[2]
+                    # the region reference in the reading order element
+                    ID = "r%i" % index
+                    page.add_reading_order_ref(ID, index)
+                    page.add_textregion(ID, box)
+                self.add_output_file(
+                    ID=mets_file_id(self.outputGrp, n),
+                    input_file=input_file,
+                    mimetype=MIMETYPE_PAGE,
+                    content=page.to_xml()
+                )
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+click
+git+https://github.com/sirfz/tesserocr#egg=tesserocr
+git+https://github.com/OCR-D/pyocrd#egg=ocrd
+pyaml
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -0,0 +1,14 @@
+-e .
+-e ../pyocrd
+
+click
+requests
+lxml
+git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool
+git+https://github.com/sirfz/tesserocr#egg=tesserocr
+lxml
+jsonschema
+Pillow
+numpy
+opencv-python
+Flask
diff --git a/requirements_test.txt b/requirements_test.txt
@@ -0,0 +1 @@
+pytest
diff --git a/setup.py b/setup.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+Installs three binaries:
+
+    - ocrd_tesserocr_segment_line
+    - ocrd_tesserocr_segment_region
+    - ocrd_tesserocr_recognize
+"""
+import codecs
+
+from setuptools import setup, find_packages
+
+with codecs.open('README.rst', encoding='utf-8') as f:
+    README = f.read()
+
+with codecs.open('LICENSE', encoding='utf-8') as f:
+    LICENSE = f.read().encode('utf-8')
+
+setup(
+    name='ocrd_tesserocr',
+    version='0.0.1',
+    description='Tesserocr bindings',
+    long_description=README,
+    author='Konstantin Baierer',
+    author_email='[email protected]',
+    url='https://github.com/kba/ocrd_tesserocr',
+    license=LICENSE,
+    packages=find_packages(exclude=('tests', 'docs')),
+    install_requires=[
+    ],
+    entry_points={
+        'console_scripts': [
+            'ocrd_tesserocr_segment_region=ocrd_tesserocr.cli:ocrd_tesserocr_segment_region',
+            'ocrd_tesserocr_segment_line=ocrd_tesserocr.cli:ocrd_tesserocr_segment_line',
+            'ocrd_tesserocr_recognize=ocrd_tesserocr.cli:ocrd_tesserocr_recognize',
+        ]
+    },
+)
diff --git a/test/__init__.py b/test/__init__.py