forked from OCR-D/ocrd_tesserocr
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Segment/Recognixe with tesserocr, sample processor app
- Loading branch information
0 parents
commit d5f26cd
Showing
25 changed files
with
487 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
* | ||
!Makefile | ||
!setup.py | ||
!requirements.txt | ||
!LICENSE | ||
!README.rst | ||
|
||
!ocrd_tesserocr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
venv3 | ||
.pytest_cache | ||
__pycache__ | ||
*.pyc | ||
*.egg-info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[MASTER] | ||
extension-pkg-whitelist=lxml | ||
ignored-modules=cv2,tesserocr | ||
|
||
[MESSAGES CONTROL] | ||
disable = | ||
missing-docstring, | ||
no-self-use, | ||
too-many-arguments, | ||
superfluous-parens, | ||
invalid-name, | ||
line-too-long, | ||
too-few-public-methods, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
language: python | ||
python: | ||
# - 2.7 | ||
- 3.6 | ||
before_install: | ||
- sudo add-apt-repository -y ppa:alex-p/tesseract-ocr | ||
- sudo apt-get -qq update | ||
- sudo make deps-ubuntu | ||
install: | ||
- make deps-pip test-deps-pip | ||
script: | ||
- export TESSDATA_PREFIX="/usr/share/tesseract-ocr/4.00/tessdata"; make test TESSDATA_PREFIX="/usr/share/tesseract-ocr/4.00/tessdata" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
FROM ocrd/pyocrd | ||
MAINTAINER OCR-D | ||
ENV DEBIAN_FRONTEND noninteractive | ||
ENV PYTHONIOENCODING utf8 | ||
ENV LC_ALL C.UTF-8 | ||
ENV LANG C.UTF-8 | ||
|
||
WORKDIR /build-ocrd | ||
COPY setup.py . | ||
COPY requirements.txt . | ||
COPY README.rst . | ||
COPY LICENSE . | ||
RUN apt-get update && \ | ||
apt-get -y install --no-install-recommends \ | ||
ca-certificates \ | ||
make \ | ||
git | ||
COPY Makefile . | ||
RUN make deps-ubuntu | ||
COPY ocrd_tesserocr ./ocrd_tesserocr | ||
RUN pip3 install --upgrade pip | ||
RUN make deps-pip install | ||
|
||
ENTRYPOINT ["/bin/sh", "-c"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
Copyright © 2018 Konstantin Baierer | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining | ||
a copy of this software and associated documentation files (the "Software"), | ||
to deal in the Software without restriction, including without limitation | ||
the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
and/or sell copies of the Software, and to permit persons to whom the | ||
Software is furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included | ||
in all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
export | ||
|
||
SHELL = /bin/bash | ||
PYTHON = python | ||
PYTHONPATH := .:$(PYTHONPATH) | ||
PIP = pip | ||
LOG_LEVEL = INFO | ||
PYTHONIOENCODING=utf8 | ||
|
||
# Docker container tag | ||
DOCKER_TAG = 'ocrd/ocrd_tesserocr' | ||
|
||
# BEGIN-EVAL makefile-parser --make-help Makefile | ||
|
||
help: | ||
@echo "" | ||
@echo " Targets" | ||
@echo "" | ||
@echo " deps-ubuntu Dependencies for deployment in an ubuntu/debian linux" | ||
@echo " deps-pip Install python deps via pip" | ||
|
||
# END-EVAL | ||
|
||
# Dependencies for deployment in an ubuntu/debian linux | ||
deps-ubuntu: | ||
apt install -y \ | ||
libtesseract-dev \ | ||
libleptonica-dev \ | ||
tesseract-ocr-eng | ||
|
||
# Install python deps via pip | ||
deps-pip: | ||
$(PIP) install -r requirements.txt | ||
|
||
# Install | ||
install: | ||
$(PIP) install . | ||
|
||
# Build docker image | ||
docker: | ||
docker build -t $(DOCKER_TAG) . | ||
|
||
.PHONY: test | ||
# Run test | ||
test: | ||
python -m pytest test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
ocrd_processor_tesserocr | ||
======================== | ||
|
||
Segment region, line, recognize with tesserocr | ||
|
||
.. image:: https://travis-ci.org/OCR-D/ocrd_tesserocr.svg?branch=master | ||
:target: https://travis-ci.org/OCR-D/ocrd_tesserocr | ||
|
||
.. image:: https://img.shields.io/docker/automated/ocrd/ocrd_tesserocr.svg | ||
:target: https://hub.docker.com/r/ocrd/ocrd_tesserocr/tags/ | ||
:alt: Docker Automated build | ||
|
||
|
||
Installation | ||
------------ | ||
|
||
Required ubuntu packages: | ||
|
||
* Tesseract headers (``libtesseract-dev``) | ||
* Some tesseract language models (``tesseract-ocr-{eng,deu,deu-frak,...}``) | ||
* Leptonica headers (``libleptonica-dev``) | ||
|
||
:: | ||
|
||
pip install -r requirements | ||
pip install . | ||
|
||
If tesserocr fails to compile with an error::: | ||
|
||
$PREFIX/include/tesseract/unicharset.h:241:10: error: ‘string’ does not name a type; did you mean ‘stdin’? | ||
static string CleanupString(const char* utf8_str) { | ||
^~~~~~ | ||
stdin | ||
|
||
This is due to some inconsistencies in the installed tesseract C headers. Replace ``string`` with ``std::string`` in ``$PREFIX/include/tesseract/unicharset.h:265:5:`` and ``$PREFIX/include/tesseract/unichar.h:164:10:`` ff. | ||
|
||
If tesserocr fails with an error about ``LSTM``/``CUBE``, you are have a | ||
mismatch between tesseract header/data/pkg-config versions. ``apt policy | ||
libtesseract-dev`` lists the apt-installable versions, keep it consistent. Make | ||
sure there are no spurious pkg-config artifacts, e.g. in | ||
``/usr/local/lib/pkgconfig/tesseract.pc``. The same goes for language models. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"git_url": "https://github.com/ocr-d/ocrd-tesserocr", | ||
"dockerhub": "ocrd/ocrd-tesserocr", | ||
"tools": [ | ||
{ | ||
"tags": ["Layouterkennung"], | ||
"description": "Segment page into regions with tesseract", | ||
"binary": "ocrd_tesserocr_segment_line", | ||
"step": "segment-line" | ||
}, | ||
{ | ||
"tags": ["Layouterkennung"], | ||
"description": "Segment regions into lines with tesseract", | ||
"binary": "ocrd_tesserocr_segment_region", | ||
"step": "segment-region" | ||
}, | ||
{ | ||
"tags": ["Texterkennung"], | ||
"description": "Recognize text in lines with tesseract", | ||
"binary": "ocrd_tesserocr_recognize", | ||
"step": "recognize" | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .recognize import TesserocrRecognize | ||
from .segment_line import TesserocrSegmentLine | ||
from .segment_region import TesserocrSegmentRegion |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import click | ||
|
||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | ||
from ocrd_tesserocr.recognize import TesserocrRecognize | ||
from ocrd_tesserocr.segment_region import TesserocrSegmentRegion | ||
from ocrd_tesserocr.segment_line import TesserocrSegmentLine | ||
|
||
@click.command() | ||
@ocrd_cli_options | ||
def ocrd_tesserocr_segment_region(*args, **kwargs): | ||
return ocrd_cli_wrap_processor(TesserocrSegmentRegion, *args, **kwargs) | ||
|
||
@click.command() | ||
@ocrd_cli_options | ||
def ocrd_tesserocr_segment_line(*args, **kwargs): | ||
return ocrd_cli_wrap_processor(TesserocrSegmentLine, *args, **kwargs) | ||
|
||
@click.command() | ||
@ocrd_cli_options | ||
def ocrd_tesserocr_recognize(*args, **kwargs): | ||
return ocrd_cli_wrap_processor(TesserocrRecognize, *args, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import os | ||
import tesserocr | ||
TESSDATA_PREFIX = os.environ['TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages()[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from __future__ import absolute_import | ||
import tesserocr | ||
from ocrd.utils import getLogger, mets_file_id | ||
from ocrd import Processor, OcrdPage, MIMETYPE_PAGE | ||
from .config import TESSDATA_PREFIX | ||
|
||
log = getLogger('processor.TesserocrRecognize') | ||
|
||
DEFAULT_MODEL = tesserocr.get_languages()[1][-1] | ||
|
||
class TesserocrRecognize(Processor): | ||
|
||
def process(self): | ||
""" | ||
Performs the (text) recognition. | ||
""" | ||
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi: | ||
log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1]) | ||
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE) | ||
for (n, input_file) in enumerate(self.input_files): | ||
log.info("INPUT FILE %i / %s", n, input_file) | ||
self.workspace.download_file(input_file) | ||
page = OcrdPage.from_file(input_file) | ||
image_url = page.imageFileName | ||
log.info("page %s", page) | ||
for region in page.list_textregions(): | ||
textlines = region.list_textlines() | ||
log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.ID) | ||
for (line_no, line) in enumerate(textlines): | ||
log.debug("Recognizing text in region '%s' line '%s'", region.ID, line_no) | ||
# xTODO use binarized / gray | ||
image = self.workspace.resolve_image_as_pil(image_url, line.coords) | ||
tessapi.SetImage(image) | ||
line.textequiv = tessapi.GetUTF8Text() | ||
self.add_output_file( | ||
ID=mets_file_id(self.outputGrp, n), | ||
input_file=input_file, | ||
mimetype=MIMETYPE_PAGE, | ||
content=page.to_xml() | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from __future__ import absolute_import | ||
import tesserocr | ||
from ocrd.utils import getLogger, mets_file_id | ||
from ocrd import Processor, OcrdPage, MIMETYPE_PAGE | ||
from .config import TESSDATA_PREFIX | ||
|
||
log = getLogger('processor.TesserocrSegmentLine') | ||
|
||
class TesserocrSegmentLine(Processor): | ||
|
||
def process(self): | ||
""" | ||
Performs the line segmentation. | ||
""" | ||
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: | ||
for (n, input_file) in enumerate(self.input_files): | ||
page = OcrdPage.from_file(self.workspace.download_file(input_file)) | ||
image_url = page.imageFileName | ||
for region in page.list_textregions(): | ||
log.debug("Detecting lines in %s with tesseract", region) | ||
image = self.workspace.resolve_image_as_pil(image_url, region.coords) | ||
tessapi.SetImage(image) | ||
for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True): | ||
region.add_textline(coords=component[1]) | ||
self.add_output_file( | ||
ID=mets_file_id(self.outputGrp, n), | ||
input_file=input_file, | ||
mimetype=MIMETYPE_PAGE, | ||
content=page.to_xml() | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from __future__ import absolute_import | ||
import tesserocr | ||
from ocrd.utils import getLogger, mets_file_id | ||
from ocrd import Processor, OcrdPage, MIMETYPE_PAGE | ||
from .config import TESSDATA_PREFIX | ||
|
||
log = getLogger('processor.TesserocrSegmentRegion') | ||
|
||
class TesserocrSegmentRegion(Processor): | ||
|
||
def process(self): | ||
""" | ||
Performs the region segmentation. | ||
""" | ||
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: | ||
for (n, input_file) in enumerate(self.input_files): | ||
page = OcrdPage.from_file(self.workspace.download_file(input_file)) | ||
image = self.workspace.resolve_image_as_pil(page.imageFileName) | ||
log.debug("Detecting regions with tesseract") | ||
tessapi.SetImage(image) | ||
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): | ||
box, index = component[1], component[2] | ||
# the region reference in the reading order element | ||
ID = "r%i" % index | ||
page.add_reading_order_ref(ID, index) | ||
page.add_textregion(ID, box) | ||
self.add_output_file( | ||
ID=mets_file_id(self.outputGrp, n), | ||
input_file=input_file, | ||
mimetype=MIMETYPE_PAGE, | ||
content=page.to_xml() | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
click | ||
git+https://github.com/sirfz/tesserocr#egg=tesserocr | ||
git+https://github.com/OCR-D/pyocrd#egg=ocrd | ||
pyaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
-e . | ||
-e ../pyocrd | ||
|
||
click | ||
requests | ||
lxml | ||
git+https://github.com/smarnach/pyexiftool.git#egg=pyexiftool | ||
git+https://github.com/sirfz/tesserocr#egg=tesserocr | ||
lxml | ||
jsonschema | ||
Pillow | ||
numpy | ||
opencv-python | ||
Flask |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Installs three binaries: | ||
- ocrd_tesserocr_segment_line | ||
- ocrd_tesserocr_segment_region | ||
- ocrd_tesserocr_recognize | ||
""" | ||
import codecs | ||
|
||
from setuptools import setup, find_packages | ||
|
||
with codecs.open('README.rst', encoding='utf-8') as f: | ||
README = f.read() | ||
|
||
with codecs.open('LICENSE', encoding='utf-8') as f: | ||
LICENSE = f.read().encode('utf-8') | ||
|
||
setup( | ||
name='ocrd_tesserocr', | ||
version='0.0.1', | ||
description='Tesserocr bindings', | ||
long_description=README, | ||
author='Konstantin Baierer', | ||
author_email='[email protected]', | ||
url='https://github.com/kba/ocrd_tesserocr', | ||
license=LICENSE, | ||
packages=find_packages(exclude=('tests', 'docs')), | ||
install_requires=[ | ||
], | ||
entry_points={ | ||
'console_scripts': [ | ||
'ocrd_tesserocr_segment_region=ocrd_tesserocr.cli:ocrd_tesserocr_segment_region', | ||
'ocrd_tesserocr_segment_line=ocrd_tesserocr.cli:ocrd_tesserocr_segment_line', | ||
'ocrd_tesserocr_recognize=ocrd_tesserocr.cli:ocrd_tesserocr_recognize', | ||
] | ||
}, | ||
) |
Empty file.
Oops, something went wrong.