Skip to content

Commit

Permalink
Add support for specifying OCR language in Tika
Browse files Browse the repository at this point in the history
This commit addresses issue #302, which required Tika to specify a
language for OCR other than English.

The changes ensure that the application can specify the language
configuration for Tesseract OCR.

Close #302
  • Loading branch information
tagliala committed Sep 27, 2024
1 parent d3237a5 commit be54481
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 1 deletion.
3 changes: 3 additions & 0 deletions config/app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ libreoffice_path: <%= ENV['LIBREOFFICE_PATH'] %>
tesseract_path: <%= ENV['TESSERACT_PATH'] %>
tika_path: <%= ENV['TIKA_PATH'] %>
wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %>

# Other settings
tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %>
wkhtmltopdf_params: '-d 100 --encoding UTF-8'
1 change: 1 addition & 0 deletions lib/colore.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
require_relative 'document'
require_relative 'heathen'
require_relative 'sidekiq_workers'
require_relative 'tika_config'
4 changes: 4 additions & 0 deletions lib/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class C_
attr_accessor :tika_path
# @return [String] Path to the wkhtmltopdf binary. Defaults to `"wkhtmltopdf"`
attr_accessor :wkhtmltopdf_path
# @return [String] Relative path to the writable tika config directory. Defaults to `"../tmp/tika"`
attr_accessor :tika_config_directory
# @return [String] Params for wkhtmltopdf
attr_accessor :wkhtmltopdf_params

Expand All @@ -65,6 +67,8 @@ def self.config
c.tesseract_path = yaml['tesseract_path'] || 'tesseract'
c.tika_path = yaml['tika_path'] || 'tika'
c.wkhtmltopdf_path = yaml['wkhtmltopdf_path'] || 'wkhtmltopdf'

c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika'
c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || ''

c
Expand Down
1 change: 1 addition & 0 deletions lib/heathen/processor_methods/libreoffice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def libreoffice(format:)
if to_suffix == 'txt'
executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for(job.language)}",
'--text',
job.content_file,
binary: true
Expand Down
1 change: 1 addition & 0 deletions lib/heathen/processor_methods/pdftotext.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def pdftotext

executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for(job.language)}",
'--text',
job.content_file,
binary: true
Expand Down
61 changes: 61 additions & 0 deletions lib/tika_config.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true

require 'fileutils'
require 'pathname'

module Colore
# The Colore Tika is a module to help with Tika-related configuration files.
module TikaConfig
# The configuration template version
VERSION = 'v1'

# The default language to use when the language has not been found
DEFAULT_LANGUAGE = 'eng'

# Config template
TEMPLATE = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"></parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="language" type="string">%<language_alpha3>s</param>
</params>
</parser>
</parsers>
</properties>
XML

class << self
private

def tika_config_path
Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__)
end

def path_for!(language_alpha3)
file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml")
return file if file.file?

FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION))
File.write(file, format(TEMPLATE, language_alpha3: language_alpha3))
file
end
end

# Returns the file path of the Tika configuration for performing OCR
# detection in a specified language.
#
# @param [String] language The language code in either ISO 639-1 (two-letter) or ISO 639-2 (three-letter) format.
# Supported languages are those with corresponding Tika configuration files.
#
# @return [Pathname] The path to the Tika configuration file for the specified language or
# the configuration file for DEFAULT_LANGUAGE if the language is not found.
def self.path_for(language)
language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE

path_for!(language_alpha3)
end
end
end
Binary file added spec/fixtures/heathen/quickfox.ar.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added spec/fixtures/heathen/quickfox.ar.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions spec/fixtures/heathen/quickfox.ar.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
الثعلب البني السريع مفتون بالكلاب الكسولة
15 changes: 14 additions & 1 deletion spec/heathen/processor_methods/pdftotext_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

RSpec.describe Heathen::Processor do
let(:content) { fixture('heathen/quickfox.pdf').read }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:job) { Heathen::Job.new 'foo', content, language }
let(:language) { 'en' }
let(:processor) { described_class.new job: job, logger: spec_logger }

after do
Expand All @@ -14,7 +15,19 @@
describe '#pdftotext' do
it 'converts PDF to TXT' do
processor.pdftotext
expect(job.content).to eq 'The quick brown fox jumps lazily over the dog'
expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end

context 'with Arabic files' do
let(:content) { fixture('heathen/quickfox.ar.pdf').read }
let(:language) { 'ar' }

it 'extracts Arabic text from images' do
processor.pdftotext
expect(job.content).to eq fixture('heathen/quickfox.ar.txt').read.strip.force_encoding(Encoding::ASCII_8BIT)
expect(job.content.mime_type).to eq 'text/plain; charset=utf-8'
end
end
end
end
59 changes: 59 additions & 0 deletions spec/lib/tika_config_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# frozen_string_literal: true

require 'spec_helper'
require 'fileutils'
require 'pathname'

RSpec.describe Colore::TikaConfig do
let(:tika_config_directory) { '../tmp/tika-test' }
let(:tika_test_config_path) { Pathname.new(File.expand_path('../../tmp/tika-test', __dir__)) }

before do
allow(Colore::C_.config).to receive(:tika_config_directory).and_return tika_config_directory
FileUtils.mkdir_p tika_test_config_path
FileUtils.rm_rf tika_test_config_path
end

after do
FileUtils.rm_rf tika_test_config_path
end

describe '.path_for' do
subject(:path_for) { described_class.path_for(language) }

context 'when the language is found' do
let(:language) { 'fr' }

before do
allow(Colore::Utils).to receive(:language_alpha3).with('fr').and_return('fra')
end

it 'returns the correct configuration file path' do
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.fra.xml')
end
end

context 'when the language is not found' do
let(:language) { 'unknown' }

it 'returns the default configuration file path' do
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, "tika.#{described_class::DEFAULT_LANGUAGE}.xml")
end
end

context 'when the configuration file is already present' do
let(:language) { 'en' }

before do
allow(File).to receive(:write)
.with(tika_test_config_path.join('ocr', described_class::VERSION, 'tika.eng.xml'), an_instance_of(String))
.and_call_original
end

it 'does not overwrite it' do
2.times { described_class.path_for(language) }
expect(File).to have_received(:write).once
end
end
end
end

0 comments on commit be54481

Please sign in to comment.