-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for specifying OCR language in Tika
This change ensure that Tika can specify the language configuration for its internal Tesseract OCR Parser. Close #302
- Loading branch information
Showing
11 changed files
with
145 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'fileutils' | ||
require 'pathname' | ||
|
||
module Colore | ||
# The Colore Tika is a module to help with Tika-related configuration files. | ||
module TikaConfig | ||
# The configuration template version | ||
VERSION = 'v1' | ||
|
||
# The default language to use when the language has not been found | ||
DEFAULT_LANGUAGE = 'eng' | ||
|
||
# Config template | ||
TEMPLATE = <<~XML | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<properties> | ||
<parsers> | ||
<parser class="org.apache.tika.parser.DefaultParser"></parser> | ||
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> | ||
<params> | ||
<param name="language" type="string">%<language_alpha3>s</param> | ||
</params> | ||
</parser> | ||
</parsers> | ||
</properties> | ||
XML | ||
|
||
class << self | ||
private | ||
|
||
def tika_config_path | ||
Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__) | ||
end | ||
|
||
def path_for!(language_alpha3) | ||
file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml") | ||
return file if file.file? | ||
|
||
FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION)) | ||
File.write(file, format(TEMPLATE, language_alpha3: language_alpha3)) | ||
file | ||
end | ||
end | ||
|
||
# Returns the file path of the Tika configuration for performing OCR | ||
# detection in a specified language. | ||
# | ||
# @param [String] language The language code in either ISO 639-1 (two-letter) or ISO 639-2 (three-letter) format. | ||
# Supported languages are those with corresponding Tika configuration files. | ||
# | ||
# @return [Pathname] The path to the Tika configuration file for the specified language or | ||
# the configuration file for DEFAULT_LANGUAGE if the language is not found. | ||
def self.path_for(language) | ||
language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE | ||
|
||
path_for!(language_alpha3) | ||
end | ||
end | ||
end |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
الثعلب البني السريع مفتون بالكلاب الكسولة |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'spec_helper' | ||
require 'fileutils' | ||
require 'pathname' | ||
|
||
RSpec.describe Colore::TikaConfig do | ||
let(:tika_config_directory) { '../tmp/tika-test' } | ||
let(:tika_test_config_path) { Pathname.new(File.expand_path('../../tmp/tika-test', __dir__)) } | ||
|
||
before do | ||
allow(Colore::C_.config).to receive(:tika_config_directory).and_return tika_config_directory | ||
FileUtils.mkdir_p tika_test_config_path | ||
FileUtils.rm_rf tika_test_config_path | ||
end | ||
|
||
after do | ||
FileUtils.rm_rf tika_test_config_path | ||
end | ||
|
||
describe '.path_for' do | ||
subject(:path_for) { described_class.path_for(language) } | ||
|
||
context 'when the language is found' do | ||
let(:language) { 'fr' } | ||
|
||
before do | ||
allow(Colore::Utils).to receive(:language_alpha3).with('fr').and_return('fra') | ||
end | ||
|
||
it 'returns the correct configuration file path' do | ||
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.fra.xml') | ||
end | ||
end | ||
|
||
context 'when the language is not found' do | ||
let(:language) { 'unknown' } | ||
|
||
it 'returns the default configuration file path' do | ||
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, "tika.#{described_class::DEFAULT_LANGUAGE}.xml") | ||
end | ||
end | ||
|
||
context 'when the configuration file is already present' do | ||
let(:language) { 'en' } | ||
|
||
before do | ||
allow(File).to receive(:write) | ||
.with(tika_test_config_path.join('ocr', described_class::VERSION, 'tika.eng.xml'), an_instance_of(String)) | ||
.and_call_original | ||
end | ||
|
||
it 'does not overwrite it' do | ||
2.times { described_class.path_for(language) } | ||
expect(File).to have_received(:write).once | ||
end | ||
end | ||
end | ||
end |