Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specify language for text extraction #314

Merged
merged 1 commit into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ libreoffice_path: <%= ENV['LIBREOFFICE_PATH'] %>
tesseract_path: <%= ENV['TESSERACT_PATH'] %>
tika_path: <%= ENV['TIKA_PATH'] %>
wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %>

# Other settings
tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %>
wkhtmltopdf_params: '-d 100 --encoding UTF-8'
1 change: 1 addition & 0 deletions lib/colore.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
require_relative 'document'
require_relative 'heathen'
require_relative 'sidekiq_workers'
require_relative 'tika_config'
4 changes: 4 additions & 0 deletions lib/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class C_
attr_accessor :tika_path
# @return [String] Path to the wkhtmltopdf binary. Defaults to `"wkhtmltopdf"`
attr_accessor :wkhtmltopdf_path
# @return [String] Relative path to the writable tika config directory. Defaults to `"../tmp/tika"`
attr_accessor :tika_config_directory
# @return [String] Params for wkhtmltopdf
attr_accessor :wkhtmltopdf_params

Expand All @@ -65,6 +67,8 @@ def self.config
c.tesseract_path = yaml['tesseract_path'] || 'tesseract'
c.tika_path = yaml['tika_path'] || 'tika'
c.wkhtmltopdf_path = yaml['wkhtmltopdf_path'] || 'wkhtmltopdf'

c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika'
c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || ''

c
Expand Down
1 change: 1 addition & 0 deletions lib/heathen/processor_methods/libreoffice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def libreoffice(format:)
if to_suffix == 'txt'
executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for(job.language)}",
'--text',
job.content_file,
binary: true
Expand Down
1 change: 1 addition & 0 deletions lib/heathen/processor_methods/pdftotext.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def pdftotext

executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for(job.language)}",
'--text',
job.content_file,
binary: true
Expand Down
61 changes: 61 additions & 0 deletions lib/tika_config.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true

require 'fileutils'
require 'pathname'

module Colore
# The Colore Tika is a module to help with Tika-related configuration files.
module TikaConfig
# The configuration template version
VERSION = 'v1'

# The default language to use when the language has not been found
DEFAULT_LANGUAGE = 'eng'

# Config template
TEMPLATE = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"></parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="language" type="string">%<language_alpha3>s</param>
</params>
</parser>
</parsers>
</properties>
XML

class << self
private

def tika_config_path
Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__)
end

def path_for!(language_alpha3)
file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml")
return file if file.file?

FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION))
File.write(file, format(TEMPLATE, language_alpha3: language_alpha3))
file
end
end

# Returns the file path of the Tika configuration for performing OCR
# detection in a specified language.
#
# @param [String] language The language code in either ISO 639-1 (two-letter) or ISO 639-2 (three-letter) format.
# Supported languages are those with corresponding Tika configuration files.
#
# @return [Pathname] The path to the Tika configuration file for the specified language or
# the configuration file for DEFAULT_LANGUAGE if the language is not found.
def self.path_for(language)
language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE

path_for!(language_alpha3)
end
end
end
Binary file added spec/fixtures/heathen/quickfox.ar.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added spec/fixtures/heathen/quickfox.ar.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions spec/fixtures/heathen/quickfox.ar.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
الثعلب البني السريع مفتون بالكلاب الكسولة
15 changes: 14 additions & 1 deletion spec/heathen/processor_methods/pdftotext_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

RSpec.describe Heathen::Processor do
let(:content) { fixture('heathen/quickfox.pdf').read }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:job) { Heathen::Job.new 'foo', content, language }
let(:language) { 'en' }
let(:processor) { described_class.new job: job, logger: spec_logger }

after do
Expand All @@ -14,7 +15,19 @@
describe '#pdftotext' do
it 'converts PDF to TXT' do
processor.pdftotext
expect(job.content).to eq 'The quick brown fox jumps lazily over the dog'
expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end

context 'with Arabic files' do
let(:content) { fixture('heathen/quickfox.ar.pdf').read }
let(:language) { 'ar' }

it 'extracts Arabic text from images' do
processor.pdftotext
expect(job.content).to eq fixture('heathen/quickfox.ar.txt').read.strip.force_encoding(Encoding::ASCII_8BIT)
expect(job.content.mime_type).to eq 'text/plain; charset=utf-8'
end
end
end
end
59 changes: 59 additions & 0 deletions spec/lib/tika_config_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# frozen_string_literal: true

require 'spec_helper'
require 'fileutils'
require 'pathname'

RSpec.describe Colore::TikaConfig do
let(:tika_config_directory) { '../tmp/tika-test' }
let(:tika_test_config_path) { Pathname.new(File.expand_path('../../tmp/tika-test', __dir__)) }

before do
allow(Colore::C_.config).to receive(:tika_config_directory).and_return tika_config_directory
FileUtils.mkdir_p tika_test_config_path
FileUtils.rm_rf tika_test_config_path
end

after do
FileUtils.rm_rf tika_test_config_path
end

describe '.path_for' do
subject(:path_for) { described_class.path_for(language) }

context 'when the language is found' do
let(:language) { 'fr' }

before do
allow(Colore::Utils).to receive(:language_alpha3).with('fr').and_return('fra')
end

it 'returns the correct configuration file path' do
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.fra.xml')
end
end

context 'when the language is not found' do
let(:language) { 'unknown' }

it 'returns the default configuration file path' do
expect(path_for).to eq tika_test_config_path.join('ocr', described_class::VERSION, "tika.#{described_class::DEFAULT_LANGUAGE}.xml")
end
end

context 'when the configuration file is already present' do
let(:language) { 'en' }

before do
allow(File).to receive(:write)
.with(tika_test_config_path.join('ocr', described_class::VERSION, 'tika.eng.xml'), an_instance_of(String))
.and_call_original
end

it 'does not overwrite it' do
2.times { described_class.path_for(language) }
expect(File).to have_received(:write).once
end
end
end
end