Skip to content

Commit

Permalink
Add Language Detection Endpoint
Browse files Browse the repository at this point in the history
- Implement a new endpoint for detecting the language of files.
- Update tests and documentation related to the new endpoint.

Closes #247
  • Loading branch information
tagliala committed Sep 27, 2024
1 parent f4ebc97 commit a92d5e4
Show file tree
Hide file tree
Showing 12 changed files with 170 additions and 6 deletions.
7 changes: 6 additions & 1 deletion .rubocop_todo.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,30 @@ Response:

... PDF document body ...

### Detect language

This is a foreground document language detection request. The detected language
will be returned as the response body.

POST /convert

Params *(suggest using `multipart/form-data`)*:

* `file` - the file to detect
* `action` - `detect_language`

#### Example:

POST /convert
file=... foo.docx ...
action=detect_language

Response:

Content-Type: text/plain

en

## Callbacks

When a document conversion is completed, an attempt will be made to POST a
Expand Down
1 change: 1 addition & 0 deletions config/app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %>
# Other settings
tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %>
wkhtmltopdf_params: '-d 100 --encoding UTF-8'
tesseract_available_languages: <%= ENV['TESSERACT_AVAILABLE_LANGUAGES'] %>
22 changes: 22 additions & 0 deletions lib/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,28 @@ class App < Sinatra::Base
respond_with_error e
end

#
# Detect document language
#
# POST params:
# file - the file to detect language
post '/detect-language' do
unless params[:file]
return respond 400, "missing file parameter"
end

unless params[:file].respond_to?(:fetch) and params[:file].fetch(:tempfile, nil).respond_to?(:read)
return respond 400, "invalid file parameter"
end

body = params[:file][:tempfile].read
content = Converter.new(logger: @logger).convert_file('detect-language', body)
content_type content.mime_type
content
rescue StandardError => e
respond_with_error e
end

# Legacy method to convert files
# Brought over from Heathen
#
Expand Down
3 changes: 3 additions & 0 deletions lib/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class C_
attr_accessor :tika_config_directory
# @return [String] Params for wkhtmltopdf
attr_accessor :wkhtmltopdf_params
# @return [Array<String>] Languages available to Tesseract for OCR. Defaults to `["eng"]`
attr_accessor :tesseract_available_languages

def self.config_file_path
Pathname.new File.expand_path('../config/app.yml', __dir__)
Expand All @@ -70,6 +72,7 @@ def self.config

c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika'
c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || ''
c.tesseract_available_languages = (yaml['tesseract_available_languages'] || 'eng').split(',')

c
end
Expand Down
1 change: 1 addition & 0 deletions lib/heathen.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
require_relative 'heathen/processor'

require_relative 'heathen/processor_methods/convert_image'
require_relative 'heathen/processor_methods/detect_language'
require_relative 'heathen/processor_methods/htmltotext'
require_relative 'heathen/processor_methods/libreoffice'
require_relative 'heathen/processor_methods/pdftotext'
Expand Down
18 changes: 18 additions & 0 deletions lib/heathen/processor_methods/detect_language.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

module Heathen
class Processor
def detect_language
executioner.execute(
Colore::C_.tika_path,
"--config=#{Colore::TikaConfig.path_for_language_detection}",
'--language',
job.content_file,
binary: true
)
raise ConversionFailed.new if executioner.last_exit_status != 0

job.content = executioner.stdout
end
end
end
4 changes: 4 additions & 0 deletions lib/heathen/task.rb
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,7 @@ def task_key(action, mime_type)
Heathen::Task.register 'doc', '.*' do
perform_task 'msoffice'
end

Heathen::Task.register 'detect_language', '.*' do
detect_language
end
18 changes: 13 additions & 5 deletions lib/tika_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ module TikaConfig
<parser class="org.apache.tika.parser.DefaultParser"></parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="language" type="string">%<language_alpha3>s</param>
<param name="language" type="string">%<alpha3_languages>s</param>
</params>
</parser>
</parsers>
Expand All @@ -34,12 +34,12 @@ def tika_config_path
Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__)
end

def path_for!(language_alpha3)
file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml")
def path_for!(alpha3_languages)
file = tika_config_path.join('ocr', VERSION, "tika.#{alpha3_languages.sort.join('-')}.xml")
return file if file.file?

FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION))
file.write format(TEMPLATE, language_alpha3: language_alpha3)
file.write format(TEMPLATE, alpha3_languages: alpha3_languages.join('+'))
file
end
end
Expand All @@ -55,7 +55,15 @@ def path_for!(language_alpha3)
def self.path_for(language)
language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE

path_for!(language_alpha3)
path_for!([language_alpha3])
end

# Returns the file path of the Tika configuration for performing language
# detection.
#
# @return [Pathname] The path to the Tika configuration file for language detection
def self.path_for_language_detection
path_for!(Colore::C_.tesseract_available_languages)
end
end
end
52 changes: 52 additions & 0 deletions spec/heathen/processor_methods/detect_language_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# frozen_string_literal: true

require 'spec_helper'

RSpec.describe Heathen::Processor do
let(:content) { fixture('heathen/quickfox.ar.jpg').read }
let(:job) { Heathen::Job.new 'foo', content }
let(:processor) { described_class.new job: job, logger: spec_logger }

before do
setup_tika_config
end

after do
processor.clean_up
delete_tika_config
end

describe '#detect_language' do
let(:content) { fixture('heathen/quickfox.jpg').read }
let(:tesseract_available_languages) { %w[eng] }

before do
allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(tesseract_available_languages)

processor.detect_language
end

it 'detects document language' do
expect(job.content).to eq 'en'
expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end

context 'with Arabic documents' do
let(:content) { fixture('heathen/quickfox.ar.jpg').read }

context 'when Arabic is not available in Tesseract' do
it 'does not detect Arabic' do
expect(job.content).not_to eq 'ar'
end
end

context 'when Arabic is available in Tesseract' do
let(:tesseract_available_languages) { %w[eng ara] }

it 'detects Arabic' do
expect(job.content).to eq 'ar'
end
end
end
end
end
8 changes: 8 additions & 0 deletions spec/integration/standard_tasks_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
end
end

describe 'detect_language' do
it 'runs' do
content = fixture('heathen/quickfox.jpg').read
new_content = converter.convert 'detect_language', content
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end
end

describe 'ocr_text' do
it 'converts jpeg' do
content = fixture('heathen/quickfox.jpg').read
Expand Down
18 changes: 18 additions & 0 deletions spec/lib/tika_config_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,22 @@
end
end
end

describe '.path_for_language_detection' do
subject(:path_for_language_detection) { described_class.path_for_language_detection }

it 'returns the correct configuration file path' do
expect(path_for_language_detection).to eq tmp_tika_config_dir.join('ocr', described_class::VERSION, 'tika.eng.xml')
end

context 'when multiple languages are available' do
before do
allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(%w[fra eng])
end

it 'returns the correct configuration file path' do
expect(path_for_language_detection).to eq tmp_tika_config_dir.join('ocr', described_class::VERSION, 'tika.eng-fra.xml')
end
end
end
end

0 comments on commit a92d5e4

Please sign in to comment.