From a383694fd59801039cab2f803dc52b5d19edc157 Mon Sep 17 00:00:00 2001 From: Geremia Taglialatela Date: Fri, 27 Sep 2024 16:37:28 +0200 Subject: [PATCH] Add Language Detection endpoint Close #247 Test --- .rubocop_todo.yml | 3 +- README.md | 24 +++++++++ config/app.yml | 1 + lib/app.rb | 22 +++++++++ lib/config.rb | 3 ++ lib/heathen.rb | 1 + .../processor_methods/detect_language.rb | 18 +++++++ lib/heathen/task.rb | 4 ++ lib/tika_config.rb | 18 +++++-- .../processor_methods/detect_language_spec.rb | 49 +++++++++++++++++++ spec/integration/standard_tasks_spec.rb | 8 +++ spec/lib/tika_config_spec.rb | 18 +++++++ 12 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 lib/heathen/processor_methods/detect_language.rb create mode 100644 spec/heathen/processor_methods/detect_language_spec.rb diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index 41bc962..dce6afb 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -87,7 +87,7 @@ Metrics/BlockLength: # Configuration parameters: CountComments, CountAsOne. Metrics/ClassLength: - Max: 173 + Max: 187 # Configuration parameters: AllowedMethods, AllowedPatterns. Metrics/CyclomaticComplexity: @@ -220,6 +220,7 @@ Style/Documentation: - 'lib/errors.rb' - 'lib/heathen/filename.rb' - 'lib/heathen/processor_methods/convert_image.rb' + - 'lib/heathen/processor_methods/detect_language.rb' - 'lib/heathen/processor_methods/htmltotext.rb' - 'lib/heathen/processor_methods/libreoffice.rb' - 'lib/heathen/processor_methods/pdftotext.rb' diff --git a/README.md b/README.md index d47507d..88358b7 100644 --- a/README.md +++ b/README.md @@ -343,6 +343,30 @@ Response: ... PDF document body ... +### Detect language + +This is a foreground document language detection request. The detected language +will be returned as the response body. + + POST /convert + +Params *(suggest using `multipart/form-data`)*: + +* `file` - the file to detect +* `action` - `detect_language` + +#### Example: + + POST /convert + file=... foo.docx ... + action=detect_language + +Response: + + Content-Type: text/plain + + en + ## Callbacks When a document conversion is completed, an attempt will be made to POST a diff --git a/config/app.yml b/config/app.yml index 8ad03fc..66c86e5 100644 --- a/config/app.yml +++ b/config/app.yml @@ -33,3 +33,4 @@ wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %> # Other settings tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %> wkhtmltopdf_params: '-d 100 --encoding UTF-8' +tesseract_available_languages: <%= ENV['TESSERACT_AVAILABLE_LANGUAGES'] %> diff --git a/lib/app.rb b/lib/app.rb index 99bcd00..f754716 100644 --- a/lib/app.rb +++ b/lib/app.rb @@ -192,6 +192,28 @@ class App < Sinatra::Base respond_with_error e end + # + # Detect document language + # + # POST params: + # file - the file to detect language + post '/detect-language' do + unless params[:file] + return respond 400, "missing file parameter" + end + + unless params[:file].respond_to?(:fetch) and params[:file].fetch(:tempfile, nil).respond_to?(:read) + return respond 400, "invalid file parameter" + end + + body = params[:file][:tempfile].read + content = Converter.new(logger: @logger).convert_file('detect-language', body) + content_type content.mime_type + content + rescue StandardError => e + respond_with_error e + end + # Legacy method to convert files # Brought over from Heathen # diff --git a/lib/config.rb b/lib/config.rb index e94d90b..e3cd166 100644 --- a/lib/config.rb +++ b/lib/config.rb @@ -45,6 +45,8 @@ class C_ attr_accessor :tika_config_directory # @return [String] Params for wkhtmltopdf attr_accessor :wkhtmltopdf_params + # @return [Array] Languages available to Tesseract for OCR. Defaults to `["eng"]` + attr_accessor :tesseract_available_languages def self.config_file_path Pathname.new File.expand_path('../config/app.yml', __dir__) @@ -70,6 +72,7 @@ def self.config c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika' c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || '' + c.tesseract_available_languages = (yaml['tesseract_available_languages'] || 'eng').split(',') c end diff --git a/lib/heathen.rb b/lib/heathen.rb index 14f5054..965e851 100644 --- a/lib/heathen.rb +++ b/lib/heathen.rb @@ -12,6 +12,7 @@ require_relative 'heathen/processor' require_relative 'heathen/processor_methods/convert_image' +require_relative 'heathen/processor_methods/detect_language' require_relative 'heathen/processor_methods/htmltotext' require_relative 'heathen/processor_methods/libreoffice' require_relative 'heathen/processor_methods/pdftotext' diff --git a/lib/heathen/processor_methods/detect_language.rb b/lib/heathen/processor_methods/detect_language.rb new file mode 100644 index 0000000..4d7730f --- /dev/null +++ b/lib/heathen/processor_methods/detect_language.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Heathen + class Processor + def detect_language + executioner.execute( + Colore::C_.tika_path, + "--config=#{Colore::TikaConfig.path_for_language_detection}", + '--language', + job.content_file, + binary: true + ) + raise ConversionFailed.new if executioner.last_exit_status != 0 + + job.content = executioner.stdout + end + end +end diff --git a/lib/heathen/task.rb b/lib/heathen/task.rb index 666b493..e421a5f 100644 --- a/lib/heathen/task.rb +++ b/lib/heathen/task.rb @@ -103,3 +103,7 @@ def task_key(action, mime_type) Heathen::Task.register 'doc', '.*' do perform_task 'msoffice' end + +Heathen::Task.register 'detect_language', '.*' do + detect_language +end diff --git a/lib/tika_config.rb b/lib/tika_config.rb index 4232914..10eb244 100644 --- a/lib/tika_config.rb +++ b/lib/tika_config.rb @@ -20,7 +20,7 @@ module TikaConfig - %s + %s @@ -34,12 +34,12 @@ def tika_config_path Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__) end - def path_for!(language_alpha3) - file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml") + def path_for!(alpha3_languages) + file = tika_config_path.join('ocr', VERSION, "tika.#{alpha3_languages.sort.join('-')}.xml") return file if file.file? FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION)) - file.write format(TEMPLATE, language_alpha3: language_alpha3) + file.write format(TEMPLATE, alpha3_languages: alpha3_languages.join('+')) file end end @@ -55,7 +55,15 @@ def path_for!(language_alpha3) def self.path_for(language) language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE - path_for!(language_alpha3) + path_for!([language_alpha3]) + end + + # Returns the file path of the Tika configuration for performing language + # detection. + # + # @return [Pathname] The path to the Tika configuration file for language detection + def self.path_for_language_detection + path_for!(Colore::C_.tesseract_available_languages) end end end diff --git a/spec/heathen/processor_methods/detect_language_spec.rb b/spec/heathen/processor_methods/detect_language_spec.rb new file mode 100644 index 0000000..e26f8b9 --- /dev/null +++ b/spec/heathen/processor_methods/detect_language_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Heathen::Processor do + let(:content) { fixture('heathen/quickfox.ar.jpg').read } + let(:job) { Heathen::Job.new 'foo', content } + let(:processor) { described_class.new job: job, logger: spec_logger } + + before do + setup_tika_config + end + + after do + processor.clean_up + delete_tika_config + end + + describe '#detect_language' do + let(:content) { fixture('heathen/quickfox.jpg').read } + + before do + processor.detect_language + end + + it 'detects doccument language' do + expect(job.content).to eq 'en' + expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii' + end + + context 'with Arabic documents' do + context 'when Arabic is not available in Tesseract' do + it 'does not detect Arabic' do + expect(job.content).not_to eq 'ar' + end + end + + context 'when Arabic is available in Tesseract' do + before do + allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(%w[fra eng]) + end + + it 'detects Arabic' do + expect(job.content).to eq 'ar' + end + end + end + end +end diff --git a/spec/integration/standard_tasks_spec.rb b/spec/integration/standard_tasks_spec.rb index 4d317fc..1372e20 100644 --- a/spec/integration/standard_tasks_spec.rb +++ b/spec/integration/standard_tasks_spec.rb @@ -21,6 +21,14 @@ end end + describe 'detect_language' do + it 'runs' do + content = fixture('heathen/quickfox.jpg').read + new_content = converter.convert 'detect_language', content + expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii' + end + end + describe 'ocr_text' do it 'converts jpeg' do content = fixture('heathen/quickfox.jpg').read diff --git a/spec/lib/tika_config_spec.rb b/spec/lib/tika_config_spec.rb index 46ca73b..e7b5b6b 100644 --- a/spec/lib/tika_config_spec.rb +++ b/spec/lib/tika_config_spec.rb @@ -51,4 +51,22 @@ end end end + + describe '.path_for_language_detection' do + subject(:path_for_language_detection) { described_class.path_for_language_detection } + + it 'returns the correct configuration file path' do + expect(path_for_language_detection).to eq tmp_tika_config_dir.join('ocr', described_class::VERSION, 'tika.eng.xml') + end + + context 'when multiple languages are available' do + before do + allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(%w[fra eng]) + end + + it 'returns the correct configuration file path' do + expect(path_for_language_detection).to eq tmp_tika_config_dir.join('ocr', described_class::VERSION, 'tika.eng-fra.xml') + end + end + end end