From da9fac62addc7676c37101a7c8a6b70f3dcb8fba Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Thu, 26 Sep 2024 18:15:02 -0400 Subject: [PATCH 1/7] Add watsonx_translator --- config/locales/en.yml | 9 ++ lib/i18n/tasks/configuration.rb | 3 + lib/i18n/tasks/translation.rb | 3 + .../tasks/translators/watsonx_translator.rb | 140 ++++++++++++++++++ 4 files changed, 155 insertions(+) create mode 100644 lib/i18n/tasks/translators/watsonx_translator.rb diff --git a/config/locales/en.yml b/config/locales/en.yml index 2e72475a..a464e5bc 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -130,6 +130,15 @@ en: none: Every translation is in use. usages: none: No key usages found. + watsonx_translate: + errors: + no_api_key: >- + Set watsonx API key via WATSONX_API_KEY environment variable or translation.watsonx_api_key + in config/i18n-tasks.yml. Get the key at https://www.ibm.com/products/watsonx-ai. + no_project_id: >- + Set watsonx Project ID via WATSONX_PROJECT_ID environment variable or translation.watsonx_api_key + in config/i18n-tasks.yml. Get the key at https://www.ibm.com/products/watsonx-ai. + no_results: watsonx returned no results. yandex_translate: errors: no_api_key: >- diff --git a/lib/i18n/tasks/configuration.rb b/lib/i18n/tasks/configuration.rb index 1bc3c4a4..963f10aa 100644 --- a/lib/i18n/tasks/configuration.rb +++ b/lib/i18n/tasks/configuration.rb @@ -70,6 +70,9 @@ def translation_config # rubocop:disable Metrics/AbcSize conf[:deepl_version] = ENV['DEEPL_VERSION'] if ENV.key?('DEEPL_VERSION') conf[:openai_api_key] = ENV['OPENAI_API_KEY'] if ENV.key?('OPENAI_API_KEY') conf[:openai_model] = ENV['OPENAI_MODEL'] if ENV.key?('OPENAI_MODEL') + conf[:watsonx_api_key] = ENV['WATSONX_API_KEY'] if ENV.key?('WATSONX_API_KEY') + conf[:watsonx_project_id] = ENV['WATSONX_PROJECT_ID'] if ENV.key?('WATSONX_PROJECT_ID') + conf[:watsonx_model] = ENV['WATSONX_MODEL'] if ENV.key?('WATSONX_MODEL') conf[:yandex_api_key] = ENV['YANDEX_API_KEY'] if ENV.key?('YANDEX_API_KEY') conf end diff --git a/lib/i18n/tasks/translation.rb b/lib/i18n/tasks/translation.rb index b73bfcba..99c00abb 100644 --- a/lib/i18n/tasks/translation.rb +++ b/lib/i18n/tasks/translation.rb @@ -3,6 +3,7 @@ require 'i18n/tasks/translators/deepl_translator' require 'i18n/tasks/translators/google_translator' require 'i18n/tasks/translators/openai_translator' +require 'i18n/tasks/translators/watsonx_translator' require 'i18n/tasks/translators/yandex_translator' module I18n::Tasks @@ -19,6 +20,8 @@ def translate_forest(forest, from:, backend:) Translators::GoogleTranslator.new(self).translate_forest(forest, from) when :openai Translators::OpenAiTranslator.new(self).translate_forest(forest, from) + when :watsonx + Translators::WatsonxTranslator.new(self).translate_forest(forest, from) when :yandex Translators::YandexTranslator.new(self).translate_forest(forest, from) else diff --git a/lib/i18n/tasks/translators/watsonx_translator.rb b/lib/i18n/tasks/translators/watsonx_translator.rb new file mode 100644 index 00000000..26c37103 --- /dev/null +++ b/lib/i18n/tasks/translators/watsonx_translator.rb @@ -0,0 +1,140 @@ +# frozen_string_literal: true + +require 'i18n/tasks/translators/base_translator' +require 'active_support/core_ext/string/filters' + +module I18n::Tasks::Translators + class WatsonxTranslator < BaseTranslator + # max allowed texts per request + BATCH_SIZE = 50 + DEFAULT_SYSTEM_PROMPT = <<~PROMPT.squish + <|eot_id|><|start_header_id|>user<|end_header_id|> + You are a professional translator that translates content from the %{from} locale + to the %{to} locale in an i18n locale array. + + The array has a structured format and contains multiple strings. Your task is to translate + each of these strings and create a new array with the translated strings. + + HTML markups (enclosed in < and > characters) must not be changed under any circumstance. + Variables (starting with %%{ and ending with }) must not be changed under any circumstance. + + Keep in mind the context of all the strings for a more accurate translation. + PROMPT + + def options_for_translate_values(from:, to:, **options) + options.merge( + from: from, + to: to + ) + end + + def options_for_html + {} + end + + def options_for_plain + {} + end + + def no_results_error_message + I18n.t('i18n_tasks.watsonx_translate.errors.no_results') + end + + private + + def translator + @translator ||= WatsonxClient.new(project_id: project_id, key: api_key) + end + + def api_key + @api_key ||= begin + key = @i18n_tasks.translation_config[:watsonx_api_key] + fail ::I18n::Tasks::CommandError, I18n.t('i18n_tasks.watsonx_translate.errors.no_api_key') if key.blank? + + key + end + end + + def project_id + @project_id ||= begin + key = @i18n_tasks.translation_config[:watsonx_project_id] + fail ::I18n::Tasks::CommandError, I18n.t('i18n_tasks.watsonx_translate.errors.no_project_id') if key.blank? + + project_id + end + end + + def model + @model ||= @i18n_tasks.translation_config[:watsonx_model].presence || 'meta-llama/llama-3-1-70b-instruct' + end + + def system_prompt + @system_prompt ||= @i18n_tasks.translation_config[:watsonx_system_prompt].presence || DEFAULT_SYSTEM_PROMPT + end + + def translate_values(list, from:, to:) + results = [] + + list.each_slice(BATCH_SIZE) do |batch| + translations = translate(batch, from, to) + + results << JSON.parse(translations) + end + + results.flatten + end + + def translate(values, from, to) + prompt = [ + format(system_prompt, from: from, to: to), + "<|eot_id|><|start_header_id|>user<|end_header_id|>Translate this array: \n\n\n", + "<|eot_id|><|start_header_id|>user<|end_header_id|>#{values.to_json}", + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + ].concat + + response = translator.generate_text( + { + model: model, + project_id: project_id, + input: prompt + } + ) + + response.dig('results', 0, 'generated_text') + end + end +end + +class WatsonxClient + WATSONX_BASE_URL = "https://us-south.ml.cloud.ibm.com/ml/" + IBM_CLOUD_IAM_URL = "https://iam.cloud.ibm.com/identity/token" + + def initialize(project_id:, key:) + @project_id = project_id + + @http = Faraday.new(url: WATSONX_BASE_URL) do |conn| + conn.use Faraday::Response::RaiseError + conn.request :json + conn.response :json + conn.options.timeout = 600 + conn.request :authorization, :Bearer, token(key) + end + end + + def generate_text(**opts) + @http.post("v1/text/generation?version=2024-05-20", opts).body + end + + private + + def token(key) + Faraday.new(url: IBM_CLOUD_IAM_URL) do |conn| + conn.use Faraday::Response::RaiseError + conn.response :json + conn.params = { + grant_type: "urn:ibm:params:oauth:grant-type:apikey", + apikey: key + } + end.post.body["access_token"] + end +end From c61e0edc0149f45663a3ca890e6dbcd4baec7bc5 Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Thu, 26 Sep 2024 21:40:32 -0400 Subject: [PATCH 2/7] Improved watsonx_translator --- .../tasks/translators/watsonx_translator.rb | 57 +++++++++++-------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/lib/i18n/tasks/translators/watsonx_translator.rb b/lib/i18n/tasks/translators/watsonx_translator.rb index 26c37103..aa949052 100644 --- a/lib/i18n/tasks/translators/watsonx_translator.rb +++ b/lib/i18n/tasks/translators/watsonx_translator.rb @@ -8,17 +8,20 @@ class WatsonxTranslator < BaseTranslator # max allowed texts per request BATCH_SIZE = 50 DEFAULT_SYSTEM_PROMPT = <<~PROMPT.squish - <|eot_id|><|start_header_id|>user<|end_header_id|> - You are a professional translator that translates content from the %{from} locale + You are a helpful assistant that translates content from the %{from} locale to the %{to} locale in an i18n locale array. + You always preserve the structure and formatting exactly as it is. The array has a structured format and contains multiple strings. Your task is to translate each of these strings and create a new array with the translated strings. - HTML markups (enclosed in < and > characters) must not be changed under any circumstance. - Variables (starting with %%{ and ending with }) must not be changed under any circumstance. - - Keep in mind the context of all the strings for a more accurate translation. + Reminder: + - Translate only the text, preserving the structure and formatting. + - Do not translate any URLs. + - Do not translate HTML tags like `
` and ``. + - HTML markups (enclosed in < and > characters) must not be changed under any circumstance. + - Variables (starting with %%{ and ending with }) must not be changed under any circumstance. + - Output only the result, without any additional information or comments. PROMPT def options_for_translate_values(from:, to:, **options) @@ -43,7 +46,7 @@ def no_results_error_message private def translator - @translator ||= WatsonxClient.new(project_id: project_id, key: api_key) + @translator ||= WatsonxClient.new(key: api_key) end def api_key @@ -57,8 +60,11 @@ def api_key def project_id @project_id ||= begin - key = @i18n_tasks.translation_config[:watsonx_project_id] - fail ::I18n::Tasks::CommandError, I18n.t('i18n_tasks.watsonx_translate.errors.no_project_id') if key.blank? + project_id = @i18n_tasks.translation_config[:watsonx_project_id] + if project_id.blank? + fail ::I18n::Tasks::CommandError, + I18n.t('i18n_tasks.watsonx_translate.errors.no_project_id') + end project_id end @@ -86,32 +92,35 @@ def translate_values(list, from:, to:) def translate(values, from, to) prompt = [ + '<|eot_id|><|start_header_id|>system<|end_header_id|>', format(system_prompt, from: from, to: to), - "<|eot_id|><|start_header_id|>user<|end_header_id|>Translate this array: \n\n\n", + '<|eot_id|><|start_header_id|>user<|end_header_id|>Translate this array:', "<|eot_id|><|start_header_id|>user<|end_header_id|>#{values.to_json}", - "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - ].concat + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>' + ].join response = translator.generate_text( - { - model: model, - project_id: project_id, - input: prompt + model_id: model, + project_id: project_id, + input: prompt, + parameters: { + decoding_method: :greedy, + max_new_tokens: 1000, + repetition_penalty: 1 } ) - response.dig('results', 0, 'generated_text') end end end class WatsonxClient - WATSONX_BASE_URL = "https://us-south.ml.cloud.ibm.com/ml/" - IBM_CLOUD_IAM_URL = "https://iam.cloud.ibm.com/identity/token" + require 'faraday' - def initialize(project_id:, key:) - @project_id = project_id + WATSONX_BASE_URL = 'https://us-south.ml.cloud.ibm.com/ml/' + IBM_CLOUD_IAM_URL = 'https://iam.cloud.ibm.com/identity/token' + def initialize(key:) @http = Faraday.new(url: WATSONX_BASE_URL) do |conn| conn.use Faraday::Response::RaiseError conn.request :json @@ -122,7 +131,7 @@ def initialize(project_id:, key:) end def generate_text(**opts) - @http.post("v1/text/generation?version=2024-05-20", opts).body + @http.post('v1/text/generation?version=2023-05-29', **opts).body end private @@ -132,9 +141,9 @@ def token(key) conn.use Faraday::Response::RaiseError conn.response :json conn.params = { - grant_type: "urn:ibm:params:oauth:grant-type:apikey", + grant_type: 'urn:ibm:params:oauth:grant-type:apikey', apikey: key } - end.post.body["access_token"] + end.post.body['access_token'] end end From 2beb91752e4e5bcf4970db2c54242173566ee646 Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Thu, 26 Sep 2024 21:45:26 -0400 Subject: [PATCH 3/7] Update readme --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 99bf2ae3..cae18e7e 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,7 @@ Available backends: - `deepl` - [DeepL Pro](#deepl-translation-config) - `yandex` - [Yandex Translate](#yandex-translation-config) - `openai` - [OpenAI](#openai-translation-config) +- `watsonx` - [watsonx](#watsonx-translation-config) ### Find usages @@ -483,6 +484,28 @@ OPENAI_API_KEY= OPENAI_MODEL= ``` + +### watsonx Translate + +`i18n-tasks translate-missing` requires a watsonx project and api key, get it at [IBM watsonx](https://www.ibm.com/watsonx/). + +```yaml +# config/i18n-tasks.yml +translation: + backend: watsonx + watsonx_api_key: + watsonx_project_id: + watsonx_model: +``` + +or via environment variable: + +```bash +WATSONX_API_KEY= +WATSONX_PROJECT_ID= +WATSONX_MODEL= +``` + ### Contextual Rails Parser There is an experimental feature to parse Rails with more context. `i18n-tasks` will support: From 6883365e6af4e5162fadbeb909e9d714cf19c467 Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Thu, 26 Sep 2024 21:48:31 -0400 Subject: [PATCH 4/7] Token limit --- lib/i18n/tasks/translators/watsonx_translator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/i18n/tasks/translators/watsonx_translator.rb b/lib/i18n/tasks/translators/watsonx_translator.rb index aa949052..3fc7dcdc 100644 --- a/lib/i18n/tasks/translators/watsonx_translator.rb +++ b/lib/i18n/tasks/translators/watsonx_translator.rb @@ -105,7 +105,7 @@ def translate(values, from, to) input: prompt, parameters: { decoding_method: :greedy, - max_new_tokens: 1000, + max_new_tokens: 2048, repetition_penalty: 1 } ) From 94da605ddb3773bd88537d799fa33aef6d969944 Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Fri, 27 Sep 2024 09:51:07 -0400 Subject: [PATCH 5/7] Fix tests --- config/locales/ru.yml | 9 +++++++++ lib/i18n/tasks/command/options/locales.rb | 2 +- lib/i18n/tasks/translators/watsonx_translator.rb | 10 +++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/config/locales/ru.yml b/config/locales/ru.yml index c6e7e4b2..e928191d 100644 --- a/config/locales/ru.yml +++ b/config/locales/ru.yml @@ -131,6 +131,15 @@ ru: none: Все переводы используются. usages: none: Не найдено использований. + watsonx_translate: + errors: + no_api_key: >- + Установите ключ API watsonx через переменную среды WATSONX_API_KEY или translation.watsonx_api_key + в config/i18n-tasks.yml. Получите ключ на https://www.ibm.com/products/watsonx-ai. + no_project_id: >- + Установите идентификатор проекта watsonx через переменную среды WATSONX_PROJECT_ID или translation.watsonx_api_key + в config/i18n-tasks.yml. Получите ключ на https://www.ibm.com/products/watsonx-ai. + no_results: watsonx не вернул результатов. yandex_translate: errors: no_api_key: |- diff --git a/lib/i18n/tasks/command/options/locales.rb b/lib/i18n/tasks/command/options/locales.rb index 1d06127e..aadf5877 100644 --- a/lib/i18n/tasks/command/options/locales.rb +++ b/lib/i18n/tasks/command/options/locales.rb @@ -32,7 +32,7 @@ module Locales parser: OptionParsers::Locale::Parser, default: 'base' - TRANSLATION_BACKENDS = %w[google deepl yandex openai].freeze + TRANSLATION_BACKENDS = %w[google deepl yandex openai watsonx].freeze arg :translation_backend, '-b', '--backend BACKEND', diff --git a/lib/i18n/tasks/translators/watsonx_translator.rb b/lib/i18n/tasks/translators/watsonx_translator.rb index 3fc7dcdc..3b93e2e2 100644 --- a/lib/i18n/tasks/translators/watsonx_translator.rb +++ b/lib/i18n/tasks/translators/watsonx_translator.rb @@ -115,12 +115,16 @@ def translate(values, from, to) end class WatsonxClient - require 'faraday' - WATSONX_BASE_URL = 'https://us-south.ml.cloud.ibm.com/ml/' IBM_CLOUD_IAM_URL = 'https://iam.cloud.ibm.com/identity/token' def initialize(key:) + begin + require 'faraday' + rescue LoadError + raise ::I18n::Tasks::CommandError, "Add gem 'faraday' to your Gemfile to use this command" + end + @http = Faraday.new(url: WATSONX_BASE_URL) do |conn| conn.use Faraday::Response::RaiseError conn.request :json @@ -131,7 +135,7 @@ def initialize(key:) end def generate_text(**opts) - @http.post('v1/text/generation?version=2023-05-29', **opts).body + @http.post('v1/text/generation?version=2024-05-20', **opts).body end private From 1a1ae193aeb3a67357adaa5618d954e60308ee0d Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Fri, 27 Sep 2024 09:56:57 -0400 Subject: [PATCH 6/7] Use latest llama --- lib/i18n/tasks/translators/watsonx_translator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/i18n/tasks/translators/watsonx_translator.rb b/lib/i18n/tasks/translators/watsonx_translator.rb index 3b93e2e2..8ef01bcf 100644 --- a/lib/i18n/tasks/translators/watsonx_translator.rb +++ b/lib/i18n/tasks/translators/watsonx_translator.rb @@ -71,7 +71,7 @@ def project_id end def model - @model ||= @i18n_tasks.translation_config[:watsonx_model].presence || 'meta-llama/llama-3-1-70b-instruct' + @model ||= @i18n_tasks.translation_config[:watsonx_model].presence || 'meta-llama/llama-3-2-90b-vision-instruct' end def system_prompt From 0879d4a25d5361059db35fd6c682d206380b74a9 Mon Sep 17 00:00:00 2001 From: Michael Roudnitski Date: Sat, 28 Sep 2024 12:03:08 -0400 Subject: [PATCH 7/7] Disabled rubocop warnings --- lib/i18n/tasks/configuration.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/i18n/tasks/configuration.rb b/lib/i18n/tasks/configuration.rb index 963f10aa..938bb1ab 100644 --- a/lib/i18n/tasks/configuration.rb +++ b/lib/i18n/tasks/configuration.rb @@ -60,7 +60,7 @@ def data_config # translation config # @return [Hash{String => String,Hash,Array}] - def translation_config # rubocop:disable Metrics/AbcSize + def translation_config # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity @config_sections[:translation] ||= begin conf = (config[:translation] || {}).with_indifferent_access conf[:backend] ||= DEFAULTS[:translation_backend]