From 2329a8c6583c79f11338c548128a929edc1c638d Mon Sep 17 00:00:00 2001 From: Michael Polidori Date: Tue, 28 Nov 2023 10:43:54 -0500 Subject: [PATCH] Switch from IBM to Google Translate; Add ignore list functionality --- README.md | 38 ++++++--- ckanext/translate/logic/action.py | 126 ++++++++++++++++++++++++------ requirements.txt | 1 + 3 files changed, 131 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 9bba5d6..86a4efc 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,16 @@ [![CKAN](https://img.shields.io/badge/ckan-2.7-orange.svg?style=flat-square)](https://github.com/ckan/ckan/tree/2.7) [![CKAN](https://img.shields.io/badge/ckan-2.8-orange.svg?style=flat-square)](https://github.com/ckan/ckan/tree/2.8) [![CKAN](https://img.shields.io/badge/ckan-2.9-orange.svg?style=flat-square)](https://github.com/ckan/ckan/tree/2.9) - Note: This plugin is tested with CKAN 2.8 or later version but hasn't been tested on earlier version. + Note: This plugin has been tested with CKAN 2.8+. It hasn't been tested on earlier versions. # ckanext-translate -This extension provides an REST API that translates the provided text into the given languages. At the moment, it uses [IBM Watson Language Translator](https://www.ibm.com/cloud/watson-language-translator) APIs on backend for translation. In the future, other third-part services can be integrated. + +This extension provides a REST API that translates the provided text into the given languages. At the moment, it uses [Google Translate API v3](https://cloud.google.com/translate/docs/advanced/translate-text-advance) APIs on backend for translation. In the future, other third-party services can be integrated. ## Installation -To install ckanext-translate extension. + +To install ckanext-translate extension. + 1. Activate your CKAN virtual environment, for example: ``` . /usr/lib/ckan/default/bin/activate @@ -27,23 +30,39 @@ To install ckanext-translate extension. ## Config settings - Following environment variables must be added. To get a IBM Watson Translator API key and url, you need to sign up for IBM cloud account and create [Language translator](https://www.ibm.com/cloud/watson-language-translator) instance. + + The following environment variables must be added. To get a Google Translate API key, follow the instructions [here](https://cloud.google.com/translate/docs/setup). + + ``` + ckanext.translate.google_service_account_file=/path/to/service_account.json + ckanext.translate.google_project_id=project_id # e.g. ckan-auto-translate + ckanext.translate.google_location=project_location # e.g. global + ``` + + There's also an optional variable to provide a list of stopwords to be ignored during translation (`ckanext.translate.ignore_list_path`). This can be helpful if you need to retain the original language for certain words or phrases. For example, if you want to keep the phrase "CKAN is awesome" in English for all languages, you can add it to a `.txt` file with one word or phrase per line: ``` - ckanext.translate.ibm_url = - ckanext.translate.ibm_key = + CKAN is awesome + Another phrase to keep ``` + Let's assume you saved this file as `/srv/app/ignore_list.txt`. You can then add the following environment variable: + + ``` + ckanext.translate.ignore_list_path=/srv/app/ignore_list.txt + ``` ## API Documentation + **API Endpont:** `/api/3/action/translate` -`input` You can pass keys values text for the translate. \ -`from` Parameter to specify the language code of the language you want to translate from. \ -`to` Parameter to specify the language code of the language you want to translate. +`input` Parameter to specify the text to be translated. +`from` Parameter to specify the language code of the language you want to translate from. +`to` Parameter to specify the language code of the language you want to translate to. Request example: + ```json { "input": { @@ -71,6 +90,7 @@ Response example: ## Developer installation + To install ckanext-translate for development, activate your CKAN virtualenv and do: diff --git a/ckanext/translate/logic/action.py b/ckanext/translate/logic/action.py index 1d634eb..25d8d55 100644 --- a/ckanext/translate/logic/action.py +++ b/ckanext/translate/logic/action.py @@ -1,47 +1,123 @@ +from google.cloud import translate_v3 as translate_v3 +from google.oauth2 import service_account +import logging import json -import requests -from ckan.common import config +import re +import hashlib +import io + import ckan.plugins.toolkit as tk import ckanext.translate.logic.schema as schema +from ckan.common import config + + +log = logging.getLogger(__name__) + + +def _get_variables(): + translate_vars = { + "project_id": config.get("ckanext.translate.google_project_id"), + "location": config.get("ckanext.translate.google_location"), + "service_account_file": config.get( + "ckanext.translate.google_service_account_file" + ), + } + + if not all(translate_vars.values()): + prettier_vars = json.dumps(translate_vars, indent=4) + raise Exception( + "Missing variables in config. Please add the following variables to your config:\n\n{vars}".format( + vars=prettier_vars + ) + ) + + return translate_vars + + +def _get_client(service_account_file): + credentials = service_account.Credentials.from_service_account_file( + service_account_file + ) + client = translate_v3.TranslationServiceClient(credentials=credentials) + + return client + + +def _ignore_terms(translate_values): + ignore_list_path = config.get("ckanext.translate.ignore_list_path") + terms = [] + + if ignore_list_path: + with io.open(ignore_list_path, "r", encoding="utf-8") as terms_file: + terms = terms_file.read().splitlines() + + hash_to_original = {} + translate_values = list(translate_values) + + for index, translate_value in enumerate(translate_values): + for term in terms: + pattern = re.compile(re.escape(term), re.IGNORECASE | re.UNICODE) + + def replace_with_hash(match): + original_text = match.group() + hash_value = hashlib.sha256(original_text.encode("utf-8")).hexdigest() + + if hash_value not in hash_to_original: + hash_to_original[hash_value] = original_text + + return hash_value + + translate_value = pattern.sub(replace_with_hash, translate_value) + + translate_values[index] = translate_value + + return translate_values, hash_to_original def translate(context, data_dict): - ibm_url = config.get('ckanext.translate.ibm_url') - ibm_api_key = config.get('ckanext.translate.ibm_key') + translate_vars = _get_variables() + client = _get_client(translate_vars["service_account_file"]) + project_id = translate_vars["project_id"] + parent = client.location_path(project_id, translate_vars["location"]) tk.check_access("translate", context, data_dict) - data, errors = tk.navl_validate( - data_dict, schema.translate(), context) + + data, errors = tk.navl_validate(data_dict, schema.translate(), context) if errors: raise tk.ValidationError(errors) translate_keys, translate_values = zip(*data["input"].items()) - - translate_req_dict= { - "text": list(translate_values), - "source": data_dict['from'], - "target": data_dict['to'], - } + translate_values, hash_to_original = _ignore_terms(translate_values) try: - response = requests.post('{}/v3/translate?{}'.format(ibm_url, 'version=2018-05-01'), - auth=('apikey', ibm_api_key), - headers= {"Content-Type": "application/json"}, - data=json.dumps(translate_req_dict) - ) + response = client.translate_text( + contents=translate_values, + source_language_code=data_dict["from"], + target_language_code=data_dict["to"], + parent=parent, + mime_type="text/plain", + ) - response.raise_for_status() - except requests.HTTPError as e: - raise tk.ValidationError({'message': '%s' % e}) + except Exception as e: + raise tk.ValidationError({"message": str(e)}) translated_dict = {} - for index, translated_item in enumerate(response.json()['translations']): - translated_dict.update({ list(translate_keys)[index]: translated_item['translation'] }) - return {"output" : translated_dict} + for index, translated_item in enumerate(response.translations): + for hash_value, original_text in hash_to_original.items(): + pattern = re.compile(re.escape(hash_value)) + translated_item.translated_text = pattern.sub( + lambda _: original_text, translated_item.translated_text + ) + translated_dict.update( + {list(translate_keys)[index]: translated_item.translated_text} + ) + + return {"output": translated_dict} + def get_actions(): return { - 'translate': translate, - } + "translate": translate, + } diff --git a/requirements.txt b/requirements.txt index e69de29..076ccdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1 @@ +google-cloud-translate==2.0.1 \ No newline at end of file