Skip to content

Commit

Permalink
Switch from IBM to Google Translate; Add ignore list functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
mpolidori committed Nov 28, 2023
1 parent 7e6515f commit 2329a8c
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 34 deletions.
38 changes: 29 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
[![CKAN](https://img.shields.io/badge/ckan-2.7-orange.svg?style=flat-square)](https://github.com/ckan/ckan/tree/2.7) [![CKAN](https://img.shields.io/badge/ckan-2.8-orange.svg?style=flat-square)](https://github.com/ckan/ckan/tree/2.8) [![CKAN](https://img.shields.io/badge/ckan-2.9-orange.svg?style=flat-square)](https://github.com/ckan/ckan/tree/2.9)

Note: This plugin is tested with CKAN 2.8 or later version but hasn't been tested on earlier version.
Note: This plugin has been tested with CKAN 2.8+. It hasn't been tested on earlier versions.

# ckanext-translate
This extension provides an REST API that translates the provided text into the given languages. At the moment, it uses [IBM Watson Language Translator](https://www.ibm.com/cloud/watson-language-translator) APIs on backend for translation. In the future, other third-part services can be integrated.

This extension provides a REST API that translates the provided text into the given languages. At the moment, it uses [Google Translate API v3](https://cloud.google.com/translate/docs/advanced/translate-text-advance) APIs on backend for translation. In the future, other third-party services can be integrated.


## Installation
To install ckanext-translate extension.

To install ckanext-translate extension.

1. Activate your CKAN virtual environment, for example:
```
. /usr/lib/ckan/default/bin/activate
Expand All @@ -27,23 +30,39 @@ To install ckanext-translate extension.
## Config settings
Following environment variables must be added. To get a IBM Watson Translator API key and url, you need to sign up for IBM cloud account and create [Language translator](https://www.ibm.com/cloud/watson-language-translator) instance.
The following environment variables must be added. To get a Google Translate API key, follow the instructions [here](https://cloud.google.com/translate/docs/setup).
```
ckanext.translate.google_service_account_file=/path/to/service_account.json
ckanext.translate.google_project_id=project_id # e.g. ckan-auto-translate
ckanext.translate.google_location=project_location # e.g. global
```
There's also an optional variable to provide a list of stopwords to be ignored during translation (`ckanext.translate.ignore_list_path`). This can be helpful if you need to retain the original language for certain words or phrases. For example, if you want to keep the phrase "CKAN is awesome" in English for all languages, you can add it to a `.txt` file with one word or phrase per line:
```
ckanext.translate.ibm_url = <IBM watson translator url>
ckanext.translate.ibm_key = <IBM watson translator API key>
CKAN is awesome
Another phrase to keep
```
Let's assume you saved this file as `/srv/app/ignore_list.txt`. You can then add the following environment variable:
```
ckanext.translate.ignore_list_path=/srv/app/ignore_list.txt
```
## API Documentation
**API Endpont:** `/api/3/action/translate`
`input` You can pass keys values text for the translate. \
`from` Parameter to specify the language code of the language you want to translate from. \
`to` Parameter to specify the language code of the language you want to translate.
`input` Parameter to specify the text to be translated.
`from` Parameter to specify the language code of the language you want to translate from.
`to` Parameter to specify the language code of the language you want to translate to.
Request example:
```json
{
"input": {
Expand Down Expand Up @@ -71,6 +90,7 @@ Response example:


## Developer installation

To install ckanext-translate for development, activate your CKAN virtualenv and
do:

Expand Down
126 changes: 101 additions & 25 deletions ckanext/translate/logic/action.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,123 @@
from google.cloud import translate_v3 as translate_v3
from google.oauth2 import service_account
import logging
import json
import requests
from ckan.common import config
import re
import hashlib
import io

import ckan.plugins.toolkit as tk
import ckanext.translate.logic.schema as schema
from ckan.common import config


log = logging.getLogger(__name__)


def _get_variables():
translate_vars = {
"project_id": config.get("ckanext.translate.google_project_id"),
"location": config.get("ckanext.translate.google_location"),
"service_account_file": config.get(
"ckanext.translate.google_service_account_file"
),
}

if not all(translate_vars.values()):
prettier_vars = json.dumps(translate_vars, indent=4)
raise Exception(
"Missing variables in config. Please add the following variables to your config:\n\n{vars}".format(
vars=prettier_vars
)
)

return translate_vars


def _get_client(service_account_file):
credentials = service_account.Credentials.from_service_account_file(
service_account_file
)
client = translate_v3.TranslationServiceClient(credentials=credentials)

return client


def _ignore_terms(translate_values):
ignore_list_path = config.get("ckanext.translate.ignore_list_path")
terms = []

if ignore_list_path:
with io.open(ignore_list_path, "r", encoding="utf-8") as terms_file:
terms = terms_file.read().splitlines()

hash_to_original = {}
translate_values = list(translate_values)

for index, translate_value in enumerate(translate_values):
for term in terms:
pattern = re.compile(re.escape(term), re.IGNORECASE | re.UNICODE)

def replace_with_hash(match):
original_text = match.group()
hash_value = hashlib.sha256(original_text.encode("utf-8")).hexdigest()

if hash_value not in hash_to_original:
hash_to_original[hash_value] = original_text

return hash_value

translate_value = pattern.sub(replace_with_hash, translate_value)

translate_values[index] = translate_value

return translate_values, hash_to_original


def translate(context, data_dict):
ibm_url = config.get('ckanext.translate.ibm_url')
ibm_api_key = config.get('ckanext.translate.ibm_key')
translate_vars = _get_variables()
client = _get_client(translate_vars["service_account_file"])
project_id = translate_vars["project_id"]
parent = client.location_path(project_id, translate_vars["location"])

tk.check_access("translate", context, data_dict)
data, errors = tk.navl_validate(
data_dict, schema.translate(), context)

data, errors = tk.navl_validate(data_dict, schema.translate(), context)

if errors:
raise tk.ValidationError(errors)

translate_keys, translate_values = zip(*data["input"].items())

translate_req_dict= {
"text": list(translate_values),
"source": data_dict['from'],
"target": data_dict['to'],
}
translate_values, hash_to_original = _ignore_terms(translate_values)

try:
response = requests.post('{}/v3/translate?{}'.format(ibm_url, 'version=2018-05-01'),
auth=('apikey', ibm_api_key),
headers= {"Content-Type": "application/json"},
data=json.dumps(translate_req_dict)
)
response = client.translate_text(
contents=translate_values,
source_language_code=data_dict["from"],
target_language_code=data_dict["to"],
parent=parent,
mime_type="text/plain",
)

response.raise_for_status()
except requests.HTTPError as e:
raise tk.ValidationError({'message': '%s' % e})
except Exception as e:
raise tk.ValidationError({"message": str(e)})

translated_dict = {}
for index, translated_item in enumerate(response.json()['translations']):
translated_dict.update({ list(translate_keys)[index]: translated_item['translation'] })

return {"output" : translated_dict}
for index, translated_item in enumerate(response.translations):
for hash_value, original_text in hash_to_original.items():
pattern = re.compile(re.escape(hash_value))
translated_item.translated_text = pattern.sub(
lambda _: original_text, translated_item.translated_text
)
translated_dict.update(
{list(translate_keys)[index]: translated_item.translated_text}
)

return {"output": translated_dict}


def get_actions():
return {
'translate': translate,
}
"translate": translate,
}
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
google-cloud-translate==2.0.1

0 comments on commit 2329a8c

Please sign in to comment.