From 56961f826f6a3f0d3ba45269b72f8d9772f74504 Mon Sep 17 00:00:00 2001
From: Joel Klinger <joel.klinger@nesta.org.uk>
Date: Tue, 22 Dec 2020 16:23:18 +0000
Subject: [PATCH] [347] Turn off translate (#350)

* Turn off translate

* rmd dangling print statements
---
 nesta/core/luigihacks/elasticsearchplus.py    |  59 +++++---
 .../tests/test_elasticsearchplus.py           | 126 +++++++++---------
 nesta/packages/nlp_utils/ngrammer.py          |   7 +-
 nesta/packages/nlp_utils/preprocess.py        |  10 +-
 4 files changed, 112 insertions(+), 90 deletions(-)

diff --git a/nesta/core/luigihacks/elasticsearchplus.py b/nesta/core/luigihacks/elasticsearchplus.py
index 7371308f..2a927ec7 100644
--- a/nesta/core/luigihacks/elasticsearchplus.py
+++ b/nesta/core/luigihacks/elasticsearchplus.py
@@ -29,6 +29,7 @@
 LANGS_TAG = "terms_iso2lang_entity"
 PUNCTUATION = re.compile(r'[a-zA-Z\d\s:]').sub('', string.printable)
 
+
 def sentence_chunks(text, chunksize=2000, delim='. '):
     """Split a string into chunks, but breaking only on
     the specified delimiter.
@@ -53,17 +54,21 @@ def sentence_chunks(text, chunksize=2000, delim='. '):
 
 class MLStripper(HTMLParser):
     """Taken from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python. Tested in _sanitize_html."""
+
     def __init__(self):
         self.reset()
         self.strict = False
-        self.convert_charrefs= True
+        self.convert_charrefs = True
         self.fed = []
         super().__init__()
+
     def handle_data(self, d):
         self.fed.append(d)
+
     def get_data(self):
         return ''.join(self.fed)
 
+
 def strip_tags(html):
     """Taken from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python. Tested in _sanitize_html"""
     s = MLStripper()
@@ -86,7 +91,8 @@ def translate(text, translator, chunksize=2000):
         {text, langs} ({str, set}): Translated text and set of
                                     detected languages.
     """
-    chunks = [strip_tags(t) for t in sentence_chunks(text, chunksize=chunksize)]
+    chunks = [strip_tags(t)
+              for t in sentence_chunks(text, chunksize=chunksize)]
     texts, langs = [], set()
     for t in translator.translate(chunks, dest='en'):
         texts.append(t.text.capitalize())  # GT uncapitalizes chunks
@@ -133,10 +139,10 @@ def _ngram_and_tokenize(row, ngrammer, ngram_fields):
         processed_tokens = ngrammer.process_document(text)
         tokens += [t.replace('_', ' ')
                    for tokens in processed_tokens
-                   for t in tokens]    
+                   for t in tokens]
     _row['terms_tokens_entity'] = tokens
     return _row
-    
+
 
 def _sanitize_html(row):
     """Strips out any html encoding. Note: nothing clever is done
@@ -154,6 +160,7 @@ def _sanitize_html(row):
         _row[k] = strip_tags(v)
     return _row
 
+
 def _clean_bad_unicode_conversion(row):
     """Removes sequences of ??? from strings, which normally
     occur due to bad unicode conversion. Note this is a hack:
@@ -172,12 +179,13 @@ def _clean_bad_unicode_conversion(row):
         elif "??" not in v:
             continue
         while "???" in v:
-            v = v.replace("???","")
+            v = v.replace("???", "")
         while "??" in v:
-            v = v.replace("??","")
+            v = v.replace("??", "")
         _row[k] = v
     return _row
 
+
 def _nullify_pairs(row, null_pairs={}):
     """Nullify any value if it's 'parent' is also null.
     For example for null_pairs={'parent': 'child'}
@@ -201,6 +209,7 @@ def _nullify_pairs(row, null_pairs={}):
             _row[child] = None
     return _row
 
+
 def _remove_padding(row):
     """Remove padding from text or list text
 
@@ -218,6 +227,7 @@ def _remove_padding(row):
                        for item in v]
     return _row
 
+
 def _caps_to_camel_case_by_value(v):
     if type(v) is not str:
         return v
@@ -227,6 +237,7 @@ def _caps_to_camel_case_by_value(v):
         return v
     return v.lower().title()
 
+
 def _caps_to_camel_case(row):
     """Convert CAPITAL TERMS to Camel Case
 
@@ -275,11 +286,13 @@ def _clean_up_lists(row, do_sort=True):
         _row[k] = v
     return _row
 
+
 def _add_entity_type(row, entity_type):
     _row = deepcopy(row)
     _row['type_of_entity'] = entity_type
     return _row
 
+
 def _null_empty_str(row):
     """Nullify values if they are empty strings.
 
@@ -326,6 +339,7 @@ def _coordinates_as_floats(row):
             _row[k] = __floatify_coord(v)
     return _row
 
+
 @lru_cache()
 def _country_lookup():
     """Extract country/nationality --> iso2 code lookup
@@ -334,7 +348,7 @@ def _country_lookup():
     Returns:
         lookup (dict): country/nationality --> iso2 code lookup.
     """
-    df = pd.read_csv(COUNTRY_LOOKUP, encoding='latin', na_filter = False)
+    df = pd.read_csv(COUNTRY_LOOKUP, encoding='latin', na_filter=False)
     lookup = defaultdict(list)
     for _, row in df.iterrows():
         iso2 = row.pop("ISO 3166 Code")
@@ -344,6 +358,7 @@ def _country_lookup():
             lookup[v].append(iso2)
     return lookup
 
+
 def _country_detection(row, country_tag=COUNTRY_TAG):
     """Append a list of countries detected from keywords
     discovered in all text fields. The new field name
@@ -391,6 +406,7 @@ def _guess_delimiter(item, threshold=0.25):
     if score < threshold:
         return p
 
+
 def _listify_terms(row, delimiters=None):
     """Split any 'terms' fields by a guessed delimiter if the
     field is a string.
@@ -410,7 +426,8 @@ def _listify_terms(row, delimiters=None):
         if _type is list:
             continue
         elif _type is not str:
-            raise TypeError(f"Type for '{k}' is '{_type}' but expected 'str' or 'list'.")
+            raise TypeError(
+                f"Type for '{k}' is '{_type}' but expected 'str' or 'list'.")
         # Now determine the delimiter
         if delimiters is None:
             delimiter = _guess_delimiter(v)
@@ -521,6 +538,7 @@ class ElasticsearchPlus(Elasticsearch):
         do_sort (bool): Sort all lists?
         {args, kwargs}: (kw)args for the core :obj:`Elasticsearch` API.
     """
+
     def __init__(self, entity_type,
                  aws_auth_region,
                  no_commit=False,
@@ -578,28 +596,29 @@ def __init__(self, entity_type,
 
         # Convert items which SHOULD be lists to lists
         if listify_terms:
-            self.transforms.append(lambda row: _listify_terms(row, terms_delimiters))
+            self.transforms.append(
+                lambda row: _listify_terms(row, terms_delimiters))
 
         # Convert upper case text to camel case
         if caps_to_camel_case:
             self.transforms.append(_caps_to_camel_case)
 
-        # Translate any text to english
-        if auto_translate:
-            # URLs to load balance Google Translate
-            urls = list(f"translate.google.{ext}"
-                        for ext in ('com', 'co.uk', 'co.kr', 'at',
-                                    'ru', 'fr', 'de', 'ch', 'es'))
-            self.transforms.append(lambda row: _auto_translate(row, translator=None,
-                                                               service_urls=urls,
-                                                               **auto_translate_kwargs))
+        # # Translate any text to english
+        # if auto_translate:
+        #     # URLs to load balance Google Translate
+        #     urls = list(f"translate.googleapis.{ext}"
+        #                 for ext in ('com', 'co.uk', 'co.kr', 'at',
+        #                             'ru', 'fr', 'de', 'ch', 'es'))
+        #     self.transforms.append(lambda row: _auto_translate(row, translator=None,
+        #                                                        service_urls=urls,
+        #                                                        **auto_translate_kwargs))
 
         # Extract any ngrams and split into tokens
         if len(ngram_fields) > 0:
             # Setup ngrammer
-            if 'MYSQLDBCONF' not in os.environ:                
+            if 'MYSQLDBCONF' not in os.environ:
                 os.environ['MYSQLDBCONF'] = 'mysqldb.config'
-            ngrammer = Ngrammer(database="production") 
+            ngrammer = Ngrammer(database="production")
             self.transforms.append(lambda row: _ngram_and_tokenize(row, ngrammer,
                                                                    ngram_fields))
 
diff --git a/nesta/core/luigihacks/tests/test_elasticsearchplus.py b/nesta/core/luigihacks/tests/test_elasticsearchplus.py
index 3bbddfdc..ae97b75e 100644
--- a/nesta/core/luigihacks/tests/test_elasticsearchplus.py
+++ b/nesta/core/luigihacks/tests/test_elasticsearchplus.py
@@ -100,70 +100,70 @@ def test_sentence_chunks():
                                           chunksize=i)) == text
 
 
-def test_auto_translate_true_short(row):
-    """The translator shouldn't be applied for short pieces of text"""
-    time.sleep(2)
-    _row = _auto_translate(row, TRANSLATOR, 1000)
-    assert not _row.pop(TRANS_TAG)
-    assert len(_row.pop(LANGS_TAG)) == 0
-    assert row['korean'] == _row['korean']
-    assert row['mixed_lang'] == _row['mixed_lang']
-    assert row == _row
-
-def test_auto_translate_true_long_small_chunks(row):
-    time.sleep(2)
-    _row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1)
-    time.sleep(2)
-    _row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000)
-    assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang')
+# def test_auto_translate_true_short(row):
+#     """The translator shouldn't be applied for short pieces of text"""
+#     time.sleep(2)
+#     _row = _auto_translate(row, TRANSLATOR, 1000)
+#     assert not _row.pop(TRANS_TAG)
+#     assert len(_row.pop(LANGS_TAG)) == 0
+#     assert row['korean'] == _row['korean']
+#     assert row['mixed_lang'] == _row['mixed_lang']
+#     assert row == _row
+
+# def test_auto_translate_true_long_small_chunks(row):
+#     time.sleep(2)
+#     _row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1)
+#     time.sleep(2)
+#     _row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000)
+#     assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang')
     
-    # Test the translation itself
-    # Constraints rather than fixed assertions since 
-    # translate algorithm may change over time
-    # so the two chunk sizes aren't guaranteed to give the same results
-    k1 = _row_1.pop('korean').upper()
-    k2 = _row_2.pop('korean').upper()
-    assert len(k1) > 10
-    assert len(k2) > 10
-    assert (len(k1) - len(k2))/len(k1) < 0.95
-    assert (len(set(k1)) - len(set(k2)))/len(set(k1)) < 0.95
-    assert sum((Counter(k1) - Counter(k2)).values())/len(k1+k2) < 0.95
-    assert sum((Counter(k2) - Counter(k1)).values())/len(k1+k2) < 0.95
-
-    # Confirm that the languages are the same
-    langs_1 = _row_1.pop(LANGS_TAG)
-    langs_2 = _row_2.pop(LANGS_TAG)
-    assert len(langs_1) == len(langs_2)
-    assert set(langs_1) == set(langs_2)
-
-    # Confirm that nothing else has changed
-    assert _row_1 == _row_2
-
-def test_auto_translate_true_long(row):
-    time.sleep(2)
-    _row = _auto_translate(row, TRANSLATOR, 10)
-    assert row.pop('korean') != _row['korean']
-    assert row.pop('mixed_lang') != _row['mixed_lang']
-    assert _row.pop(TRANS_TAG)
-    trans_korean = _row.pop('korean')
-    assert all(term in trans_korean.lower()
-               for term in ('brown', 'fox', 'jump',
-                            'over', 'lazy', 'dog'))
-    trans_mixed = _row.pop('mixed_lang')
-    assert all(term in trans_mixed.lower()
-               for term in ('brown', 'fox',
-                            'something', 'english'))
-    assert set(_row.pop(LANGS_TAG)) == {'ko', 'en'}
-    assert row == _row
-
-def test_auto_translate_false(row):
-    row.pop('korean')
-    row.pop('mixed_lang')
-    time.sleep(2)
-    _row = _auto_translate(row, TRANSLATOR)
-    assert not _row.pop(TRANS_TAG)
-    assert _row.pop(LANGS_TAG) == ['en']
-    assert row == _row
+#     # Test the translation itself
+#     # Constraints rather than fixed assertions since 
+#     # translate algorithm may change over time
+#     # so the two chunk sizes aren't guaranteed to give the same results
+#     k1 = _row_1.pop('korean').upper()
+#     k2 = _row_2.pop('korean').upper()
+#     assert len(k1) > 10
+#     assert len(k2) > 10
+#     assert (len(k1) - len(k2))/len(k1) < 0.95
+#     assert (len(set(k1)) - len(set(k2)))/len(set(k1)) < 0.95
+#     assert sum((Counter(k1) - Counter(k2)).values())/len(k1+k2) < 0.95
+#     assert sum((Counter(k2) - Counter(k1)).values())/len(k1+k2) < 0.95
+
+#     # Confirm that the languages are the same
+#     langs_1 = _row_1.pop(LANGS_TAG)
+#     langs_2 = _row_2.pop(LANGS_TAG)
+#     assert len(langs_1) == len(langs_2)
+#     assert set(langs_1) == set(langs_2)
+
+#     # Confirm that nothing else has changed
+#     assert _row_1 == _row_2
+
+# def test_auto_translate_true_long(row):
+#     time.sleep(2)
+#     _row = _auto_translate(row, TRANSLATOR, 10)
+#     assert row.pop('korean') != _row['korean']
+#     assert row.pop('mixed_lang') != _row['mixed_lang']
+#     assert _row.pop(TRANS_TAG)
+#     trans_korean = _row.pop('korean')
+#     assert all(term in trans_korean.lower()
+#                for term in ('brown', 'fox', 'jump',
+#                             'over', 'lazy', 'dog'))
+#     trans_mixed = _row.pop('mixed_lang')
+#     assert all(term in trans_mixed.lower()
+#                for term in ('brown', 'fox',
+#                             'something', 'english'))
+#     assert set(_row.pop(LANGS_TAG)) == {'ko', 'en'}
+#     assert row == _row
+
+# def test_auto_translate_false(row):
+#     row.pop('korean')
+#     row.pop('mixed_lang')
+#     time.sleep(2)
+#     _row = _auto_translate(row, TRANSLATOR)
+#     assert not _row.pop(TRANS_TAG)
+#     assert _row.pop(LANGS_TAG) == ['en']
+#     assert row == _row
 
 def test_sanitize_html(row):
     _row = _sanitize_html(row)
diff --git a/nesta/packages/nlp_utils/ngrammer.py b/nesta/packages/nlp_utils/ngrammer.py
index bb0e8202..f27c2ec3 100644
--- a/nesta/packages/nlp_utils/ngrammer.py
+++ b/nesta/packages/nlp_utils/ngrammer.py
@@ -27,6 +27,7 @@ class Ngrammer:
                   variable 'MYSQLDBCONF'
         database (str): Database name
     """
+
     def __init__(self, config_filepath=None, database="dev"):
         if config_filepath is not None:
             os.environ["MYSQLDBCONF"] = config_filepath
@@ -67,7 +68,8 @@ def find_and_replace(self, sentence, size):
             return True
         return False
 
-    def process_document(self, raw_text, remove_stops=True):
+    def process_document(self, raw_text, remove_stops=True,
+                         keep_quasi_numeric=True):
         """Tokenize and insert n-grams into documents.
 
         Args:
@@ -77,7 +79,8 @@ def process_document(self, raw_text, remove_stops=True):
              processed_doc (list): Iterable ready for word embedding
         """
         # Tokenize and clean up the text first
-        text = tokenize_document(raw_text)
+        text = tokenize_document(
+            raw_text, keep_quasi_numeric=keep_quasi_numeric)
         # Replace large n-grams first, then small n-grams
         for size in sorted(self.ngrams, reverse=True):
             for sentence in text:
diff --git a/nesta/packages/nlp_utils/preprocess.py b/nesta/packages/nlp_utils/preprocess.py
index b264e268..eda4df0c 100644
--- a/nesta/packages/nlp_utils/preprocess.py
+++ b/nesta/packages/nlp_utils/preprocess.py
@@ -30,7 +30,7 @@
                        re.VERBOSE | re.IGNORECASE)
 
 
-def tokenize_document(text, remove_stops=False):
+def tokenize_document(text, remove_stops=False, keep_quasi_numeric=True):
     """Preprocess a whole raw document.
     Args:
         text (str): Raw string of text.
@@ -38,11 +38,11 @@ def tokenize_document(text, remove_stops=False):
     Return:
         List of preprocessed and tokenized documents
     """
-    return [clean_and_tokenize(sentence, remove_stops)
+    return [clean_and_tokenize(sentence, remove_stops, keep_quasi_numeric)
             for sentence in nltk.sent_tokenize(text)]
 
 
-def clean_and_tokenize(text, remove_stops):
+def clean_and_tokenize(text, remove_stops, keep_quasi_numeric=False):
     """Preprocess a raw string/sentence of text.
     Args:
        text (str): Raw string of text.
@@ -56,14 +56,14 @@ def clean_and_tokenize(text, remove_stops):
     filtered_tokens = [token.replace('-', '_') for token in _tokens
                        if not (remove_stops and len(token) <= 2)
                        and (not remove_stops or token not in stop_words)
-                       and not any(x in token for x in string.digits)
+                       and (keep_quasi_numeric or not any(x in token for x in string.digits))
                        and any(x in token for x in string.ascii_lowercase)]
     return filtered_tokens
 
 
 def filter_by_idf(documents, lower_idf_limit, upper_idf_limit):
     """Remove (from documents) terms which are in a range of IDF values.
-    
+
     Args:
         documents (list): Either a :obj:`list` of :obj:`str` or a
                           :obj:`list` of :obj:`list` of :obj:`str` to be