From 56961f826f6a3f0d3ba45269b72f8d9772f74504 Mon Sep 17 00:00:00 2001 From: Joel Klinger Date: Tue, 22 Dec 2020 16:23:18 +0000 Subject: [PATCH] [347] Turn off translate (#350) * Turn off translate * rmd dangling print statements --- nesta/core/luigihacks/elasticsearchplus.py | 59 +++++--- .../tests/test_elasticsearchplus.py | 126 +++++++++--------- nesta/packages/nlp_utils/ngrammer.py | 7 +- nesta/packages/nlp_utils/preprocess.py | 10 +- 4 files changed, 112 insertions(+), 90 deletions(-) diff --git a/nesta/core/luigihacks/elasticsearchplus.py b/nesta/core/luigihacks/elasticsearchplus.py index 7371308f..2a927ec7 100644 --- a/nesta/core/luigihacks/elasticsearchplus.py +++ b/nesta/core/luigihacks/elasticsearchplus.py @@ -29,6 +29,7 @@ LANGS_TAG = "terms_iso2lang_entity" PUNCTUATION = re.compile(r'[a-zA-Z\d\s:]').sub('', string.printable) + def sentence_chunks(text, chunksize=2000, delim='. '): """Split a string into chunks, but breaking only on the specified delimiter. @@ -53,17 +54,21 @@ def sentence_chunks(text, chunksize=2000, delim='. '): class MLStripper(HTMLParser): """Taken from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python. Tested in _sanitize_html.""" + def __init__(self): self.reset() self.strict = False - self.convert_charrefs= True + self.convert_charrefs = True self.fed = [] super().__init__() + def handle_data(self, d): self.fed.append(d) + def get_data(self): return ''.join(self.fed) + def strip_tags(html): """Taken from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python. Tested in _sanitize_html""" s = MLStripper() @@ -86,7 +91,8 @@ def translate(text, translator, chunksize=2000): {text, langs} ({str, set}): Translated text and set of detected languages. """ - chunks = [strip_tags(t) for t in sentence_chunks(text, chunksize=chunksize)] + chunks = [strip_tags(t) + for t in sentence_chunks(text, chunksize=chunksize)] texts, langs = [], set() for t in translator.translate(chunks, dest='en'): texts.append(t.text.capitalize()) # GT uncapitalizes chunks @@ -133,10 +139,10 @@ def _ngram_and_tokenize(row, ngrammer, ngram_fields): processed_tokens = ngrammer.process_document(text) tokens += [t.replace('_', ' ') for tokens in processed_tokens - for t in tokens] + for t in tokens] _row['terms_tokens_entity'] = tokens return _row - + def _sanitize_html(row): """Strips out any html encoding. Note: nothing clever is done @@ -154,6 +160,7 @@ def _sanitize_html(row): _row[k] = strip_tags(v) return _row + def _clean_bad_unicode_conversion(row): """Removes sequences of ??? from strings, which normally occur due to bad unicode conversion. Note this is a hack: @@ -172,12 +179,13 @@ def _clean_bad_unicode_conversion(row): elif "??" not in v: continue while "???" in v: - v = v.replace("???","") + v = v.replace("???", "") while "??" in v: - v = v.replace("??","") + v = v.replace("??", "") _row[k] = v return _row + def _nullify_pairs(row, null_pairs={}): """Nullify any value if it's 'parent' is also null. For example for null_pairs={'parent': 'child'} @@ -201,6 +209,7 @@ def _nullify_pairs(row, null_pairs={}): _row[child] = None return _row + def _remove_padding(row): """Remove padding from text or list text @@ -218,6 +227,7 @@ def _remove_padding(row): for item in v] return _row + def _caps_to_camel_case_by_value(v): if type(v) is not str: return v @@ -227,6 +237,7 @@ def _caps_to_camel_case_by_value(v): return v return v.lower().title() + def _caps_to_camel_case(row): """Convert CAPITAL TERMS to Camel Case @@ -275,11 +286,13 @@ def _clean_up_lists(row, do_sort=True): _row[k] = v return _row + def _add_entity_type(row, entity_type): _row = deepcopy(row) _row['type_of_entity'] = entity_type return _row + def _null_empty_str(row): """Nullify values if they are empty strings. @@ -326,6 +339,7 @@ def _coordinates_as_floats(row): _row[k] = __floatify_coord(v) return _row + @lru_cache() def _country_lookup(): """Extract country/nationality --> iso2 code lookup @@ -334,7 +348,7 @@ def _country_lookup(): Returns: lookup (dict): country/nationality --> iso2 code lookup. """ - df = pd.read_csv(COUNTRY_LOOKUP, encoding='latin', na_filter = False) + df = pd.read_csv(COUNTRY_LOOKUP, encoding='latin', na_filter=False) lookup = defaultdict(list) for _, row in df.iterrows(): iso2 = row.pop("ISO 3166 Code") @@ -344,6 +358,7 @@ def _country_lookup(): lookup[v].append(iso2) return lookup + def _country_detection(row, country_tag=COUNTRY_TAG): """Append a list of countries detected from keywords discovered in all text fields. The new field name @@ -391,6 +406,7 @@ def _guess_delimiter(item, threshold=0.25): if score < threshold: return p + def _listify_terms(row, delimiters=None): """Split any 'terms' fields by a guessed delimiter if the field is a string. @@ -410,7 +426,8 @@ def _listify_terms(row, delimiters=None): if _type is list: continue elif _type is not str: - raise TypeError(f"Type for '{k}' is '{_type}' but expected 'str' or 'list'.") + raise TypeError( + f"Type for '{k}' is '{_type}' but expected 'str' or 'list'.") # Now determine the delimiter if delimiters is None: delimiter = _guess_delimiter(v) @@ -521,6 +538,7 @@ class ElasticsearchPlus(Elasticsearch): do_sort (bool): Sort all lists? {args, kwargs}: (kw)args for the core :obj:`Elasticsearch` API. """ + def __init__(self, entity_type, aws_auth_region, no_commit=False, @@ -578,28 +596,29 @@ def __init__(self, entity_type, # Convert items which SHOULD be lists to lists if listify_terms: - self.transforms.append(lambda row: _listify_terms(row, terms_delimiters)) + self.transforms.append( + lambda row: _listify_terms(row, terms_delimiters)) # Convert upper case text to camel case if caps_to_camel_case: self.transforms.append(_caps_to_camel_case) - # Translate any text to english - if auto_translate: - # URLs to load balance Google Translate - urls = list(f"translate.google.{ext}" - for ext in ('com', 'co.uk', 'co.kr', 'at', - 'ru', 'fr', 'de', 'ch', 'es')) - self.transforms.append(lambda row: _auto_translate(row, translator=None, - service_urls=urls, - **auto_translate_kwargs)) + # # Translate any text to english + # if auto_translate: + # # URLs to load balance Google Translate + # urls = list(f"translate.googleapis.{ext}" + # for ext in ('com', 'co.uk', 'co.kr', 'at', + # 'ru', 'fr', 'de', 'ch', 'es')) + # self.transforms.append(lambda row: _auto_translate(row, translator=None, + # service_urls=urls, + # **auto_translate_kwargs)) # Extract any ngrams and split into tokens if len(ngram_fields) > 0: # Setup ngrammer - if 'MYSQLDBCONF' not in os.environ: + if 'MYSQLDBCONF' not in os.environ: os.environ['MYSQLDBCONF'] = 'mysqldb.config' - ngrammer = Ngrammer(database="production") + ngrammer = Ngrammer(database="production") self.transforms.append(lambda row: _ngram_and_tokenize(row, ngrammer, ngram_fields)) diff --git a/nesta/core/luigihacks/tests/test_elasticsearchplus.py b/nesta/core/luigihacks/tests/test_elasticsearchplus.py index 3bbddfdc..ae97b75e 100644 --- a/nesta/core/luigihacks/tests/test_elasticsearchplus.py +++ b/nesta/core/luigihacks/tests/test_elasticsearchplus.py @@ -100,70 +100,70 @@ def test_sentence_chunks(): chunksize=i)) == text -def test_auto_translate_true_short(row): - """The translator shouldn't be applied for short pieces of text""" - time.sleep(2) - _row = _auto_translate(row, TRANSLATOR, 1000) - assert not _row.pop(TRANS_TAG) - assert len(_row.pop(LANGS_TAG)) == 0 - assert row['korean'] == _row['korean'] - assert row['mixed_lang'] == _row['mixed_lang'] - assert row == _row - -def test_auto_translate_true_long_small_chunks(row): - time.sleep(2) - _row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1) - time.sleep(2) - _row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000) - assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang') +# def test_auto_translate_true_short(row): +# """The translator shouldn't be applied for short pieces of text""" +# time.sleep(2) +# _row = _auto_translate(row, TRANSLATOR, 1000) +# assert not _row.pop(TRANS_TAG) +# assert len(_row.pop(LANGS_TAG)) == 0 +# assert row['korean'] == _row['korean'] +# assert row['mixed_lang'] == _row['mixed_lang'] +# assert row == _row + +# def test_auto_translate_true_long_small_chunks(row): +# time.sleep(2) +# _row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1) +# time.sleep(2) +# _row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000) +# assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang') - # Test the translation itself - # Constraints rather than fixed assertions since - # translate algorithm may change over time - # so the two chunk sizes aren't guaranteed to give the same results - k1 = _row_1.pop('korean').upper() - k2 = _row_2.pop('korean').upper() - assert len(k1) > 10 - assert len(k2) > 10 - assert (len(k1) - len(k2))/len(k1) < 0.95 - assert (len(set(k1)) - len(set(k2)))/len(set(k1)) < 0.95 - assert sum((Counter(k1) - Counter(k2)).values())/len(k1+k2) < 0.95 - assert sum((Counter(k2) - Counter(k1)).values())/len(k1+k2) < 0.95 - - # Confirm that the languages are the same - langs_1 = _row_1.pop(LANGS_TAG) - langs_2 = _row_2.pop(LANGS_TAG) - assert len(langs_1) == len(langs_2) - assert set(langs_1) == set(langs_2) - - # Confirm that nothing else has changed - assert _row_1 == _row_2 - -def test_auto_translate_true_long(row): - time.sleep(2) - _row = _auto_translate(row, TRANSLATOR, 10) - assert row.pop('korean') != _row['korean'] - assert row.pop('mixed_lang') != _row['mixed_lang'] - assert _row.pop(TRANS_TAG) - trans_korean = _row.pop('korean') - assert all(term in trans_korean.lower() - for term in ('brown', 'fox', 'jump', - 'over', 'lazy', 'dog')) - trans_mixed = _row.pop('mixed_lang') - assert all(term in trans_mixed.lower() - for term in ('brown', 'fox', - 'something', 'english')) - assert set(_row.pop(LANGS_TAG)) == {'ko', 'en'} - assert row == _row - -def test_auto_translate_false(row): - row.pop('korean') - row.pop('mixed_lang') - time.sleep(2) - _row = _auto_translate(row, TRANSLATOR) - assert not _row.pop(TRANS_TAG) - assert _row.pop(LANGS_TAG) == ['en'] - assert row == _row +# # Test the translation itself +# # Constraints rather than fixed assertions since +# # translate algorithm may change over time +# # so the two chunk sizes aren't guaranteed to give the same results +# k1 = _row_1.pop('korean').upper() +# k2 = _row_2.pop('korean').upper() +# assert len(k1) > 10 +# assert len(k2) > 10 +# assert (len(k1) - len(k2))/len(k1) < 0.95 +# assert (len(set(k1)) - len(set(k2)))/len(set(k1)) < 0.95 +# assert sum((Counter(k1) - Counter(k2)).values())/len(k1+k2) < 0.95 +# assert sum((Counter(k2) - Counter(k1)).values())/len(k1+k2) < 0.95 + +# # Confirm that the languages are the same +# langs_1 = _row_1.pop(LANGS_TAG) +# langs_2 = _row_2.pop(LANGS_TAG) +# assert len(langs_1) == len(langs_2) +# assert set(langs_1) == set(langs_2) + +# # Confirm that nothing else has changed +# assert _row_1 == _row_2 + +# def test_auto_translate_true_long(row): +# time.sleep(2) +# _row = _auto_translate(row, TRANSLATOR, 10) +# assert row.pop('korean') != _row['korean'] +# assert row.pop('mixed_lang') != _row['mixed_lang'] +# assert _row.pop(TRANS_TAG) +# trans_korean = _row.pop('korean') +# assert all(term in trans_korean.lower() +# for term in ('brown', 'fox', 'jump', +# 'over', 'lazy', 'dog')) +# trans_mixed = _row.pop('mixed_lang') +# assert all(term in trans_mixed.lower() +# for term in ('brown', 'fox', +# 'something', 'english')) +# assert set(_row.pop(LANGS_TAG)) == {'ko', 'en'} +# assert row == _row + +# def test_auto_translate_false(row): +# row.pop('korean') +# row.pop('mixed_lang') +# time.sleep(2) +# _row = _auto_translate(row, TRANSLATOR) +# assert not _row.pop(TRANS_TAG) +# assert _row.pop(LANGS_TAG) == ['en'] +# assert row == _row def test_sanitize_html(row): _row = _sanitize_html(row) diff --git a/nesta/packages/nlp_utils/ngrammer.py b/nesta/packages/nlp_utils/ngrammer.py index bb0e8202..f27c2ec3 100644 --- a/nesta/packages/nlp_utils/ngrammer.py +++ b/nesta/packages/nlp_utils/ngrammer.py @@ -27,6 +27,7 @@ class Ngrammer: variable 'MYSQLDBCONF' database (str): Database name """ + def __init__(self, config_filepath=None, database="dev"): if config_filepath is not None: os.environ["MYSQLDBCONF"] = config_filepath @@ -67,7 +68,8 @@ def find_and_replace(self, sentence, size): return True return False - def process_document(self, raw_text, remove_stops=True): + def process_document(self, raw_text, remove_stops=True, + keep_quasi_numeric=True): """Tokenize and insert n-grams into documents. Args: @@ -77,7 +79,8 @@ def process_document(self, raw_text, remove_stops=True): processed_doc (list): Iterable ready for word embedding """ # Tokenize and clean up the text first - text = tokenize_document(raw_text) + text = tokenize_document( + raw_text, keep_quasi_numeric=keep_quasi_numeric) # Replace large n-grams first, then small n-grams for size in sorted(self.ngrams, reverse=True): for sentence in text: diff --git a/nesta/packages/nlp_utils/preprocess.py b/nesta/packages/nlp_utils/preprocess.py index b264e268..eda4df0c 100644 --- a/nesta/packages/nlp_utils/preprocess.py +++ b/nesta/packages/nlp_utils/preprocess.py @@ -30,7 +30,7 @@ re.VERBOSE | re.IGNORECASE) -def tokenize_document(text, remove_stops=False): +def tokenize_document(text, remove_stops=False, keep_quasi_numeric=True): """Preprocess a whole raw document. Args: text (str): Raw string of text. @@ -38,11 +38,11 @@ def tokenize_document(text, remove_stops=False): Return: List of preprocessed and tokenized documents """ - return [clean_and_tokenize(sentence, remove_stops) + return [clean_and_tokenize(sentence, remove_stops, keep_quasi_numeric) for sentence in nltk.sent_tokenize(text)] -def clean_and_tokenize(text, remove_stops): +def clean_and_tokenize(text, remove_stops, keep_quasi_numeric=False): """Preprocess a raw string/sentence of text. Args: text (str): Raw string of text. @@ -56,14 +56,14 @@ def clean_and_tokenize(text, remove_stops): filtered_tokens = [token.replace('-', '_') for token in _tokens if not (remove_stops and len(token) <= 2) and (not remove_stops or token not in stop_words) - and not any(x in token for x in string.digits) + and (keep_quasi_numeric or not any(x in token for x in string.digits)) and any(x in token for x in string.ascii_lowercase)] return filtered_tokens def filter_by_idf(documents, lower_idf_limit, upper_idf_limit): """Remove (from documents) terms which are in a range of IDF values. - + Args: documents (list): Either a :obj:`list` of :obj:`str` or a :obj:`list` of :obj:`list` of :obj:`str` to be