diff --git a/requirements.txt b/requirements.txt index a22e542..d0c2526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ future==0.16.0 Cython==0.27.2 cysignals==1.6.5 pyfasttext==0.4.3 +regex==2019.3.12 diff --git a/whatthelang/predict_lang.py b/whatthelang/predict_lang.py index 7b6dba5..32f4ee6 100644 --- a/whatthelang/predict_lang.py +++ b/whatthelang/predict_lang.py @@ -1,6 +1,6 @@ from pyfasttext import FastText from os import path -import re +import regex as re MODEL_FILE = path.join(path.dirname(__file__), 'model', 'lid.176.ftz') @@ -14,7 +14,7 @@ def load_model(self): return FastText(self.model_file) def _clean_up(self,txt): - txt = re.sub(r"\b\d+\b", "", txt) + txt = re.sub(r'(\b\d+\b|\p{P}|\n)', '', txt) return txt def _flatten(self,pred):