Change the import of kenlm from github to pypi (huggingface#19770)

* Change the import of kenlm from github to pypi * Change the import of kenlm from github to pypi in circleci config * Fix code quality issues * Fix isort issue, add kenlm in extras for audio * Add kenlm to deps * Add kenlm to deps * Commit 'make fixup' changes * Remove version from kenlm deps * commit make fixup changes * Remove manual installation of kenlm * Remove manual installation of kenlm * Remove manual installation of kenlm
ymwangg · Oct 26, 2022 · 7829c89 · 7829c89
1 parent aeae978
commit 7829c89
Show file tree

Hide file tree

Showing 9 changed files with 25 additions and 16 deletions.
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
@@ -127,7 +127,6 @@ def job_name(self):
         "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
         TORCH_SCATTER_INSTALL,
         "pip install tensorflow_probability",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
@@ -143,7 +142,6 @@ def job_name(self):
         "pip install --upgrade pip",
         "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
         TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_flax_cross_test",
@@ -158,7 +156,6 @@ def job_name(self):
         "pip install --upgrade pip",
         "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
         TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     pytest_num_workers=3,
@@ -172,7 +169,6 @@ def job_name(self):
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
         "pip install tensorflow_probability",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
     ],
     pytest_options={"rA": None},
 )
@@ -184,7 +180,6 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
         "pip install --upgrade pip",
         "pip install .[flax,testing,sentencepiece,flax-speech,vision]",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
     ],
     pytest_options={"rA": None},
 )
@@ -197,7 +192,6 @@ def job_name(self):
         "pip install --upgrade pip",
         "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
         TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
     ],
     pytest_options={"rA": None},
     tests_to_run="tests/pipelines/"

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
@@ -46,7 +46,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
 
-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

diff --git a/docker/transformers-doc-builder/Dockerfile b/docker/transformers-doc-builder/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
 
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "from torch import version; print(version.__version__.split('+')[0])")+cpu.html
-RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 

diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
@@ -23,7 +23,7 @@ RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' ||  VERSI
 RUN python3 -m pip uninstall -y tensorflow flax
 
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu113.html
-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 # When installing in editable mode, `transformers` is not recognized as a package.

diff --git a/setup.py b/setup.py
@@ -123,6 +123,7 @@
     "jax>=0.2.8,!=0.3.2,<=0.3.6",
     "jaxlib>=0.1.65,<=0.3.6",
     "jieba",
+    "kenlm",
     "nltk",
     "numpy>=1.17",
     "onnxconverter-common",
@@ -274,7 +275,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -29,6 +29,7 @@
     "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
     "jaxlib": "jaxlib>=0.1.65,<=0.3.6",
     "jieba": "jieba",
+    "kenlm": "kenlm",
     "nltk": "nltk",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
@@ -36,7 +36,14 @@
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
 from ..tokenization_utils_fast import PreTrainedTokenizerFast
-from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, is_tf_available, is_torch_available, logging
+from ..utils import (
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    is_kenlm_available,
+    is_pyctcdecode_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
 from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
@@ -837,11 +844,12 @@ def pipeline(
 
                     kwargs["decoder"] = decoder
                 except ImportError as e:
-                    logger.warning(
-                        f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install"
-                        " `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install"
-                        f" https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
-                    )
+                    logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
+                    if not is_kenlm_available():
+                        logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+                    if not is_pyctcdecode_available():
+                        logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
 
     if task == "translation" and model.config.task_specific_params:
         for key in model.config.task_specific_params:

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -108,6 +108,7 @@
     is_in_notebook,
     is_ipex_available,
     is_jumanpp_available,
+    is_kenlm_available,
     is_librosa_available,
     is_more_itertools_available,
     is_ninja_available,

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
@@ -271,6 +271,10 @@
 TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
 
 
+def is_kenlm_available():
+    return importlib.util.find_spec("kenlm") is not None
+
+
 def is_torch_available():
     return _torch_available