From f442989d60a7ce35afec79fcb8b66ec4b2c7534b Mon Sep 17 00:00:00 2001 From: Dima Veselov Date: Wed, 13 Jan 2021 12:47:02 +0300 Subject: [PATCH] Added mypy and isort linting, updated rusentiment corpus reader --- .../{python-package.yml => test-and-lint.yml} | 10 +++++- bin/dostoevsky | 3 +- dostoevsky/__main__.py | 3 +- dostoevsky/corpora.py | 33 +++++-------------- dostoevsky/data/__init__.py | 5 ++- dostoevsky/models.py | 19 ++++++----- dostoevsky/tokenization.py | 2 +- mypy.ini | 2 ++ requirements/tests.txt | 2 ++ setup.py | 6 ++-- tests/conftest.py | 12 ++----- tests/test_corpora.py | 8 ++--- tests/test_model.py | 0 tests/test_tokenization.py | 2 +- 14 files changed, 47 insertions(+), 60 deletions(-) rename .github/workflows/{python-package.yml => test-and-lint.yml} (87%) create mode 100644 mypy.ini create mode 100644 tests/test_model.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/test-and-lint.yml similarity index 87% rename from .github/workflows/python-package.yml rename to .github/workflows/test-and-lint.yml index e791e04..7fd4c4f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/test-and-lint.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: Test & Lint on: push: @@ -34,6 +34,14 @@ jobs: - name: Lint with black run: | black -S -l 120 --check dostoevsky/ + + - name: List with isort + run: | + isort --check dostoevsky/ + + - name: Lint with mypy + run: | + mypy dostoevsky/ - name: Test with pytest run: | diff --git a/bin/dostoevsky b/bin/dostoevsky index d12d718..c3e5a82 100644 --- a/bin/dostoevsky +++ b/bin/dostoevsky @@ -4,8 +4,7 @@ import os import sys import typing -from dostoevsky.data import DataDownloader, DATA_BASE_PATH, AVAILABLE_FILES - +from dostoevsky.data import AVAILABLE_FILES, DATA_BASE_PATH, DataDownloader if __name__ == '__main__': command: str = sys.argv[1] diff --git a/dostoevsky/__main__.py b/dostoevsky/__main__.py index 1c56876..2c53279 100644 --- a/dostoevsky/__main__.py +++ b/dostoevsky/__main__.py @@ -2,8 +2,7 @@ import sys import typing -from dostoevsky.data import DataDownloader, DATA_BASE_PATH, AVAILABLE_FILES - +from dostoevsky.data import AVAILABLE_FILES, DATA_BASE_PATH, DataDownloader if __name__ == '__main__': if '--dry-run' in sys.argv: diff --git a/dostoevsky/corpora.py b/dostoevsky/corpora.py index 489952d..0254631 100644 --- a/dostoevsky/corpora.py +++ b/dostoevsky/corpora.py @@ -1,49 +1,34 @@ import csv +from abc import ABC, abstractmethod +from typing import Generator, List, Optional, Tuple -from typing import Generator, Optional, List, Tuple -from dostoevsky.tokenization import BaseTokenizer - - -class BaseCorpusContainer: - def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], None, None]: +class BaseCorpus(ABC): + @abstractmethod + def get_data(self) -> Generator[Tuple[str, str], None, None]: raise NotImplementedError -class RusentimentCorpus(BaseCorpusContainer): +class RusentimentCorpus(BaseCorpus): CSV_DELIMITER: str = ',' CSV_QUOTECHAR: str = '"' - UNKNOWN_LABEL: str = 'unknown' - LABELS: List[str] = [ 'positive', 'negative', 'neutral', 'skip', 'speech', - UNKNOWN_LABEL, ] def __init__( self, data_path: Optional[str], - tokenizer: BaseTokenizer, - lemmatize: bool = True, ): self.data_path = data_path - self.tokenizer = tokenizer - self.lemmatize = lemmatize - self.label_encoder = self.get_label_encoder() - - def get_label_encoder(self): - from sklearn.preprocessing import LabelBinarizer - - label_encoder = LabelBinarizer() - return label_encoder.fit(self.LABELS) - def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], None, None]: + def get_data(self) -> Generator[Tuple[str, str], None, None]: if not self.data_path: raise ValueError('data_path is None') with open(self.data_path, encoding='utf8') as source: @@ -55,6 +40,4 @@ def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], No for i, (label, text) in enumerate(reader): if i == 0: # skip headers continue - encoded_label, *_ = self.label_encoder.transform([label]) - tokens = self.tokenizer.split(text, lemmatize=self.lemmatize) - yield tokens, encoded_label + yield text, label diff --git a/dostoevsky/data/__init__.py b/dostoevsky/data/__init__.py index cbbd581..a725199 100644 --- a/dostoevsky/data/__init__.py +++ b/dostoevsky/data/__init__.py @@ -1,11 +1,10 @@ -import os import lzma +import os import ssl -import typing import tarfile +import typing import urllib.request - DATA_BASE_PATH: str = os.path.dirname(os.path.abspath(__file__)) STORAGE_BASE_URL: str = 'https://storage.b-labs.pro/' diff --git a/dostoevsky/models.py b/dostoevsky/models.py index 148f7c4..dd41092 100644 --- a/dostoevsky/models.py +++ b/dostoevsky/models.py @@ -1,21 +1,21 @@ import os - -from typing import List, Dict, Optional +from abc import ABC, abstractmethod +from typing import Dict, List, Optional from fasttext import load_model as load_fasttext_model -from dostoevsky.tokenization import BaseTokenizer -from dostoevsky.corpora import BaseCorpusContainer +from dostoevsky.corpora import BaseCorpus from dostoevsky.data import DATA_BASE_PATH +from dostoevsky.tokenization import BaseTokenizer -class BaseModel: +class BaseModel(ABC): def __init__( self, tokenizer: BaseTokenizer, lemmatize: bool = True, model_path: Optional[str] = None, - corpus: Optional[BaseCorpusContainer] = None, + corpus: Optional[BaseCorpus] = None, ): self.model_path = model_path self.tokenizer = tokenizer @@ -23,13 +23,15 @@ def __init__( self.corpus = corpus self.model = self.get_compiled_model() if self.model_path else self.get_raw_model() + @abstractmethod def get_compiled_model(self): raise NotImplementedError def preprocess_input(self, sentences: List[str]): raise NotImplementedError - def predict(self, sentences: List[str]): + @abstractmethod + def predict(self, sentences: List[str], **kwargs) -> List[Dict[str, float]]: raise NotImplementedError def get_raw_model(self): @@ -63,7 +65,8 @@ def preprocess_input(self, sentences: List[str]) -> List[str]: for sentence in sentences ] - def predict(self, sentences: List[str], k: int = -1) -> List[Dict[str, float]]: + def predict(self, sentences: List[str], **kwargs) -> List[Dict[str, float]]: + k = kwargs.get('k', -1) X = self.preprocess_input(sentences) Y = (self.model.predict(sentence, k=k) for sentence in X) return [dict(zip((label.replace('__label__', '') for label in labels), scores)) for labels, scores in Y] diff --git a/dostoevsky/tokenization.py b/dostoevsky/tokenization.py index e1a34c9..8b99837 100644 --- a/dostoevsky/tokenization.py +++ b/dostoevsky/tokenization.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple from razdel import tokenize as regex_tokenize diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..976ba02 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,2 @@ +[mypy] +ignore_missing_imports = True diff --git a/requirements/tests.txt b/requirements/tests.txt index 08703d7..73f6184 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -4,3 +4,5 @@ wheel==0.36.2 pytest==6.2.1 black==20.8b1 scikit-learn==0.24.0 +isort==5.7.0 +mypy==0.790 diff --git a/setup.py b/setup.py index 58f3263..2cbfd61 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,6 @@ from typing import List -from setuptools import ( - setup, - find_packages, -) + +from setuptools import find_packages, setup def get_long_description() -> str: diff --git a/tests/conftest.py b/tests/conftest.py index 8420593..abf048b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,10 @@ import os -import pytest +import pytest -from dostoevsky.tokenization import ( - RegexTokenizer, -) from dostoevsky.corpora import RusentimentCorpus -from dostoevsky.data import DataDownloader, DATA_BASE_PATH +from dostoevsky.data import DATA_BASE_PATH, DataDownloader +from dostoevsky.tokenization import RegexTokenizer @pytest.fixture(scope='session') @@ -58,20 +56,16 @@ def rusentiment_test_corpus_path(rusentiment_corpus_data): @pytest.fixture(scope='session') def rusentiment_corpus( rusentiment_corpus_path, - regex_tokenizer, ): return RusentimentCorpus( data_path=rusentiment_corpus_path, - tokenizer=regex_tokenizer, ) @pytest.fixture(scope='session') def rusentiment_test_corpus( rusentiment_test_corpus_path, - regex_tokenizer, ): return RusentimentCorpus( data_path=rusentiment_test_corpus_path, - tokenizer=regex_tokenizer, ) diff --git a/tests/test_corpora.py b/tests/test_corpora.py index 152cf2e..5d71432 100644 --- a/tests/test_corpora.py +++ b/tests/test_corpora.py @@ -1,11 +1,11 @@ -def test_rusentiment_corpus_get_prepared_data( +def test_rusentiment_corpus_get_data( rusentiment_corpus, ): X_train, y_train = [], [] - for i, (vectors, label) in enumerate( - rusentiment_corpus.get_prepared_data() + for i, (text, label) in enumerate( + rusentiment_corpus.get_data() ): - X_train.append(vectors) + X_train.append(text) y_train.append(label) if i >= 9: break diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py index 76a88cd..fe75f6f 100644 --- a/tests/test_tokenization.py +++ b/tests/test_tokenization.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple def test_regex_tokenizer_base_case(regex_tokenizer):