From f442989d60a7ce35afec79fcb8b66ec4b2c7534b Mon Sep 17 00:00:00 2001
From: Dima Veselov <d.a.veselov@yandex.ru>
Date: Wed, 13 Jan 2021 12:47:02 +0300
Subject: [PATCH] Added mypy and isort linting, updated rusentiment corpus
 reader

---
 .../{python-package.yml => test-and-lint.yml} | 10 +++++-
 bin/dostoevsky                                |  3 +-
 dostoevsky/__main__.py                        |  3 +-
 dostoevsky/corpora.py                         | 33 +++++--------------
 dostoevsky/data/__init__.py                   |  5 ++-
 dostoevsky/models.py                          | 19 ++++++-----
 dostoevsky/tokenization.py                    |  2 +-
 mypy.ini                                      |  2 ++
 requirements/tests.txt                        |  2 ++
 setup.py                                      |  6 ++--
 tests/conftest.py                             | 12 ++-----
 tests/test_corpora.py                         |  8 ++---
 tests/test_model.py                           |  0
 tests/test_tokenization.py                    |  2 +-
 14 files changed, 47 insertions(+), 60 deletions(-)
 rename .github/workflows/{python-package.yml => test-and-lint.yml} (87%)
 create mode 100644 mypy.ini
 create mode 100644 tests/test_model.py

diff --git a/.github/workflows/python-package.yml b/.github/workflows/test-and-lint.yml
similarity index 87%
rename from .github/workflows/python-package.yml
rename to .github/workflows/test-and-lint.yml
index e791e04..7fd4c4f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/test-and-lint.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: Python package
+name: Test & Lint
 
 on:
   push:
@@ -34,6 +34,14 @@ jobs:
     - name: Lint with black
       run: |
         black -S -l 120 --check dostoevsky/
+    
+    - name: List with isort
+      run: |
+        isort --check dostoevsky/
+
+    - name: Lint with mypy
+      run: |
+        mypy dostoevsky/
 
     - name: Test with pytest
       run: |
diff --git a/bin/dostoevsky b/bin/dostoevsky
index d12d718..c3e5a82 100644
--- a/bin/dostoevsky
+++ b/bin/dostoevsky
@@ -4,8 +4,7 @@ import os
 import sys
 import typing
 
-from dostoevsky.data import DataDownloader, DATA_BASE_PATH, AVAILABLE_FILES
-
+from dostoevsky.data import AVAILABLE_FILES, DATA_BASE_PATH, DataDownloader
 
 if __name__ == '__main__':
     command: str = sys.argv[1]
diff --git a/dostoevsky/__main__.py b/dostoevsky/__main__.py
index 1c56876..2c53279 100644
--- a/dostoevsky/__main__.py
+++ b/dostoevsky/__main__.py
@@ -2,8 +2,7 @@
 import sys
 import typing
 
-from dostoevsky.data import DataDownloader, DATA_BASE_PATH, AVAILABLE_FILES
-
+from dostoevsky.data import AVAILABLE_FILES, DATA_BASE_PATH, DataDownloader
 
 if __name__ == '__main__':
     if '--dry-run' in sys.argv:
diff --git a/dostoevsky/corpora.py b/dostoevsky/corpora.py
index 489952d..0254631 100644
--- a/dostoevsky/corpora.py
+++ b/dostoevsky/corpora.py
@@ -1,49 +1,34 @@
 import csv
+from abc import ABC, abstractmethod
+from typing import Generator, List, Optional, Tuple
 
-from typing import Generator, Optional, List, Tuple
 
-from dostoevsky.tokenization import BaseTokenizer
-
-
-class BaseCorpusContainer:
-    def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], None, None]:
+class BaseCorpus(ABC):
+    @abstractmethod
+    def get_data(self) -> Generator[Tuple[str, str], None, None]:
         raise NotImplementedError
 
 
-class RusentimentCorpus(BaseCorpusContainer):
+class RusentimentCorpus(BaseCorpus):
 
     CSV_DELIMITER: str = ','
     CSV_QUOTECHAR: str = '"'
 
-    UNKNOWN_LABEL: str = 'unknown'
-
     LABELS: List[str] = [
         'positive',
         'negative',
         'neutral',
         'skip',
         'speech',
-        UNKNOWN_LABEL,
     ]
 
     def __init__(
         self,
         data_path: Optional[str],
-        tokenizer: BaseTokenizer,
-        lemmatize: bool = True,
     ):
         self.data_path = data_path
-        self.tokenizer = tokenizer
-        self.lemmatize = lemmatize
-        self.label_encoder = self.get_label_encoder()
-
-    def get_label_encoder(self):
-        from sklearn.preprocessing import LabelBinarizer
-
-        label_encoder = LabelBinarizer()
-        return label_encoder.fit(self.LABELS)
 
-    def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], None, None]:
+    def get_data(self) -> Generator[Tuple[str, str], None, None]:
         if not self.data_path:
             raise ValueError('data_path is None')
         with open(self.data_path, encoding='utf8') as source:
@@ -55,6 +40,4 @@ def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], No
             for i, (label, text) in enumerate(reader):
                 if i == 0:  # skip headers
                     continue
-                encoded_label, *_ = self.label_encoder.transform([label])
-                tokens = self.tokenizer.split(text, lemmatize=self.lemmatize)
-                yield tokens, encoded_label
+                yield text, label
diff --git a/dostoevsky/data/__init__.py b/dostoevsky/data/__init__.py
index cbbd581..a725199 100644
--- a/dostoevsky/data/__init__.py
+++ b/dostoevsky/data/__init__.py
@@ -1,11 +1,10 @@
-import os
 import lzma
+import os
 import ssl
-import typing
 import tarfile
+import typing
 import urllib.request
 
-
 DATA_BASE_PATH: str = os.path.dirname(os.path.abspath(__file__))
 STORAGE_BASE_URL: str = 'https://storage.b-labs.pro/'
 
diff --git a/dostoevsky/models.py b/dostoevsky/models.py
index 148f7c4..dd41092 100644
--- a/dostoevsky/models.py
+++ b/dostoevsky/models.py
@@ -1,21 +1,21 @@
 import os
-
-from typing import List, Dict, Optional
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
 
 from fasttext import load_model as load_fasttext_model
 
-from dostoevsky.tokenization import BaseTokenizer
-from dostoevsky.corpora import BaseCorpusContainer
+from dostoevsky.corpora import BaseCorpus
 from dostoevsky.data import DATA_BASE_PATH
+from dostoevsky.tokenization import BaseTokenizer
 
 
-class BaseModel:
+class BaseModel(ABC):
     def __init__(
         self,
         tokenizer: BaseTokenizer,
         lemmatize: bool = True,
         model_path: Optional[str] = None,
-        corpus: Optional[BaseCorpusContainer] = None,
+        corpus: Optional[BaseCorpus] = None,
     ):
         self.model_path = model_path
         self.tokenizer = tokenizer
@@ -23,13 +23,15 @@ def __init__(
         self.corpus = corpus
         self.model = self.get_compiled_model() if self.model_path else self.get_raw_model()
 
+    @abstractmethod
     def get_compiled_model(self):
         raise NotImplementedError
 
     def preprocess_input(self, sentences: List[str]):
         raise NotImplementedError
 
-    def predict(self, sentences: List[str]):
+    @abstractmethod
+    def predict(self, sentences: List[str], **kwargs) -> List[Dict[str, float]]:
         raise NotImplementedError
 
     def get_raw_model(self):
@@ -63,7 +65,8 @@ def preprocess_input(self, sentences: List[str]) -> List[str]:
             for sentence in sentences
         ]
 
-    def predict(self, sentences: List[str], k: int = -1) -> List[Dict[str, float]]:
+    def predict(self, sentences: List[str], **kwargs) -> List[Dict[str, float]]:
+        k = kwargs.get('k', -1)
         X = self.preprocess_input(sentences)
         Y = (self.model.predict(sentence, k=k) for sentence in X)
         return [dict(zip((label.replace('__label__', '') for label in labels), scores)) for labels, scores in Y]
diff --git a/dostoevsky/tokenization.py b/dostoevsky/tokenization.py
index e1a34c9..8b99837 100644
--- a/dostoevsky/tokenization.py
+++ b/dostoevsky/tokenization.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
 
 from razdel import tokenize as regex_tokenize
 
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..976ba02
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,2 @@
+[mypy]
+ignore_missing_imports = True
diff --git a/requirements/tests.txt b/requirements/tests.txt
index 08703d7..73f6184 100644
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
@@ -4,3 +4,5 @@ wheel==0.36.2
 pytest==6.2.1
 black==20.8b1
 scikit-learn==0.24.0
+isort==5.7.0
+mypy==0.790
diff --git a/setup.py b/setup.py
index 58f3263..2cbfd61 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,6 @@
 from typing import List
-from setuptools import (
-    setup,
-    find_packages,
-)
+
+from setuptools import find_packages, setup
 
 
 def get_long_description() -> str:
diff --git a/tests/conftest.py b/tests/conftest.py
index 8420593..abf048b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,10 @@
 import os
-import pytest
 
+import pytest
 
-from dostoevsky.tokenization import (
-    RegexTokenizer,
-)
 from dostoevsky.corpora import RusentimentCorpus
-from dostoevsky.data import DataDownloader, DATA_BASE_PATH
+from dostoevsky.data import DATA_BASE_PATH, DataDownloader
+from dostoevsky.tokenization import RegexTokenizer
 
 
 @pytest.fixture(scope='session')
@@ -58,20 +56,16 @@ def rusentiment_test_corpus_path(rusentiment_corpus_data):
 @pytest.fixture(scope='session')
 def rusentiment_corpus(
     rusentiment_corpus_path,
-    regex_tokenizer,
 ):
     return RusentimentCorpus(
         data_path=rusentiment_corpus_path,
-        tokenizer=regex_tokenizer,
     )
 
 
 @pytest.fixture(scope='session')
 def rusentiment_test_corpus(
     rusentiment_test_corpus_path,
-    regex_tokenizer,
 ):
     return RusentimentCorpus(
         data_path=rusentiment_test_corpus_path,
-        tokenizer=regex_tokenizer,
     )
diff --git a/tests/test_corpora.py b/tests/test_corpora.py
index 152cf2e..5d71432 100644
--- a/tests/test_corpora.py
+++ b/tests/test_corpora.py
@@ -1,11 +1,11 @@
-def test_rusentiment_corpus_get_prepared_data(
+def test_rusentiment_corpus_get_data(
     rusentiment_corpus,
 ):
     X_train, y_train = [], []
-    for i, (vectors, label) in enumerate(
-        rusentiment_corpus.get_prepared_data()
+    for i, (text, label) in enumerate(
+        rusentiment_corpus.get_data()
     ):
-        X_train.append(vectors)
+        X_train.append(text)
         y_train.append(label)
         if i >= 9:
             break
diff --git a/tests/test_model.py b/tests/test_model.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py
index 76a88cd..fe75f6f 100644
--- a/tests/test_tokenization.py
+++ b/tests/test_tokenization.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
 
 
 def test_regex_tokenizer_base_case(regex_tokenizer):