Skip to content
This repository has been archived by the owner on Oct 31, 2023. It is now read-only.

Commit

Permalink
Added mypy and isort linting, updated rusentiment corpus reader
Browse files Browse the repository at this point in the history
  • Loading branch information
dveselov committed Jan 13, 2021
1 parent 0137031 commit f442989
Show file tree
Hide file tree
Showing 14 changed files with 47 additions and 60 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Python package
name: Test & Lint

on:
push:
Expand Down Expand Up @@ -34,6 +34,14 @@ jobs:
- name: Lint with black
run: |
black -S -l 120 --check dostoevsky/
- name: List with isort
run: |
isort --check dostoevsky/
- name: Lint with mypy
run: |
mypy dostoevsky/
- name: Test with pytest
run: |
Expand Down
3 changes: 1 addition & 2 deletions bin/dostoevsky
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ import os
import sys
import typing

from dostoevsky.data import DataDownloader, DATA_BASE_PATH, AVAILABLE_FILES

from dostoevsky.data import AVAILABLE_FILES, DATA_BASE_PATH, DataDownloader

if __name__ == '__main__':
command: str = sys.argv[1]
Expand Down
3 changes: 1 addition & 2 deletions dostoevsky/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import sys
import typing

from dostoevsky.data import DataDownloader, DATA_BASE_PATH, AVAILABLE_FILES

from dostoevsky.data import AVAILABLE_FILES, DATA_BASE_PATH, DataDownloader

if __name__ == '__main__':
if '--dry-run' in sys.argv:
Expand Down
33 changes: 8 additions & 25 deletions dostoevsky/corpora.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,34 @@
import csv
from abc import ABC, abstractmethod
from typing import Generator, List, Optional, Tuple

from typing import Generator, Optional, List, Tuple

from dostoevsky.tokenization import BaseTokenizer


class BaseCorpusContainer:
def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], None, None]:
class BaseCorpus(ABC):
@abstractmethod
def get_data(self) -> Generator[Tuple[str, str], None, None]:
raise NotImplementedError


class RusentimentCorpus(BaseCorpusContainer):
class RusentimentCorpus(BaseCorpus):

CSV_DELIMITER: str = ','
CSV_QUOTECHAR: str = '"'

UNKNOWN_LABEL: str = 'unknown'

LABELS: List[str] = [
'positive',
'negative',
'neutral',
'skip',
'speech',
UNKNOWN_LABEL,
]

def __init__(
self,
data_path: Optional[str],
tokenizer: BaseTokenizer,
lemmatize: bool = True,
):
self.data_path = data_path
self.tokenizer = tokenizer
self.lemmatize = lemmatize
self.label_encoder = self.get_label_encoder()

def get_label_encoder(self):
from sklearn.preprocessing import LabelBinarizer

label_encoder = LabelBinarizer()
return label_encoder.fit(self.LABELS)

def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], None, None]:
def get_data(self) -> Generator[Tuple[str, str], None, None]:
if not self.data_path:
raise ValueError('data_path is None')
with open(self.data_path, encoding='utf8') as source:
Expand All @@ -55,6 +40,4 @@ def get_prepared_data(self) -> Generator[Tuple[List[List[float]], List[int]], No
for i, (label, text) in enumerate(reader):
if i == 0: # skip headers
continue
encoded_label, *_ = self.label_encoder.transform([label])
tokens = self.tokenizer.split(text, lemmatize=self.lemmatize)
yield tokens, encoded_label
yield text, label
5 changes: 2 additions & 3 deletions dostoevsky/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import os
import lzma
import os
import ssl
import typing
import tarfile
import typing
import urllib.request


DATA_BASE_PATH: str = os.path.dirname(os.path.abspath(__file__))
STORAGE_BASE_URL: str = 'https://storage.b-labs.pro/'

Expand Down
19 changes: 11 additions & 8 deletions dostoevsky/models.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,37 @@
import os

from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from typing import Dict, List, Optional

from fasttext import load_model as load_fasttext_model

from dostoevsky.tokenization import BaseTokenizer
from dostoevsky.corpora import BaseCorpusContainer
from dostoevsky.corpora import BaseCorpus
from dostoevsky.data import DATA_BASE_PATH
from dostoevsky.tokenization import BaseTokenizer


class BaseModel:
class BaseModel(ABC):
def __init__(
self,
tokenizer: BaseTokenizer,
lemmatize: bool = True,
model_path: Optional[str] = None,
corpus: Optional[BaseCorpusContainer] = None,
corpus: Optional[BaseCorpus] = None,
):
self.model_path = model_path
self.tokenizer = tokenizer
self.lemmatize = lemmatize
self.corpus = corpus
self.model = self.get_compiled_model() if self.model_path else self.get_raw_model()

@abstractmethod
def get_compiled_model(self):
raise NotImplementedError

def preprocess_input(self, sentences: List[str]):
raise NotImplementedError

def predict(self, sentences: List[str]):
@abstractmethod
def predict(self, sentences: List[str], **kwargs) -> List[Dict[str, float]]:
raise NotImplementedError

def get_raw_model(self):
Expand Down Expand Up @@ -63,7 +65,8 @@ def preprocess_input(self, sentences: List[str]) -> List[str]:
for sentence in sentences
]

def predict(self, sentences: List[str], k: int = -1) -> List[Dict[str, float]]:
def predict(self, sentences: List[str], **kwargs) -> List[Dict[str, float]]:
k = kwargs.get('k', -1)
X = self.preprocess_input(sentences)
Y = (self.model.predict(sentence, k=k) for sentence in X)
return [dict(zip((label.replace('__label__', '') for label in labels), scores)) for labels, scores in Y]
Expand Down
2 changes: 1 addition & 1 deletion dostoevsky/tokenization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import List, Tuple, Optional
from typing import List, Optional, Tuple

from razdel import tokenize as regex_tokenize

Expand Down
2 changes: 2 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[mypy]
ignore_missing_imports = True
2 changes: 2 additions & 0 deletions requirements/tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ wheel==0.36.2
pytest==6.2.1
black==20.8b1
scikit-learn==0.24.0
isort==5.7.0
mypy==0.790
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import List
from setuptools import (
setup,
find_packages,
)

from setuptools import find_packages, setup


def get_long_description() -> str:
Expand Down
12 changes: 3 additions & 9 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import os
import pytest

import pytest

from dostoevsky.tokenization import (
RegexTokenizer,
)
from dostoevsky.corpora import RusentimentCorpus
from dostoevsky.data import DataDownloader, DATA_BASE_PATH
from dostoevsky.data import DATA_BASE_PATH, DataDownloader
from dostoevsky.tokenization import RegexTokenizer


@pytest.fixture(scope='session')
Expand Down Expand Up @@ -58,20 +56,16 @@ def rusentiment_test_corpus_path(rusentiment_corpus_data):
@pytest.fixture(scope='session')
def rusentiment_corpus(
rusentiment_corpus_path,
regex_tokenizer,
):
return RusentimentCorpus(
data_path=rusentiment_corpus_path,
tokenizer=regex_tokenizer,
)


@pytest.fixture(scope='session')
def rusentiment_test_corpus(
rusentiment_test_corpus_path,
regex_tokenizer,
):
return RusentimentCorpus(
data_path=rusentiment_test_corpus_path,
tokenizer=regex_tokenizer,
)
8 changes: 4 additions & 4 deletions tests/test_corpora.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
def test_rusentiment_corpus_get_prepared_data(
def test_rusentiment_corpus_get_data(
rusentiment_corpus,
):
X_train, y_train = [], []
for i, (vectors, label) in enumerate(
rusentiment_corpus.get_prepared_data()
for i, (text, label) in enumerate(
rusentiment_corpus.get_data()
):
X_train.append(vectors)
X_train.append(text)
y_train.append(label)
if i >= 9:
break
Expand Down
Empty file added tests/test_model.py
Empty file.
2 changes: 1 addition & 1 deletion tests/test_tokenization.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Tuple, Optional
from typing import List, Optional, Tuple


def test_regex_tokenizer_base_case(regex_tokenizer):
Expand Down

0 comments on commit f442989

Please sign in to comment.