Skip to content

Commit

Permalink
my variant sberbank-ai#2
Browse files Browse the repository at this point in the history
  • Loading branch information
Somewater committed Aug 9, 2018
1 parent 5b70738 commit 840fc98
Show file tree
Hide file tree
Showing 8 changed files with 255 additions and 111 deletions.
8 changes: 6 additions & 2 deletions create_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,9 @@ def zip_files(zip, source_dir, files_pattern = '*', exclude_pattern = '__pycache
output_filename = os.path.join(root_path, 'my-%s.zip' % now)
with zipfile.ZipFile(output_filename, "w", zipfile.ZIP_DEFLATED) as zip:
zip_files(zip, os.path.join(root_path, 'my'), '*')
zip_files(zip, os.path.join(root_path, 'data'), ['stop_words.csv', 'classic_poems.json'])
zip.write(os.path.join(root_path, 'my', 'metadata.json'), 'metadata.json')
zip_files(zip, os.path.join(root_path, 'data'), ['stop_words.csv', 'words_accent.json.bz2'])
zip_files(zip, os.path.join(root_path, 'tmp'), ['wiki_corpus_w2v.bin',
'wiki_corpus_w2v.bin.trainables.syn1neg.npy',
'wiki_corpus_w2v.bin.wv.vectors.npy'])
zip.write(os.path.join(root_path, 'my', 'metadata.json'), 'metadata.json')
zip.write(os.path.join(root_path, 'my', 'server.py'), 'server.py')
1 change: 0 additions & 1 deletion data/words_accent.json

This file was deleted.

25 changes: 23 additions & 2 deletions my/corpus_w2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from my.utils import stem, lemma
import os
from gensim.corpora import WikiCorpus
from scipy.spatial.distance import cosine
import numpy as np

class CorpusW2v(object):
def __init__(self, corpus: Corpus, reader: DataReader, vector_size: int = 100):
Expand All @@ -27,7 +29,6 @@ def sentences(self, stemm: bool = False, lemmatize: bool = False) -> Iterator[It
if stemm:
tokens = [stem(w) for w in tokens]
i += 1
#if i % 100000 == 0: print(i, 'topics iterated')
yield tokens

def train(self):
Expand All @@ -39,6 +40,7 @@ def train(self):
def load(self):
self.model = Word2Vec.load(self.model_filepath)
self.model.init_sims(replace=True)
#self.lemma2word = {word.split('_')[0]: word for word in self.model.wv.index2word}

def find_similar_words(self, words: List[str], stemmer: Callable[[str], str] = None) -> Iterator[str]:
word_in_corpus = []
Expand All @@ -50,7 +52,6 @@ def find_similar_words(self, words: List[str], stemmer: Callable[[str], str] = N
if word_in_corpus:
for w, score in self.model.wv.most_similar(positive=word_in_corpus, topn=1000):
yield w
#return .most_similar(positive=[])

def accuracy(self) -> Tuple[float, Dict[str, Tuple[int, int]]]:
topics = self.reader.read_check_topics()
Expand All @@ -66,6 +67,26 @@ def accuracy(self) -> Tuple[float, Dict[str, Tuple[int, int]]]:
result_acc.append(union / n)
return sum(result_acc) / len(result_acc), result_data

def word_vector(self, word):
word = lemma(word)
#word = self.lemma2word.get(lemma)
return self.model[word] if word in self.model else None

def text_vector(self, text):
"""Вектор текста, получается путем усреднения векторов всех слов в тексте"""
word_vectors = [
self.word_vector(token)
for token in get_cyrillic_words(text.lower())
if len(token) > 2 and not (token in self.stop_words)
]
word_vectors = [vec for vec in word_vectors if vec is not None]
return np.mean(word_vectors, axis=0)

def distance(self, vec1, vec2):
if vec1 is None or vec2 is None:
return 2
return cosine(vec1, vec2)

@staticmethod
def create_fasttext_model(self):
return FastText.load_fasttext_format(os.path.join('data', 'fasttext', 'ru'))
38 changes: 35 additions & 3 deletions my/data_reader.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
from typing import List, Iterator, Set, Dict, Tuple
from my.model import *
from my.utils import stem, lemma
from my.utils import *
import os
import json
import bz2
from lxml import etree
import csv
import re
from collections import defaultdict
import itertools
import nltk

BigLetters = re.compile('[A-Z]+')

class DataReader:
DATASETS_PATH = os.environ.get('DATASETS_PATH', 'data')

def read_classic_poems(self) -> List[Poem]:
poems: List[Poem] = []
with open(os.path.join('data', 'classic_poems.json')) as f:
with open(os.path.join(DataReader.DATASETS_PATH, 'classic_poems.json')) as f:
for entry in json.load(f):
poem: Poem = Poem(Poet.by_poet_id(entry['poet_id']), entry['title'], entry['content'])
prepared_content = "\n".join([
unify_chars(line)
for line in get_lines(entry['content'])
])
poem: Poem = Poem(Poet.by_poet_id(entry['poet_id']), entry['title'], prepared_content)
poems.append(poem)
return poems

Expand Down Expand Up @@ -221,6 +229,30 @@ def read_check_topics(self) -> Dict[str, List[str]]:
result[name] = lemms
return result

def form_dictionary_from_csv(self, phonetic: 'Phonetic', column='paragraph', max_docs=30000):
"""Загрузить словарь слов из CSV файла с текстами, индексированный по формам слова.
Возвращает словарь вида:
{форма: {множество, слов, кандидатов, ...}}
форма — (<число_слогов>, <номер_ударного>)
"""
corpora_tokens = []
with open(os.path.join(DataReader.DATASETS_PATH, 'sdsj2017_sberquad.csv')) as fin:
reader = csv.DictReader(fin)
for row in itertools.islice(reader, max_docs):
paragraph = row[column]
paragraph_tokens = nltk.tokenize.word_tokenize(paragraph.lower())
corpora_tokens += paragraph_tokens

word_by_form = defaultdict(set)
for token in corpora_tokens:
if token.isalpha():
word_syllables = phonetic.syllables_count(token)
word_accent = phonetic.accent_syllable(token)
form = (word_syllables, word_accent)
word_by_form[form].add(token)

return word_by_form

class OpCorpus(Corpus):
def __init__(self, reader: DataReader):
super().__init__('opcorpora', reader.read_opcorpora)
Expand Down
Loading

0 comments on commit 840fc98

Please sign in to comment.