-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
99 lines (75 loc) · 2.69 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- encoding: utf-8 -*-
import os
import ast
import pandas as pd
import logging
import numpy as np
from sklearn.decomposition import PCA
import gensim
from gensim.models import KeyedVectors
import tensorflow as tf
import numpy as np
from gensim.models import Word2Vec
from dotenv import load_dotenv, find_dotenv
# loading model name, language, project path from env
load_dotenv(find_dotenv(), override=True)
project_path = os.environ.get("PROJECT_PATH")
model_name = os.environ.get("MODEL_NAME")
filepath = os.environ.get("FILEPATH")
language = os.environ.get("LANGUAGE")
dimension = 100
class W2vmodel:
def __init__(self, model_name, language, data, iteration, epoch, windows_size, dimension):
"""
:param model_name:
:param language:
:return:
"""
self.model_name = model_name
self.preprocessed_Data = data
self.language = language
self.iteration = iteration
self.epoch = epoch
self.windows_size = windows_size
self.trainable_data = []
self.unique_words = []
self.dimension = dimension
self.project_path =project_path
def get_list_of_list(self):
"""
:return:
"""
self.trainable_data = open(self.preprocessed_Data, "r").read().split("\n")
self.unique_words = set([word for each in self.trainable_data for word in each.split(" ")])
def train_model(self):
"""
:return:
"""
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
self.get_list_of_list()
model = gensim.models.Word2Vec(
self.trainable_data,
size=self.dimension,
window=self.windows_size,
min_count=2,
workers=10,
iter=self.iteration)
model.train(self.trainable_data, total_examples=len(self.trainable_data), epochs=self.epoch)
model_name = self.model_name + str(self.dimension) + ".model"
model.wv.save(model_name)
return model_name
def load_gensim_model(self, name, test_similarity= None, binary = None):
"""
:return:
# """
if binary:
model = KeyedVectors.load_word2vec_format(name,binary=True)
else:
model = KeyedVectors.load(name, mmap='r')
retun model
def return_similar(self, word, model):
return model.most_similar(word)
if __name__ == '__main__':
ss = W2vmodel(model_name=model_name, language=language,data=filepath, iteration=1000, epoch=12, windows_size=5, dimension=dimension)
model = ss.train_model()
print(ss.return_similar("भारतस्थित", model))