-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrainingData.py
executable file
·103 lines (87 loc) · 3.2 KB
/
TrainingData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import nltk
import io
import numpy as np
import os
from collections import Counter
END = 0
UNK = 1
class TrainingData:
raw_text = []
dictionary = {}
X = []
Y = []
def load_input_data(self, file):
with io.open(file, 'r') as f:
for line in f:
if (line != "\n"):
self.raw_text.append(line)
f.close()
def create_dictionary(self):
tokenized = []
words = []
dictionary = {}
print("Creating dictionary...")
for sentence in self.raw_text:
tokenized.append(nltk.wordpunct_tokenize(sentence))
for sentence in tokenized:
for word in sentence:
words.append(word)
if(len(words) > 10000):
counter = Counter(words)
mostCommonWords = counter.most_common(10000)
words = [0] * len(mostCommonWords)
for i in range(len(mostCommonWords)):
words[i] = mostCommonWords[i][0]
print(len(words))
else:
words = sorted(set(words))
dictionary["<END>"] = END
dictionary["<UNK>"] = UNK
for i in range(0, len(words)):
dictionary[words[i]] = i + 2
self.dictionary = dictionary
def prepare_training_data(self):
print("Preparing training data...")
sentence_number = 0
for i in range(0, len(self.raw_text), 2):
self.X.append([])
sentence = []
words = nltk.wordpunct_tokenize(self.raw_text[i])
for word in words:
sentence.append(word)
sentence.append("<END>")
for word in sentence:
word_representation = np.zeros(shape=(1, len(self.dictionary)))
try:
word_representation[0, self.dictionary[word]] = 1
except KeyError:
word_representation[0, self.dictionary["<UNK>"]] = 1
self.X[sentence_number].append(word_representation[0])
sentence_number += 1
sentence_number = 0
for i in range(1, len(self.raw_text), 2):
self.Y.append([])
sentence = []
words = nltk.wordpunct_tokenize(self.raw_text[i])
for word in words:
sentence.append(word)
sentence.append("<END>")
for word in sentence:
word_representation = np.zeros(shape=(1, len(self.dictionary)))
word_representation[0, self.dictionary[word]] = 1
self.Y[sentence_number].append(word_representation[0])
sentence_number += 1
def load_training_data(self, file):
self.load_input_data(file)
self.create_dictionary()
self.prepare_training_data()
print("Training data is ready!")
def loadOpenSubtitles(self):
for filename in os.listdir('data'):
f = open("data\\"+filename, 'r')
for line in f:
self.raw_text.append(line)
f.close()
self.create_dictionary()
self.prepare_training_data()
print("Training data is ready!")