-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy pathdata_utils.py
43 lines (38 loc) · 1.17 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# *-* coding:utf-8 *-*
'''
@author: ioiogoo
@date: 2018/1/31 19:30
'''
def preprocess_file(Config):
# 语料文本内容
files_content = ''
with open(Config.poetry_file, 'r',encoding='UTF-8') as f:
for line in f:
x = line.strip() + "]"
x = x.split(":")[1]
if len(x) <= 5 :
continue
if x[5] == ',':
files_content += x
words = sorted(list(files_content))
counted_words = {}
for word in words:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
# 去掉低频的字
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
words, _ = zip(*wordPairs)
words += (" ",)
# word到id的映射
word2num = dict((c, i) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
word2numF = lambda x: word2num.get(x, len(words) - 1)
return word2numF, num2word, words, files_content