-
Notifications
You must be signed in to change notification settings - Fork 0
/
2020-12-3 RNN文本分类.py
113 lines (92 loc) · 3.78 KB
/
2020-12-3 RNN文本分类.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
'''
文本分类数据集:
https://pan.baidu.com/s/1hugrfRu 密码:qfud
参考网址:
https://blog.csdn.net/qq_36047533/article/details/88094385
https://blog.csdn.net/weixin_40931845/article/details/83865877
'''
# 分词
# 词语 -> id
# matrix -> [|V|, embed_size]
# 词语A -> id(5)
# 词表
# label -> id
import sys
import os
import jieba # pip install jieba
# input files
train_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.train.txt'
val_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.val.txt'
test_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.test.txt'
# output files
seg_train_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.train.seg.txt'
seg_val_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.val.seg.txt'
seg_test_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.test.seg.txt'
vocab_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.vocab.txt'
category_file = r'C:\Users\ext_renqq\Desktop\文本分类数据\cnews.category.txt'
with open(val_file, 'rb') as f:
lines = f.readlines()
# for line in lines:
#print(lines[0].decode('utf-8'), lines[0].decode('utf-8').strip('\r\n'), lines[0].decode('utf-8').strip('\r\n').split('\t'))
label, content = lines[0].decode('utf-8').strip('\r\n').split('\t')
word_iter = jieba.cut(content)
# print(word_iter)
def generate_seg_file(input_file, output_seg_file):
"""Segment the sentences in each line in input_file"""
with open(input_file, 'rb') as f:
lines = f.readlines()
with open(output_seg_file, 'w', encoding='utf-8') as f:
for line in lines:
label, content = line.decode('utf-8').strip('\r\n').split('\t')
word_iter = jieba.cut(content)
word_content = ''
for word in word_iter:
# print('word', word)
word = word.strip(' ')
# print('word', word)
if word != '':
word_content += word + ' '
out_line = '%s\t%s\n' % (label, word_content.strip(' '))
f.write(out_line)
generate_seg_file(train_file, seg_train_file)
generate_seg_file(val_file, seg_val_file)
generate_seg_file(test_file, seg_test_file)
def generate_vocab_file(input_seg_file, output_vocab_file):
with open(input_seg_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
word_dict = {}
for line in lines:
label, content = line.strip('\r\n').split('\t')
for word in content.split():
word_dict.setdefault(word, 0)
word_dict[word] += 1
# [(word, frequency), ..., ()]
# print('111', word_dict)
sorted_word_dict = sorted(
word_dict.items(), key=lambda d: d[1], reverse=True)
# print('111',sorted_word_dict)
with open(output_vocab_file, 'w', encoding='utf-8') as f:
f.write('<UNK>\t10000000\n')
for item in sorted_word_dict:
# print('item', item, item[1])
# print(type(item))
f.write('%s\t%d\n' % (item[0], item[1]))
generate_vocab_file(seg_train_file, vocab_file)
def generate_category_dict(input_file, category_file):
with open(input_file, 'rb') as f:
lines = f.readlines()
category_dict = {}
for line in lines:
# print(line.decode('utf-8'))
label, content = line.decode('utf-8').strip('\r\n').split('\t')
category_dict.setdefault(label, 0)
category_dict[label] += 1
category_number = len(category_dict)
print('category_dict', category_dict)
with open(category_file, 'w', encoding='utf-8') as f:
for category in category_dict:
line = '%s\n' % category
print('%s\t%d' % (
category, category_dict[category]))
f.write(line)
generate_category_dict(train_file, category_file)