-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathtext_categorization.py
169 lines (158 loc) · 5.95 KB
/
text_categorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# -*-coding:utf-8-*-
import jieba.posseg as pesg
import codecs
import sys
from gensim import corpora,models,similarities
import os
allFileNum = 0
reload(sys)
sys.setdefaultencoding('utf8')
class XiangSi():
def __init__(self):
#构建停用词表
self.stop_words='D:/' + u'py程序' + '/stopwords.txt'
self.stopwords=codecs.open(self.stop_words,'r',encoding='utf-8').readlines()
self.stopwords=[w.strip()for w in self.stopwords]
self.stop_flag=['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']
#对文章进行分词、去停用词
def tokenzation(self,filename):
result=[]
with open(filename,'r') as f:
text=f.read()
words=pesg.cut(text)#
for word,flag in words:
if flag not in self.stop_flag and word not in self.stopwords:
result.append(word)
return result
def wenzhang(self,files,example):
filenames=files
corpus=[]
for each in filenames:
corpus.append(self.tokenzation(each))
#建立词袋模型
dictionary=corpora.Dictionary(corpus)
doc_vectors=[dictionary.doc2bow(text) for text in corpus]
#建立TF-IDF模型
tfidf=models.TfidfModel(doc_vectors)
tfidf_vectors=tfidf[doc_vectors]
#指定主题数
lsi=models.LsiModel(tfidf_vectors,id2word=dictionary,num_topics=5)
lsi = models.LsiModel(tfidf_vectors, id2word=dictionary)
lsi_vector=lsi[tfidf_vectors]
#构建训练样本
query=self.tokenzation('D:/' +'ZNdaolun/Sun/' + example)
query_bow=dictionary.doc2bow(query)
query_lsi=lsi[query_bow]
index=similarities.MatrixSimilarity(lsi_vector)
sims=index[query_lsi]
#print list(enumerate(sims))
return list(enumerate(sims))
def analyse_nr(self,filepath):
listfiles=[]
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
child.decode('utf-8')
listfiles.append(child)
return listfiles
#print child.decode('utf-8') # .decode('gbk')是解决中文显示乱码问题
def maopao(self,list1):
j = 0
for i in range(len(list1)):
for j in range(len(list1) - i - 1):
if (list1[j][1] < list1[j + 1][1]):
t = list1[j]
list1[j] = list1[j + 1]
list1[j + 1] = t
print list1
def token1(self,text,topic):
result=[]
words=pesg.cut(text)#
for word in words:
if word in self.stopwords:
result.append(word)
return result
def printPath(self,level, path):
global allFileNum
# 所有文件夹,第一个字段是次目录的级别
dirList = []
# 所有文件
fileList = []
# 返回一个列表,其中包含在目录条目的名称(google翻译)
files = os.listdir(path)
# 先添加目录级别
dirList.append(str(level))
for f in files:
if (os.path.isdir(path + '/' + f)):
# 排除隐藏文件夹。因为隐藏文件夹过多
if (f[0] == '.'):
pass
else:
# 添加非隐藏文件夹
dirList.append(f)
if (os.path.isfile(path + '/' + f)):
# 添加文件
fileList.append(f)
# 当一个标志使用,文件夹列表第一个级别不打印
i_dl = 0
for dl in dirList:
if (i_dl == 0):
i_dl = i_dl + 1
else:
# 打印至控制台,不是第一个的目录
print '-' * (int(dirList[0])), dl
# 打印目录下的所有文件夹和文件,目录级别+1
self.printPath((int(dirList[0]) + 1), path + '/' + dl)
# for fl in fileList:
# # 打印文件
# print '-' * (int(dirList[0])), fl
# # 随便计算一下有多少个文件
# allFileNum = allFileNum + 1
return fileList
def change1(self,list1, list2, str1):
for i in range(len(list1)):
newnum = list1[i][1]
newstr = str(list1[i][0]) +'-'+str(list2[i])+'-'+ str1
list1[i] = (newstr, newnum)
return list1
if __name__=='__main__':
xs=XiangSi()
listfile=xs.printPath(1, 'D:/ZNdaolun/Sun/text')
count=1
list6=[]
#files=['D:/' + u'py程序' + '/answer/1.txt','D:/' + u'py程序' + '/answer/people_top10.txt','D:/' + u'py程序' + '/answer/p_location.txt']
files=xs.analyse_nr('D:/' +'ZNdaolun' + '/Sun/text/')
#print files
#区分类别,根据需要更改类别文件
list1=xs.wenzhang(files,'computer.txt')
list1=xs.change1(list1,listfile,"computer")
list2 = xs.wenzhang(files, 'educate.txt')
list2=xs.change1(list2, listfile, "educate")
list3 = xs.wenzhang(files, 'sport.txt')
list3=xs.change1(list3, listfile, "sport")
list4 = xs.wenzhang(files, 'war.txt')
list4=xs.change1(list4, listfile, "war")
list5 = xs.wenzhang(files, 'weather.txt')
list5=xs.change1(list5, listfile, "weather")
for i in range(len(list1)):
if list1[i][1]<list2[i][1]:
list6.append(list2[i])
else:
list6.append(list1[i])
for i in range(len(list1)):
if list6[i][1] < list3[i][1]:
list6[i]=list3[i]
for i in range(len(list1)):
if list6[i][1] < list4[i][1]:
list6[i] = list4[i]
for i in range(len(list1)):
if list6[i][1] < list5[i][1]:
list6[i] = list5[i]
# count+=1
# print list1,u'高血压','\n',list2,'ios','\n',list3,'android','\n',list4,'lee','\n',list5,'zhou'
# print xs.printPath(1, 'D:/ZNdaolun/text')
for i in range(len(list6)):
print list6[i][1]
if list6[i][1]<0.3:
list6[i]=(list6[i][0],"no")
print list6