-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExperiments_PSO.py
304 lines (208 loc) · 9.76 KB
/
Experiments_PSO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# -*- coding: utf-8 -*-
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, SpectralClustering, SpectralBiclustering, MiniBatchKMeans, SpectralCoclustering, BisectingKMeans
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from node2vec import Node2Vec
from gensim.models import KeyedVectors, Word2Vec
import networkx as nx
import pandas as pd
import numpy as np
import networkx as nx
import tensorflow as tf
from keras.layers import Dense, Input
from keras.models import Model
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from node2vec import Node2Vec
from scipy.io import savemat
import itertools
###获取API目录的字典
def getdictofcategory():
'''
得到API和目录一级类别的字典
'''
API = pd.read_csv("./datasets/APIs.csv")
# 对数据进行清洗 除去了在Categories中为nan值的一行
API = API.dropna(subset=["Categories"])
API = API.reset_index(drop=True)
# 对sumbit_date行进行切割
API['newCategories'] = API['Categories'].str.split("###")
newCategories = API['newCategories']
# 将newCategories中全部缩减为只取一级标签
for i in range(len(newCategories)):
newCategories[i] = newCategories[i][0]
# 将一级标签的series赋值于API数据中
API['newCategories'] = newCategories
keys = list(API['APIName'])
values = list(API['newCategories'])
dictofAPICategories = dict(zip(keys, values))
return dictofAPICategories
category_dict = getdictofcategory()
#####数据预处理######
API = pd.read_csv("./datasets/APIs.csv")
# 对数据进行清洗 除去了在Categories中为nan值的一行
API = API.dropna(subset=["Categories"])
API = API.reset_index(drop=True)
# 对数据进行清洗 除去了在Description中为nan值的一行
API = API.dropna(subset=["Description"])
API = API.reset_index(drop=True)
# 对sumbit_date行进行切割
API['newCategories'] = API['Categories'].str.split("###")
newCategories = API['newCategories']
# 将newCategories中全部缩减为只取一级标签
for i in range(len(newCategories)):
newCategories[i] = newCategories[i][0]
# 将一级标签的series赋值于API数据中
API['newCategories'] = newCategories
# 根据保存的图结构 构建图
txt = []
with open('PSO_edges_noweights.txt', 'r', encoding='utf-8') as f:
for line in f:
txt.append(line)
edges = []
for item in txt:
a = item.strip().split('\t')
# a[2] = int(a[2])
edges.append(tuple(a))
G = nx.Graph()
G.add_edges_from(edges)
# 按节点度数排序并将节点放入列表中
nodes_sorted_by_degree = sorted(G.nodes(), key=lambda x: G.degree(x), reverse=True)
node_list = []
for node in nodes_sorted_by_degree:
node_list.append(node)
sub_G = nx.subgraph(G, node_list[:500])
nx.write_graphml(sub_G,"PSO.graphml")
av = nx.average_clustering(G)
assort = nx.degree_assortativity_coefficient(G)
# av_degree = nx.degree(G)
# 获取每个节点的度数
degrees = dict(G.degree())
# 计算平均度数
avg_degree = sum(degrees.values()) / len(degrees)
nodes = list(G.nodes)
# 预处理 将网络中的节点不在API的datafram中 那么就将该节点删掉
full_API = list(API["APIName"])
#在这里把图中节点不存在API里面的节点去除掉
for none_node in nodes:
if none_node not in full_API:
G.remove_node(none_node)
# 将节点的描述信息放入node_description
node_description = []
node_label = []
#new_nodes为处理之后的图, 需要用这个图, 因为这个图他的每一个节点都是有描述和category
new_nodes = list(G.nodes)
# ##将这里的new_nodes保存到本地 方便后续进行操作
#
# with open('newnodes.txt', 'w', encoding='utf-8') as f:
# for item in new_nodes:
# f.write("%s\n" % item)
for no in new_nodes:
aa = API[API.APIName == no].index.tolist()[0]
node_label.append(API['newCategories'][aa])
node_description.append(API['Description'][aa])
####预处理结束####
####目标: 利用bert模型得到每一个API的文本特征向量
#####对预处理数据进行深度学习
max_length = 32
embedding_size = 768
# 加载BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('E:/IntelligentServiceLab/huggingface/bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', mirror= 'https://mirros.tuna.tsinghua.edu.cn/hugging-face-models')
model = BertModel.from_pretrained('E:/IntelligentServiceLab/huggingface/bert-base-uncased')
# 构建节点特征矩阵
node_feature = []
for sentence in node_description:
inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
outputs = model(**inputs)
# Get the last hidden state of the first token
embedding = outputs.last_hidden_state[:, 0, :]
# Resize the embedding to the desired size
embedding = torch.nn.functional.pad(embedding, (0, embedding_size - embedding.size(1)), 'constant', 0)
# Convert to a numpy array
embedding = embedding.detach().numpy()
node_feature.append(embedding[0])
#####这里的文本特征为对每一个API的文本进行嵌入
text_features = torch.Tensor(np.array(node_feature))
#######API的文本特征向量的构造结束
# features = dict(zip(new_nodes, node_feature))
# nx.set_node_attributes(G, features, "features")
#
# G1 = nx.convert_node_labels_to_integers(G)
######目标: 利用Node2vec模型得到每一个API的机构特征向量
model_X2 = Word2Vec.load('Node2Vec_PSO16_noweights.model')
dict_vec = model_X2.wv.key_to_index
vec = model_X2.wv.vectors
node2vec_features = []
###处理所得到的Node2vecAPI嵌入向量的表征和之前的networkx的图嵌入的API表征的对齐 因为这里的dict_vec第一个为Twitter但是new_node第一个API为Google 所以要对齐
for i in new_nodes:
node2vec_features.append(vec[dict_vec[i]])
node2vec_features = torch.Tensor(node2vec_features)
########利用Node2vec模型得到每一个API的机构特征向量结束
test_X = torch.cat([text_features, node2vec_features], dim=1)
X2_truelabel = []
for i in dict_vec.keys():
if i in category_dict.keys():
X2_truelabel.append(category_dict[i])
all_label_category = list(set(X2_truelabel))
label = []
for i in X2_truelabel:
label.append(all_label_category.index(i))
cluster2 = KMeans(n_clusters=329, random_state=9).fit(node2vec_features)
pre_label = cluster2.labels_
pre_label_spect = SpectralClustering(n_clusters=329, gamma=0.1).fit_predict(test_X)
spectbi = SpectralBiclustering(n_clusters=329, random_state=0).fit(node2vec_features)
pre_label_spectbi = spectbi.row_labels_
minibatchkmeans = MiniBatchKMeans(n_clusters=329, random_state=0, batch_size=6).fit(node2vec_features)
pre_label_minibatch = minibatchkmeans.labels_
# spectco = SpectralCoclustering(n_clusters=329, random_state=0).fit(test_X)
# pre_label_spectco = spectco.row_labels_
Bisect = BisectingKMeans(n_clusters=329, random_state=0).fit(test_X)
pre_label_Bisect = Bisect.labels_
ACC_kmeans = metrics.accuracy_score(label, pre_label)
NMI_kmeans = metrics.normalized_mutual_info_score(label, pre_label)
ARI_kmeans = metrics.adjusted_rand_score(label, pre_label)
AMI_kmeans = metrics.adjusted_mutual_info_score(label, pre_label)
MI_kmeans = metrics.mutual_info_score(label, pre_label)
R_kmeans = metrics.rand_score(label, pre_label)
CS_kmeans = metrics.completeness_score(label, pre_label)
spect_ACC = metrics.accuracy_score(label, pre_label_spect)
spect_NMI = metrics.normalized_mutual_info_score(label, pre_label_spect)
spect_ARI = metrics.adjusted_rand_score(label, pre_label_spect)
spect_AMI = metrics.adjusted_mutual_info_score(label, pre_label_spect)
spect_MI = metrics.mutual_info_score(label, pre_label_spect)
spect_R = metrics.rand_score(label, pre_label_spect)
spect_CS = metrics.completeness_score(label, pre_label_spect)
spectbi_ACC = metrics.accuracy_score(label, pre_label_spectbi)
spectbi_NMI = metrics.normalized_mutual_info_score(label, pre_label_spectbi)
spectbi_ARI = metrics.adjusted_rand_score(label, pre_label_spectbi)
spectbi_AMI = metrics.adjusted_mutual_info_score(label, pre_label_spectbi)
spectbi_MI = metrics.mutual_info_score(label, pre_label_spectbi)
spectbi_R = metrics.rand_score(label, pre_label_spectbi)
spectbi_CS = metrics.completeness_score(label, pre_label_spectbi)
minibatch_ACC = metrics.accuracy_score(label, pre_label_minibatch)
minibatch_NMI = metrics.normalized_mutual_info_score(label, pre_label_minibatch)
minibatch_ARI = metrics.adjusted_rand_score(label, pre_label_minibatch)
minibatch_AMI = metrics.adjusted_mutual_info_score(label, pre_label_minibatch)
minibatch_MI = metrics.mutual_info_score(label, pre_label_minibatch)
minibatch_R = metrics.rand_score(label, pre_label_minibatch)
minibatch_CS = metrics.completeness_score(label, pre_label_minibatch)
Bisect_ACC = metrics.accuracy_score(label, pre_label_Bisect)
Bisect_NMI = metrics.normalized_mutual_info_score(label, pre_label_Bisect)
Bisect_ARI = metrics.adjusted_rand_score(label, pre_label_Bisect)
Bisect_AMI = metrics.adjusted_mutual_info_score(label, pre_label_Bisect)
Bisect_MI = metrics.mutual_info_score(label, pre_label_Bisect)
Bisect_R = metrics.rand_score(label, pre_label_Bisect)
Bisect_CS = metrics.completeness_score(label, pre_label_Bisect)
# spectco_ACC = metrics.accuracy_score(label, pre_label_spectco)
# spectco_NMI = metrics.normalized_mutual_info_score(label, pre_label_spectco)
# spectco_ARI = metrics.adjusted_rand_score(label, pre_label_spectco)
# spectco_AMI = metrics.adjusted_mutual_info_score(label, pre_label_spectco)
# spectco_MI = metrics.mutual_info_score(label, pre_label_spectco)
# spectco_R = metrics.rand_score(label, pre_label_spectco)
# spectco_CS = metrics.completeness_score(label, pre_label_spectco)
print("aaaa")