-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyze.py
179 lines (128 loc) · 4.57 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# This module conducts least path analysis and outputs best paths to output file
from generateGraph import *
import csv
# Generate graph
map = generateGraph()
# Set up weights of eig influence, degree centrality influence and maximum path length
eig_influence = 0.5
central_influence = 0.5
max_path = 10
# Helper method that checks if element is in list
def inList(elem, lis):
for l in lis:
if (elem in l) or (l in elem):
return True
return False
# Get target artest and starting collaborators
def getLists():
# Get top 100 artists based on number of followers, as well as their genres
ids = list(map.keys())
ids.sort(key=lambda id: map[id].followers, reverse=True)
popular_artists = ids[0:100]
genres = [map[id].genre for id in popular_artists]
# Get starting artists by filtering out artists with more than 4000 followers
# Then we get rank these by eigenvector centrality and betweenness (both have equal influence on ranking)
starting_artists = list(ids)
starting_artists = filter(lambda id: (map[id].followers <= 4000), starting_artists)
starting_artists = list(starting_artists)
starting_artists.sort(key=lambda id: (eig_influence*map[id].eig) + ((1-eig_influence)*map[id].between), reverse=True)
starting_artists = list(starting_artists)
# Then we filter out the artists in this list who do not belong to the genres of the top 100 artists
starting_artists = filter(lambda id: inList(map[id].genre, genres), starting_artists)
starting_artists = list(starting_artists)
# Then we get top 50% of these artists as our starting collaborators
length = len(starting_artists)
middle = length//2
starting_artists = starting_artists[0:middle]
# Return starting collaborators and target artists
return (starting_artists, popular_artists)
# Use dfs to generate all least paths (with a maximum size of 10) between each starting artist and any of the target artists
def getAllPaths(start, end):
res = []
hasVisited = {}
for key in map:
hasVisited[key] = False
# Initialize stack
stack = [start]
paths = [[start]]
hasVisited[start] = True
pathDicts = [hasVisited]
# DFS traversal through graph
while stack:
artist = stack.pop(-1)
artist = map[artist]
path = paths.pop(-1)
pathDict = pathDicts.pop(-1)
if (artist.id in end):
res.append(path)
continue
if len(path) > max_path:
continue
# As traversing through graph, only visit artists who have not been visited as part of that path (to avoid cycles)
# Only visit artists who have shared genre
children = artist.links
for child in children:
(target, weight) = child
if (map[target].genre in artist.genre) or (artist.genre in map[target].genre):
if (pathDict[target] == False):
stack.append(target)
newPath = list(path)
newPath.append(target)
paths.append(newPath)
newPathDict = dict(pathDict)
newPathDict[target] = True
pathDicts.append(newPathDict)
# Return paths
return res
# Helper method to score each path based on sum of inverse of each edge's weight
def scorePath(path):
score = 0.0
for i in range(0, len(path)-1):
result = [child[0] for child in map[path[i]].links].index(path[i+1])
score += (1/float(map[path[i]].links[result][1]))
return score
# Helper method to score each path based on sum of weighted degrees
def scoreCentrality(path):
scoreCentral = 0.0
for id in path:
scoreCentral += map[id].wDegree
return central_influence
def getPathStr(path):
res = ""
for i in range(0, len(path)):
at = map[path[i]]
res += at.name + "; " + str(at.followers)
if (i < len(path)-1):
res += " =======> "
return res
res = getLists()
(starts, ends) = res
allPaths = []
for start in starts:
paths = getAllPaths(start, ends)
for path in paths:
allPaths.append(path)
# Sort paths based on highest sum of inverted edge weights
# Get bottom 50% of these paths
allPaths.sort(key=lambda path: scorePath(path), reverse=False)
length = len(allPaths)
middle = length//2
allPaths = allPaths[0:middle]
# Sort paths based on highest sum of weighted degrees
# Get top 50% of these paths
allPaths.sort(key=lambda path: scoreCentrality(path), reverse=True)
allPaths.pop(0)
# Create file to output best paths
a = open("files/best_paths.csv", "w")
writer = csv.writer(a)
writer.writerow(["paths"])
for path in allPaths:
writer.writerow(path)
a.close()
# Create file to output best path scores
b = open("best_paths_scores.csv", "w")
writer = csv.writer(b)
writer.writerow(["inverted weights score", "sum centralities score"])
for path in allPaths:
writer.writerow([scorePath(path), scoreCentrality(path)])
b.close()