forked from yannguegan/marottes-medias-back
-
Notifications
You must be signed in to change notification settings - Fork 0
/
media_prepare_data.py
391 lines (335 loc) · 14 KB
/
media_prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# coding: utf-8
# !/usr/bin/python3.6
# This script gathers entities and media data to create the
# JSON file loaded front-end to render the main dashboard
# Libraries we need
import requests
import pandas as pd
import json
import re
import os
import time
import datetime
import scipy.stats as ss
from bs4 import BeautifulSoup
from pprint import pprint
from dateutil.parser import parse
import urllib.parse
from collections import OrderedDict
import math
print('')
print('/// Librairies correctly imported')
# Global variables we will use
context = 'dev'
print('Current environment:',context)
relevance_threshold = 0.35
confidence_threshold = 5
count_media_min = 0
media_list_URL = '[GOOGLE-SHEET-CSV-URL]'
entities_to_ignore_URL = '[GOOGLE-SHEET-CSV-URL]'
search_url = {
'path': 'https://www.google.fr/search?q=site:',
'timeParam':'&tbs=qdr:w'
}
this_week = datetime.datetime.now().isocalendar()[1]
this_week = str(datetime.datetime.now().year) + '.' + str(this_week)
this_month = str(datetime.datetime.now().year) + '.' + str(datetime.datetime.now().month)
data_path = '[LOCAL-PATH}'
if context == 'prod':
data_path = '[SERVER-PATH]'
analyses_timespans = ['current', 'previous']
print('')
print('/// Global variables ready')
# Functions we need
def get_csv_content(url, sep, file, index):
try:
table = pd.read_csv(url, delimiter=sep, index_col=index)
print('URL successfuly loaded:',url)
if table.iloc[0,0] == 'NaN':
print('Error in table found at URL:',url)
print('Using latest saved version of:', file + '.csv')
table = pd.read_csv(data_path + 'csv/' + file + '.csv')
else:
print('Writing CSV file for backup:', file + '.csv')
table.to_csv(data_path + 'csv/' + file + '.csv', index_label=index)
except:
print('Error loading URL:',url)
print('Using latest saved version of:', file + '.csv')
table = pd.read_csv(data_path + 'csv/' + file + '.csv', index_col=index)
return table
print('')
print('/// Functions correcty defined')
media_list = get_csv_content(media_list_URL, ',' , 'media_list', 'code')
media_list['code'] = media_list.index.values
media_list = media_list.reindex(media_list.index.rename('id'))
media_list = media_list[media_list['scrap'] == 'oui']
media_list = media_list[media_list['show'] == 'oui']
if context == 'dev':
media_list = media_list[:500]
# Prepare dictionary with media info
media = []
for row in media_list.iterrows():
medium = {
'code': row[0],
'name': row[1]['media'],
'rss': row[1]['rss'],
'domain': row[1]['domaine'],
}
media.append(medium)
nb_media=len(media)
print('Media dictionary ready,', len(media), 'medias found')
print('')
print('/// Media list loaded and media object ready')
# Load existing analyses for each media
for i,medium in enumerate(media):
# clear_all_outputs()
print('Loading existing content for media',(i+1),'/',nb_media,":",medium['name'])
medium['analysesBy7days'] = {}
for ts in analyses_timespans:
try:
file = open(data_path + '/' + medium['code'] + '/analysesBy7days/' + ts + '.json').read()
content = json.loads(file)
medium['analysesBy7days'][ts] = content
except:
print('No file',ts + '.json', 'found or file error for media',(i+1),'/',nb_media,":",medium['name'])
medium['analysesBy7days'][ts] = {
'entities': []
}
print('')
print('/// Existing analyses loaded for all media')
# Load list of entities to ignore
entities_to_ignore = get_csv_content(entities_to_ignore_URL, ',', 'ignore','id')
ignore_list = []
for row in entities_to_ignore.iterrows():
ignore_list.append(row[1]['nom'])
print('')
print('/// Entities to ignore list loaded')
# Get entities ranks for current/previous analyse
all_entities = {
'current': {},
'previous': {}
}
ranked_entities = {
'current': [],
'previous': []
}
for timespan in analyses_timespans:
for medium in media:
media_entities = medium['analysesBy7days'][timespan]['entities']
for entity in media_entities:
if entity['confidence'] >= confidence_threshold and entity['relevance'] >= relevance_threshold and entity['name'] not in ignore_list:
if entity['code'] not in all_entities[timespan]:
all_entities[timespan][entity['code']] = {
'code': entity['code'],
'media': [],
'mediaCount': 0,
'totalScoreRank': 0,
'totalScore': 0
}
all_entities[timespan][entity['code']]['media'].append(medium['code'])
all_entities[timespan][entity['code']]['totalScore'] += entity['relevance']
for entity_code, data in all_entities[timespan].items():
media_count = len(all_entities[timespan][entity_code]['media'])
all_entities[timespan][entity_code]['mediaCount'] = media_count
all_entities[timespan][entity_code]['totalScore'] = round(all_entities[timespan][entity_code]['totalScore'],3)
for entity_code, data in all_entities[timespan].items():
ranked_entities[timespan].append(data)
ranked_entities[timespan].sort(key=lambda x: x['totalScore'], reverse=True)
rank = 1
equally = 0
previous_entity_score = ranked_entities[timespan][0]['totalScore']
for entity in ranked_entities[timespan]:
current_entity_score = entity['totalScore']
if current_entity_score < previous_entity_score:
rank += equally
equally = 1
if current_entity_score == previous_entity_score:
equally += 1
entity['totalScoreRank'] = rank
previous_entity_score = current_entity_score
print('')
print('/// All entities listed and ranked')
# Prepare global data file
data = {
'entities': {},
'media': {},
'analyse': {}
}
data['analyse']['countMediaMin'] = count_media_min
data['analyse']['relevanceThreshold'] = relevance_threshold
# Load entities and medis in global data
for i,medium in enumerate(media):
# Add media to list
data['media'][medium['code']] = {
'code': medium['code'],
'name': medium['name'],
'domain': medium['domain']
}
data['media'][medium['code']]['analyse'] = {}
# Add info about last analysis for each medium
try:
data['media'][medium['code']]['analyse']['nbStories'] = medium['analysesBy7days']['current']['nbStories']
data['media'][medium['code']]['analyse']['lastAnalyse'] = medium['analysesBy7days']['current']['lastAnalyse']
except:
data['media'][medium['code']]['analyse']['nbStories'] = 'nc'
data['media'][medium['code']]['analyse']['lastAnalyse'] = 'nc'
# Add entities
try:
all_entities = medium['analysesBy7days']['current']['entities']
print('7days analysis for',medium['name'],':',len(all_entities),'entities')
for entity in all_entities:
if entity['confidence'] >= confidence_threshold and entity['relevance'] >= relevance_threshold and entity['name'] not in ignore_list:
# Add entity to list
# Create entity medium
entity_medium = {
'code': medium['code'],
'name': medium['name'],
'relevance': entity['relevance'],
'confidence': entity['confidence'],
}
# Try to find previous relevance score
previous_entities = medium['analysesBy7days']['previous']['entities']
entity_medium['previousRelevance'] = '-'
entity_medium['previousRelevanceEvol'] = 'none'
for previous_entity in previous_entities:
if previous_entity['code'] == entity['code']:
entity_medium['previousRelevance'] = previous_entity['relevance']
if entity_medium['previousRelevance'] <= entity_medium['relevance']:
entity_medium['previousRelevanceEvol'] = 'more'
else:
entity_medium['previousRelevanceEvol'] = 'less'
# Add entity to dict if not existing
if entity['code'] not in data['entities']:
data['entities'][entity['code']] = {
'code': entity['code'],
'name': entity['name'],
'media': []
}
# Get search terms for this entity
try:
data['entities'][entity['code']]['terms'] = entity['terms']
except:
data['entities'][entity['code']]['terms'] = []
else:
# Add search terms to entity data if not already there
try:
terms = entity['terms']
except:
terms = []
current_terms = data['entities'][entity['code']]['terms']
for term in terms:
if term not in current_terms:
data['entities'][entity['code']]['terms'].append(term)
# Add entity medium to entity data
data['entities'][entity['code']]['media'].append(entity_medium)
except:
print('Could not find entities for 7days analysis of', medium['name'])
print('')
print('/// Media and entities selection completed')
# Build search query for each entity
for name, info in data['entities'].items():
terms = data['entities'][name]['terms']
search = ''
for i,term in enumerate(terms):
if i == 0:
search += ' '
if i > 0:
search += ' OR '
search += '"' + term + '"'
data['entities'][name]['search'] = search
# Remove terms array to save file space
for name, info in data['entities'].items():
del data['entities'][name]['terms']
print('')
print('/// Search queries OK')
# Add search URL for each media in each entity
for name, info in data['entities'].items():
entity_media = data['entities'][name]['media']
for entity_medium in entity_media:
domain = data['media'][entity_medium['code']]['domain']
medium_search_url = search_url['path'] + domain + urllib.parse.quote_plus(data['entities'][name]['search']) + search_url['timeParam']
entity_medium['searchURL'] = medium_search_url
# Remove search query to save file space
for name, info in data['entities'].items():
del data['entities'][name]['search']
print('')
print('/// Search URLs ready')
# Add average and other calculation for each media in each entity
for name, info in data['entities'].items():
entity_media = data['entities'][name]['media']
data['entities'][name]['mediaCount'] = len(entity_media)
total_score = 0
for entity_medium in entity_media:
total_score += entity_medium['relevance']
data['entities'][name]['averageRelevance'] = round(total_score/len(entity_media),4)
entity_medium['spreadAverageType'] = 'none'
for entity_medium in entity_media:
entity_medium['spreadAverage'] = round((entity_medium['relevance'] - data['entities'][name]['averageRelevance']) / data['entities'][name]['averageRelevance'] * 100,1)
# print(entity_medium['spreadAverage'])
if entity_medium['spreadAverage'] > 0:
entity_medium['spreadAverageType'] = 'more'
else:
entity_medium['spreadAverageType'] = 'less'
entity_medium['spreadAverage'] = math.fabs(entity_medium['spreadAverage'])
entity_medium['spreadRelevance'] = '-'
if entity_medium['previousRelevance'] != '-':
entity_medium['spreadRelevance'] = round((entity_medium['relevance'] - entity_medium['previousRelevance']) / entity_medium['previousRelevance'] * 100,1)
print('')
print('/// Calculation done')
# Remove data to save space
for name, info in data['entities'].items():
entity_media = data['entities'][name]['media']
for entity_medium in entity_media:
del entity_medium['confidence']
for name, info in data['media'].items():
del data['media'][name]['domain']
print('')
print('/// Data removed for smaller file')
# Sort medias for each entity by average spread
for name, info in data['entities'].items():
entity_media = data['entities'][name]['media']
entity_media.sort(key=lambda x: x['relevance'], reverse=True)
# Make array with entities
entities_sort = []
for name, info in data['entities'].items():
entities_sort.append(info)
# Keep entities only media count is above threshold
entities_filtered = []
for entity in entities_sort:
if entity['mediaCount'] >= count_media_min:
entities_filtered.append(entity)
data['entities'] = entities_filtered
print('')
print('/// Entities filtered and sorted')
# Add entity rank
for entity in data['entities']:
entity_code = entity['code']
for timespan in analyses_timespans:
for ranked_entity in ranked_entities[timespan]:
if ranked_entity['code'] == entity_code:
entity[timespan + 'Rank'] = ranked_entity['totalScoreRank']
for entity in data['entities']:
try:
diff_rank = entity['currentRank'] - entity['previousRank']
if diff_rank == 0:
entity['rankDiff'] = '='
if diff_rank > 0:
entity['rankDiff'] = '▼ ' + str(diff_rank)
if diff_rank < 0:
entity['rankDiff'] = '▲ ' + str(abs(diff_rank))
except:
entity['rankDiff'] = 'nouveau'
try:
del entity['previousRank']
except:
pass
data['entities'].sort(key=lambda x: x['currentRank'], reverse=False)
print('')
print('/// Rank added for all entities')
# Write file with all results
data_to_file = {}
data_to_file['7days'] = data
with open(data_path + 'data.json','w+') as fp:
json.dump(data_to_file, fp)
print('')
print('/// JSON global file ready')