-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreateNews.py
100 lines (87 loc) · 4.13 KB
/
CreateNews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from functions import *
import pandas as pd
import operator
def find_top_group(data):
data_top_by_favorite = data.sort_values('favorite_count', ascending=False)
top_tweets_by_favorite = { k:2 for k in data_top_by_favorite['id'].head(10)}
data_top_by_user_followers = data.sort_values('user_followers', ascending=False)
top_tweets_by_user_followers = { k:3 for k in data_top_by_user_followers['id'].head(10)}
data_top_by_retweet_count = data.sort_values('retweet_count', ascending=False)
top_tweets_by_retweet_count = { k:1 for k in data_top_by_retweet_count['id'].head(10)}
data_score = data.copy()
data_score['score'] = data_score['user_followers'] * data_score['retweet_count'] * data_score['favorite_count']
top_tweets_by_score = { k:4 for k in data_score['id'].head(10)}
ids = set(top_tweets_by_favorite.keys()) | set(top_tweets_by_user_followers.keys()) | set(top_tweets_by_retweet_count.keys()) | set(top_tweets_by_score.keys())
headlines = {}
for id in ids:
fa = get_value(top_tweets_by_favorite, id)
uf = get_value(top_tweets_by_user_followers, id)
rc = get_value(top_tweets_by_retweet_count, id)
sc = get_value(top_tweets_by_score, id)
headlines[id] = fa + uf + rc + sc
top_headlines = sorted(headlines.items(), key = operator.itemgetter(1), reverse=True)[:10]
top_headlines_dic = { i[0]:i[1] for i in top_headlines}
data_headlines = data[data.id.isin(list(top_headlines_dic.keys()))]
data_headlines['score'] = [ top_headlines_dic[id] for id in data_headlines['id']]
top_data_headlines = data_headlines.sort_values('score', ascending=False)
return top_data_headlines
def eval_top_words(text, set_words):
text_clean = clean_text(text)
tokens = [ lemmatize(token) for token in text_clean.split()]
c = 0
for token in tokens:
if token in set_words:
c += 1
return c
def tag_news(text):
glossaries = get_glossaries()
point = 0
tag = 'otros'
for key in glossaries:
if key != 'zbad':
c = eval_top_words(text, glossaries[key])
if c > point:
point = c
tag = key
else:
c = eval_top_words(text, glossaries[key])
point = point - c
if point <= 0: tag = 'otros'
return tag + ' ' + str(point)
# Encontrar titulares para trend
def find_headlines(trend_path):
# Leer trend - json
trend_name = trend_path.split('\\')[-1].split('.')[0]
data = pd.read_json(trend_path, lines=True)
data['url'] = data['text'].apply(get_url)
data['text'] = data['text'].apply(clean_url)
data = data.drop_duplicates(subset=['text'])
# Encontrar grupo de potenciales titulares
top_data_headlines = find_top_group(data)
# Taggear e identificar titular
top_data_headlines['tag'] = top_data_headlines['text'].apply(tag_news)
top_data_headlines['tag_point'] = top_data_headlines['tag'].apply(lambda text: int(text.split()[1]))
top_data_headlines['tag'] = top_data_headlines['tag'].apply(lambda text: text.split()[0])
top_data_headlines['sum_p'] = top_data_headlines['score'] + top_data_headlines['tag_point']
idx_drop = top_data_headlines[top_data_headlines['tag'] == 'otros'].index
top_data_headlines_final = top_data_headlines.drop(idx_drop)
top_data_headlines_final = top_data_headlines_final.drop_duplicates(subset=['text'])
top_data_headlines_final = top_data_headlines_final.sort_values('sum_p', ascending=False)
print('[✓] ' + trend_name + ' news found successfully.')
return top_data_headlines_final, data
# Noticia
def create_new(row_data_news):
new = {}
new['id'] = row_data_news['id']
new['title'] = row_data_news['text']
new['date'] = str(row_data_news['date'])
new['screen_name'] = row_data_news['screen_name']
new['name'] = row_data_news['name']
new['img'] = row_data_news['img']
new['url'] = row_data_news['url']
new['category'] = row_data_news['tag']
new['score'] = row_data_news['sum_p']
new['sentiment'] = 'sentiment/' + str(row_data_news['id']) + '.jpg'
new['wordcloud'] = 'wordcloud/' + str(row_data_news['id']) + '.jpg'
new['entities'] = row_data_news['entities']
return new