-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_prep.py
151 lines (111 loc) · 4.64 KB
/
data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import datasets
#async packages
from aiohttp import ClientSession
import asyncio
# basic python packages
import requests
import re
# dataframes
import pandas as pd
# file systems
from glob import glob
import os
import sys
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from tqdm import tqdm
from tempfile import NamedTemporaryFile
import csv
summary_df = pd.read_csv('text_summary_stats.csv')
def gen_temp_file():
if len(glob('text_datasets/*.csv')):
pass
else:
try:
subprocess.run(['python','data_prep.py'])
except:
subprocess.run(['python3','data_prep.py'])
csv_files = glob('text_datasets/*.csv')
if os.path.exists(os.path.join(os.getcwd(), 'online_news_popularity_data')):
pass
else:
os.mkdir(os.path.join(os.getcwd(), 'online_news_popularity_data'))
fpath = os.path.join(os.getcwd(), 'online_news_popularity_data', 'online_news_popularity_data.csv')
# with NamedTemporaryFile(mode='w', suffix = '.csv',
# dir = os.path.join(os.getcwd(), 'online_news_popularity_data'),
# delete=False) as f:
with open(fpath, 'w') as f:
fieldnames = pd.read_csv(csv_files[0]).columns.tolist()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for csv_file in tqdm(csv_files):
with open(csv_file) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
row_dict = {fieldname: row[fieldname] for fieldname in fieldnames}
writer.writerow(row_dict)
# writer.writerow({'title': row['title'],
# 'content': row['content'],
# 'shares': row['shares']})
return fpath
def get_texts(urls, shares):
df = asyncio.run(download_all_data(urls, shares))
return df
async def text_download(url, session):
async with session.get(url) as resp:
if resp.status == 200:
try:
html = await resp.read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('title').text.split(' | ')[0]
paragraphs = soup.find_all('p')
texts = [p.text for p in paragraphs if 'nggallery' not in p.text]
text = '\n'.join(texts)
except:
title, text = None, None
return title, text
else:
return None, None
async def download_all_data(urls, shares):
async with ClientSession() as session:
tasks = [text_download(url, session) for url in urls]
full_texts = await asyncio.gather(*tasks)
titles = [text[0] for text in full_texts]
contents = [text[1] for text in full_texts]
df = pd.DataFrame(zip(titles, contents, shares), columns = ['title','content','shares'])
return df
def save_text_csv(urls, shares, file_name, summary):
try:
dataset_dir = os.path.join(os.getcwd(), 'text_datasets')
if os.path.exists(dataset_dir):
pass
else:
os.mkdir(dataset_dir)
df = asyncio.run(download_all_data(urls, shares))
df = pd.concat([df.reset_index(drop = True), summary.reset_index(drop = True)], axis = 1)
df.to_csv(os.path.join(dataset_dir, file_name), index = False)
except:
pass
return None
if __name__ == '__main__':
dl_manager = datasets.DownloadManager()
_DOWNLOAD_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip'
archive = dl_manager.download(_DOWNLOAD_URL)
for path, f in dl_manager.iter_archive(archive):
if path[-3:] == 'csv':
df = pd.read_csv(f)
df.columns = df.columns.str.strip()
urls = df.url.str.replace('http://', 'https://')
shares = df.shares
N = df.shape[0]
batch_size = 100
print('start process')
res = Parallel(n_jobs = -1)(delayed(save_text_csv)(urls[i*batch_size:(i+1)*batch_size], shares[i*batch_size:(i+1)*batch_size], f"dataset_{i}.csv", summary_df.loc[i*batch_size:(i+1)*batch_size,:]) for i in tqdm(range(N//batch_size+1)))
path = gen_temp_file()
df = pd.read_csv(path)
df = df.loc[df.notnull().prod(axis = 1).astype(bool),:].reset_index(drop = True)
df.to_csv(path, index = False)
# text_df = get_texts(urls[:1000], shares[:1000])
# save_text_csv(urls[:100], shares[:100], 'text_data.csv')
# text_df.to_csv('text_data.csv', index = False)
# res = Parallel(n_jobs=-1)(delayed(fun)() for fun in self.functions.values())