-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathreddit_scaper.py
59 lines (46 loc) · 2.27 KB
/
reddit_scaper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
#https://github.com/nicholasserra/reddit-simple-media-scrape/blob/master/reddit_scrape.py
#https://crypto.stackexchange.com/questions/26336/sha-512-faster-than-sha-256
from argparse import ArgumentParser
import requests
import hashlib
import sys
import os
import re
class RedditScrape(object):
PROFILE_URL = 'https://www.reddit.com/user/{}/submitted.json'
def __init__(self, username, directory):
assert os.path.isdir(directory), RuntimeError('Invalid directory')
self.username = username
self.directory = directory
def scrape(self):
directory = self.directory + os.sep + self.username
if not os.path.exists(directory):
os.mkdir(directory)
count = 0
user_url = self.PROFILE_URL.format(self.username)
user_agent = {'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0'}
while True:
response = requests.get(user_url, headers=user_agent).text
hash_rsp = hashlib.sha512(str.encode(response)).hexdigest() # <-- long story to explain why i did this silly thing and why i first thought that would be a clever thing to do, lol.
if not os.path.exists(directory + os.sep + hash_rsp + '.json'):
file = open(f'{directory}{os.sep}{hash_rsp}.json', 'w+')
file .write(response)
file .close()
after_id = re.search('(?<="after": ").*?(?=")', response)
if after_id : user_url = self.PROFILE_URL.format(self.username) + '?after={}'.format(after_id.group(0))
else : break
count += 1
sys.stdout.write('.')
sys.stdout.flush()
print('\n{} files saved.'.format(count))
def main():
parser = ArgumentParser (description="Reddit simple media scraper.")
parser.add_argument ('username' , nargs='?', type=str , help='Username of the user you want to get the data')
parser.add_argument ('directory', nargs='?', default=os.path.dirname(os.path.realpath(__file__)))
args = parser.parse_args()
scraper = RedditScrape (args.username, args.directory)
scraper.scrape ()
if __name__ == '__main__':
main()
#python reddit_scrapper.py username