-
Notifications
You must be signed in to change notification settings - Fork 70
/
scan_pages_history_to_big_list.py
51 lines (38 loc) · 1.29 KB
/
scan_pages_history_to_big_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import json, random, os
from tqdm import tqdm
from extstats.CONSTS import SITEMAP_FILE, PAGES_DIRECTORY
from extstats.store_infos_history import latest_good, TO_RM
DIR = PAGES_DIRECTORY
ext_ids = os.listdir(DIR)
random.shuffle(ext_ids)
exts = []
urls = {url.split('/')[-1]: url for url in json.load(open(SITEMAP_FILE))}
for ext_id in tqdm(ext_ids):
latest = latest_good(ext_id)
if latest and 'content' in latest:
content = latest['content']
content['ext_id'] = ext_id
if 'url' not in content:
if ext_id in urls:
content['url'] = urls[ext_id]
else:
content['url'] = "https://chrome.google.com/webstore/detail/_/" + ext_id
content['not_in_sitemap'] = ext_id not in urls
exts.append(content)
if len(TO_RM) % 100 == 10:
print(len(TO_RM))
print(len(exts), 'extensions')
def safeint(n):
try:
return int(n)
except:
return -1
exts = sorted((x for x in exts), key=lambda x: -safeint(x.get('user_count')))
json.dump(exts, open('data/PAGES.json', 'w'), indent=2, sort_keys=True)
for x in exts[:20]:
print(x['name'], x['user_count'])
print()
print('deleted:')
for x in [x for x in exts if x.get('deleted')][:10]:
print(x['name'], x['user_count'])
print('\n'.join(TO_RM))