-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
43 lines (37 loc) · 1.51 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from bs4 import BeautifulSoup as soup
import os
import re
import requests
htmlDir = 'html\\'
for htmlFile in os.listdir(htmlDir):
contents = open(os.path.join(htmlDir,htmlFile),'r').read()
parsed = soup(contents,'html.parser')
rows = parsed.find_all('tr')
table_rows_text = []
row_text_with_icon = ''
for row in rows:
tags = row.find_all(lambda tag: tag.has_attr('src'))
for tag in tags:
url = tag['src']
filename = os.path.basename(url)
row_text_with_icon = row.text+filename
filepath = os.path.join('jpg',filename)
if not os.path.exists("jpg"):
os.makedirs("jpg")
#comment this out so doesn't download lots of unused jpegs
#with open(filepath,'wb') as jpgFile:
#jpgFile.write(requests.get(url).content)
tmp = re.sub(r' ','',row_text_with_icon,1)
tmp = re.sub(r'(Exotic)',r'\1 ',tmp)
tmp = re.sub(r'(Legendary|Gun|Launcher|Glaive|Rifle|Sword|Cannon|Bow|Shotgun)',r'\1 ',tmp)
tmp = re.sub(r' (Bow) ',r'\1 ',tmp)
tmp = re.sub(r'%',' ',tmp)
tmp = re.sub(r'(Exotic|Legendary)',r' \1',tmp)
tmp = re.sub(r' (Gun|Launcher|Rifle|Sidearm) ',r'\1 ',tmp)
tmp = re.sub(r' ',',',tmp)
table_rows_text.append(tmp)
table_rows_text.pop(0)
csv_filename = os.path.splitext(htmlFile)[0] + ".csv"
with open('csv\\'+csv_filename,'w') as csvFile:
for row in table_rows_text:
csvFile.write(row+'\n')