-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwebscraping.py
87 lines (77 loc) · 2.94 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
from bs4 import BeautifulSoup
import re
import os
url_link = input("Paste url : ")
book_name = input("Enter the name of the book: ")
# method to validate chapter_count
def chapterCount(message):
while True:
try:
userInput = int(input(message))
except ValueError:
print("You have to type a number! try again..")
continue
else:
return userInput
# method to validate abbr_name
def abbrName(message):
while True:
try:
userInput = str(input(message))
except ValueError:
print("You have to type character! try again..")
continue
else:
return userInput
abbr_name = abbrName("Enter short abbr for the book: ")
chapter_count = chapterCount("How many chapters in this book?: ")
lo = re.search('\.html',url_link).group()
# add 1 in the url in case the url already has 1xxx.html in the link
url_link = re.sub(f'{lo}',f'{1}xxx{lo}',url_link)
# create a folder for the book in current directory
directory = f'./tdm_simple/'
file_name = f'{book_name}.xml'
file_path = os.path.join(directory,file_name)
# check the folder is weather already exists
if not os.path.exists(directory):
os.mkdir(directory)
with open(file_path, encoding='utf-8', mode='a') as f:
f.write(f'<BIBLEBOOK bnumber="" bname="{book_name}" bsname="{abbr_name}">')
for index in range(int(chapter_count)):
url_request = re.sub('\dxxx',f'{index+1}', url_link)
url = requests.get(url_request)
# parsing the html
# soup = BeautifulSoup(url.content, "html.parser") this is slower =
soup = BeautifulSoup(url.content, 'lxml')
# getting the chapter
c = soup.find('div', class_='c')
# select multiple classes that have the input clase names
divs = soup.find_all(attrs={'class': ['p','q1','q2','s1','m']})
# remove unwanted tags
for r in divs:
for cf in r.find_all("span", class_='cf'):
cf.extract()
search_text = r'\d+-\d+|\d+'
ver = ""
# writing start here
print(f'{book_name}{index+1}.xml start writing .........')
f.write('\n')
f.write(f'\t\t<CHAPTER cnumber="{c.get_text(strip=True)}">')
for p in divs:
spans = p.find_all("span")
for span in spans:
for sentc in span:
verse = sentc.get_text(strip=True)
if re.search(f'{search_text}', verse):
ver_num = re.search(f'{search_text}', verse).group()
verse = re.sub(f'{search_text}',f'</VERS>\n\t\t\t<VERS vnumber="{ver_num}">', verse)
ver += verse
else:
ver += f'{verse}'
ver = re.sub('</VERS>', '', ver, 1)
f.write(f'{ver}</VERS>\n\t\t</CHAPTER>')
f.write('\n</BIBLEBOOK>')
f.close()
print("Done writing.")
print('file saved!!!')