-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextractsetopati.py
56 lines (47 loc) · 1.9 KB
/
extractsetopati.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/python3
# program to extract content from Setopati
# for Nepali ngram model
# http://virtualanup.com/nepali-ngram-models/
import urllib3
from bs4 import BeautifulSoup
import os
http = urllib3.PoolManager()
def checkDir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
outputdirname = "setopatioutput"
setopatiurl = "http://setopati.com/bichar/"
# make sure the output directory exist
checkDir(outputdirname)
# iterate through the news artices.
for i in range(2000, 12000):
filename = os.path.join(outputdirname, str(i))
# if the output file for the news article already exist, skip it.
# this will prevent us from redoing much task if the script gets broken in the
# middle of extraction.
if os.path.exists(filename):
print("Skipping ", i)
continue
# try to get the HTML content of the URL
articleurl = setopatiurl+str(i)
print("Extracting content from ", i)
r = http.request('GET', articleurl)
# create a file
outputfile = open(filename, 'wb')
if r.status == 200:
try:
# success. Now try to extract the news portition using beautifulsoup
extractor = BeautifulSoup(r.data)
# the content inside division with ID 'newsbox' is the main content
newsbox = extractor.find("div", {"id": "newsbox"})
if len(newsbox) > 1:
# if there is news inside the news box, then it's length will be > 1
# remove the content inside span, h1, h2 etc
for htmltag in ['strong', 'h1','span', 'h2']:
for tag in newsbox.find_all(htmltag):
tag.decompose()
content = bytes(newsbox.get_text(), 'UTF-8')
outputfile.write(bytes(' ', 'UTF-8').join(content.split(bytes('\n', 'UTF-8'))))
except:
pass # do nohing in case of error
outputfile.close()