-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraping.py
82 lines (76 loc) · 2.74 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
from googlesearch import search
import csv
import re
from time import sleep
from nltk.tokenize import sent_tokenize
#make more sentences for Audrey
#tell visible elements from invisible ones manually
def mask_visible(element):
#element.is_displayed()
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(html, query):
soup = BeautifulSoup(html, 'html.parser')
texts = soup.findAll(text=True)
print('html text areas found:', len(texts))
output = ''
# blacklist = [
# '[document]',
# 'noscript',
# 'header',
# 'html',
# 'meta',
# 'head',
# 'input',
# 'script',
# # there may be more elements you don't want, such as "style", etc.
# ]
for t in texts:
if mask_visible(t):
#add a newline if previous or next is hyperlink
# if t.parent.name == 'a':
# output += str(t)
# elif last_t_type == 'a':
# for s in re.split('\.', t):
# output += str(t)+'|'
# elif t.parent.name != 'a':
# for s in re.split('\.', t):
# if len(s)>20:
# output += str(t)+'|'
# last_t_type = t.parent.name
#for s in re.split('\.', t):
# output += str(t)+'|'
output += ' {}'.format(t)
# visible_texts = (i.get_text() for i in texts)
# for x,i in enumerate(visible_texts):
# print(i)
# if x>5:
# break
# print('type', type(texts))
# def generator(texts):
# for t in texts:
# pass
#((s+'.').strip() for t in visible_texts if (t != '\n') for s in re.split('\.', t) if (len(s)>20 and t.parent.name != 'a'))
#return [i.strip() for i in re.split('\.|\n|。', output) if len(i.strip())>20]
#return [i.strip() for i in re.split('\.|\n|。', output) if len(i.strip())>len(query)]
output = [i.strip() for i in re.split('\.|\n|。', output)]
output = sent_tokenize(" ".join(output))
print(type(output), len(output))
return output
def parse_another_site(response_object, driver, f, query):
url = next(response_object)
driver.get(url)
sleep(3)
text = text_from_html(driver.page_source, query)
print('\n gotten text: ', type(text), 'Sentence count: ', len(text), url, '\n')
for i in text:
f.writelines('"'+i+'",\n')
output = [i for i in text if (query.lower() in i.lower())]
print('how many acceptable sentences were found: ', len(output))
return output, url