-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_extractor.py
93 lines (75 loc) · 3.28 KB
/
html_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from flask_socketio import SocketIO,emit
from progress import *
# socketio = SocketIO(app)
def is_english(text):
"""
Check if the given text contains mostly English characters.
"""
english_count = sum(1 for char in text if ord(char) < 128)
non_english_count = len(text) - english_count
return english_count >= non_english_count
def check_failure(text, failed_txt):
"""
Check if any of the failure strings exist in the given text.
"""
return any(failure_str.lower() in text.lower() for failure_str in failed_txt)
def get_html(urls, mode_of_search = None) :
"""
Function that takes in URLs and returns the HTML content within them.
Args:
urls: List of all URLs.
Returns:
website_content: Dictionary {url: html_content}.
failed_fetch: Number of websites that couldn't be accessed during fetching HTML.
"""
website_content = {} # Dictionary to store HTML content for each URL
failed_fetch = 0 # Counter for failed fetch attempts
failed_txt = ["Sorry", "not found", "oops", "try again", "failed", "error"]
items_completed = 0
total_items = len(urls)
num_of_output_progress = 10
# Iterate through each URL in the list
for website_idx in tqdm(range(len(urls))):
website = urls[website_idx]
# Send an HTTP GET request to the website
response = requests.get(website)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
# Check if the text contains mostly English characters
if is_english(text):
# Split text into lines, filter out empty lines, and join the non-empty lines
lines = text.splitlines()
non_empty_lines = [line for line in lines if line.strip()]
result = '\n'.join(non_empty_lines)
if mode_of_search == "Search Bar Scrape":
if check_failure(result, failed_txt):
pass
else:
website_content[website] = result
else:
# Store the result in the dictionary with the URL as the key
website_content[website] = result
else:
pass # Not saving non-English content
else:
# If the request was not successful, increment the failed_fetch counter
failed_fetch += 1
items_completed += 1
try:
if items_completed % (total_items // num_of_output_progress) == 0:
percentage_complete = (items_completed / total_items) * 100
socketio.emit('print_output', {'output': f"{progress_bar_once(word='Completed', percentage=round(percentage_complete, 2), num=30)}"})
except: pass
return website_content, failed_fetch
# Usecase example:
"""
text2, failed_fetch_html = get_html(["https://timesofindia.indiatimes.com/india/whole-world-waiting-for-22nd-january-pm-modi-in-ayodhya/articleshow/106388185.cms"])
print(text2)
"""
# Note: Uncomment the usecase example when using this code.