Skip to content

Commit

Permalink
Proper error handling #4 when providers are not completely unreachabl…
Browse files Browse the repository at this point in the history
…e but page format has changed. Verbose logging messages are printed for ValueErrors, KeyErrors and generic Exceptions as part of the parse_proxyList method.
  • Loading branch information
pgaref committed Aug 8, 2017
1 parent 8921459 commit ec84df5
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 134 deletions.
53 changes: 30 additions & 23 deletions http_request_randomizer/requests/parsers/FreeProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,40 @@ def __init__(self, id, web_url, timeout=None):

def parse_proxyList(self):
curr_proxy_list = []
response = requests.get(self.get_url(), timeout=self.timeout)
try:
response = requests.get(self.get_url(), timeout=self.timeout)
if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []

if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []
content = response.content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table", attrs={"id": "proxylisttable"})

content = response.content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table", attrs={"id": "proxylisttable"})
# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]
datasets = []
for row in table.find_all("tr")[1:]:
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
if dataset:
datasets.append(dataset)

datasets = []
for row in table.find_all("tr")[1:]:
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
if dataset:
datasets.append(dataset)

for dataset in datasets:
proxy_obj = self.create_proxy_object(dataset)
# Make sure it is a Valid Proxy Address
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(dataset))
return curr_proxy_list
for dataset in datasets:
proxy_obj = self.create_proxy_object(dataset)
# Make sure it is a Valid Proxy Address
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(dataset))
except AttributeError as e:
logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
except KeyError as e:
logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
except Exception as e:
logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
finally:
return curr_proxy_list

def create_proxy_object(self, dataset):
# Check Field[0] for tags and field[1] for values!
Expand Down
51 changes: 30 additions & 21 deletions http_request_randomizer/requests/parsers/ProxyForEuParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,41 @@ def __init__(self, id, web_url, bandwithdh=None, timeout=None):

def parse_proxyList(self):
curr_proxy_list = []
response = requests.get(self.get_url(), timeout=self.timeout)
try:
response = requests.get(self.get_url(), timeout=self.timeout)

if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []
if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []

content = response.content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table", attrs={"class": "proxy_list"})
content = response.content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table", attrs={"class": "proxy_list"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]
# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

datasets = []
for row in table.find_all("tr")[1:]:
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
datasets.append(dataset)
datasets = []
for row in table.find_all("tr")[1:]:
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
datasets.append(dataset)

for dataset in datasets:
# Avoid Straggler proxies and make sure it is a Valid Proxy Address
proxy_obj = self.create_proxy_object(dataset)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(dataset))
except AttributeError as e:
logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
except KeyError as e:
logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
except Exception as e:
logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
finally:
return curr_proxy_list

for dataset in datasets:
# Avoid Straggler proxies and make sure it is a Valid Proxy Address
proxy_obj = self.create_proxy_object(dataset)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(dataset))
return curr_proxy_list

def create_proxy_object(self, dataset):
ip = ""
Expand Down
108 changes: 58 additions & 50 deletions http_request_randomizer/requests/parsers/RebroWeeblyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,60 +18,68 @@ def __init__(self, id, web_url, timeout=None):

def parse_proxyList(self, use_top15k=False):
curr_proxy_list = []
response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)
try:
response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)

if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []
if not response.ok:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
return []

content = response.content
soup = BeautifulSoup(content, "html.parser")
all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
# address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
# .find('font', attrs={'color': '#33a27f'})
# Parse Top Proxy List page
address_list = []
country_list = []
anonymity_list = []
for div in all_divs:
address_div = div.find('font', attrs={'color': '#33a27f'})
if address_div is not None:
for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
address_list.append(str(row))
curr_div = div.findAll('font', attrs={'size': '2'})
if curr_div[0] is not None:
row_data = []
# font -> strong -> font
title = curr_div[0].contents[0].contents[0].contents[0]
for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
row_data.append(str(row))
if 'Country' in str(title):
country_list.extend(row_data)
if 'Status' in str(title):
anonymity_list.extend(row_data)
for address, country, anonymity in zip(address_list, country_list, anonymity_list):
# Make sure it is a Valid Proxy Address
proxy_obj = self.create_proxy_object(address, country, anonymity)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(row))
# Usually these proxies are stale
if use_top15k:
# Parse 15k Nodes Text file (named *-all-*.txt)
content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
content = response.content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
for link in table.findAll('a'):
current_link = link.get('href')
if current_link is not None and "all" in current_link:
self.txt_proxy_path = current_link
more_content = requests.get(self.get_url() + self.txt_proxy_path).text
for proxy_address in more_content.split():
if UrlParser.valid_ip_port(proxy_address):
proxy_obj = self.create_proxy_object(row)
all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
# address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
# .find('font', attrs={'color': '#33a27f'})
# Parse Top Proxy List page
address_list = []
country_list = []
anonymity_list = []
for div in all_divs:
address_div = div.find('font', attrs={'color': '#33a27f'})
if address_div is not None:
for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
address_list.append(str(row))
curr_div = div.findAll('font', attrs={'size': '2'})
if curr_div[0] is not None:
row_data = []
# font -> strong -> font
title = curr_div[0].contents[0].contents[0].contents[0]
for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
row_data.append(str(row))
if 'Country' in str(title):
country_list.extend(row_data)
if 'Status' in str(title):
anonymity_list.extend(row_data)
for address, country, anonymity in zip(address_list, country_list, anonymity_list):
# Make sure it is a Valid Proxy Address
proxy_obj = self.create_proxy_object(address, country, anonymity)
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
curr_proxy_list.append(proxy_obj)
return curr_proxy_list
else:
logger.debug("Proxy Invalid: {}".format(row))
# Usually these proxies are stale
if use_top15k:
# Parse 15k Nodes Text file (named *-all-*.txt)
content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
soup = BeautifulSoup(content, "html.parser")
table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
for link in table.findAll('a'):
current_link = link.get('href')
if current_link is not None and "all" in current_link:
self.txt_proxy_path = current_link
more_content = requests.get(self.get_url() + self.txt_proxy_path).text
for proxy_address in more_content.split():
if UrlParser.valid_ip_port(proxy_address):
proxy_obj = self.create_proxy_object(row)
curr_proxy_list.append(proxy_obj)
except AttributeError as e:
logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
except KeyError as e:
logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
except Exception as e:
logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
finally:
return curr_proxy_list

def create_proxy_object(self, address, country, anonymity):
# Make sure it is a Valid IP
Expand Down
88 changes: 48 additions & 40 deletions http_request_randomizer/requests/parsers/SamairProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,47 +18,55 @@ def __init__(self, id, web_url, timeout=None):

def parse_proxyList(self):
curr_proxy_list = []
# Parse all proxy pages -> format: /list/{num}.htm
# Get the pageRange from the 'pagination' table
page_set = self.get_pagination_set()
logger.debug("Pages: {}".format(page_set))
for page in page_set:
response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
if not response.ok:
# Could not parse ANY page - Let user know
if not curr_proxy_list:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
# Return proxies parsed so far
return curr_proxy_list
content = response.content
soup = BeautifulSoup(content, "html.parser")
# css provides the port number so we reverse it
# for href in soup.findAll('link'):
# if '/styles/' in href.get('href'):
# style = "http://www.samair.ru" + href.get('href')
# break
# css = requests.get(style).content.split('\n')
# css.pop()
# ports = {}
# for l in css:
# p = l.split(' ')
# key = p[0].split(':')[0][1:]
# value = p[1].split('\"')[1]
# ports[key] = value
try:
# Parse all proxy pages -> format: /list/{num}.htm
# Get the pageRange from the 'pagination' table
page_set = self.get_pagination_set()
logger.debug("Pages: {}".format(page_set))
for page in page_set:
response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
if not response.ok:
# Could not parse ANY page - Let user know
if not curr_proxy_list:
logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
# Return proxies parsed so far
return curr_proxy_list
content = response.content
soup = BeautifulSoup(content, "html.parser")
# css provides the port number so we reverse it
# for href in soup.findAll('link'):
# if '/styles/' in href.get('href'):
# style = "http://www.samair.ru" + href.get('href')
# break
# css = requests.get(style).content.split('\n')
# css.pop()
# ports = {}
# for l in css:
# p = l.split(' ')
# key = p[0].split(':')[0][1:]
# value = p[1].split('\"')[1]
# ports[key] = value

table = soup.find("div", attrs={"id": "proxylist"})
# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]
for row in table.find_all("tr")[1:]:
td_row = row.find("td")
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
proxy_obj = self.create_proxy_object(row)
# Make sure it is a Valid Proxy Address
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(td_row.text))
return curr_proxy_list
table = soup.find("div", attrs={"id": "proxylist"})
# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]
for row in table.find_all("tr")[1:]:
td_row = row.find("td")
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
proxy_obj = self.create_proxy_object(row)
# Make sure it is a Valid Proxy Address
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(td_row.text))
except AttributeError as e:
logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
except KeyError as e:
logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
except Exception as e:
logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
finally:
return curr_proxy_list

def get_pagination_set(self):
response = requests.get(self.get_url(), timeout=self.timeout)
Expand Down

0 comments on commit ec84df5

Please sign in to comment.