diff --git a/http_request_randomizer/requests/parsers/FreeProxyParser.py b/http_request_randomizer/requests/parsers/FreeProxyParser.py index 1ae0251..71b3873 100644 --- a/http_request_randomizer/requests/parsers/FreeProxyParser.py +++ b/http_request_randomizer/requests/parsers/FreeProxyParser.py @@ -16,33 +16,40 @@ def __init__(self, id, web_url, timeout=None): def parse_proxyList(self): curr_proxy_list = [] - response = requests.get(self.get_url(), timeout=self.timeout) + try: + response = requests.get(self.get_url(), timeout=self.timeout) + if not response.ok: + logger.warn("Proxy Provider url failed: {}".format(self.get_url())) + return [] - if not response.ok: - logger.warn("Proxy Provider url failed: {}".format(self.get_url())) - return [] + content = response.content + soup = BeautifulSoup(content, "html.parser") + table = soup.find("table", attrs={"id": "proxylisttable"}) - content = response.content - soup = BeautifulSoup(content, "html.parser") - table = soup.find("table", attrs={"id": "proxylisttable"}) + # The first tr contains the field names. + headings = [th.get_text() for th in table.find("tr").find_all("th")] - # The first tr contains the field names. - headings = [th.get_text() for th in table.find("tr").find_all("th")] + datasets = [] + for row in table.find_all("tr")[1:]: + dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) + if dataset: + datasets.append(dataset) - datasets = [] - for row in table.find_all("tr")[1:]: - dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) - if dataset: - datasets.append(dataset) - - for dataset in datasets: - proxy_obj = self.create_proxy_object(dataset) - # Make sure it is a Valid Proxy Address - if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): - curr_proxy_list.append(proxy_obj) - else: - logger.debug("Proxy Invalid: {}".format(dataset)) - return curr_proxy_list + for dataset in datasets: + proxy_obj = self.create_proxy_object(dataset) + # Make sure it is a Valid Proxy Address + if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): + curr_proxy_list.append(proxy_obj) + else: + logger.debug("Proxy Invalid: {}".format(dataset)) + except AttributeError as e: + logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) + except KeyError as e: + logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) + except Exception as e: + logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) + finally: + return curr_proxy_list def create_proxy_object(self, dataset): # Check Field[0] for tags and field[1] for values! diff --git a/http_request_randomizer/requests/parsers/ProxyForEuParser.py b/http_request_randomizer/requests/parsers/ProxyForEuParser.py index 918967e..4047dd6 100644 --- a/http_request_randomizer/requests/parsers/ProxyForEuParser.py +++ b/http_request_randomizer/requests/parsers/ProxyForEuParser.py @@ -16,32 +16,41 @@ def __init__(self, id, web_url, bandwithdh=None, timeout=None): def parse_proxyList(self): curr_proxy_list = [] - response = requests.get(self.get_url(), timeout=self.timeout) + try: + response = requests.get(self.get_url(), timeout=self.timeout) - if not response.ok: - logger.warn("Proxy Provider url failed: {}".format(self.get_url())) - return [] + if not response.ok: + logger.warn("Proxy Provider url failed: {}".format(self.get_url())) + return [] - content = response.content - soup = BeautifulSoup(content, "html.parser") - table = soup.find("table", attrs={"class": "proxy_list"}) + content = response.content + soup = BeautifulSoup(content, "html.parser") + table = soup.find("table", attrs={"class": "proxy_list"}) - # The first tr contains the field names. - headings = [th.get_text() for th in table.find("tr").find_all("th")] + # The first tr contains the field names. + headings = [th.get_text() for th in table.find("tr").find_all("th")] - datasets = [] - for row in table.find_all("tr")[1:]: - dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) - datasets.append(dataset) + datasets = [] + for row in table.find_all("tr")[1:]: + dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) + datasets.append(dataset) + + for dataset in datasets: + # Avoid Straggler proxies and make sure it is a Valid Proxy Address + proxy_obj = self.create_proxy_object(dataset) + if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): + curr_proxy_list.append(proxy_obj) + else: + logger.debug("Proxy Invalid: {}".format(dataset)) + except AttributeError as e: + logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) + except KeyError as e: + logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) + except Exception as e: + logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) + finally: + return curr_proxy_list - for dataset in datasets: - # Avoid Straggler proxies and make sure it is a Valid Proxy Address - proxy_obj = self.create_proxy_object(dataset) - if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): - curr_proxy_list.append(proxy_obj) - else: - logger.debug("Proxy Invalid: {}".format(dataset)) - return curr_proxy_list def create_proxy_object(self, dataset): ip = "" diff --git a/http_request_randomizer/requests/parsers/RebroWeeblyParser.py b/http_request_randomizer/requests/parsers/RebroWeeblyParser.py index 673d104..3a3ed9c 100644 --- a/http_request_randomizer/requests/parsers/RebroWeeblyParser.py +++ b/http_request_randomizer/requests/parsers/RebroWeeblyParser.py @@ -18,60 +18,68 @@ def __init__(self, id, web_url, timeout=None): def parse_proxyList(self, use_top15k=False): curr_proxy_list = [] - response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout) + try: + response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout) - if not response.ok: - logger.warn("Proxy Provider url failed: {}".format(self.get_url())) - return [] + if not response.ok: + logger.warn("Proxy Provider url failed: {}".format(self.get_url())) + return [] - content = response.content - soup = BeautifulSoup(content, "html.parser") - all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) - # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) - # .find('font', attrs={'color': '#33a27f'}) - # Parse Top Proxy List page - address_list = [] - country_list = [] - anonymity_list = [] - for div in all_divs: - address_div = div.find('font', attrs={'color': '#33a27f'}) - if address_div is not None: - for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']: - address_list.append(str(row)) - curr_div = div.findAll('font', attrs={'size': '2'}) - if curr_div[0] is not None: - row_data = [] - # font -> strong -> font - title = curr_div[0].contents[0].contents[0].contents[0] - for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']: - row_data.append(str(row)) - if 'Country' in str(title): - country_list.extend(row_data) - if 'Status' in str(title): - anonymity_list.extend(row_data) - for address, country, anonymity in zip(address_list, country_list, anonymity_list): - # Make sure it is a Valid Proxy Address - proxy_obj = self.create_proxy_object(address, country, anonymity) - if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): - curr_proxy_list.append(proxy_obj) - else: - logger.debug("Proxy Invalid: {}".format(row)) - # Usually these proxies are stale - if use_top15k: - # Parse 15k Nodes Text file (named *-all-*.txt) - content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content + content = response.content soup = BeautifulSoup(content, "html.parser") - table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"}) - for link in table.findAll('a'): - current_link = link.get('href') - if current_link is not None and "all" in current_link: - self.txt_proxy_path = current_link - more_content = requests.get(self.get_url() + self.txt_proxy_path).text - for proxy_address in more_content.split(): - if UrlParser.valid_ip_port(proxy_address): - proxy_obj = self.create_proxy_object(row) + all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) + # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) + # .find('font', attrs={'color': '#33a27f'}) + # Parse Top Proxy List page + address_list = [] + country_list = [] + anonymity_list = [] + for div in all_divs: + address_div = div.find('font', attrs={'color': '#33a27f'}) + if address_div is not None: + for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']: + address_list.append(str(row)) + curr_div = div.findAll('font', attrs={'size': '2'}) + if curr_div[0] is not None: + row_data = [] + # font -> strong -> font + title = curr_div[0].contents[0].contents[0].contents[0] + for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']: + row_data.append(str(row)) + if 'Country' in str(title): + country_list.extend(row_data) + if 'Status' in str(title): + anonymity_list.extend(row_data) + for address, country, anonymity in zip(address_list, country_list, anonymity_list): + # Make sure it is a Valid Proxy Address + proxy_obj = self.create_proxy_object(address, country, anonymity) + if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): curr_proxy_list.append(proxy_obj) - return curr_proxy_list + else: + logger.debug("Proxy Invalid: {}".format(row)) + # Usually these proxies are stale + if use_top15k: + # Parse 15k Nodes Text file (named *-all-*.txt) + content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content + soup = BeautifulSoup(content, "html.parser") + table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"}) + for link in table.findAll('a'): + current_link = link.get('href') + if current_link is not None and "all" in current_link: + self.txt_proxy_path = current_link + more_content = requests.get(self.get_url() + self.txt_proxy_path).text + for proxy_address in more_content.split(): + if UrlParser.valid_ip_port(proxy_address): + proxy_obj = self.create_proxy_object(row) + curr_proxy_list.append(proxy_obj) + except AttributeError as e: + logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) + except KeyError as e: + logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) + except Exception as e: + logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) + finally: + return curr_proxy_list def create_proxy_object(self, address, country, anonymity): # Make sure it is a Valid IP diff --git a/http_request_randomizer/requests/parsers/SamairProxyParser.py b/http_request_randomizer/requests/parsers/SamairProxyParser.py index eccc9d1..0a074cd 100644 --- a/http_request_randomizer/requests/parsers/SamairProxyParser.py +++ b/http_request_randomizer/requests/parsers/SamairProxyParser.py @@ -18,47 +18,55 @@ def __init__(self, id, web_url, timeout=None): def parse_proxyList(self): curr_proxy_list = [] - # Parse all proxy pages -> format: /list/{num}.htm - # Get the pageRange from the 'pagination' table - page_set = self.get_pagination_set() - logger.debug("Pages: {}".format(page_set)) - for page in page_set: - response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout) - if not response.ok: - # Could not parse ANY page - Let user know - if not curr_proxy_list: - logger.warn("Proxy Provider url failed: {}".format(self.get_url())) - # Return proxies parsed so far - return curr_proxy_list - content = response.content - soup = BeautifulSoup(content, "html.parser") - # css provides the port number so we reverse it - # for href in soup.findAll('link'): - # if '/styles/' in href.get('href'): - # style = "http://www.samair.ru" + href.get('href') - # break - # css = requests.get(style).content.split('\n') - # css.pop() - # ports = {} - # for l in css: - # p = l.split(' ') - # key = p[0].split(':')[0][1:] - # value = p[1].split('\"')[1] - # ports[key] = value + try: + # Parse all proxy pages -> format: /list/{num}.htm + # Get the pageRange from the 'pagination' table + page_set = self.get_pagination_set() + logger.debug("Pages: {}".format(page_set)) + for page in page_set: + response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout) + if not response.ok: + # Could not parse ANY page - Let user know + if not curr_proxy_list: + logger.warn("Proxy Provider url failed: {}".format(self.get_url())) + # Return proxies parsed so far + return curr_proxy_list + content = response.content + soup = BeautifulSoup(content, "html.parser") + # css provides the port number so we reverse it + # for href in soup.findAll('link'): + # if '/styles/' in href.get('href'): + # style = "http://www.samair.ru" + href.get('href') + # break + # css = requests.get(style).content.split('\n') + # css.pop() + # ports = {} + # for l in css: + # p = l.split(' ') + # key = p[0].split(':')[0][1:] + # value = p[1].split('\"')[1] + # ports[key] = value - table = soup.find("div", attrs={"id": "proxylist"}) - # The first tr contains the field names. - headings = [th.get_text() for th in table.find("tr").find_all("th")] - for row in table.find_all("tr")[1:]: - td_row = row.find("td") - # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) - proxy_obj = self.create_proxy_object(row) - # Make sure it is a Valid Proxy Address - if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text): - curr_proxy_list.append(proxy_obj) - else: - logger.debug("Proxy Invalid: {}".format(td_row.text)) - return curr_proxy_list + table = soup.find("div", attrs={"id": "proxylist"}) + # The first tr contains the field names. + headings = [th.get_text() for th in table.find("tr").find_all("th")] + for row in table.find_all("tr")[1:]: + td_row = row.find("td") + # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) + proxy_obj = self.create_proxy_object(row) + # Make sure it is a Valid Proxy Address + if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text): + curr_proxy_list.append(proxy_obj) + else: + logger.debug("Proxy Invalid: {}".format(td_row.text)) + except AttributeError as e: + logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) + except KeyError as e: + logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) + except Exception as e: + logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) + finally: + return curr_proxy_list def get_pagination_set(self): response = requests.get(self.get_url(), timeout=self.timeout)