Proper error handling #4 when providers are not completely unreachabl…

…e but page format has changed. Verbose logging messages are printed for ValueErrors, KeyErrors and generic Exceptions as part of the parse_proxyList method.
pgaref · Aug 8, 2017 · ec84df5 · ec84df5
1 parent 8921459
commit ec84df5
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 134 deletions.
diff --git a/http_request_randomizer/requests/parsers/FreeProxyParser.py b/http_request_randomizer/requests/parsers/FreeProxyParser.py
@@ -16,33 +16,40 @@ def __init__(self, id, web_url, timeout=None):
 
     def parse_proxyList(self):
         curr_proxy_list = []
-        response = requests.get(self.get_url(), timeout=self.timeout)
+        try:
+            response = requests.get(self.get_url(), timeout=self.timeout)
+            if not response.ok:
+                logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
+                return []
 
-        if not response.ok:
-            logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
-            return []
+            content = response.content
+            soup = BeautifulSoup(content, "html.parser")
+            table = soup.find("table", attrs={"id": "proxylisttable"})
 
-        content = response.content
-        soup = BeautifulSoup(content, "html.parser")
-        table = soup.find("table", attrs={"id": "proxylisttable"})
+            # The first tr contains the field names.
+            headings = [th.get_text() for th in table.find("tr").find_all("th")]
 
-        # The first tr contains the field names.
-        headings = [th.get_text() for th in table.find("tr").find_all("th")]
+            datasets = []
+            for row in table.find_all("tr")[1:]:
+                dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
+                if dataset:
+                    datasets.append(dataset)
 
-        datasets = []
-        for row in table.find_all("tr")[1:]:
-            dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
-            if dataset:
-                datasets.append(dataset)
-
-        for dataset in datasets:
-            proxy_obj = self.create_proxy_object(dataset)
-            # Make sure it is a Valid Proxy Address
-            if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
-                curr_proxy_list.append(proxy_obj)
-            else:
-                logger.debug("Proxy Invalid: {}".format(dataset))
-        return curr_proxy_list
+            for dataset in datasets:
+                proxy_obj = self.create_proxy_object(dataset)
+                # Make sure it is a Valid Proxy Address
+                if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
+                    curr_proxy_list.append(proxy_obj)
+                else:
+                    logger.debug("Proxy Invalid: {}".format(dataset))
+        except AttributeError as e:
+            logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
+        except KeyError as e:
+            logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
+        except Exception as e:
+            logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
+        finally:
+            return curr_proxy_list
 
     def create_proxy_object(self, dataset):
         # Check Field[0] for tags and field[1] for values!

diff --git a/http_request_randomizer/requests/parsers/ProxyForEuParser.py b/http_request_randomizer/requests/parsers/ProxyForEuParser.py
@@ -16,32 +16,41 @@ def __init__(self, id, web_url, bandwithdh=None, timeout=None):
 
     def parse_proxyList(self):
         curr_proxy_list = []
-        response = requests.get(self.get_url(), timeout=self.timeout)
+        try:
+            response = requests.get(self.get_url(), timeout=self.timeout)
 
-        if not response.ok:
-            logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
-            return []
+            if not response.ok:
+                logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
+                return []
 
-        content = response.content
-        soup = BeautifulSoup(content, "html.parser")
-        table = soup.find("table", attrs={"class": "proxy_list"})
+            content = response.content
+            soup = BeautifulSoup(content, "html.parser")
+            table = soup.find("table", attrs={"class": "proxy_list"})
 
-        # The first tr contains the field names.
-        headings = [th.get_text() for th in table.find("tr").find_all("th")]
+            # The first tr contains the field names.
+            headings = [th.get_text() for th in table.find("tr").find_all("th")]
 
-        datasets = []
-        for row in table.find_all("tr")[1:]:
-            dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
-            datasets.append(dataset)
+            datasets = []
+            for row in table.find_all("tr")[1:]:
+                dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
+                datasets.append(dataset)
+
+            for dataset in datasets:
+                # Avoid Straggler proxies and make sure it is a Valid Proxy Address
+                proxy_obj = self.create_proxy_object(dataset)
+                if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
+                    curr_proxy_list.append(proxy_obj)
+                else:
+                    logger.debug("Proxy Invalid: {}".format(dataset))
+        except AttributeError as e:
+            logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
+        except KeyError as e:
+            logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
+        except Exception as e:
+            logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
+        finally:
+            return curr_proxy_list
 
-        for dataset in datasets:
-            # Avoid Straggler proxies and make sure it is a Valid Proxy Address
-            proxy_obj = self.create_proxy_object(dataset)
-            if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
-                curr_proxy_list.append(proxy_obj)
-            else:
-                logger.debug("Proxy Invalid: {}".format(dataset))
-        return curr_proxy_list
 
     def create_proxy_object(self, dataset):
         ip = ""

diff --git a/http_request_randomizer/requests/parsers/RebroWeeblyParser.py b/http_request_randomizer/requests/parsers/RebroWeeblyParser.py
@@ -18,60 +18,68 @@ def __init__(self, id, web_url, timeout=None):
 
     def parse_proxyList(self, use_top15k=False):
         curr_proxy_list = []
-        response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)
+        try:
+            response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout)
 
-        if not response.ok:
-            logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
-            return []
+            if not response.ok:
+                logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
+                return []
 
-        content = response.content
-        soup = BeautifulSoup(content, "html.parser")
-        all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
-        # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
-        # .find('font', attrs={'color': '#33a27f'})
-        # Parse Top Proxy List page
-        address_list = []
-        country_list = []
-        anonymity_list = []
-        for div in all_divs:
-            address_div = div.find('font', attrs={'color': '#33a27f'})
-            if address_div is not None:
-                for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
-                    address_list.append(str(row))
-            curr_div = div.findAll('font', attrs={'size': '2'})
-            if curr_div[0] is not None:
-                row_data = []
-                # font -> strong -> font
-                title = curr_div[0].contents[0].contents[0].contents[0]
-                for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
-                    row_data.append(str(row))
-                if 'Country' in str(title):
-                    country_list.extend(row_data)
-                if 'Status' in str(title):
-                    anonymity_list.extend(row_data)
-        for address, country, anonymity in zip(address_list, country_list, anonymity_list):
-            # Make sure it is a Valid Proxy Address
-            proxy_obj = self.create_proxy_object(address, country, anonymity)
-            if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
-                curr_proxy_list.append(proxy_obj)
-            else:
-                logger.debug("Proxy Invalid: {}".format(row))
-        # Usually these proxies are stale
-        if use_top15k:
-            # Parse 15k Nodes Text file (named *-all-*.txt)
-            content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
+            content = response.content
             soup = BeautifulSoup(content, "html.parser")
-            table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
-            for link in table.findAll('a'):
-                current_link = link.get('href')
-                if current_link is not None and "all" in current_link:
-                    self.txt_proxy_path = current_link
-            more_content = requests.get(self.get_url() + self.txt_proxy_path).text
-            for proxy_address in more_content.split():
-                if UrlParser.valid_ip_port(proxy_address):
-                    proxy_obj = self.create_proxy_object(row)
+            all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
+            # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"})
+            # .find('font', attrs={'color': '#33a27f'})
+            # Parse Top Proxy List page
+            address_list = []
+            country_list = []
+            anonymity_list = []
+            for div in all_divs:
+                address_div = div.find('font', attrs={'color': '#33a27f'})
+                if address_div is not None:
+                    for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']:
+                        address_list.append(str(row))
+                curr_div = div.findAll('font', attrs={'size': '2'})
+                if curr_div[0] is not None:
+                    row_data = []
+                    # font -> strong -> font
+                    title = curr_div[0].contents[0].contents[0].contents[0]
+                    for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']:
+                        row_data.append(str(row))
+                    if 'Country' in str(title):
+                        country_list.extend(row_data)
+                    if 'Status' in str(title):
+                        anonymity_list.extend(row_data)
+            for address, country, anonymity in zip(address_list, country_list, anonymity_list):
+                # Make sure it is a Valid Proxy Address
+                proxy_obj = self.create_proxy_object(address, country, anonymity)
+                if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
                     curr_proxy_list.append(proxy_obj)
-        return curr_proxy_list
+                else:
+                    logger.debug("Proxy Invalid: {}".format(row))
+            # Usually these proxies are stale
+            if use_top15k:
+                # Parse 15k Nodes Text file (named *-all-*.txt)
+                content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content
+                soup = BeautifulSoup(content, "html.parser")
+                table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"})
+                for link in table.findAll('a'):
+                    current_link = link.get('href')
+                    if current_link is not None and "all" in current_link:
+                        self.txt_proxy_path = current_link
+                more_content = requests.get(self.get_url() + self.txt_proxy_path).text
+                for proxy_address in more_content.split():
+                    if UrlParser.valid_ip_port(proxy_address):
+                        proxy_obj = self.create_proxy_object(row)
+                        curr_proxy_list.append(proxy_obj)
+        except AttributeError as e:
+            logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
+        except KeyError as e:
+            logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
+        except Exception as e:
+            logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
+        finally:
+            return curr_proxy_list
 
     def create_proxy_object(self, address, country, anonymity):
         # Make sure it is a Valid IP

diff --git a/http_request_randomizer/requests/parsers/SamairProxyParser.py b/http_request_randomizer/requests/parsers/SamairProxyParser.py
@@ -18,47 +18,55 @@ def __init__(self, id, web_url, timeout=None):
 
     def parse_proxyList(self):
         curr_proxy_list = []
-        # Parse all proxy pages -> format: /list/{num}.htm
-        # Get the pageRange from the 'pagination' table
-        page_set = self.get_pagination_set()
-        logger.debug("Pages: {}".format(page_set))
-        for page in page_set:
-            response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
-            if not response.ok:
-                # Could not parse ANY page - Let user know
-                if not curr_proxy_list:
-                    logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
-                # Return proxies parsed so far
-                return curr_proxy_list
-            content = response.content
-            soup = BeautifulSoup(content, "html.parser")
-            # css provides the port number so we reverse it
-            # for href in soup.findAll('link'):
-            #     if '/styles/' in href.get('href'):
-            #         style = "http://www.samair.ru" + href.get('href')
-            #         break
-            # css = requests.get(style).content.split('\n')
-            # css.pop()
-            # ports = {}
-            # for l in css:
-            #     p = l.split(' ')
-            #     key = p[0].split(':')[0][1:]
-            #     value = p[1].split('\"')[1]
-            #     ports[key] = value
+        try:
+            # Parse all proxy pages -> format: /list/{num}.htm
+            # Get the pageRange from the 'pagination' table
+            page_set = self.get_pagination_set()
+            logger.debug("Pages: {}".format(page_set))
+            for page in page_set:
+                response = requests.get("{0}{1}".format(self.get_url(), page), timeout=self.timeout)
+                if not response.ok:
+                    # Could not parse ANY page - Let user know
+                    if not curr_proxy_list:
+                        logger.warn("Proxy Provider url failed: {}".format(self.get_url()))
+                    # Return proxies parsed so far
+                    return curr_proxy_list
+                content = response.content
+                soup = BeautifulSoup(content, "html.parser")
+                # css provides the port number so we reverse it
+                # for href in soup.findAll('link'):
+                #     if '/styles/' in href.get('href'):
+                #         style = "http://www.samair.ru" + href.get('href')
+                #         break
+                # css = requests.get(style).content.split('\n')
+                # css.pop()
+                # ports = {}
+                # for l in css:
+                #     p = l.split(' ')
+                #     key = p[0].split(':')[0][1:]
+                #     value = p[1].split('\"')[1]
+                #     ports[key] = value
 
-            table = soup.find("div", attrs={"id": "proxylist"})
-            # The first tr contains the field names.
-            headings = [th.get_text() for th in table.find("tr").find_all("th")]
-            for row in table.find_all("tr")[1:]:
-                td_row = row.find("td")
-                # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
-                proxy_obj = self.create_proxy_object(row)
-                # Make sure it is a Valid Proxy Address
-                if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
-                    curr_proxy_list.append(proxy_obj)
-                else:
-                    logger.debug("Proxy Invalid: {}".format(td_row.text))
-        return curr_proxy_list
+                table = soup.find("div", attrs={"id": "proxylist"})
+                # The first tr contains the field names.
+                headings = [th.get_text() for th in table.find("tr").find_all("th")]
+                for row in table.find_all("tr")[1:]:
+                    td_row = row.find("td")
+                    # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
+                    proxy_obj = self.create_proxy_object(row)
+                    # Make sure it is a Valid Proxy Address
+                    if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
+                        curr_proxy_list.append(proxy_obj)
+                    else:
+                        logger.debug("Proxy Invalid: {}".format(td_row.text))
+        except AttributeError as e:
+            logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
+        except KeyError as e:
+            logger.error("Provider {0} failed with Key error: {1}".format(self.id, e))
+        except Exception as e:
+            logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e))
+        finally:
+            return curr_proxy_list
 
     def get_pagination_set(self):
         response = requests.get(self.get_url(), timeout=self.timeout)