codelucas · idoshamun · Jun 17, 2021 · Jul 28, 2021 · Jul 28, 2021 · Oct 19, 2021
diff --git a/newspaper/article.py b/newspaper/article.py
@@ -213,19 +213,22 @@ def parse(self):
         self.throw_if_not_downloaded_verbose()
 
         self.doc = self.config.get_parser().fromstring(self.html)
-        self.clean_doc = copy.deepcopy(self.doc)
 
         if self.doc is None:
             # `parse` call failed, return nothing
             return
 
+        document_cleaner = DocumentCleaner(self.config)
+        output_formatter = OutputFormatter(self.config)
+
+        self.clean_doc = copy.deepcopy(self.doc)
+        # Before any computations on the body, clean DOM object
+        self.clean_doc = document_cleaner.clean(self.clean_doc)
+
         # TODO: Fix this, sync in our fix_url() method
         parse_candidate = self.get_parse_candidate()
         self.link_hash = parse_candidate.link_hash  # MD5
 
-        document_cleaner = DocumentCleaner(self.config)
-        output_formatter = OutputFormatter(self.config)
-
         title = self.extractor.get_title(self.clean_doc)
         self.set_title(title)
 
@@ -267,16 +270,23 @@ def parse(self):
             self.url,
             self.clean_doc)
 
-        # Before any computations on the body, clean DOM object
-        self.doc = document_cleaner.clean(self.doc)
-
         self.top_node = self.extractor.calculate_best_node(self.doc)
+        if self.top_node is None:
+            self.top_node = self.extractor.calculate_best_node(self.clean_doc)
+        if self.top_node is None:
+            self.top_node = self.extractor.parser.getElementById(self.doc, 'content')
+        if self.top_node is None:
+            for tag in ['article', 'main']:
+                nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag)
+                if len(nodes) > 0:
+                    self.top_node = nodes[0]
+                    break
         if self.top_node is not None:
             video_extractor = VideoExtractor(self.config, self.top_node)
             self.set_movies(video_extractor.get_videos())
 
-            self.top_node = self.extractor.post_cleanup(self.top_node)
             self.clean_top_node = copy.deepcopy(self.top_node)
+            self.clean_top_node = self.extractor.post_cleanup(self.clean_top_node)
 
             text, article_html = output_formatter.get_formatted(
                 self.top_node)

diff --git a/newspaper/configuration.py b/newspaper/configuration.py
@@ -57,6 +57,10 @@ def __init__(self):
         # Fail for error responses (e.g. 404 page)
         self.http_success_only = True
 
+        # Allow redirects (enabled by default)
+        self.allow_redirects = True
+
+        self.ignored_images_suffix_list = []
         # English is the fallback
         self._language = 'en'
 

diff --git a/newspaper/extractors.py b/newspaper/extractors.py
@@ -13,6 +13,7 @@
 
 import copy
 import logging
+import os.path
 import re
 import re
 from collections import defaultdict
@@ -449,26 +450,34 @@ def get_meta_img_url(self, article_url, doc):
         """
         top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
         try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
+        try_one = None if self.image_is_ignored(try_one) else try_one
         if not try_one:
             link_img_src_kwargs = \
                 {'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
             elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs)
             try_two = elems[0].get('href') if elems else None
-
+            try_two = None if self.image_is_ignored(try_two) else try_two
             if not try_two:
                 try_three = self.get_meta_content(doc, 'meta[name="og:image"]')
-
+                try_three = None if self.image_is_ignored(try_three) else try_three
                 if not try_three:
                     link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
                     elems = self.parser.getElementsByTag(doc, **link_icon_kwargs)
                     try_four = elems[0].get('href') if elems else None
+                    try_four = None if self.image_is_ignored(try_four) else try_four
 
         top_meta_image = try_one or try_two or try_three or try_four
 
         if top_meta_image:
             return urljoin(article_url, top_meta_image)
         return ''
 
+    def image_is_ignored(self, image):
+        return any([True for x in self.config.ignored_images_suffix_list if image and image != '' and self.match_image(x, os.path.basename(image))])
+
+    def match_image(self, pattern, image):
+        return re.search(pattern, image) is not None
+
     def get_meta_type(self, doc):
         """Returns meta type of article, open graph protocol
         """
@@ -575,6 +584,7 @@ def get_img_urls(self, article_url, doc):
                 for img_tag in img_tags if img_tag.get('src')]
         img_links = set([urljoin(article_url, url)
                          for url in urls])
+        img_links = set([x for x in img_links if not self.image_is_ignored(x)])
         return img_links
 
     def get_first_img_url(self, article_url, top_node):
@@ -1014,9 +1024,16 @@ def nodes_to_check(self, doc):
         on like paragraphs and tables
         """
         nodes_to_check = []
-        for tag in ['p', 'pre', 'td']:
-            items = self.parser.getElementsByTag(doc, tag=tag)
-            nodes_to_check += items
+        articles = self.parser.getElementsByTag(doc, tag='article')
+        if len(articles) > 0 and self.get_meta_site_name(doc) == 'Medium':
+            # Specific heuristic for Medium articles
+            sections = self.parser.getElementsByTag(articles[0], tag='section')
+            if len(sections) > 1:
+                nodes_to_check = sections
+        if len(nodes_to_check) == 0:
+            for tag in ['p', 'pre', 'td', 'ol', 'ul']:
+                items = self.parser.getElementsByTag(doc, tag=tag)
+                nodes_to_check += items
         return nodes_to_check
 
     def is_table_and_no_para_exist(self, e):

diff --git a/newspaper/network.py b/newspaper/network.py
@@ -21,15 +21,15 @@
 FAIL_ENCODING = 'ISO-8859-1'
 
 
-def get_request_kwargs(timeout, useragent, proxies, headers):
+def get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects):
     """This Wrapper method exists b/c some values in req_kwargs dict
     are methods which need to be called every time we make a request
     """
     return {
         'headers': headers if headers else {'User-Agent': useragent},
         'cookies': cj(),
         'timeout': timeout,
-        'allow_redirects': True,
+        'allow_redirects': allow_redirects,
         'proxies': proxies
     }
 
@@ -55,12 +55,13 @@ def get_html_2XX_only(url, config=None, response=None):
     timeout = config.request_timeout
     proxies = config.proxies
     headers = config.headers
+    allow_redirects = config.allow_redirects
 
     if response is not None:
         return _get_html_from_response(response, config)
 
     response = requests.get(
-        url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
+        url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects))
 
     html = _get_html_from_response(response, config)
 
@@ -107,7 +108,7 @@ def __init__(self, url, config=None):
     def send(self):
         try:
             self.resp = requests.get(self.url, **get_request_kwargs(
-                self.timeout, self.useragent, self.proxies, self.headers))
+                self.timeout, self.useragent, self.proxies, self.headers, self.config.allow_redirects))
             if self.config.http_success_only:
                 self.resp.raise_for_status()
         except requests.exceptions.RequestException as e:

diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py
@@ -9,6 +9,7 @@
 
 from html import unescape
 import logging
+import copy
 
 from .text import innerTrim
 
@@ -42,7 +43,7 @@ def get_formatted(self, top_node):
         """Returns the body text of an article, and also the body article
         html if specified. Returns in (text, html) form
         """
-        self.top_node = top_node
+        self.top_node = copy.deepcopy(top_node)
         html, text = '', ''
 
         self.remove_negativescores_nodes()

diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ cssselect>=0.9.2
 feedfinder2>=0.0.4
 feedparser>=5.2.1
 jieba3k>=0.35.1
-lxml>=3.6.0
+lxml==5.1.0 # https://lxml.de/5.2/changes-5.2.0.html
 nltk>=3.2.1
 Pillow>=3.3.0
 pythainlp>=1.7.2

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -24,7 +24,7 @@
 URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt')
 
 import newspaper
-from newspaper import Article, fulltext, Source, ArticleException, news_pool
+from newspaper import Article, Config, fulltext, Source, ArticleException, news_pool
 from newspaper.article import ArticleDownloadState
 from newspaper.configuration import Configuration
 from newspaper.urls import get_domain
@@ -406,9 +406,9 @@ def test_get_top_image_from_meta(self):
         html = '<meta property="og:image" content="https://example.com/meta_img_filename.jpg" />' \
                '<meta name="og:image" content="https://example.com/meta_another_img_filename.jpg"/>'
         html_empty_og_content = '<meta property="og:image" content="" />' \
-            '<meta name="og:image" content="https://example.com/meta_another_img_filename.jpg"/>'
+                                '<meta name="og:image" content="https://example.com/meta_another_img_filename.jpg"/>'
         html_empty_all = '<meta property="og:image" content="" />' \
-            '<meta name="og:image" />'
+                         '<meta name="og:image" />'
         html_rel_img_src = html_empty_all + '<link rel="img_src" href="https://example.com/meta_link_image.jpg" />'
         html_rel_img_src2 = html_empty_all + '<link rel="image_src" href="https://example.com/meta_link_image2.jpg" />'
         html_rel_icon = html_empty_all + '<link rel="icon" href="https://example.com/meta_link_rel_icon.ico" />'
@@ -544,7 +544,6 @@ def test_valid_urls(self):
                 print('\t\turl: %s is supposed to be %s' % (url, truth_val))
                 raise
 
-
     @print_test
     def test_pubdate(self):
         """Checks that irrelevant data in url isn't considered as publishing date"""
@@ -568,7 +567,6 @@ def test_pubdate(self):
                         print('\t\tpublishing date in %s should not be present' % (url))
                     raise
 
-
     @unittest.skip("Need to write an actual test")
     @print_test
     def test_prepare_url(self):
@@ -635,9 +633,9 @@ class ConfigBuildTestCase(unittest.TestCase):
     NOTE: No need to mock responses as we are just initializing the
     objects, not actually calling download(..)
     """
+
     @print_test
     def test_article_default_params(self):
-
         a = Article(url='http://www.cnn.com/2013/11/27/'
                         'travel/weather-thanksgiving/index.html')
         self.assertEqual('en', a.config.language)
@@ -767,6 +765,19 @@ def test_article_pdf_fetching(self):
         a.download()
         self.assertNotEqual('%PDF-', a.html)
 
+
+class TestIgnoreImages(unittest.TestCase):
+
+    @print_test
+    def test_config_ignore_images(self):
+        config = Config()
+        config.ignored_images_suffix_list = ['think.png', '(.*)\.ico']
+        a = Article('https://www.reillywood.com/blog/why-nu/', config=config)
+        a.download()
+        a.parse()
+        self.assertEqual('https://d33wubrfki0l68.cloudfront.net/77d3013f91800257b3ca2adfb995ae24e49fff4e/b3086/img/main/headshot.jpg', a.top_img)
+
+
 if __name__ == '__main__':
     argv = list(sys.argv)
     if 'fulltext' in argv: