From 7bb0c76a99fdc7cd74bc79f896ce3940a05246fa Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Thu, 17 Jun 2021 17:55:36 +0300 Subject: [PATCH 01/14] fix cleaning the wrong top node Before this change, `top_node` was cleaned and then copied to the `clean_top_node`. I believe this is not the original intent and should be fixed because it creates confusion and there's no way to get the raw `top_node` --- newspaper/article.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newspaper/article.py b/newspaper/article.py index df0d9c435..d7c2cd349 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -275,8 +275,8 @@ def parse(self): video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) - self.top_node = self.extractor.post_cleanup(self.top_node) self.clean_top_node = copy.deepcopy(self.top_node) + self.clean_top_node = self.extractor.post_cleanup(self.clean_top_node) text, article_html = output_formatter.get_formatted( self.top_node) From e4beb8f1531155d7c9fff1e8f93dfcf7c1f42697 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Wed, 28 Jul 2021 13:55:03 +0300 Subject: [PATCH 02/14] add ul and ol to the list of nodes to check --- newspaper/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 962554014..aa81f7fc0 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -1014,7 +1014,7 @@ def nodes_to_check(self, doc): on like paragraphs and tables """ nodes_to_check = [] - for tag in ['p', 'pre', 'td']: + for tag in ['p', 'pre', 'td', 'ol', 'ul']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items return nodes_to_check From a626a41032015fd32b49bf03471654b44fae6044 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Wed, 28 Jul 2021 13:56:15 +0300 Subject: [PATCH 03/14] deep copy top node in the output formatter --- newspaper/outputformatters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py index 47a76467c..620dce631 100644 --- a/newspaper/outputformatters.py +++ b/newspaper/outputformatters.py @@ -9,6 +9,7 @@ from html import unescape import logging +import copy from .text import innerTrim @@ -42,7 +43,7 @@ def get_formatted(self, top_node): """Returns the body text of an article, and also the body article html if specified. Returns in (text, html) form """ - self.top_node = top_node + self.top_node = copy.deepcopy(top_node) html, text = '', '' self.remove_negativescores_nodes() From a5b15ff3d057618f770353e084d399bf251db626 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Tue, 19 Oct 2021 17:44:49 +0300 Subject: [PATCH 04/14] add support for section based articles --- newspaper/extractors.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index aa81f7fc0..31eee5744 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -1017,6 +1017,10 @@ def nodes_to_check(self, doc): for tag in ['p', 'pre', 'td', 'ol', 'ul']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items + for tag in ['section']: + items = self.parser.getElementsByTag(doc, tag=tag) + if len(items) > 1: + nodes_to_check = items return nodes_to_check def is_table_and_no_para_exist(self, e): From a737681a1a1bdfd17bd97bd78deea7791b431ffe Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Mon, 25 Oct 2021 16:04:06 +0300 Subject: [PATCH 05/14] fix medium heuristic for finding the content node --- newspaper/extractors.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 31eee5744..f6f9406ae 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -1014,13 +1014,16 @@ def nodes_to_check(self, doc): on like paragraphs and tables """ nodes_to_check = [] - for tag in ['p', 'pre', 'td', 'ol', 'ul']: - items = self.parser.getElementsByTag(doc, tag=tag) - nodes_to_check += items - for tag in ['section']: - items = self.parser.getElementsByTag(doc, tag=tag) - if len(items) > 1: - nodes_to_check = items + articles = self.parser.getElementsByTag(doc, tag='article') + if len(articles) > 0: + # Specific heuristic for Medium articles + sections = self.parser.getElementsByTag(articles[0], tag='section') + if len(sections) > 1: + nodes_to_check = sections + if len(nodes_to_check) == 0: + for tag in ['p', 'pre', 'td', 'ol', 'ul']: + items = self.parser.getElementsByTag(doc, tag=tag) + nodes_to_check += items return nodes_to_check def is_table_and_no_para_exist(self, e): From a169a56c9845b1b60c4b62bc164cffacbade5857 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Tue, 26 Oct 2021 12:33:07 +0300 Subject: [PATCH 06/14] don't clean the default doc variable instead use clean_doc --- newspaper/article.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/newspaper/article.py b/newspaper/article.py index d7c2cd349..85c5f3622 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -213,19 +213,22 @@ def parse(self): self.throw_if_not_downloaded_verbose() self.doc = self.config.get_parser().fromstring(self.html) - self.clean_doc = copy.deepcopy(self.doc) if self.doc is None: # `parse` call failed, return nothing return + document_cleaner = DocumentCleaner(self.config) + output_formatter = OutputFormatter(self.config) + + self.clean_doc = copy.deepcopy(self.doc) + # Before any computations on the body, clean DOM object + self.clean_doc = document_cleaner.clean(self.clean_doc) + # TODO: Fix this, sync in our fix_url() method parse_candidate = self.get_parse_candidate() self.link_hash = parse_candidate.link_hash # MD5 - document_cleaner = DocumentCleaner(self.config) - output_formatter = OutputFormatter(self.config) - title = self.extractor.get_title(self.clean_doc) self.set_title(title) @@ -267,9 +270,6 @@ def parse(self): self.url, self.clean_doc) - # Before any computations on the body, clean DOM object - self.doc = document_cleaner.clean(self.doc) - self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is not None: video_extractor = VideoExtractor(self.config, self.top_node) From c01bdec73c950660b88e8301412d215a236918c9 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Tue, 26 Oct 2021 16:25:03 +0300 Subject: [PATCH 07/14] force medium heuristic on medium articles only --- newspaper/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index f6f9406ae..28d1dd5b6 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -1015,7 +1015,7 @@ def nodes_to_check(self, doc): """ nodes_to_check = [] articles = self.parser.getElementsByTag(doc, tag='article') - if len(articles) > 0: + if len(articles) > 0 and self.get_meta_site_name(doc) == 'Medium': # Specific heuristic for Medium articles sections = self.parser.getElementsByTag(articles[0], tag='section') if len(sections) > 1: From a4dcbe45b84839d9ece18be8f698f8f50daa6aa3 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Tue, 26 Oct 2021 16:47:46 +0300 Subject: [PATCH 08/14] fallback to clean doc if it wasn't possible to find top node --- newspaper/article.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/newspaper/article.py b/newspaper/article.py index 85c5f3622..620cb0970 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -271,6 +271,8 @@ def parse(self): self.clean_doc) self.top_node = self.extractor.calculate_best_node(self.doc) + if self.top_node is None: + self.top_node = self.extractor.calculate_best_node(self.clean_doc) if self.top_node is not None: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) From 7b95eb6c51800d27bedafc3ece94bad148dc9832 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Wed, 27 Oct 2021 17:04:45 +0300 Subject: [PATCH 09/14] add fallback heuristics when can't find top node --- newspaper/article.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/newspaper/article.py b/newspaper/article.py index 620cb0970..dd3a25e52 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -273,6 +273,14 @@ def parse(self): self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is None: self.top_node = self.extractor.calculate_best_node(self.clean_doc) + if self.top_node is None: + self.top_node = self.extractor.parser.getElementById(self.doc, 'content') + if self.top_node is None: + for tag in ['article', 'main']: + nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag) + if len(nodes) > 0: + self.top_node = nodes[0] + break if self.top_node is not None: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) From 06551769ffaaba3edd5ecf327c40705d0cb2f750 Mon Sep 17 00:00:00 2001 From: vpol Date: Thu, 14 Jul 2022 07:52:29 +0100 Subject: [PATCH 10/14] allow_redirects config option (#2) --- newspaper/configuration.py | 3 +++ newspaper/network.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/newspaper/configuration.py b/newspaper/configuration.py index 94688e705..bcaf52cbf 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -57,6 +57,9 @@ def __init__(self): # Fail for error responses (e.g. 404 page) self.http_success_only = True + # Allow redirects (enabled by default) + self.allow_redirects = True + # English is the fallback self._language = 'en' diff --git a/newspaper/network.py b/newspaper/network.py index 29f0e699d..8ba02aa78 100644 --- a/newspaper/network.py +++ b/newspaper/network.py @@ -21,7 +21,7 @@ FAIL_ENCODING = 'ISO-8859-1' -def get_request_kwargs(timeout, useragent, proxies, headers): +def get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects): """This Wrapper method exists b/c some values in req_kwargs dict are methods which need to be called every time we make a request """ @@ -29,7 +29,7 @@ def get_request_kwargs(timeout, useragent, proxies, headers): 'headers': headers if headers else {'User-Agent': useragent}, 'cookies': cj(), 'timeout': timeout, - 'allow_redirects': True, + 'allow_redirects': allow_redirects, 'proxies': proxies } @@ -55,12 +55,13 @@ def get_html_2XX_only(url, config=None, response=None): timeout = config.request_timeout proxies = config.proxies headers = config.headers + allow_redirects = config.allow_redirects if response is not None: return _get_html_from_response(response, config) response = requests.get( - url=url, **get_request_kwargs(timeout, useragent, proxies, headers)) + url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects)) html = _get_html_from_response(response, config) @@ -107,7 +108,7 @@ def __init__(self, url, config=None): def send(self): try: self.resp = requests.get(self.url, **get_request_kwargs( - self.timeout, self.useragent, self.proxies, self.headers)) + self.timeout, self.useragent, self.proxies, self.headers, self.config.allow_redirects)) if self.config.http_success_only: self.resp.raise_for_status() except requests.exceptions.RequestException as e: From e149436510ac32dfa34e150373246400aaf5370f Mon Sep 17 00:00:00 2001 From: Viktor Poluksht Date: Thu, 13 Apr 2023 12:05:10 +0100 Subject: [PATCH 11/14] feat: allow to ignore certain basename regex (#3) Signed-off-by: Viktor Poluksht --- newspaper/configuration.py | 1 + newspaper/extractors.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/newspaper/configuration.py b/newspaper/configuration.py index bcaf52cbf..301dd1790 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -60,6 +60,7 @@ def __init__(self): # Allow redirects (enabled by default) self.allow_redirects = True + self.ignored_images_suffix_list = [] # English is the fallback self._language = 'en' diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 28d1dd5b6..d47eb488d 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -13,6 +13,7 @@ import copy import logging +import os.path import re import re from collections import defaultdict @@ -449,19 +450,21 @@ def get_meta_img_url(self, article_url, doc): """ top_meta_image, try_one, try_two, try_three, try_four = [None] * 5 try_one = self.get_meta_content(doc, 'meta[property="og:image"]') + try_one = None if self.image_is_ignored(try_one) else try_one if not try_one: link_img_src_kwargs = \ {'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'} elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs) try_two = elems[0].get('href') if elems else None - + try_two = None if self.image_is_ignored(try_two) else try_two if not try_two: try_three = self.get_meta_content(doc, 'meta[name="og:image"]') - + try_three = None if self.image_is_ignored(try_three) else try_three if not try_three: link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} elems = self.parser.getElementsByTag(doc, **link_icon_kwargs) try_four = elems[0].get('href') if elems else None + try_four = None if self.image_is_ignored(try_four) else try_four top_meta_image = try_one or try_two or try_three or try_four @@ -469,6 +472,12 @@ def get_meta_img_url(self, article_url, doc): return urljoin(article_url, top_meta_image) return '' + def image_is_ignored(self, image): + return len([True for x in self.config.ignored_images_suffix_list if image and self.match_image(x, os.path.basename(image))]) > 0 + + def match_image(self, pattern, image): + return re.search(pattern, image) is not None + def get_meta_type(self, doc): """Returns meta type of article, open graph protocol """ From 7d34fc99ed3fbe7636cbf4d156bcb8c1e9abd5e4 Mon Sep 17 00:00:00 2001 From: Viktor Poluksht Date: Thu, 13 Apr 2023 15:29:17 +0100 Subject: [PATCH 12/14] fix: more places where we have to check images Signed-off-by: Viktor Poluksht --- newspaper/extractors.py | 3 ++- tests/unit_tests.py | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index d47eb488d..cfc2bca5a 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -473,7 +473,7 @@ def get_meta_img_url(self, article_url, doc): return '' def image_is_ignored(self, image): - return len([True for x in self.config.ignored_images_suffix_list if image and self.match_image(x, os.path.basename(image))]) > 0 + return any([True for x in self.config.ignored_images_suffix_list if image and image != '' and self.match_image(x, os.path.basename(image))]) def match_image(self, pattern, image): return re.search(pattern, image) is not None @@ -584,6 +584,7 @@ def get_img_urls(self, article_url, doc): for img_tag in img_tags if img_tag.get('src')] img_links = set([urljoin(article_url, url) for url in urls]) + img_links = [x for x in img_links if not self.image_is_ignored(x)] return img_links def get_first_img_url(self, article_url, top_node): diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 69c05adfa..76e54f9c4 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -24,7 +24,7 @@ URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt') import newspaper -from newspaper import Article, fulltext, Source, ArticleException, news_pool +from newspaper import Article, Config, fulltext, Source, ArticleException, news_pool from newspaper.article import ArticleDownloadState from newspaper.configuration import Configuration from newspaper.urls import get_domain @@ -406,9 +406,9 @@ def test_get_top_image_from_meta(self): html = '' \ '' html_empty_og_content = '' \ - '' + '' html_empty_all = '' \ - '' + '' html_rel_img_src = html_empty_all + '' html_rel_img_src2 = html_empty_all + '' html_rel_icon = html_empty_all + '' @@ -544,7 +544,6 @@ def test_valid_urls(self): print('\t\turl: %s is supposed to be %s' % (url, truth_val)) raise - @print_test def test_pubdate(self): """Checks that irrelevant data in url isn't considered as publishing date""" @@ -568,7 +567,6 @@ def test_pubdate(self): print('\t\tpublishing date in %s should not be present' % (url)) raise - @unittest.skip("Need to write an actual test") @print_test def test_prepare_url(self): @@ -635,9 +633,9 @@ class ConfigBuildTestCase(unittest.TestCase): NOTE: No need to mock responses as we are just initializing the objects, not actually calling download(..) """ + @print_test def test_article_default_params(self): - a = Article(url='http://www.cnn.com/2013/11/27/' 'travel/weather-thanksgiving/index.html') self.assertEqual('en', a.config.language) @@ -767,6 +765,19 @@ def test_article_pdf_fetching(self): a.download() self.assertNotEqual('%PDF-', a.html) + +class TestIgnoreImages(unittest.TestCase): + + @print_test + def test_config_ignore_images(self): + config = Config() + config.ignored_images_suffix_list = ['think.png', '(.*)\.ico'] + a = Article('https://www.reillywood.com/blog/why-nu/', config=config) + a.download() + a.parse() + self.assertEqual('https://d33wubrfki0l68.cloudfront.net/77d3013f91800257b3ca2adfb995ae24e49fff4e/b3086/img/main/headshot.jpg', a.top_img) + + if __name__ == '__main__': argv = list(sys.argv) if 'fulltext' in argv: From b75385a7d26378a7d8b04b6a243706d6042b4c1c Mon Sep 17 00:00:00 2001 From: Viktor Poluksht Date: Tue, 25 Apr 2023 08:57:02 +0100 Subject: [PATCH 13/14] fix: type conversion list -> set Signed-off-by: Viktor Poluksht --- newspaper/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newspaper/extractors.py b/newspaper/extractors.py index cfc2bca5a..d1caada0c 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -584,7 +584,7 @@ def get_img_urls(self, article_url, doc): for img_tag in img_tags if img_tag.get('src')] img_links = set([urljoin(article_url, url) for url in urls]) - img_links = [x for x in img_links if not self.image_is_ignored(x)] + img_links = set([x for x in img_links if not self.image_is_ignored(x)]) return img_links def get_first_img_url(self, article_url, top_node): From d25229e2dd87d8b9fc243621acaf83150226af55 Mon Sep 17 00:00:00 2001 From: denis <54359969+denisb0@users.noreply.github.com> Date: Wed, 3 Apr 2024 08:54:15 +0300 Subject: [PATCH 14/14] chore: set lxml version to supported (#6) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 619746017..0ea8c095a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cssselect>=0.9.2 feedfinder2>=0.0.4 feedparser>=5.2.1 jieba3k>=0.35.1 -lxml>=3.6.0 +lxml==5.1.0 # https://lxml.de/5.2/changes-5.2.0.html nltk>=3.2.1 Pillow>=3.3.0 pythainlp>=1.7.2