Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix cleaning the wrong top node #896

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,19 +213,22 @@ def parse(self):
self.throw_if_not_downloaded_verbose()

self.doc = self.config.get_parser().fromstring(self.html)
self.clean_doc = copy.deepcopy(self.doc)

if self.doc is None:
# `parse` call failed, return nothing
return

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)

self.clean_doc = copy.deepcopy(self.doc)
# Before any computations on the body, clean DOM object
self.clean_doc = document_cleaner.clean(self.clean_doc)

# TODO: Fix this, sync in our fix_url() method
parse_candidate = self.get_parse_candidate()
self.link_hash = parse_candidate.link_hash # MD5

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)

title = self.extractor.get_title(self.clean_doc)
self.set_title(title)

Expand Down Expand Up @@ -267,16 +270,23 @@ def parse(self):
self.url,
self.clean_doc)

# Before any computations on the body, clean DOM object
self.doc = document_cleaner.clean(self.doc)

self.top_node = self.extractor.calculate_best_node(self.doc)
if self.top_node is None:
self.top_node = self.extractor.calculate_best_node(self.clean_doc)
if self.top_node is None:
self.top_node = self.extractor.parser.getElementById(self.doc, 'content')
if self.top_node is None:
for tag in ['article', 'main']:
nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag)
if len(nodes) > 0:
self.top_node = nodes[0]
break
if self.top_node is not None:
video_extractor = VideoExtractor(self.config, self.top_node)
self.set_movies(video_extractor.get_videos())

self.top_node = self.extractor.post_cleanup(self.top_node)
self.clean_top_node = copy.deepcopy(self.top_node)
self.clean_top_node = self.extractor.post_cleanup(self.clean_top_node)

text, article_html = output_formatter.get_formatted(
self.top_node)
Expand Down
4 changes: 4 additions & 0 deletions newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ def __init__(self):
# Fail for error responses (e.g. 404 page)
self.http_success_only = True

# Allow redirects (enabled by default)
self.allow_redirects = True

self.ignored_images_suffix_list = []
# English is the fallback
self._language = 'en'

Expand Down
27 changes: 22 additions & 5 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import copy
import logging
import os.path
import re
import re
from collections import defaultdict
Expand Down Expand Up @@ -449,26 +450,34 @@ def get_meta_img_url(self, article_url, doc):
"""
top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
try_one = None if self.image_is_ignored(try_one) else try_one
if not try_one:
link_img_src_kwargs = \
{'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs)
try_two = elems[0].get('href') if elems else None

try_two = None if self.image_is_ignored(try_two) else try_two
if not try_two:
try_three = self.get_meta_content(doc, 'meta[name="og:image"]')

try_three = None if self.image_is_ignored(try_three) else try_three
if not try_three:
link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
elems = self.parser.getElementsByTag(doc, **link_icon_kwargs)
try_four = elems[0].get('href') if elems else None
try_four = None if self.image_is_ignored(try_four) else try_four

top_meta_image = try_one or try_two or try_three or try_four

if top_meta_image:
return urljoin(article_url, top_meta_image)
return ''

def image_is_ignored(self, image):
return any([True for x in self.config.ignored_images_suffix_list if image and image != '' and self.match_image(x, os.path.basename(image))])

def match_image(self, pattern, image):
return re.search(pattern, image) is not None

def get_meta_type(self, doc):
"""Returns meta type of article, open graph protocol
"""
Expand Down Expand Up @@ -575,6 +584,7 @@ def get_img_urls(self, article_url, doc):
for img_tag in img_tags if img_tag.get('src')]
img_links = set([urljoin(article_url, url)
for url in urls])
img_links = set([x for x in img_links if not self.image_is_ignored(x)])
return img_links

def get_first_img_url(self, article_url, top_node):
Expand Down Expand Up @@ -1014,9 +1024,16 @@ def nodes_to_check(self, doc):
on like paragraphs and tables
"""
nodes_to_check = []
for tag in ['p', 'pre', 'td']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
articles = self.parser.getElementsByTag(doc, tag='article')
if len(articles) > 0 and self.get_meta_site_name(doc) == 'Medium':
# Specific heuristic for Medium articles
sections = self.parser.getElementsByTag(articles[0], tag='section')
if len(sections) > 1:
nodes_to_check = sections
if len(nodes_to_check) == 0:
for tag in ['p', 'pre', 'td', 'ol', 'ul']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
return nodes_to_check

def is_table_and_no_para_exist(self, e):
Expand Down
9 changes: 5 additions & 4 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@
FAIL_ENCODING = 'ISO-8859-1'


def get_request_kwargs(timeout, useragent, proxies, headers):
def get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects):
"""This Wrapper method exists b/c some values in req_kwargs dict
are methods which need to be called every time we make a request
"""
return {
'headers': headers if headers else {'User-Agent': useragent},
'cookies': cj(),
'timeout': timeout,
'allow_redirects': True,
'allow_redirects': allow_redirects,
'proxies': proxies
}

Expand All @@ -55,12 +55,13 @@ def get_html_2XX_only(url, config=None, response=None):
timeout = config.request_timeout
proxies = config.proxies
headers = config.headers
allow_redirects = config.allow_redirects

if response is not None:
return _get_html_from_response(response, config)

response = requests.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects))

html = _get_html_from_response(response, config)

Expand Down Expand Up @@ -107,7 +108,7 @@ def __init__(self, url, config=None):
def send(self):
try:
self.resp = requests.get(self.url, **get_request_kwargs(
self.timeout, self.useragent, self.proxies, self.headers))
self.timeout, self.useragent, self.proxies, self.headers, self.config.allow_redirects))
if self.config.http_success_only:
self.resp.raise_for_status()
except requests.exceptions.RequestException as e:
Expand Down
3 changes: 2 additions & 1 deletion newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from html import unescape
import logging
import copy

from .text import innerTrim

Expand Down Expand Up @@ -42,7 +43,7 @@ def get_formatted(self, top_node):
"""Returns the body text of an article, and also the body article
html if specified. Returns in (text, html) form
"""
self.top_node = top_node
self.top_node = copy.deepcopy(top_node)
html, text = '', ''

self.remove_negativescores_nodes()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cssselect>=0.9.2
feedfinder2>=0.0.4
feedparser>=5.2.1
jieba3k>=0.35.1
lxml>=3.6.0
lxml==5.1.0 # https://lxml.de/5.2/changes-5.2.0.html
nltk>=3.2.1
Pillow>=3.3.0
pythainlp>=1.7.2
Expand Down
23 changes: 17 additions & 6 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt')

import newspaper
from newspaper import Article, fulltext, Source, ArticleException, news_pool
from newspaper import Article, Config, fulltext, Source, ArticleException, news_pool
from newspaper.article import ArticleDownloadState
from newspaper.configuration import Configuration
from newspaper.urls import get_domain
Expand Down Expand Up @@ -406,9 +406,9 @@ def test_get_top_image_from_meta(self):
html = '<meta property="og:image" content="https://example.com/meta_img_filename.jpg" />' \
'<meta name="og:image" content="https://example.com/meta_another_img_filename.jpg"/>'
html_empty_og_content = '<meta property="og:image" content="" />' \
'<meta name="og:image" content="https://example.com/meta_another_img_filename.jpg"/>'
'<meta name="og:image" content="https://example.com/meta_another_img_filename.jpg"/>'
html_empty_all = '<meta property="og:image" content="" />' \
'<meta name="og:image" />'
'<meta name="og:image" />'
html_rel_img_src = html_empty_all + '<link rel="img_src" href="https://example.com/meta_link_image.jpg" />'
html_rel_img_src2 = html_empty_all + '<link rel="image_src" href="https://example.com/meta_link_image2.jpg" />'
html_rel_icon = html_empty_all + '<link rel="icon" href="https://example.com/meta_link_rel_icon.ico" />'
Expand Down Expand Up @@ -544,7 +544,6 @@ def test_valid_urls(self):
print('\t\turl: %s is supposed to be %s' % (url, truth_val))
raise


@print_test
def test_pubdate(self):
"""Checks that irrelevant data in url isn't considered as publishing date"""
Expand All @@ -568,7 +567,6 @@ def test_pubdate(self):
print('\t\tpublishing date in %s should not be present' % (url))
raise


@unittest.skip("Need to write an actual test")
@print_test
def test_prepare_url(self):
Expand Down Expand Up @@ -635,9 +633,9 @@ class ConfigBuildTestCase(unittest.TestCase):
NOTE: No need to mock responses as we are just initializing the
objects, not actually calling download(..)
"""

@print_test
def test_article_default_params(self):

a = Article(url='http://www.cnn.com/2013/11/27/'
'travel/weather-thanksgiving/index.html')
self.assertEqual('en', a.config.language)
Expand Down Expand Up @@ -767,6 +765,19 @@ def test_article_pdf_fetching(self):
a.download()
self.assertNotEqual('%PDF-', a.html)


class TestIgnoreImages(unittest.TestCase):

@print_test
def test_config_ignore_images(self):
config = Config()
config.ignored_images_suffix_list = ['think.png', '(.*)\.ico']
a = Article('https://www.reillywood.com/blog/why-nu/', config=config)
a.download()
a.parse()
self.assertEqual('https://d33wubrfki0l68.cloudfront.net/77d3013f91800257b3ca2adfb995ae24e49fff4e/b3086/img/main/headshot.jpg', a.top_img)


if __name__ == '__main__':
argv = list(sys.argv)
if 'fulltext' in argv:
Expand Down