From 94b1a20854cee30f2d4b282de1ca68c3ba591c73 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 8 Apr 2015 19:14:47 +0300 Subject: [PATCH 01/44] Add python3 support --- goose/__init__.py | 9 ++++++--- goose/cleaners.py | 2 ++ goose/configuration.py | 9 +++++++-- goose/extractors/content.py | 2 +- goose/extractors/images.py | 6 +++++- goose/extractors/metas.py | 6 ++++-- goose/image.py | 4 ++++ goose/network.py | 17 +++++++++-------- goose/outputformatters.py | 6 +++++- goose/parsers.py | 7 +++++-- goose/text.py | 13 ++++++++----- goose/utils/__init__.py | 13 ++++++++++--- goose/utils/encoding.py | 28 +++++++++++++++------------- goose/utils/images.py | 10 +++++++--- setup.py | 2 +- tests/extractors/authors.py | 20 +++++++++++++++++--- tests/extractors/base.py | 11 +++++++++-- tests/extractors/content.py | 4 +++- tests/extractors/images.py | 6 ++++-- tests/extractors/links.py | 4 +++- tests/extractors/metas.py | 4 +++- tests/extractors/opengraph.py | 4 +++- tests/extractors/publishdate.py | 4 +++- tests/extractors/tags.py | 4 +++- tests/extractors/title.py | 4 +++- tests/extractors/tweets.py | 3 ++- tests/extractors/videos.py | 4 +++- tests/parsers.py | 3 +++ 28 files changed, 148 insertions(+), 61 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index 409b5732..f267fa34 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -64,9 +64,12 @@ def crawl(self, crawl_candiate): try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) - except (UnicodeDecodeError, ValueError): - self.config.parser_class = parsers[0] - return self.crawl(crawl_candiate) + except (UnicodeDecodeError, ValueError) as e: + if parsers: + self.config.parser_class = parsers[0] + return self.crawl(crawl_candiate) + else: + raise e return article def initialize(self): diff --git a/goose/cleaners.py b/goose/cleaners.py index c1384ee0..9ab45b6d 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -20,6 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import unicode_literals + from goose.utils import ReplaceSequence diff --git a/goose/configuration.py b/goose/configuration.py index fcfa5b9a..4913f699 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -22,6 +22,9 @@ """ import os import tempfile + +import six + from goose.text import StopWords from goose.parsers import Parser from goose.parsers import ParserSoup @@ -30,10 +33,12 @@ HTTP_DEFAULT_TIMEOUT = 30 AVAILABLE_PARSERS = { - 'lxml': Parser, - 'soup': ParserSoup, + 'lxml': Parser } +if six.PY2: + AVAILABLE_PARSERS['soup'] = ParserSoup + class Configuration(object): diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..afdc2c91 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -260,7 +260,7 @@ def update_score(self, node, addToScore): if score_string: current_score = int(score_string) - new_score = current_score + addToScore + new_score = current_score + int(addToScore) self.parser.setAttribute(node, "gravityScore", str(new_score)) def update_node_count(self, node, add_to_count): diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 3af44f5f..2dd63786 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -23,7 +23,11 @@ import re import os -from urlparse import urlparse, urljoin +try: + from urlparse import urlparse, urljoin +except ImportError: + from urllib.parse import urlparse, urljoin + from goose.extractors import BaseExtractor from goose.image import Image diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index 95acadd5..ee7d520e 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -22,8 +22,10 @@ """ import re -from urlparse import urljoin -from urlparse import urlparse +try: + from urlparse import urlparse, urljoin +except ImportError: + from urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor diff --git a/goose/image.py b/goose/image.py index 351e3396..23026398 100644 --- a/goose/image.py +++ b/goose/image.py @@ -20,6 +20,10 @@ See the License for the specific language governing permissions and limitations under the License. """ +try: + long +except NameError: + long = int class Image(object): diff --git a/goose/network.py b/goose/network.py index 666a7d61..2b8265ad 100644 --- a/goose/network.py +++ b/goose/network.py @@ -20,7 +20,12 @@ See the License for the specific language governing permissions and limitations under the License. """ -import urllib2 +import six + +try: + from urllib2 import urlopen, Request +except ImportError: + from urllib.request import urlopen, Request class HtmlFetcher(object): @@ -39,18 +44,14 @@ def get_url(self): def get_html(self, url): # utf-8 encode unicode url - if isinstance(url, unicode): + if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') # set request - self.request = urllib2.Request( - url, - headers=self.headers) + self.request = Request(url, headers=self.headers) # do request try: - self.result = urllib2.urlopen( - self.request, - timeout=self.config.http_timeout) + self.result = urlopen(self.request, timeout=self.config.http_timeout) except Exception: self.result = None diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 1f8ba4bd..21dab451 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,7 +20,11 @@ See the License for the specific language governing permissions and limitations under the License. """ -from HTMLParser import HTMLParser +try: + from HTMLParser import HTMLParser +except ImportError: + from html.parser import HTMLParser + from goose.text import innerTrim diff --git a/goose/parsers.py b/goose/parsers.py index a43e9b47..c0b091a9 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -21,7 +21,9 @@ limitations under the License. """ import lxml.html -from lxml.html import soupparser + +import six + from lxml import etree from copy import deepcopy from goose.text import innerTrim @@ -56,7 +58,7 @@ def fromstring(self, html): @classmethod def nodeToString(self, node): - return etree.tostring(node) + return etree.tostring(node, encoding=six.text_type) @classmethod def replaceTag(self, node, tag): @@ -239,6 +241,7 @@ class ParserSoup(Parser): @classmethod def fromstring(self, html): + from lxml.html import soupparser html = encodeValue(html) self.doc = soupparser.fromstring(html) return self.doc diff --git a/goose/text.py b/goose/text.py index 3ef63d6b..02846e20 100644 --- a/goose/text.py +++ b/goose/text.py @@ -23,6 +23,9 @@ import os import re import string + +import six + from goose.utils import FileHelper from goose.utils.encoding import smart_unicode from goose.utils.encoding import smart_str @@ -32,7 +35,7 @@ def innerTrim(value): - if isinstance(value, (unicode, str)): + if isinstance(value, (six.text_type, six.string_types)): # remove tab and white space value = re.sub(TABSSPACE, ' ', value) value = ''.join(value.splitlines()) @@ -87,7 +90,6 @@ def set_word_count(self, cnt): class StopWords(object): PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") - TRANS_TABLE = string.maketrans('', '') _cached_stop_words = {} def __init__(self, language='en'): @@ -106,9 +108,10 @@ def __init__(self, language='en'): def remove_punctuation(self, content): # code taken form # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python - if isinstance(content, unicode): - content = content.encode('utf-8') - return content.translate(self.TRANS_TABLE, string.punctuation) + if not isinstance(content, six.text_type): + content = content.decode('utf-8') + tbl = dict.fromkeys(ord(x) for x in string.punctuation) + return content.translate(tbl) def candiate_words(self, stripped_input): return stripped_input.split(' ') diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py index 5a1de7d4..41cf9c95 100644 --- a/goose/utils/__init__.py +++ b/goose/utils/__init__.py @@ -26,7 +26,13 @@ import os import goose import codecs -import urlparse + +import six + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse class BuildURL(object): @@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash): class RawHelper(object): @classmethod def get_parsing_candidate(self, url, raw_html): - if isinstance(raw_html, unicode): + if isinstance(raw_html, six.text_type): raw_html = raw_html.encode('utf-8') link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) return ParsingCandidate(url, link_hash) @@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl): # replace shebang is urls final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ if '#!' in url_to_crawl else url_to_crawl - link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) + url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url + link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time()) return ParsingCandidate(final_url, link_hash) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index 4dc23ca7..eb98917c 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- -import types import datetime + +import six + from decimal import Decimal @@ -45,8 +47,8 @@ def is_protected_type(obj): force_unicode(strings_only=True). """ return isinstance(obj, ( - types.NoneType, - int, long, + type(None), + six.integer_types, datetime.datetime, datetime.date, datetime.time, float, Decimal) ) @@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # Handle the common case first, saves 30-40% in performance when s # is an instance of unicode. This function gets called often in that # setting. - if isinstance(s, unicode): + if isinstance(s, six.text_type): return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types,): if hasattr(s, '__unicode__'): - s = unicode(s) + s = s.__unicode__() else: try: - s = unicode(str(s), encoding, errors) + s = six.text_type(s, encoding, errors) except UnicodeEncodeError: if not isinstance(s, Exception): raise @@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # output should be. s = u' '.join([force_unicode(arg, encoding, strings_only, errors) for arg in s]) - elif not isinstance(s, unicode): + elif not isinstance(s, six.text_type): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a # SafeUnicode at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise DjangoUnicodeDecodeError(s, *e.args) else: @@ -109,11 +111,11 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): If strings_only is True, don't convert (some) non-string-like objects. """ - if strings_only and isinstance(s, (types.NoneType, int)): + if strings_only and isinstance(s, (type(None), int)): return s # if isinstance(s, Promise): # return unicode(s).encode(encoding, errors) - if not isinstance(s, basestring): + if not isinstance(s, six.string_types): try: return str(s) except UnicodeEncodeError: @@ -123,8 +125,8 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): # further exception. return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) - return unicode(s).encode(encoding, errors) - elif isinstance(s, unicode): + return six.text_type(s).encode(encoding, errors) + elif isinstance(s, six.text_type): return s.encode(encoding, errors) elif s and encoding != 'utf-8': return s.decode('utf-8', errors).encode(encoding, errors) diff --git a/goose/utils/images.py b/goose/utils/images.py index 388d5c85..76a8c72f 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,8 +22,12 @@ """ import hashlib import os -import urllib2 +try: + from urllib2 import urlopen, Request +except ImportError: + from urllib.request import urlopen, Request from PIL import Image + from goose.utils.encoding import smart_str from goose.image import ImageDetails from goose.image import LocallyStoredImage @@ -115,8 +119,8 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - req = urllib2.Request(src) - f = urllib2.urlopen(req) + req = Request(src) + f = urlopen(req) data = f.read() return data except Exception: diff --git a/setup.py b/setup.py index ebad2547..fbe60081 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,6 @@ packages=find_packages(), include_package_data=True, zip_safe=False, - install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'], + install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk'], test_suite="tests" ) diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py index 709040c1..a21d362e 100644 --- a/tests/extractors/authors.py +++ b/tests/extractors/authors.py @@ -21,12 +21,26 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleAuthor(TestExtractionBase): def test_author_schema(self): article = self.getArticle() - fields = ['authors'] - self.runArticleAssertions(article=article, fields=fields) + field = 'authors' + + # Do not call self.runArticleAssertions because need to sort results, + # because set not save ordering, so test failed; + + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + expected_value.sort() + result_value.sort() + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index e19d20e0..72d4c601 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -22,11 +22,18 @@ """ import os import json -import urllib2 import unittest import socket -from StringIO import StringIO +try: + import urllib2 +except ImportError: + import urllib.request as urllib2 + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO from goose import Goose from goose.utils import FileHelper diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 30dc2754..854c4bd1 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase from goose.text import StopWordsChinese from goose.text import StopWordsArabic diff --git a/tests/extractors/images.py b/tests/extractors/images.py index e47a1dde..896d6985 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -20,13 +20,15 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import + import os import json import hashlib import unittest -from base import MockResponse -from base import TestExtractionBase +from .base import MockResponse +from .base import TestExtractionBase from goose.configuration import Configuration from goose.image import Image diff --git a/tests/extractors/links.py b/tests/extractors/links.py index 8539465e..ea15a459 100644 --- a/tests/extractors/links.py +++ b/tests/extractors/links.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleLinks(TestExtractionBase): diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py index fd45915a..a4eef74c 100644 --- a/tests/extractors/metas.py +++ b/tests/extractors/metas.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestMetas(TestExtractionBase): diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py index 415a784c..a0616227 100644 --- a/tests/extractors/opengraph.py +++ b/tests/extractors/opengraph.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestOpenGraph(TestExtractionBase): diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py index 8d2a13b9..355250d5 100644 --- a/tests/extractors/publishdate.py +++ b/tests/extractors/publishdate.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestPublishDate(TestExtractionBase): diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py index 22b17129..2f5562ba 100644 --- a/tests/extractors/tags.py +++ b/tests/extractors/tags.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleTags(TestExtractionBase): diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 09170205..c6f7813c 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestTitle(TestExtractionBase): diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py index 50300f43..3f72a604 100644 --- a/tests/extractors/tweets.py +++ b/tests/extractors/tweets.py @@ -20,8 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import -from base import TestExtractionBase +from .base import TestExtractionBase class TestArticleTweet(TestExtractionBase): diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 10be15ff..0350c8c3 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class ImageExtractionTests(TestExtractionBase): diff --git a/tests/parsers.py b/tests/parsers.py index 6614368d..7b47d89e 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -22,6 +22,7 @@ """ import os import unittest +import sys from goose.utils import FileHelper from goose.parsers import Parser @@ -260,5 +261,7 @@ class TestParser(ParserBase): class TestParserSoup(ParserBase): + + @unittest.skipIf(sys.version_info.major != 2, "supported only in python2") def setUp(self): self.parser = ParserSoup From 6d9156595ffa093febad6e137f877add162e269a Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 8 Apr 2015 19:19:53 +0300 Subject: [PATCH 02/44] Update requirements --- requirements.txt | 3 ++- setup.py | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7e6a6c09..bbd377ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ Pillow lxml cssselect jieba -beautifulsoup +beautifulsoup # Only on python2 nltk +six diff --git a/setup.py b/setup.py index fbe60081..df367682 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,8 @@ """ import os +import sys + from setuptools import setup, find_packages from imp import load_source @@ -53,6 +55,11 @@ except Exception: long_description = description +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] +if sys.version_info.major == 2: + requirements.append('beautifulsoup') + + setup(name='goose-extractor', version=version.__version__, description=description, @@ -66,6 +73,6 @@ packages=find_packages(), include_package_data=True, zip_safe=False, - install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk'], + install_requires=requirements, test_suite="tests" ) From 79a12dd349efdb77c2762e231b8dc6e5c4166a2b Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 11:58:29 +0300 Subject: [PATCH 03/44] Add python3 to CLASSIFIERS --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index df367682..ab03c825 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', 'Topic :: Internet', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries :: Python Modules'] From 76af358635237859917fec2e8f81cb17c6004934 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 12:21:19 +0300 Subject: [PATCH 04/44] Optimize imports --- goose/__init__.py | 1 - goose/extractors/images.py | 6 +----- goose/extractors/metas.py | 6 ++---- goose/image.py | 8 ++------ goose/outputformatters.py | 5 +---- goose/utils/images.py | 7 +++---- goose/video.py | 1 + tests/extractors/base.py | 5 +---- 8 files changed, 11 insertions(+), 28 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index f267fa34..d1cd6da8 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -21,7 +21,6 @@ limitations under the License. """ import os -import platform from tempfile import mkstemp from goose.version import version_info, __version__ diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 2dd63786..f258aead 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -23,11 +23,7 @@ import re import os -try: - from urlparse import urlparse, urljoin -except ImportError: - from urllib.parse import urlparse, urljoin - +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor from goose.image import Image diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index ee7d520e..5a65aa16 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -22,10 +22,8 @@ """ import re -try: - from urlparse import urlparse, urljoin -except ImportError: - from urllib.parse import urlparse, urljoin + +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor diff --git a/goose/image.py b/goose/image.py index 23026398..58ddd021 100644 --- a/goose/image.py +++ b/goose/image.py @@ -20,10 +20,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -try: - long -except NameError: - long = int class Image(object): @@ -50,7 +46,7 @@ def __init__(self): self.extraction_type = "NA" # stores how many bytes this image is. - self.bytes = long(0) + self.bytes = 0 def get_src(self): return self.src @@ -91,7 +87,7 @@ def set_mime_type(self, mime_type): class LocallyStoredImage(object): def __init__(self, src='', local_filename='', - link_hash='', bytes=long(0), file_extension='', height=0, width=0): + link_hash='', bytes=0, file_extension='', height=0, width=0): self.src = src self.local_filename = local_filename self.link_hash = link_hash diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 21dab451..808f2eee 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,10 +20,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -try: - from HTMLParser import HTMLParser -except ImportError: - from html.parser import HTMLParser +from six.moves.html_parser import HTMLParser from goose.text import innerTrim diff --git a/goose/utils/images.py b/goose/utils/images.py index 76a8c72f..92d5a133 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,10 +22,9 @@ """ import hashlib import os -try: - from urllib2 import urlopen, Request -except ImportError: - from urllib.request import urlopen, Request + +from six.moves.urllib.request import urlopen, Request + from PIL import Image from goose.utils.encoding import smart_str diff --git a/goose/video.py b/goose/video.py index 8509bba0..0691ac96 100644 --- a/goose/video.py +++ b/goose/video.py @@ -21,6 +21,7 @@ limitations under the License. """ + class Video(object): """\ Video object diff --git a/tests/extractors/base.py b/tests/extractors/base.py index 72d4c601..a0849e35 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -30,10 +30,7 @@ except ImportError: import urllib.request as urllib2 -try: - from StringIO import StringIO -except ImportError: - from io import StringIO +from six import StringIO from goose import Goose from goose.utils import FileHelper From f44c2af9e6eee9ac21f45612f1a6b76ee3682cd0 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 12:37:33 +0300 Subject: [PATCH 05/44] Restore python 2.6 support --- setup.py | 2 +- tests/parsers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index ab03c825..c046ed82 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ long_description = description requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] -if sys.version_info.major == 2: +if sys.version_info[0] == 2: requirements.append('beautifulsoup') diff --git a/tests/parsers.py b/tests/parsers.py index 7b47d89e..812cc6b2 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -22,7 +22,7 @@ """ import os import unittest -import sys +import six from goose.utils import FileHelper from goose.parsers import Parser @@ -262,6 +262,6 @@ class TestParser(ParserBase): class TestParserSoup(ParserBase): - @unittest.skipIf(sys.version_info.major != 2, "supported only in python2") + @unittest.skipIf(six.PY3, "supported only in python2") def setUp(self): self.parser = ParserSoup From 2e18083b9f52a903e53e5bbe30b55d28b37d181b Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 13:06:08 +0300 Subject: [PATCH 06/44] Try to fix tests in python 2.6 --- setup.py | 2 ++ tests/parsers.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c046ed82..bce19c5c 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,8 @@ requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] if sys.version_info[0] == 2: requirements.append('beautifulsoup') + if sys.version_info[1] < 7: + requirements.append('unittest2') setup(name='goose-extractor', diff --git a/tests/parsers.py b/tests/parsers.py index 812cc6b2..41aa5934 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -21,7 +21,11 @@ limitations under the License. """ import os -import unittest +try: + import unittest2 as unittest # Need to support skipIf in python 2.6 +except ImportError: + import unittest + import six from goose.utils import FileHelper From b7884f1c3e219dd2c4fa473a10f0aba80618e628 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 17:09:05 +0300 Subject: [PATCH 07/44] Fix smart_str --- goose/utils/encoding.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index eb98917c..25022704 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -115,9 +115,13 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): return s # if isinstance(s, Promise): # return unicode(s).encode(encoding, errors) - if not isinstance(s, six.string_types): + if isinstance(s, six.text_type): + return s.encode(encoding, errors) + elif not isinstance(s, (six.binary_type, six.string_types)): try: - return str(s) + if six.PY2: + return str(s) + return str(s).encode(encoding, errors) except UnicodeEncodeError: if isinstance(s, Exception): # An Exception subclass containing non-ASCII data that doesn't @@ -126,8 +130,6 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) return six.text_type(s).encode(encoding, errors) - elif isinstance(s, six.text_type): - return s.encode(encoding, errors) elif s and encoding != 'utf-8': return s.decode('utf-8', errors).encode(encoding, errors) else: From 90287612f6517ab07655acfe28cf43fd16ef8a1b Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 17:10:30 +0300 Subject: [PATCH 08/44] Fix ValueError if we get document with set encoding. Add test case for this. --- goose/parsers.py | 3 +-- goose/text.py | 34 +++++++++++++++++++++++++++++++--- tests/parsers.py | 15 +++++++++++++++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/goose/parsers.py b/goose/parsers.py index c0b091a9..61d6510e 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -26,8 +26,7 @@ from lxml import etree from copy import deepcopy -from goose.text import innerTrim -from goose.text import encodeValue +from goose.text import innerTrim, encodeValue, get_encodings_from_content class Parser(object): diff --git a/goose/text.py b/goose/text.py index 02846e20..960d0608 100644 --- a/goose/text.py +++ b/goose/text.py @@ -34,6 +34,28 @@ TABSSPACE = re.compile(r'[\s\t]+') +def get_encodings_from_content(content): + """ + Code from: + https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py + Return encodings from given content string. + :param content: string to extract encodings from. + """ + find_charset = re.compile( + r']', flags=re.I + ).findall + + find_pragma = re.compile( + r']', flags=re.I + ).findall + + find_xml = re.compile( + r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + + return find_charset(content) + find_pragma(content) + find_xml(content) + + def innerTrim(value): if isinstance(value, (six.text_type, six.string_types)): # remove tab and white space @@ -46,9 +68,15 @@ def innerTrim(value): def encodeValue(value): string_org = value try: - value = smart_unicode(value) - except (UnicodeEncodeError, DjangoUnicodeDecodeError): - value = smart_str(value) + encoding = get_encodings_from_content(value) + if encoding: + # If encoding is set we must pass bytes to lxml.html.fromstring or will get exception; + value = smart_str(value) + else: + try: + value = smart_unicode(value) + except (UnicodeEncodeError, DjangoUnicodeDecodeError): + value = smart_str(value) except Exception: value = string_org return value diff --git a/tests/parsers.py b/tests/parsers.py index 41aa5934..e5f17164 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -259,6 +259,21 @@ def test_delAttribute(self): # remove an unexistant attribute self.parser.delAttribute(div, attr="bla") + def test_encoding(self): + """ + If pass unicode string to lxml.html.fromstring with encoding set in document will receive: + "ValueError: Unicode strings with encoding declaration are not supported. + Please use bytes input or XML fragments without declaration." + Test for this case. + """ + html = u""" + + """ + html += u'' + html += u'

Я рядочок

' + html += u'' + self.parser.fromstring(html) + class TestParser(ParserBase): pass From 74743ab334af9c9c64e17c92227ceaca2fbc3ba9 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 18:55:11 +0300 Subject: [PATCH 09/44] Add py 3.4 to travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 2f2c722e..4b341e25 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ language: python python: - 2.6 - 2.7 + - 3.4 install: - pip install -r requirements.txt --use-mirrors From 5fbc788bd4340edaeb64012b1e6c1b8141bbf1cf Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 22:32:37 +0300 Subject: [PATCH 10/44] Remove install from requirements file in travis.yml as no way to avoid install bs3 under py3 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4b341e25..a242d0ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: - 3.4 install: - - pip install -r requirements.txt --use-mirrors + - pip install jieba - python setup.py install script: python setup.py test From 1d029324a11305d9c93eae5181c33fda17c757cc Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 22:32:57 +0300 Subject: [PATCH 11/44] Close image file after use --- goose/utils/images.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/goose/utils/images.py b/goose/utils/images.py index 92d5a133..31a55d61 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -38,9 +38,9 @@ class ImageUtils(object): def get_image_dimensions(self, identify_program, path): image_details = ImageDetails() try: - image = Image.open(path) - image_details.set_mime_type(image.format) - width, height = image.size + with Image.open(path) as image: + image_details.set_mime_type(image.format) + width, height = image.size image_details.set_width(width) image_details.set_height(height) except IOError: From 9091e3827cae539b3bacd2cabd5eb3f4c39225ea Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 22:35:10 +0300 Subject: [PATCH 12/44] Fix tests --- goose/parsers.py | 13 ++++++++++--- goose/text.py | 12 +++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/goose/parsers.py b/goose/parsers.py index 61d6510e..fab3eb31 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -26,7 +26,7 @@ from lxml import etree from copy import deepcopy -from goose.text import innerTrim, encodeValue, get_encodings_from_content +from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str class Parser(object): @@ -51,8 +51,15 @@ def css_select(self, node, selector): @classmethod def fromstring(self, html): - html = encodeValue(html) - self.doc = lxml.html.fromstring(html) + encoding = get_encodings_from_content(html) + encoding = encoding and encoding[0] or None + if not encoding: + html = encodeValue(html) + self.doc = lxml.html.fromstring(html) + else: + html = smart_str(html, encoding=encoding) + parser = lxml.html.HTMLParser(encoding=encoding) + self.doc = lxml.html.fromstring(html, parser=parser) return self.doc @classmethod diff --git a/goose/text.py b/goose/text.py index 960d0608..343fdbc2 100644 --- a/goose/text.py +++ b/goose/text.py @@ -68,15 +68,9 @@ def innerTrim(value): def encodeValue(value): string_org = value try: - encoding = get_encodings_from_content(value) - if encoding: - # If encoding is set we must pass bytes to lxml.html.fromstring or will get exception; - value = smart_str(value) - else: - try: - value = smart_unicode(value) - except (UnicodeEncodeError, DjangoUnicodeDecodeError): - value = smart_str(value) + value = smart_unicode(value) + except (UnicodeEncodeError, DjangoUnicodeDecodeError): + value = smart_str(value) except Exception: value = string_org return value From 8fa55b4ebc41b2ebda2edfb309c01719769c9549 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 16 Apr 2015 21:57:27 +0300 Subject: [PATCH 13/44] Fix encoding detection --- goose/text.py | 36 ++++++++++++++++++++++++------------ goose/utils/encoding.py | 4 +--- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/goose/text.py b/goose/text.py index 343fdbc2..31070cf0 100644 --- a/goose/text.py +++ b/goose/text.py @@ -41,18 +41,30 @@ def get_encodings_from_content(content): Return encodings from given content string. :param content: string to extract encodings from. """ - find_charset = re.compile( - r']', flags=re.I - ).findall - - find_pragma = re.compile( - r']', flags=re.I - ).findall - - find_xml = re.compile( - r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' - ).findall - + if isinstance(content, six.binary_type) and six.PY3: + find_charset = re.compile( + br']', flags=re.I + ).findall + + find_pragma = re.compile( + br']', flags=re.I + ).findall + + find_xml = re.compile( + br'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + else: + find_charset = re.compile( + r']', flags=re.I + ).findall + + find_pragma = re.compile( + r']', flags=re.I + ).findall + + find_xml = re.compile( + r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall return find_charset(content) + find_pragma(content) + find_xml(content) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index 25022704..f94f476e 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -117,7 +117,7 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): # return unicode(s).encode(encoding, errors) if isinstance(s, six.text_type): return s.encode(encoding, errors) - elif not isinstance(s, (six.binary_type, six.string_types)): + elif not isinstance(s, six.binary_type): try: if six.PY2: return str(s) @@ -130,7 +130,5 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) return six.text_type(s).encode(encoding, errors) - elif s and encoding != 'utf-8': - return s.decode('utf-8', errors).encode(encoding, errors) else: return s From 1ef277b5784887e93c952a4a7cf5ce3a5fb993b4 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 16 Apr 2015 21:57:44 +0300 Subject: [PATCH 14/44] Fix test runner under py3 --- tests/extractors/base.py | 10 +++++++--- tests/extractors/images.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index a0849e35..93b3c075 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -30,7 +30,8 @@ except ImportError: import urllib.request as urllib2 -from six import StringIO +import six +from six import StringIO, BytesIO from goose import Goose from goose.utils import FileHelper @@ -51,13 +52,16 @@ class MockResponse(): def __init__(self, cls): self.cls = cls - def content(self): + def content(self, req): return "response" def response(self, req): data = self.content(req) url = req.get_full_url() - resp = urllib2.addinfourl(StringIO(data), data, url) + if isinstance(data, six.binary_type): + resp = urllib2.addinfourl(BytesIO(data), data, url) + else: + resp = urllib2.addinfourl(StringIO(data), data, url) resp.code = self.code resp.msg = self.msg return resp diff --git a/tests/extractors/images.py b/tests/extractors/images.py index 896d6985..9a9712a1 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -43,7 +43,7 @@ class MockResponseImage(MockResponse): def image_content(self, req): - md5_hash = hashlib.md5(req.get_full_url()).hexdigest() + md5_hash = hashlib.md5(req.get_full_url().encode("utf-8")).hexdigest() current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), From 964eb4806699b58da790124d8a0d5a84c77a1ce1 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Fri, 13 Nov 2015 18:27:36 +0300 Subject: [PATCH 15/44] Fix unicode processing + ` ` support * As STOP_WORDS are stored in unicode format we should keep our words candidates in unicode also to be able to compare candidates against dictionary correctly * With some languages, short stopwords are linked to the next word in the sentance with no-breakable-space. To designate those stop words we should support nbsp when tokenizing. Russian is an example. So this fixes #223 --- goose/text.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/goose/text.py b/goose/text.py index 3ef63d6b..040adf4e 100644 --- a/goose/text.py +++ b/goose/text.py @@ -28,6 +28,7 @@ from goose.utils.encoding import smart_str from goose.utils.encoding import DjangoUnicodeDecodeError +SPACE_SYMBOLS = re.compile(ur'[\s\xa0\t]') TABSSPACE = re.compile(r'[\s\t]+') @@ -106,12 +107,14 @@ def __init__(self, language='en'): def remove_punctuation(self, content): # code taken form # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python + translate = lambda data: data.translate(self.TRANS_TABLE, string.punctuation) if isinstance(content, unicode): - content = content.encode('utf-8') - return content.translate(self.TRANS_TABLE, string.punctuation) + return translate(content.encode('utf-8')).decode('utf-8') # Don't forget to decode back if encoded + else: + return translate(content) def candiate_words(self, stripped_input): - return stripped_input.split(' ') + return re.split(SPACE_SYMBOLS, stripped_input) def get_stopword_count(self, content): if not content: From 72929331d44309f9002ae0dd3cd268cfddb0e43f Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 13 Jan 2016 12:31:51 +0300 Subject: [PATCH 16/44] Move to requests as network lib Html fetching is now done with requests Using requests allows writing high-level code encapsulating network & html level (decoding gzip, etc) --- goose/crawler.py | 4 ---- goose/network.py | 39 +++++++++++++++------------------------ requirements.txt | 1 + setup.py | 2 +- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 34daf048..8838bc0f 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -213,10 +213,6 @@ def get_html(self, crawl_candidate, parsing_candidate): # fetch HTML html = self.htmlfetcher.get_html(parsing_candidate.url) - self.article.additional_data.update({ - 'request': self.htmlfetcher.request, - 'result': self.htmlfetcher.result, - }) return html def get_metas_extractor(self): diff --git a/goose/network.py b/goose/network.py index 2b8265ad..41be8238 100644 --- a/goose/network.py +++ b/goose/network.py @@ -21,41 +21,32 @@ limitations under the License. """ import six - -try: - from urllib2 import urlopen, Request -except ImportError: - from urllib.request import urlopen, Request +import requests class HtmlFetcher(object): def __init__(self, config): self.config = config - # set header - self.headers = {'User-agent': self.config.browser_user_agent} + self._connection = requests.Session() + self._connection.headers = {'User-agent': self.config.browser_user_agent} + + self._url = None def get_url(self): - # if we have a result - # get the final_url - if self.result is not None: - return self.result.geturl() - return None + return self._url def get_html(self, url): # utf-8 encode unicode url if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') - # set request - self.request = Request(url, headers=self.headers) - # do request - try: - self.result = urlopen(self.request, timeout=self.config.http_timeout) - except Exception: - self.result = None - - # read the result content - if self.result is not None: - return self.result.read() - return None + response = self._connection.get(url) + if response.ok: + self._url = response.url + text = response.text + else: + self._url = None + text = None + + return text diff --git a/requirements.txt b/requirements.txt index bbd377ee..8d153935 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +requests Pillow lxml cssselect diff --git a/setup.py b/setup.py index bce19c5c..66ee40f4 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ except Exception: long_description = description -requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests'] if sys.version_info[0] == 2: requirements.append('beautifulsoup') if sys.version_info[1] < 7: From 87808d2e84e1d8e0fe870f6609ef460e6809a391 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 13 Jan 2016 12:32:55 +0300 Subject: [PATCH 17/44] Draft new release 1.0.28: * Move to requests as network library --- goose/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/version.py b/goose/version.py index fedcbb6d..ee492dcd 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 25) +version_info = (1, 0, 28) __version__ = ".".join(map(str, version_info)) From 5b4ef12986f1ce64d2623fdf17baa78ef2a3305d Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 13 Jan 2016 17:53:55 +0300 Subject: [PATCH 18/44] Allow multiple 'special tags' Some special tags can be false positive, so we had to porcess them all to select best top node --- goose/cleaners.py | 3 +-- goose/crawler.py | 5 ++++- goose/extractors/content.py | 11 ++++++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/goose/cleaners.py b/goose/cleaners.py index 9ab45b6d..791b232f 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -68,8 +68,7 @@ def __init__(self, config, article): .append("\t")\ .append("^\\s+$") - def clean(self): - doc_to_clean = self.article.doc + def clean(self, doc_to_clean): doc_to_clean = self.clean_body_classes(doc_to_clean) doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.clean_em_tags(doc_to_clean) diff --git a/goose/crawler.py b/goose/crawler.py index 8838bc0f..d5caeb58 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -161,7 +161,10 @@ def crawl(self, crawl_candidate): self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document - self.article.doc = self.cleaner.clean() + if not isinstance(self.article.doc, list): + self.article.doc = [self.cleaner.clean(self.article.doc)] + else: + self.article.doc = map(lambda doc1: self.cleaner.clean(doc1), self.article.doc) # big stuff self.article.top_node = self.extractor.calculate_best_node() diff --git a/goose/extractors/content.py b/goose/extractors/content.py index afdc2c91..a9d176e4 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -52,7 +52,7 @@ def get_known_article_tags(self): self.article.doc, **item) if len(nodes): - return nodes[0] + return nodes return None def is_articlebody(self, node): @@ -315,16 +315,17 @@ def get_node_gravity_score(self, node): return None return int(grvScoreString) - def nodes_to_check(self, doc): + def nodes_to_check(self, docs): """\ returns a list of nodes we want to search on like paragraphs and tables """ nodes_to_check = [] - for tag in ['p', 'pre', 'td']: - items = self.parser.getElementsByTag(doc, tag=tag) - nodes_to_check += items + for doc in docs: + for tag in ['p', 'pre', 'td']: + items = self.parser.getElementsByTag(doc, tag=tag) + nodes_to_check += items return nodes_to_check def is_table_and_no_para_exist(self, e): From 15aeb23579ff91b7d660ec18e4e6d27b497eae89 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 13 Jan 2016 17:55:45 +0300 Subject: [PATCH 19/44] Do not parse binary to text with requests Requests uses headers-preferred content encoding, but for HTML better choise is TAGS-preferred content encoding --- goose/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/network.py b/goose/network.py index 41be8238..c1b15e6b 100644 --- a/goose/network.py +++ b/goose/network.py @@ -44,7 +44,7 @@ def get_html(self, url): response = self._connection.get(url) if response.ok: self._url = response.url - text = response.text + text = response.content else: self._url = None text = None From f7eee8b5b44beaa6e5aba4b22c32b546a6a39bad Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 13 Jan 2016 18:44:04 +0300 Subject: [PATCH 20/44] Fix `map only iterable once' in py3 issue --- goose/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/crawler.py b/goose/crawler.py index d5caeb58..7db1266b 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -164,7 +164,7 @@ def crawl(self, crawl_candidate): if not isinstance(self.article.doc, list): self.article.doc = [self.cleaner.clean(self.article.doc)] else: - self.article.doc = map(lambda doc1: self.cleaner.clean(doc1), self.article.doc) + self.article.doc = list(map(lambda doc1: self.cleaner.clean(doc1), self.article.doc)) # big stuff self.article.top_node = self.extractor.calculate_best_node() From 21bc2b2ea88dbe3798a5b574044134e33ce39626 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 13 Jan 2016 18:44:26 +0300 Subject: [PATCH 21/44] Additional content extraction filter --- goose/extractors/content.py | 1 + 1 file changed, 1 insertion(+) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index a9d176e4..17011154 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -28,6 +28,7 @@ KNOWN_ARTICLE_CONTENT_TAGS = [ {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, + {'attr': 'class', 'value': 'g-content'}, {'tag': 'article'}, ] From 6849ce668947d449855b6014278a6c0824b945b2 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 14 Jan 2016 16:05:01 +0300 Subject: [PATCH 22/44] Fix clean issue --- goose/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/crawler.py b/goose/crawler.py index 7db1266b..4b972af3 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -164,7 +164,7 @@ def crawl(self, crawl_candidate): if not isinstance(self.article.doc, list): self.article.doc = [self.cleaner.clean(self.article.doc)] else: - self.article.doc = list(map(lambda doc1: self.cleaner.clean(doc1), self.article.doc)) + self.article.doc = list(map(lambda doc1: self.cleaner.clean(deepcopy(doc1)), self.article.doc)) # big stuff self.article.top_node = self.extractor.calculate_best_node() From 024760c6f57d5c5c37fffed02a3030d5d2965647 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 14 Jan 2016 16:46:39 +0300 Subject: [PATCH 23/44] Do not stop on first found text candidate --- goose/extractors/content.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 17011154..e4f9f679 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -26,6 +26,7 @@ KNOWN_ARTICLE_CONTENT_TAGS = [ + {'attr': 'class', 'value': 'short-story'}, {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, {'attr': 'class', 'value': 'g-content'}, @@ -48,12 +49,13 @@ def get_language(self): return self.config.target_language def get_known_article_tags(self): + nodes = [] for item in KNOWN_ARTICLE_CONTENT_TAGS: - nodes = self.parser.getElementsByTag( - self.article.doc, - **item) - if len(nodes): - return nodes + nodes.extend(self.parser.getElementsByTag( + self.article.doc, + **item)) + if len(nodes): + return nodes return None def is_articlebody(self, node): From a5bd141b350360361b8fc1147ce1b27b95fe7727 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 14 Jan 2016 16:47:10 +0300 Subject: [PATCH 24/44] Add all sort of disclaimers to the trash --- goose/cleaners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/cleaners.py b/goose/cleaners.py index 791b232f..2ad975d0 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -50,7 +50,7 @@ def __init__(self, config, article): "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" - "|legende|ajoutVideo|timestamp|js_replies" + "|legende|ajoutVideo|timestamp|js_replies|disclaim" ) self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re From da6cc1d3f5f23d51d3cf448d2a7e0ed956a191ae Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 20 Jan 2016 18:25:45 +0300 Subject: [PATCH 25/44] Fix tests crash with requests Moving to requests as http library made test mocks, that used urllib mocking, incorrect This commit fixes tests by using mock_requests library for mocking, instead of urllib one. --- requirements.txt | 1 + setup.py | 2 +- tests/extractors/base.py | 18 +++++++++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8d153935..5c57cc97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ requests +requests_mock Pillow lxml cssselect diff --git a/setup.py b/setup.py index 66ee40f4..f83b0960 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ except Exception: long_description = description -requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests'] +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests', 'requests_mock'] if sys.version_info[0] == 2: requirements.append('beautifulsoup') if sys.version_info[1] < 7: diff --git a/tests/extractors/base.py b/tests/extractors/base.py index 93b3c075..10be4f49 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -24,6 +24,7 @@ import json import unittest import socket +import requests_mock try: import urllib2 @@ -42,7 +43,7 @@ # Response -class MockResponse(): +class MockResponse: """\ Base mock response class """ @@ -140,6 +141,14 @@ class TestExtractionBase(BaseMockTests): """ callback = MockResponseExtractors + def setUp(self): + # patch DNS + self.original_getaddrinfo = socket.getaddrinfo + socket.getaddrinfo = self.new_getaddrinfo + + def tearDown(self): + socket.getaddrinfo = self.original_getaddrinfo + def getRawHtml(self): test, suite, module, cls, func = self.id().split('.') path = os.path.join( @@ -211,8 +220,11 @@ def runArticleAssertions(self, article, fields): self.assertEqual(expected_value, result_value, msg=msg) def extract(self, instance): - article = instance.extract(url=self.data['url']) - return article + url = self.data['url'] + with requests_mock.mock() as m: + m.get(url, content=self.getRawHtml().encode('utf-8')) + article = instance.extract(url=url) + return article def getConfig(self): config = Configuration() From c64b245a8a6267c900365cc26070344afad30959 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 20 Jan 2016 18:27:12 +0300 Subject: [PATCH 26/44] Fix deprecated class warning --- goose/outputformatters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 808f2eee..00df6c3c 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ +import html from six.moves.html_parser import HTMLParser +import sys from goose.text import innerTrim @@ -67,12 +69,14 @@ def get_formatted_text(self): self.remove_fewwords_paragraphs() return self.convert_to_text() + _text_parser = HTMLParser() if sys.version_info[0] == 2 else html + def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: - txt = HTMLParser().unescape(txt) + txt = self._text_parser.unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) return '\n\n'.join(txts) From c7a207d0bb0ca2dab2af5a455ec73aca598ec2c9 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 20 Jan 2016 19:16:21 +0300 Subject: [PATCH 27/44] Use requests for image extraction --- goose/utils/images.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/goose/utils/images.py b/goose/utils/images.py index 31a55d61..77cd0ac3 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,8 +22,7 @@ """ import hashlib import os - -from six.moves.urllib.request import urlopen, Request +import requests from PIL import Image @@ -118,9 +117,8 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - req = Request(src) - f = urlopen(req) - data = f.read() + f = requests.get(src) + data = f.content return data except Exception: return None From 320fa7da429bad2c65afa6b0c5733f53a910fa27 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Wed, 20 Jan 2016 19:16:52 +0300 Subject: [PATCH 28/44] Fix image extraction mocker --- tests/extractors/base.py | 57 ++++++-------------------------------- tests/extractors/images.py | 34 +++++++++++++---------- 2 files changed, 28 insertions(+), 63 deletions(-) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index 10be4f49..a154babc 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -53,48 +53,8 @@ class MockResponse: def __init__(self, cls): self.cls = cls - def content(self, req): - return "response" - - def response(self, req): - data = self.content(req) - url = req.get_full_url() - if isinstance(data, six.binary_type): - resp = urllib2.addinfourl(BytesIO(data), data, url) - else: - resp = urllib2.addinfourl(StringIO(data), data, url) - resp.code = self.code - resp.msg = self.msg - return resp - - -class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): - """\ - Mocked HTTPHandler in order to query APIs locally - """ - cls = None - - def https_open(self, req): - return self.http_open(req) - - def http_open(self, req): - r = self.cls.callback(self.cls) - return r.response(req) - - @staticmethod - def patch(cls): - opener = urllib2.build_opener(MockHTTPHandler) - urllib2.install_opener(opener) - # dirty ! - for h in opener.handlers: - if isinstance(h, MockHTTPHandler): - h.cls = cls - return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] - - @staticmethod - def unpatch(): - # urllib2 - urllib2._opener = None + def contents(self): + pass class BaseMockTests(unittest.TestCase): @@ -107,10 +67,8 @@ def setUp(self): # patch DNS self.original_getaddrinfo = socket.getaddrinfo socket.getaddrinfo = self.new_getaddrinfo - MockHTTPHandler.patch(self) def tearDown(self): - MockHTTPHandler.unpatch() # DNS socket.getaddrinfo = self.original_getaddrinfo @@ -122,7 +80,7 @@ def _get_current_testname(self): class MockResponseExtractors(MockResponse): - def content(self, req): + def contents(self): test, suite, module, cls, func = self.cls.id().split('.') path = os.path.join( os.path.dirname(CURRENT_PATH), @@ -132,7 +90,7 @@ def content(self, req): "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) - return content + yield self.cls.data['url'], content.encode('utf-8') class TestExtractionBase(BaseMockTests): @@ -220,10 +178,11 @@ def runArticleAssertions(self, article, fields): self.assertEqual(expected_value, result_value, msg=msg) def extract(self, instance): - url = self.data['url'] + article_url = self.data['url'] with requests_mock.mock() as m: - m.get(url, content=self.getRawHtml().encode('utf-8')) - article = instance.extract(url=url) + for url, content in self.callback(self).contents(): + m.get(url, content=content) + article = instance.extract(url=article_url) return article def getConfig(self): diff --git a/tests/extractors/images.py b/tests/extractors/images.py index 9a9712a1..5fce71b5 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -42,8 +42,8 @@ class MockResponseImage(MockResponse): - def image_content(self, req): - md5_hash = hashlib.md5(req.get_full_url().encode("utf-8")).hexdigest() + def image_content(self, url): + md5_hash = hashlib.md5(url.encode('utf-8')).hexdigest() current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), @@ -53,12 +53,15 @@ def image_content(self, req): current_test, md5_hash) path = os.path.abspath(path) - f = open(path, 'rb') - content = f.read() - f.close() - return content - - def html_content(self, req): + try: + f = open(path, 'rb') + content = f.read() + f.close() + return content + except Exception: + return None + + def html_content(self): current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), @@ -68,12 +71,15 @@ def html_content(self, req): current_test, "%s.html" % current_test) path = os.path.abspath(path) - return FileHelper.loadResourceFile(path) - - def content(self, req): - if self.cls.data['url'] == req.get_full_url(): - return self.html_content(req) - return self.image_content(req) + return FileHelper.loadResourceFile(path).encode('utf-8') + + def contents(self): + yield self.cls.data['url'], self.html_content() + img_url = self.cls.data['expected']['top_image']['src'] + if img_url: + print(img_url) + yield img_url, self.image_content(img_url) + # self.image_content() class ImageExtractionTests(TestExtractionBase): From c0d6b69c1b937a0412d3a317a9ad781069b8eeca Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 20:38:50 +0300 Subject: [PATCH 29/44] Move test dependencies out of main deps --- requirements.txt | 1 - setup.py | 34 +++++++++++++++++----------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5c57cc97..8d153935 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ requests -requests_mock Pillow lxml cssselect diff --git a/setup.py b/setup.py index f83b0960..ba134fb0 100644 --- a/setup.py +++ b/setup.py @@ -56,26 +56,26 @@ except Exception: long_description = description -requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests', 'requests_mock'] +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests'] if sys.version_info[0] == 2: requirements.append('beautifulsoup') if sys.version_info[1] < 7: requirements.append('unittest2') - setup(name='goose-extractor', - version=version.__version__, - description=description, - long_description=long_description, - keywords='scrapping, extractor, web scrapping', - classifiers=CLASSIFIERS, - author='Xavier Grangier', - author_email='grangier@gmail.com', - url='https://github.com/grangier/python-goose', - license='Apache', - packages=find_packages(), - include_package_data=True, - zip_safe=False, - install_requires=requirements, - test_suite="tests" -) + version=version.__version__, + description=description, + long_description=long_description, + keywords='scrapping, extractor, web scrapping', + classifiers=CLASSIFIERS, + author='Xavier Grangier', + author_email='grangier@gmail.com', + url='https://github.com/grangier/python-goose', + license='Apache', + packages=find_packages(), + include_package_data=True, + zip_safe=False, + install_requires=requirements, + test_suite="tests", + tests_require=['requests_mock'] + ) From 4724f829e50a6b4ec029a87ca94a3224435180cc Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 21:31:02 +0300 Subject: [PATCH 30/44] Undo wrong edit --- goose/outputformatters.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 00df6c3c..d2cb5019 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,9 +20,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -import html from six.moves.html_parser import HTMLParser -import sys from goose.text import innerTrim @@ -69,7 +67,7 @@ def get_formatted_text(self): self.remove_fewwords_paragraphs() return self.convert_to_text() - _text_parser = HTMLParser() if sys.version_info[0] == 2 else html + _text_parser = HTMLParser() def convert_to_text(self): txts = [] From 0398cd1c3110075e05d7eed56962f387c12bc990 Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 21:32:50 +0300 Subject: [PATCH 31/44] Drop xml declaration in test data It is not clear why it was there in the first place, as valid html does not contain such header. Again this is not connected to the test itself. --- tests/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/parsers.py b/tests/parsers.py index e5f17164..6e5e1986 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -266,7 +266,7 @@ def test_encoding(self): Please use bytes input or XML fragments without declaration." Test for this case. """ - html = u""" + html = u""" """ html += u'' From 397465cd84347612256122563b97405da5f03870 Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 21:37:41 +0300 Subject: [PATCH 32/44] Move `unittest2` to test dependencies --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ba134fb0..6592f0d8 100644 --- a/setup.py +++ b/setup.py @@ -57,10 +57,11 @@ long_description = description requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six', 'requests'] +test_requirements = ['requests_mock'] if sys.version_info[0] == 2: requirements.append('beautifulsoup') if sys.version_info[1] < 7: - requirements.append('unittest2') + test_requirements.append('unittest2') setup(name='goose-extractor', version=version.__version__, @@ -77,5 +78,5 @@ zip_safe=False, install_requires=requirements, test_suite="tests", - tests_require=['requests_mock'] + tests_require=test_requirements ) From 2900678e479b636ca64573a5c4dc7040116ae018 Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 21:38:58 +0300 Subject: [PATCH 33/44] Try to build with python 3.5 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index a242d0ac..0eefe3a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ python: - 2.6 - 2.7 - 3.4 + - 3.5 install: - pip install jieba From fd0712a100c879184d1a66e7ee2733098cc007d7 Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 22:08:00 +0300 Subject: [PATCH 34/44] Use same http session for text & images This benefits to automatic cookie handling, keep alive connection and may be some other features --- goose/crawler.py | 12 +++++++----- goose/extractors/images.py | 7 +++---- goose/network.py | 4 ++-- goose/utils/images.py | 3 +-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 4b972af3..e5713b57 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -39,7 +39,7 @@ from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter -from goose.network import HtmlFetcher +from goose.network import NetworkFetcher class CrawlCandidate(object): @@ -99,11 +99,13 @@ def __init__(self, config): # title extractor self.title_extractor = self.get_title_extractor() + # html fetcher + self.fetcher = NetworkFetcher(self.config) + # image extrator self.image_extractor = self.get_image_extractor() - # html fetcher - self.htmlfetcher = HtmlFetcher(self.config) + # TODO : log prefix self.logPrefix = "crawler:" @@ -215,7 +217,7 @@ def get_html(self, crawl_candidate, parsing_candidate): return crawl_candidate.raw_html # fetch HTML - html = self.htmlfetcher.get_html(parsing_candidate.url) + html = self.fetcher.fetch(parsing_candidate.url) return html def get_metas_extractor(self): @@ -243,7 +245,7 @@ def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): - return ImageExtractor(self.config, self.article) + return ImageExtractor(self.fetcher, self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) diff --git a/goose/extractors/images.py b/goose/extractors/images.py index f258aead..ebaf6935 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -48,9 +48,10 @@ def __init__(self, node, parent_depth, sibling_depth): class ImageExtractor(BaseExtractor): - def __init__(self, config, article): + def __init__(self, fetcher, config, article): super(ImageExtractor, self).__init__(config, article) + self.fetcher = fetcher self.custom_site_mapping = {} self.load_customesite_mapping() @@ -333,9 +334,7 @@ def get_local_image(self, src): """\ returns the bytes of the image file on disk """ - local_image = ImageUtils.store_image(None, - self.link_hash, src, self.config) - return local_image + return ImageUtils.store_image(self.fetcher, self.link_hash, src, self.config) def get_clean_domain(self): if self.article.domain: diff --git a/goose/network.py b/goose/network.py index c1b15e6b..c2bb8969 100644 --- a/goose/network.py +++ b/goose/network.py @@ -24,7 +24,7 @@ import requests -class HtmlFetcher(object): +class NetworkFetcher(object): def __init__(self, config): self.config = config @@ -36,7 +36,7 @@ def __init__(self, config): def get_url(self): return self._url - def get_html(self, url): + def fetch(self, url): # utf-8 encode unicode url if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') diff --git a/goose/utils/images.py b/goose/utils/images.py index 77cd0ac3..a98c5af4 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,7 +22,6 @@ """ import hashlib import os -import requests from PIL import Image @@ -117,7 +116,7 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - f = requests.get(src) + f = http_client.get(src) data = f.content return data except Exception: From 60ab80ae10d97dbea58c3544c5dc6d25d341763c Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 22:54:52 +0300 Subject: [PATCH 35/44] Fix function name --- goose/utils/images.py | 2 +- tests/extractors/images.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/goose/utils/images.py b/goose/utils/images.py index a98c5af4..cb92b9b3 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -116,7 +116,7 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - f = http_client.get(src) + f = http_client.fetch(src) data = f.content return data except Exception: diff --git a/tests/extractors/images.py b/tests/extractors/images.py index 5fce71b5..9c089fe2 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -77,7 +77,6 @@ def contents(self): yield self.cls.data['url'], self.html_content() img_url = self.cls.data['expected']['top_image']['src'] if img_url: - print(img_url) yield img_url, self.image_content(img_url) # self.image_content() From b61bd4c942062b0c79faf3d55a1482769e135717 Mon Sep 17 00:00:00 2001 From: Lol4t0 Date: Wed, 20 Jan 2016 23:21:35 +0300 Subject: [PATCH 36/44] Fix API --- goose/utils/images.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/goose/utils/images.py b/goose/utils/images.py index cb92b9b3..9c12a1f8 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -116,8 +116,6 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - f = http_client.fetch(src) - data = f.content - return data + return http_client.fetch(src) except Exception: return None From f712600a02218d7fe5e7b938153d82ce46cdeb09 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 21 Jan 2016 11:55:03 +0300 Subject: [PATCH 37/44] Remove obsolete known issues After moving to requests http backend cookies are handled correctly. Test url http://www.nytimes.com/2013/08/18/world/middleeast/pressure-by-us-failed-to-sway-egypts-leaders.html?hp checked working --- README.rst | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/README.rst b/README.rst index 5dc8ab0b..bf12c3dd 100644 --- a/README.rst +++ b/README.rst @@ -180,7 +180,7 @@ class. Goose in Korean ----------------- +--------------- In order to use Goose in Korean you have to use the StopWordsKorean class. @@ -197,24 +197,6 @@ class. 14년째 세계 각국의 통신·안전·전파 규격 시험과 인증 한 우물만 파고 있는 이 회사 박채규 대표가 만나기로 한 주인공이다. 그는 전기전자·무선통신·자동차 전장품 분야에 - -Known issues ------------- - -- There are some issues with unicode URLs. -- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance: - - >>> import urllib2 - >>> import goose - >>> url = "http://www.nytimes.com/2013/08/18/world/middleeast/pressure-by-us-failed-to-sway-egypts-leaders.html?hp" - >>> opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) - >>> response = opener.open(url) - >>> raw_html = response.read() - >>> g = goose.Goose() - >>> a = g.extract(raw_html=raw_html) - >>> a.cleaned_text - u'CAIRO \u2014 For a moment, at least, American and European diplomats trying to defuse the volatile standoff in Egypt thought they had a breakthrough.\n\nAs t' - TODO ---- From abd427bf2e1a3405fefeac6532b5f13563ed450b Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 21 Jan 2016 11:56:01 +0300 Subject: [PATCH 38/44] Adjust classifiers Python 3.4, Python 3.5 added --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 6592f0d8..c4d1fabf 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,8 @@ 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Internet', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries :: Python Modules'] From 9632746b3f5e31a84af903a8055cf45d392441b2 Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 21 Jan 2016 12:05:50 +0300 Subject: [PATCH 39/44] Draft new release 1.0.29 * Requests used for images. Same http session is used for all requests. * Analyze all possible text root nodes and select best one, do not stop on first text root node candidate * Improve text selection filters --- goose/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/version.py b/goose/version.py index ee492dcd..4f2a84c1 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 28) +version_info = (1, 0, 29) __version__ = ".".join(map(str, version_info)) From 8644cfe6bfa6d8e136ea9c91993f68d218edb09b Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 21 Jan 2016 14:55:53 +0300 Subject: [PATCH 40/44] Remove unused import --- tests/extractors/base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index a154babc..cdf6cb32 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -31,9 +31,6 @@ except ImportError: import urllib.request as urllib2 -import six -from six import StringIO, BytesIO - from goose import Goose from goose.utils import FileHelper from goose.configuration import Configuration From 118d220323d75963a27aef26e9299d7cab18451e Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 21 Jan 2016 15:30:52 +0300 Subject: [PATCH 41/44] Make list of known DOM nodes configurable Config parameter is `known_context_patterns' Default: { 'known_context_patterns': [ {'attr': 'class', 'value': 'short-story'}, {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, {'attr': 'class', 'value': 'g-content'}, {'tag': 'article'}, ] } --- goose/configuration.py | 11 +++++++++++ goose/extractors/content.py | 13 ++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/goose/configuration.py b/goose/configuration.py index 4913f699..e2db3fd7 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -39,6 +39,14 @@ if six.PY2: AVAILABLE_PARSERS['soup'] = ParserSoup +KNOWN_ARTICLE_CONTENT_PATTERNS = [ + {'attr': 'class', 'value': 'short-story'}, + {'attr': 'itemprop', 'value': 'articleBody'}, + {'attr': 'class', 'value': 'post-content'}, + {'attr': 'class', 'value': 'g-content'}, + {'tag': 'article'}, +] + class Configuration(object): @@ -104,6 +112,9 @@ def __init__(self): # http timeout self.http_timeout = HTTP_DEFAULT_TIMEOUT + # known context patterns. Goose at first will search context at dom nodes, qualifying these patterns + self.known_context_patterns = KNOWN_ARTICLE_CONTENT_PATTERNS + def get_parser(self): return AVAILABLE_PARSERS[self.parser_class] diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e4f9f679..433ed0c9 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -25,15 +25,6 @@ from goose.extractors import BaseExtractor -KNOWN_ARTICLE_CONTENT_TAGS = [ - {'attr': 'class', 'value': 'short-story'}, - {'attr': 'itemprop', 'value': 'articleBody'}, - {'attr': 'class', 'value': 'post-content'}, - {'attr': 'class', 'value': 'g-content'}, - {'tag': 'article'}, -] - - class ContentExtractor(BaseExtractor): def get_language(self): @@ -50,7 +41,7 @@ def get_language(self): def get_known_article_tags(self): nodes = [] - for item in KNOWN_ARTICLE_CONTENT_TAGS: + for item in self.config.known_context_patterns: nodes.extend(self.parser.getElementsByTag( self.article.doc, **item)) @@ -59,7 +50,7 @@ def get_known_article_tags(self): return None def is_articlebody(self, node): - for item in KNOWN_ARTICLE_CONTENT_TAGS: + for item in self.config.known_context_patterns: # attribute if "attr" in item and "value" in item: if self.parser.getAttribute(node, item['attr']) == item['value']: From 28de450c00ab1e242f2acf6a8c686592678c0eff Mon Sep 17 00:00:00 2001 From: Lol4to Date: Thu, 21 Jan 2016 15:33:09 +0300 Subject: [PATCH 42/44] Use http timeout provided by configuration When performing network requests, use request timeout, provided by goose configuration --- goose/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/network.py b/goose/network.py index c2bb8969..793d14fe 100644 --- a/goose/network.py +++ b/goose/network.py @@ -41,7 +41,7 @@ def fetch(self, url): if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') - response = self._connection.get(url) + response = self._connection.get(url, timeout=self.config.http_timeout) if response.ok: self._url = response.url text = response.content From 67858eecea593b49baebe78a8d2eb1cb4ca19a0a Mon Sep 17 00:00:00 2001 From: Lol4to Date: Tue, 26 Jan 2016 17:17:46 +0300 Subject: [PATCH 43/44] Raise exception on network error in strict mode Swallowing errors makes it difficult to understand whether something went wrong with network, goose, or target resource. So strict mode (now default) is introduced. With this mode goose will raise Exception instead of returning empty responses. --- goose/configuration.py | 3 +++ goose/exceptions.py | 3 +++ goose/network.py | 8 ++++++++ 3 files changed, 14 insertions(+) create mode 100644 goose/exceptions.py diff --git a/goose/configuration.py b/goose/configuration.py index e2db3fd7..7d83a34f 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -115,6 +115,9 @@ def __init__(self): # known context patterns. Goose at first will search context at dom nodes, qualifying these patterns self.known_context_patterns = KNOWN_ARTICLE_CONTENT_PATTERNS + # Strict mode. Generate exceptions on errors instead of swallowing them + self.strict = True + def get_parser(self): return AVAILABLE_PARSERS[self.parser_class] diff --git a/goose/exceptions.py b/goose/exceptions.py new file mode 100644 index 00000000..b75f3183 --- /dev/null +++ b/goose/exceptions.py @@ -0,0 +1,3 @@ +from .network import NetworkError + +__all__ = ['NetworkError'] diff --git a/goose/network.py b/goose/network.py index 793d14fe..928fd4b5 100644 --- a/goose/network.py +++ b/goose/network.py @@ -24,6 +24,12 @@ import requests +class NetworkError(RuntimeError): + def __init__(self, status_code, reason): + self.reason = reason + self.status_code = status_code + + class NetworkFetcher(object): def __init__(self, config): @@ -48,5 +54,7 @@ def fetch(self, url): else: self._url = None text = None + if self.config.strict: + raise NetworkError(response.status_code, response.reason) return text From 69e5d80bb52eccba0b8c05cac3634b0837f26c4d Mon Sep 17 00:00:00 2001 From: Lol4to Date: Tue, 26 Jan 2016 17:35:08 +0300 Subject: [PATCH 44/44] Do not remove other headers when setting `User-agent` --- goose/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/network.py b/goose/network.py index 928fd4b5..2aca4873 100644 --- a/goose/network.py +++ b/goose/network.py @@ -35,7 +35,7 @@ class NetworkFetcher(object): def __init__(self, config): self.config = config self._connection = requests.Session() - self._connection.headers = {'User-agent': self.config.browser_user_agent} + self._connection.headers['User-agent'] = self.config.browser_user_agent self._url = None