diff --git a/.travis.yml b/.travis.yml index 2f2c722e..a242d0ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,10 @@ language: python python: - 2.6 - 2.7 + - 3.4 install: - - pip install -r requirements.txt --use-mirrors + - pip install jieba - python setup.py install script: python setup.py test diff --git a/goose/__init__.py b/goose/__init__.py index 409b5732..d1cd6da8 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -21,7 +21,6 @@ limitations under the License. """ import os -import platform from tempfile import mkstemp from goose.version import version_info, __version__ @@ -64,9 +63,12 @@ def crawl(self, crawl_candiate): try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) - except (UnicodeDecodeError, ValueError): - self.config.parser_class = parsers[0] - return self.crawl(crawl_candiate) + except (UnicodeDecodeError, ValueError) as e: + if parsers: + self.config.parser_class = parsers[0] + return self.crawl(crawl_candiate) + else: + raise e return article def initialize(self): diff --git a/goose/cleaners.py b/goose/cleaners.py index c1384ee0..9ab45b6d 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -20,6 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import unicode_literals + from goose.utils import ReplaceSequence diff --git a/goose/configuration.py b/goose/configuration.py index fcfa5b9a..4913f699 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -22,6 +22,9 @@ """ import os import tempfile + +import six + from goose.text import StopWords from goose.parsers import Parser from goose.parsers import ParserSoup @@ -30,10 +33,12 @@ HTTP_DEFAULT_TIMEOUT = 30 AVAILABLE_PARSERS = { - 'lxml': Parser, - 'soup': ParserSoup, + 'lxml': Parser } +if six.PY2: + AVAILABLE_PARSERS['soup'] = ParserSoup + class Configuration(object): diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..afdc2c91 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -260,7 +260,7 @@ def update_score(self, node, addToScore): if score_string: current_score = int(score_string) - new_score = current_score + addToScore + new_score = current_score + int(addToScore) self.parser.setAttribute(node, "gravityScore", str(new_score)) def update_node_count(self, node, add_to_count): diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 3af44f5f..f258aead 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -23,7 +23,7 @@ import re import os -from urlparse import urlparse, urljoin +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor from goose.image import Image diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index 95acadd5..5a65aa16 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -22,8 +22,8 @@ """ import re -from urlparse import urljoin -from urlparse import urlparse + +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor diff --git a/goose/image.py b/goose/image.py index 351e3396..58ddd021 100644 --- a/goose/image.py +++ b/goose/image.py @@ -46,7 +46,7 @@ def __init__(self): self.extraction_type = "NA" # stores how many bytes this image is. - self.bytes = long(0) + self.bytes = 0 def get_src(self): return self.src @@ -87,7 +87,7 @@ def set_mime_type(self, mime_type): class LocallyStoredImage(object): def __init__(self, src='', local_filename='', - link_hash='', bytes=long(0), file_extension='', height=0, width=0): + link_hash='', bytes=0, file_extension='', height=0, width=0): self.src = src self.local_filename = local_filename self.link_hash = link_hash diff --git a/goose/network.py b/goose/network.py index 666a7d61..2b8265ad 100644 --- a/goose/network.py +++ b/goose/network.py @@ -20,7 +20,12 @@ See the License for the specific language governing permissions and limitations under the License. """ -import urllib2 +import six + +try: + from urllib2 import urlopen, Request +except ImportError: + from urllib.request import urlopen, Request class HtmlFetcher(object): @@ -39,18 +44,14 @@ def get_url(self): def get_html(self, url): # utf-8 encode unicode url - if isinstance(url, unicode): + if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') # set request - self.request = urllib2.Request( - url, - headers=self.headers) + self.request = Request(url, headers=self.headers) # do request try: - self.result = urllib2.urlopen( - self.request, - timeout=self.config.http_timeout) + self.result = urlopen(self.request, timeout=self.config.http_timeout) except Exception: self.result = None diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 1f8ba4bd..808f2eee 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,7 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ -from HTMLParser import HTMLParser +from six.moves.html_parser import HTMLParser + from goose.text import innerTrim diff --git a/goose/parsers.py b/goose/parsers.py index a43e9b47..fab3eb31 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -21,11 +21,12 @@ limitations under the License. """ import lxml.html -from lxml.html import soupparser + +import six + from lxml import etree from copy import deepcopy -from goose.text import innerTrim -from goose.text import encodeValue +from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str class Parser(object): @@ -50,13 +51,20 @@ def css_select(self, node, selector): @classmethod def fromstring(self, html): - html = encodeValue(html) - self.doc = lxml.html.fromstring(html) + encoding = get_encodings_from_content(html) + encoding = encoding and encoding[0] or None + if not encoding: + html = encodeValue(html) + self.doc = lxml.html.fromstring(html) + else: + html = smart_str(html, encoding=encoding) + parser = lxml.html.HTMLParser(encoding=encoding) + self.doc = lxml.html.fromstring(html, parser=parser) return self.doc @classmethod def nodeToString(self, node): - return etree.tostring(node) + return etree.tostring(node, encoding=six.text_type) @classmethod def replaceTag(self, node, tag): @@ -239,6 +247,7 @@ class ParserSoup(Parser): @classmethod def fromstring(self, html): + from lxml.html import soupparser html = encodeValue(html) self.doc = soupparser.fromstring(html) return self.doc diff --git a/goose/text.py b/goose/text.py index 3ef63d6b..31070cf0 100644 --- a/goose/text.py +++ b/goose/text.py @@ -23,6 +23,9 @@ import os import re import string + +import six + from goose.utils import FileHelper from goose.utils.encoding import smart_unicode from goose.utils.encoding import smart_str @@ -31,8 +34,42 @@ TABSSPACE = re.compile(r'[\s\t]+') +def get_encodings_from_content(content): + """ + Code from: + https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py + Return encodings from given content string. + :param content: string to extract encodings from. + """ + if isinstance(content, six.binary_type) and six.PY3: + find_charset = re.compile( + br']', flags=re.I + ).findall + + find_pragma = re.compile( + br']', flags=re.I + ).findall + + find_xml = re.compile( + br'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + else: + find_charset = re.compile( + r']', flags=re.I + ).findall + + find_pragma = re.compile( + r']', flags=re.I + ).findall + + find_xml = re.compile( + r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + return find_charset(content) + find_pragma(content) + find_xml(content) + + def innerTrim(value): - if isinstance(value, (unicode, str)): + if isinstance(value, (six.text_type, six.string_types)): # remove tab and white space value = re.sub(TABSSPACE, ' ', value) value = ''.join(value.splitlines()) @@ -87,7 +124,6 @@ def set_word_count(self, cnt): class StopWords(object): PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") - TRANS_TABLE = string.maketrans('', '') _cached_stop_words = {} def __init__(self, language='en'): @@ -106,9 +142,10 @@ def __init__(self, language='en'): def remove_punctuation(self, content): # code taken form # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python - if isinstance(content, unicode): - content = content.encode('utf-8') - return content.translate(self.TRANS_TABLE, string.punctuation) + if not isinstance(content, six.text_type): + content = content.decode('utf-8') + tbl = dict.fromkeys(ord(x) for x in string.punctuation) + return content.translate(tbl) def candiate_words(self, stripped_input): return stripped_input.split(' ') diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py index 5a1de7d4..41cf9c95 100644 --- a/goose/utils/__init__.py +++ b/goose/utils/__init__.py @@ -26,7 +26,13 @@ import os import goose import codecs -import urlparse + +import six + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse class BuildURL(object): @@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash): class RawHelper(object): @classmethod def get_parsing_candidate(self, url, raw_html): - if isinstance(raw_html, unicode): + if isinstance(raw_html, six.text_type): raw_html = raw_html.encode('utf-8') link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) return ParsingCandidate(url, link_hash) @@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl): # replace shebang is urls final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ if '#!' in url_to_crawl else url_to_crawl - link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) + url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url + link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time()) return ParsingCandidate(final_url, link_hash) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index 4dc23ca7..f94f476e 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- -import types import datetime + +import six + from decimal import Decimal @@ -45,8 +47,8 @@ def is_protected_type(obj): force_unicode(strings_only=True). """ return isinstance(obj, ( - types.NoneType, - int, long, + type(None), + six.integer_types, datetime.datetime, datetime.date, datetime.time, float, Decimal) ) @@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # Handle the common case first, saves 30-40% in performance when s # is an instance of unicode. This function gets called often in that # setting. - if isinstance(s, unicode): + if isinstance(s, six.text_type): return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types,): if hasattr(s, '__unicode__'): - s = unicode(s) + s = s.__unicode__() else: try: - s = unicode(str(s), encoding, errors) + s = six.text_type(s, encoding, errors) except UnicodeEncodeError: if not isinstance(s, Exception): raise @@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # output should be. s = u' '.join([force_unicode(arg, encoding, strings_only, errors) for arg in s]) - elif not isinstance(s, unicode): + elif not isinstance(s, six.text_type): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a # SafeUnicode at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise DjangoUnicodeDecodeError(s, *e.args) else: @@ -109,13 +111,17 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): If strings_only is True, don't convert (some) non-string-like objects. """ - if strings_only and isinstance(s, (types.NoneType, int)): + if strings_only and isinstance(s, (type(None), int)): return s # if isinstance(s, Promise): # return unicode(s).encode(encoding, errors) - if not isinstance(s, basestring): + if isinstance(s, six.text_type): + return s.encode(encoding, errors) + elif not isinstance(s, six.binary_type): try: - return str(s) + if six.PY2: + return str(s) + return str(s).encode(encoding, errors) except UnicodeEncodeError: if isinstance(s, Exception): # An Exception subclass containing non-ASCII data that doesn't @@ -123,10 +129,6 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): # further exception. return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) - return unicode(s).encode(encoding, errors) - elif isinstance(s, unicode): - return s.encode(encoding, errors) - elif s and encoding != 'utf-8': - return s.decode('utf-8', errors).encode(encoding, errors) + return six.text_type(s).encode(encoding, errors) else: return s diff --git a/goose/utils/images.py b/goose/utils/images.py index 388d5c85..31a55d61 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,8 +22,11 @@ """ import hashlib import os -import urllib2 + +from six.moves.urllib.request import urlopen, Request + from PIL import Image + from goose.utils.encoding import smart_str from goose.image import ImageDetails from goose.image import LocallyStoredImage @@ -35,9 +38,9 @@ class ImageUtils(object): def get_image_dimensions(self, identify_program, path): image_details = ImageDetails() try: - image = Image.open(path) - image_details.set_mime_type(image.format) - width, height = image.size + with Image.open(path) as image: + image_details.set_mime_type(image.format) + width, height = image.size image_details.set_width(width) image_details.set_height(height) except IOError: @@ -115,8 +118,8 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - req = urllib2.Request(src) - f = urllib2.urlopen(req) + req = Request(src) + f = urlopen(req) data = f.read() return data except Exception: diff --git a/goose/video.py b/goose/video.py index 8509bba0..0691ac96 100644 --- a/goose/video.py +++ b/goose/video.py @@ -21,6 +21,7 @@ limitations under the License. """ + class Video(object): """\ Video object diff --git a/requirements.txt b/requirements.txt index 7e6a6c09..bbd377ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ Pillow lxml cssselect jieba -beautifulsoup +beautifulsoup # Only on python2 nltk +six diff --git a/setup.py b/setup.py index ebad2547..bce19c5c 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,8 @@ """ import os +import sys + from setuptools import setup, find_packages from imp import load_source @@ -40,6 +42,7 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', 'Topic :: Internet', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries :: Python Modules'] @@ -53,6 +56,13 @@ except Exception: long_description = description +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] +if sys.version_info[0] == 2: + requirements.append('beautifulsoup') + if sys.version_info[1] < 7: + requirements.append('unittest2') + + setup(name='goose-extractor', version=version.__version__, description=description, @@ -66,6 +76,6 @@ packages=find_packages(), include_package_data=True, zip_safe=False, - install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'], + install_requires=requirements, test_suite="tests" ) diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py index 709040c1..a21d362e 100644 --- a/tests/extractors/authors.py +++ b/tests/extractors/authors.py @@ -21,12 +21,26 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleAuthor(TestExtractionBase): def test_author_schema(self): article = self.getArticle() - fields = ['authors'] - self.runArticleAssertions(article=article, fields=fields) + field = 'authors' + + # Do not call self.runArticleAssertions because need to sort results, + # because set not save ordering, so test failed; + + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + expected_value.sort() + result_value.sort() + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index e19d20e0..93b3c075 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -22,11 +22,16 @@ """ import os import json -import urllib2 import unittest import socket -from StringIO import StringIO +try: + import urllib2 +except ImportError: + import urllib.request as urllib2 + +import six +from six import StringIO, BytesIO from goose import Goose from goose.utils import FileHelper @@ -47,13 +52,16 @@ class MockResponse(): def __init__(self, cls): self.cls = cls - def content(self): + def content(self, req): return "response" def response(self, req): data = self.content(req) url = req.get_full_url() - resp = urllib2.addinfourl(StringIO(data), data, url) + if isinstance(data, six.binary_type): + resp = urllib2.addinfourl(BytesIO(data), data, url) + else: + resp = urllib2.addinfourl(StringIO(data), data, url) resp.code = self.code resp.msg = self.msg return resp diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 30dc2754..854c4bd1 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase from goose.text import StopWordsChinese from goose.text import StopWordsArabic diff --git a/tests/extractors/images.py b/tests/extractors/images.py index e47a1dde..9a9712a1 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -20,13 +20,15 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import + import os import json import hashlib import unittest -from base import MockResponse -from base import TestExtractionBase +from .base import MockResponse +from .base import TestExtractionBase from goose.configuration import Configuration from goose.image import Image @@ -41,7 +43,7 @@ class MockResponseImage(MockResponse): def image_content(self, req): - md5_hash = hashlib.md5(req.get_full_url()).hexdigest() + md5_hash = hashlib.md5(req.get_full_url().encode("utf-8")).hexdigest() current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), diff --git a/tests/extractors/links.py b/tests/extractors/links.py index 8539465e..ea15a459 100644 --- a/tests/extractors/links.py +++ b/tests/extractors/links.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleLinks(TestExtractionBase): diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py index fd45915a..a4eef74c 100644 --- a/tests/extractors/metas.py +++ b/tests/extractors/metas.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestMetas(TestExtractionBase): diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py index 415a784c..a0616227 100644 --- a/tests/extractors/opengraph.py +++ b/tests/extractors/opengraph.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestOpenGraph(TestExtractionBase): diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py index 8d2a13b9..355250d5 100644 --- a/tests/extractors/publishdate.py +++ b/tests/extractors/publishdate.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestPublishDate(TestExtractionBase): diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py index 22b17129..2f5562ba 100644 --- a/tests/extractors/tags.py +++ b/tests/extractors/tags.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleTags(TestExtractionBase): diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 09170205..c6f7813c 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestTitle(TestExtractionBase): diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py index 50300f43..3f72a604 100644 --- a/tests/extractors/tweets.py +++ b/tests/extractors/tweets.py @@ -20,8 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import -from base import TestExtractionBase +from .base import TestExtractionBase class TestArticleTweet(TestExtractionBase): diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 10be15ff..0350c8c3 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class ImageExtractionTests(TestExtractionBase): diff --git a/tests/parsers.py b/tests/parsers.py index 6614368d..e5f17164 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -21,7 +21,12 @@ limitations under the License. """ import os -import unittest +try: + import unittest2 as unittest # Need to support skipIf in python 2.6 +except ImportError: + import unittest + +import six from goose.utils import FileHelper from goose.parsers import Parser @@ -254,11 +259,28 @@ def test_delAttribute(self): # remove an unexistant attribute self.parser.delAttribute(div, attr="bla") + def test_encoding(self): + """ + If pass unicode string to lxml.html.fromstring with encoding set in document will receive: + "ValueError: Unicode strings with encoding declaration are not supported. + Please use bytes input or XML fragments without declaration." + Test for this case. + """ + html = u""" + + """ + html += u'' + html += u'

Я рядочок

' + html += u'' + self.parser.fromstring(html) + class TestParser(ParserBase): pass class TestParserSoup(ParserBase): + + @unittest.skipIf(six.PY3, "supported only in python2") def setUp(self): self.parser = ParserSoup