grangier · vetal4444 · Jun 29, 2014 · Jul 14, 2014 · Jul 14, 2014 · Sep 14, 2014
diff --git a/goose/__init__.py b/goose/__init__.py
@@ -64,9 +64,12 @@ def crawl(self, crawl_candiate):
         try:
             crawler = Crawler(self.config)
             article = crawler.crawl(crawl_candiate)
-        except (UnicodeDecodeError, ValueError):
-            self.config.parser_class = parsers[0]
-            return self.crawl(crawl_candiate)
+        except (UnicodeDecodeError, ValueError) as e:
+            if parsers:
+                self.config.parser_class = parsers[0]
+                return self.crawl(crawl_candiate)
+            else:
+                raise e
         return article
 
     def initialize(self):

diff --git a/goose/cleaners.py b/goose/cleaners.py
@@ -20,6 +20,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+from __future__ import unicode_literals
+
 from goose.utils import ReplaceSequence
 
 

diff --git a/goose/configuration.py b/goose/configuration.py
@@ -22,6 +22,9 @@
 """
 import os
 import tempfile
+
+import six
+
 from goose.text import StopWords
 from goose.parsers import Parser
 from goose.parsers import ParserSoup
@@ -30,10 +33,12 @@
 HTTP_DEFAULT_TIMEOUT = 30
 
 AVAILABLE_PARSERS = {
-    'lxml': Parser,
-    'soup': ParserSoup,
+    'lxml': Parser
 }
 
+if six.PY2:
+    AVAILABLE_PARSERS['soup'] = ParserSoup
+
 
 class Configuration(object):
 

diff --git a/goose/extractors/content.py b/goose/extractors/content.py
@@ -260,7 +260,7 @@ def update_score(self, node, addToScore):
         if score_string:
             current_score = int(score_string)
 
-        new_score = current_score + addToScore
+        new_score = current_score + int(addToScore)
         self.parser.setAttribute(node, "gravityScore", str(new_score))
 
     def update_node_count(self, node, add_to_count):

diff --git a/goose/extractors/images.py b/goose/extractors/images.py
@@ -23,7 +23,11 @@
 import re
 import os
 
-from urlparse import urlparse, urljoin
+try:
+    from urlparse import urlparse, urljoin
+except ImportError:
+    from urllib.parse import urlparse, urljoin
+
 
 from goose.extractors import BaseExtractor
 from goose.image import Image

diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py
@@ -22,8 +22,10 @@
 """
 
 import re
-from urlparse import urljoin
-from urlparse import urlparse
+try:
+    from urlparse import urlparse, urljoin
+except ImportError:
+    from urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 

diff --git a/goose/image.py b/goose/image.py
@@ -20,6 +20,10 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+try:
+    long
+except NameError:
+    long = int
 
 
 class Image(object):

diff --git a/goose/network.py b/goose/network.py
@@ -20,7 +20,12 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-import urllib2
+import six
+
+try:
+    from urllib2 import urlopen, Request
+except ImportError:
+    from urllib.request import urlopen, Request
 
 
 class HtmlFetcher(object):
@@ -39,18 +44,14 @@ def get_url(self):
 
     def get_html(self, url):
         # utf-8 encode unicode url
-        if isinstance(url, unicode):
+        if isinstance(url, six.text_type) and six.PY2:
             url = url.encode('utf-8')
 
         # set request
-        self.request = urllib2.Request(
-                        url,
-                        headers=self.headers)
+        self.request = Request(url, headers=self.headers)
         # do request
         try:
-            self.result = urllib2.urlopen(
-                            self.request,
-                            timeout=self.config.http_timeout)
+            self.result = urlopen(self.request, timeout=self.config.http_timeout)
         except Exception:
             self.result = None
 

diff --git a/goose/outputformatters.py b/goose/outputformatters.py
@@ -20,7 +20,11 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-from HTMLParser import HTMLParser
+try:
+    from HTMLParser import HTMLParser
+except ImportError:
+    from html.parser import HTMLParser
+
 from goose.text import innerTrim
 
 

diff --git a/goose/parsers.py b/goose/parsers.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 import lxml.html
-from lxml.html import soupparser
+
+import six
+
 from lxml import etree
 from copy import deepcopy
 from goose.text import innerTrim
@@ -56,7 +58,7 @@ def fromstring(self, html):
 
     @classmethod
     def nodeToString(self, node):
-        return etree.tostring(node)
+        return etree.tostring(node, encoding=six.text_type)
 
     @classmethod
     def replaceTag(self, node, tag):
@@ -239,6 +241,7 @@ class ParserSoup(Parser):
 
     @classmethod
     def fromstring(self, html):
+        from lxml.html import soupparser
         html = encodeValue(html)
         self.doc = soupparser.fromstring(html)
         return self.doc
diff --git a/goose/text.py b/goose/text.py
@@ -23,6 +23,9 @@
 import os
 import re
 import string
+
+import six
+
 from goose.utils import FileHelper
 from goose.utils.encoding import smart_unicode
 from goose.utils.encoding import smart_str
@@ -32,7 +35,7 @@
 
 
 def innerTrim(value):
-    if isinstance(value, (unicode, str)):
+    if isinstance(value, (six.text_type, six.string_types)):
         # remove tab and white space
         value = re.sub(TABSSPACE, ' ', value)
         value = ''.join(value.splitlines())
@@ -87,7 +90,6 @@ def set_word_count(self, cnt):
 class StopWords(object):
 
     PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
-    TRANS_TABLE = string.maketrans('', '')
     _cached_stop_words = {}
 
     def __init__(self, language='en'):
@@ -106,9 +108,10 @@ def __init__(self, language='en'):
     def remove_punctuation(self, content):
         # code taken form
         # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
-        if isinstance(content, unicode):
-            content = content.encode('utf-8')
-        return content.translate(self.TRANS_TABLE, string.punctuation)
+        if not isinstance(content, six.text_type):
+            content = content.decode('utf-8')
+        tbl = dict.fromkeys(ord(x) for x in string.punctuation)
+        return content.translate(tbl)
 
     def candiate_words(self, stripped_input):
         return stripped_input.split(' ')

diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py
@@ -26,7 +26,13 @@
 import os
 import goose
 import codecs
-import urlparse
+
+import six
+
+try:
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
 
 
 class BuildURL(object):
@@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash):
 class RawHelper(object):
     @classmethod
     def get_parsing_candidate(self, url, raw_html):
-        if isinstance(raw_html, unicode):
+        if isinstance(raw_html, six.text_type):
             raw_html = raw_html.encode('utf-8')
         link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
         return ParsingCandidate(url, link_hash)
@@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl):
         # replace shebang is urls
         final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
                     if '#!' in url_to_crawl else url_to_crawl
-        link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
+        url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url
+        link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time())
         return ParsingCandidate(final_url, link_hash)
 
 

diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
-import types
 import datetime
+
+import six
+
 from decimal import Decimal
 
 
@@ -45,8 +47,8 @@ def is_protected_type(obj):
     force_unicode(strings_only=True).
     """
     return isinstance(obj, (
-        types.NoneType,
-        int, long,
+        type(None),
+        six.integer_types,
         datetime.datetime, datetime.date, datetime.time,
         float, Decimal)
     )
@@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
     # Handle the common case first, saves 30-40% in performance when s
     # is an instance of unicode. This function gets called often in that
     # setting.
-    if isinstance(s, unicode):
+    if isinstance(s, six.text_type):
         return s
     if strings_only and is_protected_type(s):
         return s
     try:
-        if not isinstance(s, basestring,):
+        if not isinstance(s, six.string_types,):
             if hasattr(s, '__unicode__'):
-                s = unicode(s)
+                s = s.__unicode__()
             else:
                 try:
-                    s = unicode(str(s), encoding, errors)
+                    s = six.text_type(s, encoding, errors)
                 except UnicodeEncodeError:
                     if not isinstance(s, Exception):
                         raise
@@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
                     # output should be.
                     s = u' '.join([force_unicode(arg, encoding, strings_only,
                             errors) for arg in s])
-        elif not isinstance(s, unicode):
+        elif not isinstance(s, six.text_type):
             # Note: We use .decode() here, instead of unicode(s, encoding,
             # errors), so that if s is a SafeString, it ends up being a
             # SafeUnicode at the end.
             s = s.decode(encoding, errors)
-    except UnicodeDecodeError, e:
+    except UnicodeDecodeError as e:
         if not isinstance(s, Exception):
             raise DjangoUnicodeDecodeError(s, *e.args)
         else:
@@ -109,11 +111,11 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
-    if strings_only and isinstance(s, (types.NoneType, int)):
+    if strings_only and isinstance(s, (type(None), int)):
         return s
     # if isinstance(s, Promise):
     #     return unicode(s).encode(encoding, errors)
-    if not isinstance(s, basestring):
+    if not isinstance(s, six.string_types):
         try:
             return str(s)
         except UnicodeEncodeError:
@@ -123,8 +125,8 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
                 # further exception.
                 return ' '.join([smart_str(arg, encoding, strings_only,
                         errors) for arg in s])
-            return unicode(s).encode(encoding, errors)
-    elif isinstance(s, unicode):
+            return six.text_type(s).encode(encoding, errors)
+    elif isinstance(s, six.text_type):
         return s.encode(encoding, errors)
     elif s and encoding != 'utf-8':
         return s.decode('utf-8', errors).encode(encoding, errors)

diff --git a/goose/utils/images.py b/goose/utils/images.py
@@ -22,8 +22,12 @@
 """
 import hashlib
 import os
-import urllib2
+try:
+    from urllib2 import urlopen, Request
+except ImportError:
+    from urllib.request import urlopen, Request
 from PIL import Image
+
 from goose.utils.encoding import smart_str
 from goose.image import ImageDetails
 from goose.image import LocallyStoredImage
@@ -115,8 +119,8 @@ def clean_src_string(self, src):
     @classmethod
     def fetch(self, http_client, src):
         try:
-            req = urllib2.Request(src)
-            f = urllib2.urlopen(req)
+            req = Request(src)
+            f = urlopen(req)
             data = f.read()
             return data
         except Exception:

diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,6 @@ Pillow
 lxml
 cssselect
 jieba
-beautifulsoup
+beautifulsoup  # Only on python2
 nltk
+six
diff --git a/setup.py b/setup.py
@@ -22,6 +22,8 @@
 """
 
 import os
+import sys
+
 from setuptools import setup, find_packages
 from imp import load_source
 
@@ -53,6 +55,11 @@
 except Exception:
     long_description = description
 
+requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six']
+if sys.version_info.major == 2:
+    requirements.append('beautifulsoup')
+
+
 setup(name='goose-extractor',
     version=version.__version__,
     description=description,
@@ -66,6 +73,6 @@
     packages=find_packages(),
     include_package_data=True,
     zip_safe=False,
-    install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'],
+    install_requires=requirements,
     test_suite="tests"
 )