Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python 3 support #220

Open
wants to merge 21 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions goose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ def crawl(self, crawl_candiate):
try:
crawler = Crawler(self.config)
article = crawler.crawl(crawl_candiate)
except (UnicodeDecodeError, ValueError):
self.config.parser_class = parsers[0]
return self.crawl(crawl_candiate)
except (UnicodeDecodeError, ValueError) as e:
if parsers:
self.config.parser_class = parsers[0]
return self.crawl(crawl_candiate)
else:
raise e
return article

def initialize(self):
Expand Down
2 changes: 2 additions & 0 deletions goose/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import unicode_literals

from goose.utils import ReplaceSequence


Expand Down
9 changes: 7 additions & 2 deletions goose/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
"""
import os
import tempfile

import six

from goose.text import StopWords
from goose.parsers import Parser
from goose.parsers import ParserSoup
Expand All @@ -30,10 +33,12 @@
HTTP_DEFAULT_TIMEOUT = 30

AVAILABLE_PARSERS = {
'lxml': Parser,
'soup': ParserSoup,
'lxml': Parser
}

if six.PY2:
AVAILABLE_PARSERS['soup'] = ParserSoup


class Configuration(object):

Expand Down
2 changes: 1 addition & 1 deletion goose/extractors/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def update_score(self, node, addToScore):
if score_string:
current_score = int(score_string)

new_score = current_score + addToScore
new_score = current_score + int(addToScore)
self.parser.setAttribute(node, "gravityScore", str(new_score))

def update_node_count(self, node, add_to_count):
Expand Down
6 changes: 5 additions & 1 deletion goose/extractors/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
import re
import os

from urlparse import urlparse, urljoin
try:
from urlparse import urlparse, urljoin
except ImportError:
from urllib.parse import urlparse, urljoin


from goose.extractors import BaseExtractor
from goose.image import Image
Expand Down
6 changes: 4 additions & 2 deletions goose/extractors/metas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
"""

import re
from urlparse import urljoin
from urlparse import urlparse
try:
from urlparse import urlparse, urljoin
except ImportError:
from urllib.parse import urlparse, urljoin

from goose.extractors import BaseExtractor

Expand Down
4 changes: 4 additions & 0 deletions goose/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
try:
long
except NameError:
long = int


class Image(object):
Expand Down
17 changes: 9 additions & 8 deletions goose/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
import urllib2
import six

try:
from urllib2 import urlopen, Request
except ImportError:
from urllib.request import urlopen, Request


class HtmlFetcher(object):
Expand All @@ -39,18 +44,14 @@ def get_url(self):

def get_html(self, url):
# utf-8 encode unicode url
if isinstance(url, unicode):
if isinstance(url, six.text_type) and six.PY2:
url = url.encode('utf-8')

# set request
self.request = urllib2.Request(
url,
headers=self.headers)
self.request = Request(url, headers=self.headers)
# do request
try:
self.result = urllib2.urlopen(
self.request,
timeout=self.config.http_timeout)
self.result = urlopen(self.request, timeout=self.config.http_timeout)
except Exception:
self.result = None

Expand Down
6 changes: 5 additions & 1 deletion goose/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from HTMLParser import HTMLParser
try:
from HTMLParser import HTMLParser
except ImportError:
from html.parser import HTMLParser

from goose.text import innerTrim


Expand Down
7 changes: 5 additions & 2 deletions goose/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
limitations under the License.
"""
import lxml.html
from lxml.html import soupparser

import six

from lxml import etree
from copy import deepcopy
from goose.text import innerTrim
Expand Down Expand Up @@ -56,7 +58,7 @@ def fromstring(self, html):

@classmethod
def nodeToString(self, node):
return etree.tostring(node)
return etree.tostring(node, encoding=six.text_type)

@classmethod
def replaceTag(self, node, tag):
Expand Down Expand Up @@ -239,6 +241,7 @@ class ParserSoup(Parser):

@classmethod
def fromstring(self, html):
from lxml.html import soupparser
html = encodeValue(html)
self.doc = soupparser.fromstring(html)
return self.doc
13 changes: 8 additions & 5 deletions goose/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
import os
import re
import string

import six

from goose.utils import FileHelper
from goose.utils.encoding import smart_unicode
from goose.utils.encoding import smart_str
Expand All @@ -32,7 +35,7 @@


def innerTrim(value):
if isinstance(value, (unicode, str)):
if isinstance(value, (six.text_type, six.string_types)):
# remove tab and white space
value = re.sub(TABSSPACE, ' ', value)
value = ''.join(value.splitlines())
Expand Down Expand Up @@ -87,7 +90,6 @@ def set_word_count(self, cnt):
class StopWords(object):

PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
TRANS_TABLE = string.maketrans('', '')
_cached_stop_words = {}

def __init__(self, language='en'):
Expand All @@ -106,9 +108,10 @@ def __init__(self, language='en'):
def remove_punctuation(self, content):
# code taken form
# http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
if isinstance(content, unicode):
content = content.encode('utf-8')
return content.translate(self.TRANS_TABLE, string.punctuation)
if not isinstance(content, six.text_type):
content = content.decode('utf-8')
tbl = dict.fromkeys(ord(x) for x in string.punctuation)
return content.translate(tbl)

def candiate_words(self, stripped_input):
return stripped_input.split(' ')
Expand Down
13 changes: 10 additions & 3 deletions goose/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@
import os
import goose
import codecs
import urlparse

import six

try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse


class BuildURL(object):
Expand Down Expand Up @@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash):
class RawHelper(object):
@classmethod
def get_parsing_candidate(self, url, raw_html):
if isinstance(raw_html, unicode):
if isinstance(raw_html, six.text_type):
raw_html = raw_html.encode('utf-8')
link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
return ParsingCandidate(url, link_hash)
Expand All @@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl):
# replace shebang is urls
final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
if '#!' in url_to_crawl else url_to_crawl
link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url
link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time())
return ParsingCandidate(final_url, link_hash)


Expand Down
28 changes: 15 additions & 13 deletions goose/utils/encoding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
import types
import datetime

import six

from decimal import Decimal


Expand Down Expand Up @@ -45,8 +47,8 @@ def is_protected_type(obj):
force_unicode(strings_only=True).
"""
return isinstance(obj, (
types.NoneType,
int, long,
type(None),
six.integer_types,
datetime.datetime, datetime.date, datetime.time,
float, Decimal)
)
Expand All @@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
# Handle the common case first, saves 30-40% in performance when s
# is an instance of unicode. This function gets called often in that
# setting.
if isinstance(s, unicode):
if isinstance(s, six.text_type):
return s
if strings_only and is_protected_type(s):
return s
try:
if not isinstance(s, basestring,):
if not isinstance(s, six.string_types,):
if hasattr(s, '__unicode__'):
s = unicode(s)
s = s.__unicode__()
else:
try:
s = unicode(str(s), encoding, errors)
s = six.text_type(s, encoding, errors)
except UnicodeEncodeError:
if not isinstance(s, Exception):
raise
Expand All @@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
# output should be.
s = u' '.join([force_unicode(arg, encoding, strings_only,
errors) for arg in s])
elif not isinstance(s, unicode):
elif not isinstance(s, six.text_type):
# Note: We use .decode() here, instead of unicode(s, encoding,
# errors), so that if s is a SafeString, it ends up being a
# SafeUnicode at the end.
s = s.decode(encoding, errors)
except UnicodeDecodeError, e:
except UnicodeDecodeError as e:
if not isinstance(s, Exception):
raise DjangoUnicodeDecodeError(s, *e.args)
else:
Expand All @@ -109,11 +111,11 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):

If strings_only is True, don't convert (some) non-string-like objects.
"""
if strings_only and isinstance(s, (types.NoneType, int)):
if strings_only and isinstance(s, (type(None), int)):
return s
# if isinstance(s, Promise):
# return unicode(s).encode(encoding, errors)
if not isinstance(s, basestring):
if not isinstance(s, six.string_types):
try:
return str(s)
except UnicodeEncodeError:
Expand All @@ -123,8 +125,8 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
# further exception.
return ' '.join([smart_str(arg, encoding, strings_only,
errors) for arg in s])
return unicode(s).encode(encoding, errors)
elif isinstance(s, unicode):
return six.text_type(s).encode(encoding, errors)
elif isinstance(s, six.text_type):
return s.encode(encoding, errors)
elif s and encoding != 'utf-8':
return s.decode('utf-8', errors).encode(encoding, errors)
Expand Down
10 changes: 7 additions & 3 deletions goose/utils/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@
"""
import hashlib
import os
import urllib2
try:
from urllib2 import urlopen, Request
except ImportError:
from urllib.request import urlopen, Request
from PIL import Image

from goose.utils.encoding import smart_str
from goose.image import ImageDetails
from goose.image import LocallyStoredImage
Expand Down Expand Up @@ -115,8 +119,8 @@ def clean_src_string(self, src):
@classmethod
def fetch(self, http_client, src):
try:
req = urllib2.Request(src)
f = urllib2.urlopen(req)
req = Request(src)
f = urlopen(req)
data = f.read()
return data
except Exception:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ Pillow
lxml
cssselect
jieba
beautifulsoup
beautifulsoup # Only on python2

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so your code will not work in python 3?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not working in python 3.5.4 as it does not support beautifulsoup. Even after changing the requires.txt with beautidulsoup4 the install didnt work. Attached error file
error_goose_extract.txt

nltk
six
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
"""

import os
import sys

from setuptools import setup, find_packages
from imp import load_source

Expand Down Expand Up @@ -53,6 +55,11 @@
except Exception:
long_description = description

requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six']
if sys.version_info.major == 2:
requirements.append('beautifulsoup')


setup(name='goose-extractor',
version=version.__version__,
description=description,
Expand All @@ -66,6 +73,6 @@
packages=find_packages(),
include_package_data=True,
zip_safe=False,
install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'],
install_requires=requirements,
test_suite="tests"
)
Loading