grangier · Lol4t0 · Jun 29, 2014 · Jul 14, 2014 · Jul 14, 2014 · Sep 14, 2014
diff --git a/.travis.yml b/.travis.yml
@@ -3,9 +3,11 @@ language: python
 python:
     - 2.6
     - 2.7
+    - 3.4
+    - 3.5
 
 install:
-    - pip install -r requirements.txt --use-mirrors
+    - pip install jieba
     - python setup.py install
 
 script: python setup.py test
diff --git a/README.rst b/README.rst
@@ -180,7 +180,7 @@ class.
 
 
 Goose in Korean
-----------------
+---------------
 
 In order to use Goose in Korean you have to use the StopWordsKorean
 class.
@@ -197,24 +197,6 @@ class.
     14년째 세계 각국의 통신·안전·전파 규격 시험과 인증 한 우물만 파고 있는 이 회사 박채규 대표가 만나기로 한 주인공이다. 
     그는 전기전자·무선통신·자동차 전장품 분야에
 
-
-Known issues
-------------
-
-- There are some issues with unicode URLs.
-- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance:
-
-    >>> import urllib2
-    >>> import goose
-    >>> url = "http://www.nytimes.com/2013/08/18/world/middleeast/pressure-by-us-failed-to-sway-egypts-leaders.html?hp"
-    >>> opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
-    >>> response = opener.open(url)
-    >>> raw_html = response.read()
-    >>> g = goose.Goose()
-    >>> a = g.extract(raw_html=raw_html)
-    >>> a.cleaned_text
-    u'CAIRO \u2014 For a moment, at least, American and European diplomats trying to defuse the volatile standoff in Egypt thought they had a breakthrough.\n\nAs t'
-
 TODO
 ----
 

diff --git a/goose/__init__.py b/goose/__init__.py
@@ -21,7 +21,6 @@
 limitations under the License.
 """
 import os
-import platform
 from tempfile import mkstemp
 
 from goose.version import version_info, __version__
@@ -64,9 +63,12 @@ def crawl(self, crawl_candiate):
         try:
             crawler = Crawler(self.config)
             article = crawler.crawl(crawl_candiate)
-        except (UnicodeDecodeError, ValueError):
-            self.config.parser_class = parsers[0]
-            return self.crawl(crawl_candiate)
+        except (UnicodeDecodeError, ValueError) as e:
+            if parsers:
+                self.config.parser_class = parsers[0]
+                return self.crawl(crawl_candiate)
+            else:
+                raise e
         return article
 
     def initialize(self):

diff --git a/goose/cleaners.py b/goose/cleaners.py
@@ -20,6 +20,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+from __future__ import unicode_literals
+
 from goose.utils import ReplaceSequence
 
 
@@ -48,7 +50,7 @@ def __init__(self, config, article):
         "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
         "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
         "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
-        "|legende|ajoutVideo|timestamp|js_replies"
+        "|legende|ajoutVideo|timestamp|js_replies|disclaim"
         )
         self.regexp_namespace = "http://exslt.org/regular-expressions"
         self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re
@@ -66,8 +68,7 @@ def __init__(self, config, article):
                                             .append("\t")\
                                             .append("^\\s+$")
 
-    def clean(self):
-        doc_to_clean = self.article.doc
+    def clean(self, doc_to_clean):
         doc_to_clean = self.clean_body_classes(doc_to_clean)
         doc_to_clean = self.clean_article_tags(doc_to_clean)
         doc_to_clean = self.clean_em_tags(doc_to_clean)

diff --git a/goose/configuration.py b/goose/configuration.py
@@ -22,6 +22,9 @@
 """
 import os
 import tempfile
+
+import six
+
 from goose.text import StopWords
 from goose.parsers import Parser
 from goose.parsers import ParserSoup
@@ -30,10 +33,20 @@
 HTTP_DEFAULT_TIMEOUT = 30
 
 AVAILABLE_PARSERS = {
-    'lxml': Parser,
-    'soup': ParserSoup,
+    'lxml': Parser
 }
 
+if six.PY2:
+    AVAILABLE_PARSERS['soup'] = ParserSoup
+
+KNOWN_ARTICLE_CONTENT_PATTERNS = [
+    {'attr': 'class', 'value': 'short-story'},
+    {'attr': 'itemprop', 'value': 'articleBody'},
+    {'attr': 'class', 'value': 'post-content'},
+    {'attr': 'class', 'value': 'g-content'},
+    {'tag': 'article'},
+]
+
 
 class Configuration(object):
 
@@ -99,6 +112,12 @@ def __init__(self):
         # http timeout
         self.http_timeout = HTTP_DEFAULT_TIMEOUT
 
+        # known context patterns. Goose at first will search context at dom nodes, qualifying these patterns
+        self.known_context_patterns = KNOWN_ARTICLE_CONTENT_PATTERNS
+
+        # Strict mode. Generate exceptions on errors instead of swallowing them
+        self.strict = True
+
     def get_parser(self):
         return AVAILABLE_PARSERS[self.parser_class]
 

diff --git a/goose/crawler.py b/goose/crawler.py
@@ -39,7 +39,7 @@
 from goose.cleaners import StandardDocumentCleaner
 from goose.outputformatters import StandardOutputFormatter
 
-from goose.network import HtmlFetcher
+from goose.network import NetworkFetcher
 
 
 class CrawlCandidate(object):
@@ -99,11 +99,13 @@ def __init__(self, config):
         # title extractor
         self.title_extractor = self.get_title_extractor()
 
+        # html fetcher
+        self.fetcher = NetworkFetcher(self.config)
+
         # image extrator
         self.image_extractor = self.get_image_extractor()
 
-        # html fetcher
-        self.htmlfetcher = HtmlFetcher(self.config)
+
 
         # TODO : log prefix
         self.logPrefix = "crawler:"
@@ -161,7 +163,10 @@ def crawl(self, crawl_candidate):
             self.article.doc = article_body
 
         # before we do any calcs on the body itself let's clean up the document
-        self.article.doc = self.cleaner.clean()
+        if not isinstance(self.article.doc, list):
+            self.article.doc = [self.cleaner.clean(self.article.doc)]
+        else:
+            self.article.doc = list(map(lambda doc1: self.cleaner.clean(deepcopy(doc1)), self.article.doc))
 
         # big stuff
         self.article.top_node = self.extractor.calculate_best_node()
@@ -212,11 +217,7 @@ def get_html(self, crawl_candidate, parsing_candidate):
             return crawl_candidate.raw_html
 
         # fetch HTML
-        html = self.htmlfetcher.get_html(parsing_candidate.url)
-        self.article.additional_data.update({
-            'request': self.htmlfetcher.request,
-            'result': self.htmlfetcher.result,
-            })
+        html = self.fetcher.fetch(parsing_candidate.url)
         return html
 
     def get_metas_extractor(self):
@@ -244,7 +245,7 @@ def get_title_extractor(self):
         return TitleExtractor(self.config, self.article)
 
     def get_image_extractor(self):
-        return ImageExtractor(self.config, self.article)
+        return ImageExtractor(self.fetcher, self.config, self.article)
 
     def get_video_extractor(self):
         return VideoExtractor(self.config, self.article)

diff --git a/goose/exceptions.py b/goose/exceptions.py
@@ -0,0 +1,3 @@
+from .network import NetworkError
+
+__all__ = ['NetworkError']
diff --git a/goose/extractors/content.py b/goose/extractors/content.py
@@ -25,13 +25,6 @@
 from goose.extractors import BaseExtractor
 
 
-KNOWN_ARTICLE_CONTENT_TAGS = [
-    {'attr': 'itemprop', 'value': 'articleBody'},
-    {'attr': 'class', 'value': 'post-content'},
-    {'tag': 'article'},
-]
-
-
 class ContentExtractor(BaseExtractor):
 
     def get_language(self):
@@ -47,16 +40,17 @@ def get_language(self):
         return self.config.target_language
 
     def get_known_article_tags(self):
-        for item in KNOWN_ARTICLE_CONTENT_TAGS:
-            nodes = self.parser.getElementsByTag(
-                            self.article.doc,
-                            **item)
-            if len(nodes):
-                return nodes[0]
+        nodes = []
+        for item in self.config.known_context_patterns:
+            nodes.extend(self.parser.getElementsByTag(
+                         self.article.doc,
+                         **item))
+        if len(nodes):
+            return nodes
         return None
 
     def is_articlebody(self, node):
-        for item in KNOWN_ARTICLE_CONTENT_TAGS:
+        for item in self.config.known_context_patterns:
             # attribute
             if "attr" in item and "value" in item:
                 if self.parser.getAttribute(node, item['attr']) == item['value']:
@@ -260,7 +254,7 @@ def update_score(self, node, addToScore):
         if score_string:
             current_score = int(score_string)
 
-        new_score = current_score + addToScore
+        new_score = current_score + int(addToScore)
         self.parser.setAttribute(node, "gravityScore", str(new_score))
 
     def update_node_count(self, node, add_to_count):
@@ -315,16 +309,17 @@ def get_node_gravity_score(self, node):
             return None
         return int(grvScoreString)
 
-    def nodes_to_check(self, doc):
+    def nodes_to_check(self, docs):
         """\
         returns a list of nodes we want to search
         on like paragraphs and tables
         """
         nodes_to_check = []
 
-        for tag in ['p', 'pre', 'td']:
-            items = self.parser.getElementsByTag(doc, tag=tag)
-            nodes_to_check += items
+        for doc in docs:
+            for tag in ['p', 'pre', 'td']:
+                items = self.parser.getElementsByTag(doc, tag=tag)
+                nodes_to_check += items
         return nodes_to_check
 
     def is_table_and_no_para_exist(self, e):

diff --git a/goose/extractors/images.py b/goose/extractors/images.py
@@ -23,7 +23,7 @@
 import re
 import os
 
-from urlparse import urlparse, urljoin
+from six.moves.urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 from goose.image import Image
@@ -48,9 +48,10 @@ def __init__(self, node, parent_depth, sibling_depth):
 
 class ImageExtractor(BaseExtractor):
 
-    def __init__(self, config, article):
+    def __init__(self, fetcher, config, article):
         super(ImageExtractor, self).__init__(config, article)
 
+        self.fetcher = fetcher
         self.custom_site_mapping = {}
 
         self.load_customesite_mapping()
@@ -333,9 +334,7 @@ def get_local_image(self, src):
         """\
         returns the bytes of the image file on disk
         """
-        local_image = ImageUtils.store_image(None,
-                                    self.link_hash, src, self.config)
-        return local_image
+        return ImageUtils.store_image(self.fetcher, self.link_hash, src, self.config)
 
     def get_clean_domain(self):
         if self.article.domain:

diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py
@@ -22,8 +22,8 @@
 """
 
 import re
-from urlparse import urljoin
-from urlparse import urlparse
+
+from six.moves.urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 

diff --git a/goose/image.py b/goose/image.py
@@ -46,7 +46,7 @@ def __init__(self):
         self.extraction_type = "NA"
 
         # stores how many bytes this image is.
-        self.bytes = long(0)
+        self.bytes = 0
 
     def get_src(self):
         return self.src
@@ -87,7 +87,7 @@ def set_mime_type(self, mime_type):
 class LocallyStoredImage(object):
 
     def __init__(self, src='', local_filename='',
-        link_hash='', bytes=long(0), file_extension='', height=0, width=0):
+                 link_hash='', bytes=0, file_extension='', height=0, width=0):
         self.src = src
         self.local_filename = local_filename
         self.link_hash = link_hash