Skip to content

Commit

Permalink
Merge pull request #72 from scrapinghub/load-tldextract-lazily
Browse files Browse the repository at this point in the history
Import tldextract lazily
  • Loading branch information
sibiryakov committed Sep 29, 2015
2 parents 141f233 + 5544d5a commit 065b69a
Show file tree
Hide file tree
Showing 10 changed files with 116 additions and 25 deletions.
15 changes: 15 additions & 0 deletions docs/source/topics/frontera-settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,21 @@ Default: ``frontera.utils.fingerprint.sha1``
The function used to calculate the ``domain`` fingerprint.


.. setting:: TLDEXTRACT_DOMAIN_INFO

TLDEXTRACT_DOMAIN_INFO
----------------------

Default: ``False``

If set to ``True``, will use `tldextract`_ to attach extra domain information
(second-level, top-level and subdomain) to meta field (see :ref:`frontier-objects-additional-data`).


.. _tldextract: https://pypi.python.org/pypi/tldextract



Default settings
================

Expand Down
3 changes: 3 additions & 0 deletions docs/source/topics/frontier-objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ An example of a generated fingerprint for a :class:`Request <frontera.core.model
'198d99a8b2284701d6c147174cd69a37a7dea90f'


.. _frontier-objects-additional-data:


Adding additional data to objects
=================================

Expand Down
41 changes: 21 additions & 20 deletions frontera/contrib/middlewares/domain.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,7 @@
import re

from frontera.core.components import Middleware
from frontera.utils.url import parse_domain_from_url_fast


def parse_domain_info(url, test_mode=False):
if test_mode:
match = re.match('([A-Z])\w+', url)
netloc = name = match.groups()[0] if match else '?'
scheme = sld = tld = subdomain = '-'
else:
netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url)
return {
'netloc': netloc,
'name': name,
'scheme': scheme,
'sld': sld,
'tld': tld,
'subdomain': subdomain,
}
from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url


class DomainMiddleware(Middleware):
Expand Down Expand Up @@ -74,6 +57,8 @@ class DomainMiddleware(Middleware):

def __init__(self, manager):
self.manager = manager
use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False)
self.parse_domain_func = parse_domain_from_url if use_tldextract else parse_domain_from_url_fast

@classmethod
def from_manager(cls, manager):
Expand All @@ -99,8 +84,24 @@ def request_error(self, request, error):
return self._add_domain(request)

def _add_domain(self, obj):
obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode)
obj.meta['domain'] = self.parse_domain_info(obj.url, self.manager.test_mode)
if 'redirect_urls' in obj.meta:
obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode)
obj.meta['redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode)
for url in obj.meta['redirect_urls']]
return obj

def parse_domain_info(self, url, test_mode=False):
if test_mode:
match = re.match('([A-Z])\w+', url)
netloc = name = match.groups()[0] if match else '?'
scheme = sld = tld = subdomain = '-'
else:
netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url)
return {
'netloc': netloc,
'name': name,
'scheme': scheme,
'sld': sld,
'tld': tld,
'subdomain': subdomain,
}
5 changes: 5 additions & 0 deletions frontera/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1'
DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1'

#--------------------------------------------------------
# Domain mw
#--------------------------------------------------------
TLDEXTRACT_DOMAIN_INFO = False

#--------------------------------------------------------
# Logging
#--------------------------------------------------------
Expand Down
63 changes: 63 additions & 0 deletions frontera/tests/test_domain_mware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import unittest
from frontera.contrib.middlewares.domain import DomainMiddleware
from frontera.core.manager import FrontierManager
from frontera.core.models import Request


class FakeManager(object):
settings = {}
test_mode = False


class DomainMiddlewareTest(unittest.TestCase):
def setUp(self):
self.fake_manager = FakeManager()

def test_create(self):
DomainMiddleware(self.fake_manager)

def test_should_parse_domain_info(self):
seeds = [
Request('http://example.com'),
Request('https://www.google.com'),
]

mware = DomainMiddleware(self.fake_manager)
result = mware.add_seeds(seeds)

self.assertEquals(len(result), len(seeds))

for r in result:
self.assertIn('domain', r.meta, 'Missing domain info for %r' % r)

expected = [
{'name': 'example.com', 'netloc': 'example.com', 'scheme': 'http',
'sld': '', 'subdomain': '', 'tld': ''},
{'name': 'www.google.com', 'netloc': 'www.google.com', 'scheme': 'https',
'sld': '', 'subdomain': '', 'tld': ''},
]
self.assertEquals(expected, [r.meta['domain'] for r in result])

def test_should_parse_tldextract_extra_domain_info(self):
seeds = [
Request('http://example.com'),
Request('https://www.google.com'),
]

self.fake_manager.settings = {'TLDEXTRACT_DOMAIN_INFO': True}

mware = DomainMiddleware(self.fake_manager)
result = mware.add_seeds(seeds)

self.assertEquals(len(result), len(seeds))

for r in result:
self.assertIn('domain', r.meta, 'Missing domain info for %r' % r)

expected = [
{'name': 'example.com', 'netloc': 'example.com', 'scheme': 'http',
'sld': 'example', 'subdomain': '', 'tld': 'com'},
{'name': 'google.com', 'netloc': 'www.google.com', 'scheme': 'https',
'sld': 'google', 'subdomain': 'www', 'tld': 'com'},
]
self.assertEquals(expected, [r.meta['domain'] for r in result])
2 changes: 1 addition & 1 deletion frontera/utils/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import hashlib
from six import moves
from w3lib.util import unicode_to_str
import tldextract


# Python 2.x urllib.always_safe become private in Python 3.x;
Expand Down Expand Up @@ -39,6 +38,7 @@ def parse_domain_from_url(url):
https://google.es/mail google.es google.es https google es
-------------------------------------------------------------------------------------------------------
"""
import tldextract
extracted = tldextract.extract(url)
scheme, _, _, _, _, _ = parse_url(url)

Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
six>=1.8.0
w3lib>=1.10.0
tldextract>=1.5.1
SQLAlchemy>=0.9.8
SQLAlchemy>=0.9.8
1 change: 1 addition & 0 deletions requirements/tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ MySQL-python>=1.2.5
PyMySQL>=0.6.3
psycopg2>=2.5.4
scrapy>=0.24
-r tldextract.txt
1 change: 1 addition & 0 deletions requirements/tldextract.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tldextract>=1.5.1
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
install_requires=[
'six>=1.8.0',
'w3lib>=1.10.0',
'tldextract>=1.5.1',
'SQLAlchemy>=0.9.8'
],
extras_require={
Expand All @@ -49,12 +48,16 @@
'logging': [
"colorlog>=2.4.0",
],
'tldextract': [
'tldextract>=1.5.1',
]
},
tests_require=[
"pytest>=2.6.4",
"MySQL-python>=1.2.5",
"PyMySQL>=0.6.3",
"psycopg2>=2.5.4",
"scrapy>=0.24"
"scrapy>=0.24",
"tldextract>=1.5.1",
]
)

0 comments on commit 065b69a

Please sign in to comment.