Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom spider argument support #14

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions arachnado/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,19 @@ def find_spider_cls(spider_name, spider_packages):
return spider_cls


def set_spider_class_args(spider_cls, **kwargs):
""" Creates new spider class based on given spider class to keep the
original spider class consistent.

:param spider_cls: Original spider class
:param kwargs: Custom spider attributed to be set for this particular crawl
"""
if kwargs:
return type(
'{}Customized'.format(spider_cls.__name__), (spider_cls,), kwargs)
return spider_cls


class BaseRequestHandler(RequestHandler):

def initialize(self, crawler_process, opts):
Expand All @@ -91,6 +104,25 @@ def render(self, *args, **kwargs):
kwargs['initial_process_stats_json'] = json_encode(proc_stats)
return super(BaseRequestHandler, self).render(*args, **kwargs)

def get_custom_spider_arguments(self, ignore=None):
""" Returns custom spider arguments embedded in request

:param ignore: List of arguments to be ignored. Defaults to 'domain'.
"""
ignore = ignore or ['domain']
if self.json_args:
args = self.json_args
getter = self.json_args.get
else:
content_type = self.request.headers.get('Content-Type', '').lower()
if 'x-www-form-urlencoded' in content_type:
args = self.request.body_arguments
getter = self.get_body_argument
else:
args = self.request.query_arguments
getter = self.get_query_argument
return {arg: getter(arg) for arg in args if arg not in ignore}


class Index(NoEtagsMixin, BaseRequestHandler):

Expand All @@ -109,7 +141,7 @@ class StartCrawler(ApiHandler, BaseRequestHandler):
"""
This endpoint starts crawling for a domain.
"""
def crawl(self, domain):
def crawl(self, domain, **kwargs):
storage_opts = self.opts['arachnado.storage']
settings = {
'MOTOR_PIPELINE_ENABLED': storage_opts['enabled'],
Expand All @@ -120,22 +152,24 @@ def crawl(self, domain):
spider_cls = get_spider_cls(domain, self._get_spider_package_names())

if spider_cls is not None:
spider_cls = set_spider_class_args(spider_cls, **kwargs)
self.crawler = create_crawler(settings, spider_cls=spider_cls)
self.crawler_process.crawl(self.crawler, domain=domain)
return True
return False

def post(self):
spider_args = self.get_custom_spider_arguments()
if self.is_json:
domain = self.json_args['domain']
if self.crawl(domain):
if self.crawl(domain, **spider_args):
self.write({"status": "ok",
"job_id": self.crawler.spider.crawl_id})
else:
self.write({"status": "error"})
else:
domain = self.get_body_argument('domain')
if self.crawl(domain):
if self.crawl(domain, **spider_args):
self.redirect("/")
else:
raise HTTPError(400)
Expand Down
28 changes: 28 additions & 0 deletions arachnado/mixins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import logging
import datetime


class ArachnadoSpiderMixin(object):
"""
An arachnado spider mixin that contains common attributes and utilities for
all Arachnado spiders
"""
crawl_id = None
domain = None
motor_job_id = None

def __init__(self, *args, **kwargs):
super(ArachnadoSpiderMixin, self).__init__(*args, **kwargs)
# don't log scraped items
logging.getLogger("scrapy.core.scraper").setLevel(logging.INFO)

def get_page_item(self, response, type_='page'):
return {
'crawled_at': datetime.datetime.utcnow(),
'url': response.url,
'status': response.status,
'headers': response.headers,
'body': response.body_as_unicode(),
'meta': response.meta,
'_type': type_,
}
32 changes: 7 additions & 25 deletions arachnado/spider.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import datetime
import logging

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.http.response.html import HtmlResponse
from scrapy.spiders.crawl import CrawlSpider

from .utils import MB, add_scheme_if_missing, get_netloc
from .crawler_process import ArachnadoCrawler
from .mixins import ArachnadoSpiderMixin


DEFAULT_SETTINGS = {
Expand Down Expand Up @@ -68,30 +68,12 @@ def create_crawler(settings=None, spider_cls=None):
return ArachnadoCrawler(spider_cls, _settings)


class ArachnadoSpider(scrapy.Spider):
"""
A base spider that contains common attributes and utilities for all
Arachnado spiders
"""
crawl_id = None
domain = None
motor_job_id = None
class ArachnadoSpider(scrapy.Spider, ArachnadoSpiderMixin):
pass

def __init__(self, *args, **kwargs):
super(ArachnadoSpider, self).__init__(*args, **kwargs)
# don't log scraped items
logging.getLogger("scrapy.core.scraper").setLevel(logging.INFO)

def get_page_item(self, response, type_='page'):
return {
'crawled_at': datetime.datetime.utcnow(),
'url': response.url,
'status': response.status,
'headers': response.headers,
'body': response.body_as_unicode(),
'meta': response.meta,
'_type': type_,
}

class ArachnadoCrawlSpider(CrawlSpider, ArachnadoSpiderMixin):
pass


class CrawlWebsiteSpider(ArachnadoSpider):
Expand Down