From e432b1f9611e569643f0dc29e91f4ece3da21658 Mon Sep 17 00:00:00 2001 From: Ubasic Date: Mon, 31 Mar 2014 10:45:46 +0800 Subject: [PATCH] Init Commit --- .gitignore | 2 + docs/Makefile | 177 +++++++++++++++++++ docs/conf.py | 343 ++++++++++++++++++++++++++++++++++++ docs/index.rst | 324 ++++++++++++++++++++++++++++++++++ docs/modules/grapy.core.rst | 70 ++++++++ docs/modules/grapy.rst | 45 +++++ docs/modules/modules.rst | 7 + grapy/__init__.py | 5 + grapy/core/__init__.py | 9 + grapy/core/base_sched.py | 35 ++++ grapy/core/base_spider.py | 31 ++++ grapy/core/engine.py | 150 ++++++++++++++++ grapy/core/exceptions.py | 16 ++ grapy/core/item.py | 170 ++++++++++++++++++ grapy/core/request.py | 122 +++++++++++++ grapy/core/response.py | 209 ++++++++++++++++++++++ grapy/logging.py | 2 + grapy/sched.py | 44 +++++ grapy/utils.py | 60 +++++++ setup.py | 24 +++ 20 files changed, 1845 insertions(+) create mode 100644 .gitignore create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/modules/grapy.core.rst create mode 100644 docs/modules/grapy.rst create mode 100644 docs/modules/modules.rst create mode 100644 grapy/__init__.py create mode 100644 grapy/core/__init__.py create mode 100644 grapy/core/base_sched.py create mode 100644 grapy/core/base_spider.py create mode 100644 grapy/core/engine.py create mode 100644 grapy/core/exceptions.py create mode 100644 grapy/core/item.py create mode 100644 grapy/core/request.py create mode 100644 grapy/core/response.py create mode 100644 grapy/logging.py create mode 100644 grapy/sched.py create mode 100644 grapy/utils.py create mode 100755 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ec85cda --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.swp +*.py[co] diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..bff84ec --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/crawl.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/crawl.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/crawl" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/crawl" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..7bd0e7f --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# grapy documentation build configuration file, created by +# sphinx-quickstart on Thu Dec 5 10:47:15 2013. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.pngmath', + # 'sphinx.ext.mathjax', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'grapy' +copyright = '2013, Li Meng Jun' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1.5' +# The full version, including alpha/beta/rc tags. +release = '0.1.5' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'grapydoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'grapy.tex', 'grapy Documentation', + 'Li Meng Jun', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'grapy', 'grapy Documentation', + ['Li Meng Jun'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'grapy', 'grapy Documentation', + 'Li Meng Jun', 'grapy', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# -- Options for Epub output ---------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = 'Grapy' +epub_author = 'Li Meng Jun' +epub_publisher = 'Li Meng Jun' +epub_copyright = '2013, Li Meng Jun' + +# The basename for the epub file. It defaults to the project name. +#epub_basename = 'grapy' + +# The HTML theme for the epub output. Since the default themes are not optimized +# for small screen space, using the same theme for HTML and epub output is +# usually not wise. This defaults to 'epub', a theme designed to save visual +# space. +#epub_theme = 'epub' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# A tuple containing the cover image and cover page html template filenames. +#epub_cover = () + +# A sequence of (type, uri, title) tuples for the guide element of content.opf. +#epub_guide = () + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True + +# Choose between 'default' and 'includehidden'. +#epub_tocscope = 'default' + +# Fix unsupported image types using the PIL. +#epub_fix_images = False + +# Scale large images. +#epub_max_image_width = 0 + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#epub_show_urls = 'inline' + +# If false, no index is generated. +#epub_use_index = True + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..f9012cc --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,324 @@ +.. grapy documentation master file, created by + sphinx-quickstart on Thu Dec 5 10:47:15 2013. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to grapy's documentation! +================================= + +Contents: + +.. toctree:: + :maxdepth: 2 + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +=============== +Crawl Tutorial +=============== + +In this tutorial, we'll assume that Crawl is already installed on your system. +If that's not the case, see :ref:`intro-install`. + +We are going to use `Open directory project (dmoz) `_ as +our example domain to scrape. + +This tutorial will walk you through these tasks: + +1. Creating a new Crawl project +2. Defining the Items you will extract +3. Writing a :ref:`spider ` to grapy a site and extract + :ref:`Items ` +4. Writing an :ref:`Item Pipeline ` to store the + extracted Items + +Crawl is written in Python_. If you're new to the language you might want to +start by getting an idea of what the language is like, to get the most out of +Crawl. If you're already familiar with other languages, and want to learn +Python quickly, we recommend `Learn Python The Hard Way`_. If you're new to programming +and want to start with Python, take a look at `this list of Python resources +for non-programmers`_. + +.. _Python: http://www.python.org +.. _this list of Python resources for non-programmers: http://wiki.python.org/moin/BeginnersGuide/NonProgrammers +.. _Learn Python The Hard Way: http://learnpythonthehardway.org/book/ + +Creating a project +================== + +Before you start crawling, you will have set up a new Grapy project. Enter a +directory where you'd like to store your code and then run:: + + mkdir tutorial + mkdir tutorial/spiders + touch tutorial/__init__.py + touch tutorial/items.py + touch tutorial/pipelines.py + touch tutorial/middlewares.py + touch tutorial/spiders/__init__.py + touch config.py + touch main.py + +These are basically: + +* ``config.py``: the project configuration file +* ``tutorial/``: the project's python module, you'll later import your code from + here. +* ``tutorial/items.py``: the project's items file. +* ``tutorial/pipelines.py``: the project's pipelines file. +* ``tutorial/middlewares.py``: the project's middlewares file. +* ``tutorial/spiders/``: a directory where you'll later put your spiders. + +Defining our Item +================= + +`Item` are containers that will be loaded with the crawled data; they work +like simple python dicts but provide additional protecting against populating +undeclared fields, to prevent typos. + +They are declared by creating an :class:`grapy.core.Item` class and defining +its attributes as :attr:`grapy.core.Item._fields` objects, like you will in an ORM +(don't worry if you're not familiar with ORMs, you will see that this is an +easy task). + +We begin by modeling the item that we will use to hold the sites data obtained +from dmoz.org, as we want to capture the name, url and description of the +sites, we define fields for each of these three attributes. To do that, we edit +items.py, found in the ``tutorial`` directory. Our Item class looks like this:: + + from grapy.core import Item + + class DmozItem(Item): + _fields = [ + {'name': 'title', 'type': 'str'}, + {'name': 'link', 'type': 'str'}, + {'name': 'desc', 'type': 'str'} + ] + +This may seem complicated at first, but defining the item allows you to use other handy +components of Crawl that need to know how your item looks like. + +Our first Spider +================ + +Spiders are user-written classes used to crawl information from a domain (or group +of domains). + +They define an initial list of URLs to download, how to follow links, and how +to parse the contents of those pages to extract :ref:`items `. + +To create a Spider, you must subclass :class:`grapy.core.BaseSpider`, and +define the three main, mandatory, attributes: + +* :attr:`~grapy.core.BaseSpider.name`: identifies the Spider. It must be + unique, that is, you can't set the same name for different Spiders. + +* :attr:`~grapy.core.BaseSpider.start_urls`: is a list of URLs where the + Spider will begin to grapy from. So, the first pages downloaded will be those + listed here. The subsequent URLs will be generated successively from data + contained in the start URLs. + +* :meth:`~grapy.core.BaseSpider.parse` is a method of the spider, which will + be called with the downloaded :class:`~grapy.core.Response` object of each + start URL. The response is passed to the method as the first and only + argument. + + This method is responsible for parsing the response data and extracting + grapyed data (as grapyed items) and more URLs to follow. + + The :meth:`~grapy.core.BaseSpider.parse` method is in charge of processing + the response and returning crawled data (as :class:`~grapy.core.Item` + objects) and more URLs to follow (as :class:`~grapy.core.Request` objects). + +This is the code for our first Spider; save it in a file named +``dmoz_spider.py`` under the ``tutorial/spiders`` directory:: + + from grapy.core import BaseSpider + + class DmozSpider(BaseSpider): + name = "dmoz" + start_urls = [ + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" + ] + + def parse(self, response): + filename = response.url.split("/")[-2] + open(filename, 'wb').write(response.content) + + +Crawling +======== + +To put our spider to work, go to the project's top level directory and edit ``main.py``:: + + from grapy import engine + from grapy.sched import Scheduler + from tutorial.spiders.dmoz_spider import DmozSpider + + sched = Scheduler() + engine.set_sched(sched) + engine.set_spiders([DmozSpider()]) + + engine.start() + +then:: + + python3 main.py + +But more interesting, as our ``parse`` method instructs, two files have been +created: *Books* and *Resources*, with the content of both URLs. + +What just happened under the hood? +================================== + +Crawl creates :class:`grapy.core.Request` objects for each URL in the +``start_urls`` attribute of the Spider, and assigns them the ``parse`` method of +the spider as their callback function. + +These Requests are scheduled, then executed, and +:class:`grapy.core.Response` objects are returned and then fed back to the +spider, through the :meth:`~grapy.core.BaseSpider.parse` method. + +Extracting Items +================ + +There are several ways to extract data from web pages. +Scrapy use :attr:`~grapy.core.Response.soup` and +:meth:`~grapy.core.Response.select` base on `BeautifulSoup`_ + +Let's add this code to our spider:: + + from grapy.core import BaseSpider + + class DmozSpider(BaseSpider): + name = "dmoz" + start_urls = [ + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" + ] + + def parse(self, response): + for site in response.select('ul li'): + elem = site.find('a') + if elem: + title = elem.get_text() + link = elem.get('href') + desc = site.get_text() + print(title, link, desc) + +Now try crawling the dmoz.org domain again and you'll see sites being printed +in your output, run:: + + python3 main.py + +Using our item +============== + +:class:`~grapy.core.Item` objects are custom python dicts; you can access the +values of their fields (attributes of the class we defined earlier) using the +standard dict syntax like:: + + >>> item = DmozItem() + >>> item['title'] = 'Example title' + >>> item['title'] + 'Example title' + >>> item.title + 'Example title' + +Spiders are expected to return their grapyed data inside +:class:`~grapy.core.Item` objects. So, in order to return the data we've +grapyed so far, the final code for our Spider would be like this:: + + from grapy.core import BaseSpider + from tutorial.items import DmozItem + + class DmozSpider(BaseSpider): + name = "dmoz" + start_urls = [ + "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", + "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" + ] + + def parse(self, response): + items = [] + for site in response.select('ul li'): + elem = site.find('a') + if elem: + item = DmozItem() + title = elem.get_text() + link = elem.get('href') + desc = site.get_text() + print(title, link, desc) + items.append(item) + + return items + +Next steps +========== + +This tutorial covers only the basics of Crawl, but there's a lot of other +features not mentioned here. + +.. _intro-install: + +================== +Installation guide +================== + +Pre-requisites +============== + +The installation steps assume that you have the following things installed: + +* `Python`_ 3.3 +* `asyncio`_ Python 3 async library +* `aiohttp`_ http client/server for asyncio +* `BeautifulSoup`_ Beautiful Soup: We called him Tortoise because he taught us +* `aiogear`_ Gearman client/worker for asyncio +* `pip`_ or `easy_install`_ Python package managers +* `Gearman`_ Gearman Job Server + +Installing Crawl +================= + +To install using source:: + + git clone ssh://gitlab@gitlab.widget-inc.com:65422/pinbot-grapy/grapy.git + cd grapy + python3 setup.py install + +.. _Python: http://www.python.org +.. _asyncio: https://code.google.com/p/tulip/ +.. _aiohttp: https://github.com/fafhrd91/aiohttp +.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ +.. _Gearman: http://gearman.org/ +.. _aiogear: https://github.com/Lupino/aiogear +.. _pip: http://www.pip-installer.org/en/latest/installing.html +.. _easy_install: http://pypi.python.org/pypi/setuptools + +.. _topics-spiders: + +================== +Spider +================== + +.. _topics-items: + +================== +Item +================== + +.. _topics-item-pipeline: + +================== +Pipeline +================== + diff --git a/docs/modules/grapy.core.rst b/docs/modules/grapy.core.rst new file mode 100644 index 0000000..e965b3f --- /dev/null +++ b/docs/modules/grapy.core.rst @@ -0,0 +1,70 @@ +grapy.core package +================== + +Submodules +---------- + +grapy.core.base_sched module +---------------------------- + +.. automodule:: grapy.core.base_sched + :members: + :undoc-members: + :show-inheritance: + +grapy.core.base_spider module +----------------------------- + +.. automodule:: grapy.core.base_spider + :members: + :undoc-members: + :show-inheritance: + +grapy.core.engine module +------------------------ + +.. automodule:: grapy.core.engine + :members: + :undoc-members: + :show-inheritance: + +grapy.core.exceptions module +---------------------------- + +.. automodule:: grapy.core.exceptions + :members: + :undoc-members: + :show-inheritance: + +grapy.core.item module +---------------------- + +.. automodule:: grapy.core.item + :members: + :undoc-members: + :show-inheritance: + +grapy.core.request module +------------------------- + +.. automodule:: grapy.core.request + :members: + :undoc-members: + :show-inheritance: + +grapy.core.response module +-------------------------- + +.. automodule:: grapy.core.response + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: grapy.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/grapy.rst b/docs/modules/grapy.rst new file mode 100644 index 0000000..a4a9ed6 --- /dev/null +++ b/docs/modules/grapy.rst @@ -0,0 +1,45 @@ +grapy package +============= + +Subpackages +----------- + +.. toctree:: + + grapy.core + +Submodules +---------- + +grapy.logging module +-------------------- + +.. automodule:: grapy.logging + :members: + :undoc-members: + :show-inheritance: + +grapy.sched module +------------------ + +.. automodule:: grapy.sched + :members: + :undoc-members: + :show-inheritance: + +grapy.utils module +------------------ + +.. automodule:: grapy.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: grapy + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/modules.rst b/docs/modules/modules.rst new file mode 100644 index 0000000..f8c8050 --- /dev/null +++ b/docs/modules/modules.rst @@ -0,0 +1,7 @@ +.. +== + +.. toctree:: + :maxdepth: 4 + + grapy diff --git a/grapy/__init__.py b/grapy/__init__.py new file mode 100644 index 0000000..1d0280e --- /dev/null +++ b/grapy/__init__.py @@ -0,0 +1,5 @@ +from .core import Engine + +__all__ = ['engine'] + +engine = Engine() diff --git a/grapy/core/__init__.py b/grapy/core/__init__.py new file mode 100644 index 0000000..0562bf7 --- /dev/null +++ b/grapy/core/__init__.py @@ -0,0 +1,9 @@ +from .engine import Engine +from .base_spider import BaseSpider +from .base_sched import BaseScheduler +from .request import Request +from .response import Response +from .item import Item, dump_item, load_item + +__all__ = ['Engine', 'BaseSpider', 'BaseScheduler', 'Request', 'Response', + 'Item', 'dump_item', 'load_item'] diff --git a/grapy/core/base_sched.py b/grapy/core/base_sched.py new file mode 100644 index 0000000..bf7fc31 --- /dev/null +++ b/grapy/core/base_sched.py @@ -0,0 +1,35 @@ +import asyncio + +class BaseScheduler(object): + def __init__(self): + self.engine = None + self.is_running = False + + def push_req(self, req): + ''' + push the request + ''' + raise NotImplementedError('you must rewrite at sub class') + + def push_item(self, item): + yield from self.submit_item(item) + + def submit_req(self, req): + yield from self.engine.process(req) + + def submit_item(self, item): + yield from self.engine.process_item(item) + + @asyncio.coroutine + def run(self): + ''' + run the scheduler + ''' + raise NotImplementedError('you must rewrite at sub class') + + def start(self): + if self.is_running: + return + + self.is_running = True + return asyncio.Task(self.run()) diff --git a/grapy/core/base_spider.py b/grapy/core/base_spider.py new file mode 100644 index 0000000..fa05223 --- /dev/null +++ b/grapy/core/base_spider.py @@ -0,0 +1,31 @@ +from .request import Request + +__all__ = ['BaseSpider'] + +class BaseSpider(object): + '''The BaseSpider, all the spider recommend to extends this''' + + __slots__ = ['name', 'start_urls'] + + def __init__(self, name=None, start_urls=[]): + ''' + @name: the spider name, unique + + @start_urls: the start request url + ''' + self.name = name + self.start_urls = start_urls + + def start_request(self): + '''you can rewrite it for custem start request''' + for url in self.start_urls: + req = Request(url) + req.unique = False + yield req + + def parse(self, response): + ''' + the default spider parse function. + you must rewrite on a sub class. + ''' + raise NotImplementedError('you must rewrite at sub class') diff --git a/grapy/core/engine.py b/grapy/core/engine.py new file mode 100644 index 0000000..9d45c9b --- /dev/null +++ b/grapy/core/engine.py @@ -0,0 +1,150 @@ +import asyncio +from .request import Request +import inspect +from .item import Item +from ..logging import logger +from .exceptions import EngineError + +__all__ = ['Engine'] + +class Engine(object): + + __slots__ = ['pipelines', 'spiders', 'middlewares', 'sched', 'loop'] + + def __init__(self, loop=None): + self.pipelines = [] + self.spiders = {} + self.middlewares = [] + self.sched = None + self.loop = loop + if not self.loop: + self.loop = asyncio.get_event_loop() + + def set_spiders(self, spiders): + self.spiders = {} + if isinstance(spiders, dict): + for name, spider in spiders.items(): + self.spiders[name] = spider + + else: + self.add_spiders(spiders) + + def add_spiders(self, spiders): + for spider in spiders: + self.add_spider(spider) + + def add_spider(self, spider): + if spider.name in self.spiders.keys(): + raise EngineError('Spider[%s] is already exists'%spider.name) + self.spiders[spider.name] = spider + + def remove_spider(self, spider_name): + self.spiders.pop(spider_name) + + def get_spider(self, name): + spider = self.spiders.get(name) + if spider: + return spider + else: + raise EngineError('Spider[%s] is not found'%name) + + def set_pipelines(self, pipelines): + self.pipelines = pipelines + + def set_middlewares(self, middlewares): + self.middlewares = middlewares + + def set_sched(self, sched): + self.sched = sched + self.sched.engine = self + + @asyncio.coroutine + def process(self, req): + req = yield from self.process_middleware('before_process_request', req) + + rsp = yield from req.request() + + rsp.req = req + + rsp = yield from self.process_middleware('after_process_response', rsp) + + yield from self.process_response(rsp) + + @asyncio.coroutine + def process_middleware(self, name, obj): + for mid in self.middlewares: + if hasattr(mid, name): + func = getattr(mid, name) + obj = func(obj) + if isinstance(obj, asyncio.Future) or inspect.isgenerator(obj): + obj = yield from obj + + return obj + + @asyncio.coroutine + def process_item(self, item, pipelines=None): + if not pipelines: + pipelines = self.pipelines + + for pip in pipelines: + + item = pip.process(item) + if isinstance(item, asyncio.Future) or inspect.isgenerator(item): + item = yield from item + + @asyncio.coroutine + def process_response(self, rsp): + spider_name = rsp.req.spider + callback = rsp.req.callback + args = rsp.req.callback_args + spider = self.get_spider(spider_name) + func = getattr(spider, callback) + items = func(rsp, *args) + if items is None: + return + for item in items: + if isinstance(item, Request): + item.spider = spider.name + logger.info('Find url[{}] on requset[{}] by spider[{}]'.\ + format(item.url, rsp.url, spider.name)) + + item.group = rsp.req.group + item.ref = rsp.req.req_id + + yield from self.push_req(item) + elif isinstance(item, Item): + yield from self.push_item(item) + else: + raise EngineError('Unknow type') + + @asyncio.coroutine + def push_req(self, req, middleware=True): + if middleware: + req = yield from self.process_middleware('before_push_request', req) + + req = self.sched.push_req(req) + if isinstance(req, asyncio.Future) or inspect.isgenerator(req): + req = yield from req + + @asyncio.coroutine + def push_item(self, item): + ret = self.sched.push_item(item) + if isinstance(ret, asyncio.Future) or inspect.isgenerator(ret): + ret = yield from ret + + def start_request(self): + for spider in self.spiders.values(): + for req in spider.start_request(): + req.spider = spider.name + yield from self.push_req(req) + + def run(self): + yield from self.start_request() + self.sched.start() + + def start(self): + asyncio.Task(self.run()) + self.loop.run_forever() + + def shutdown(self): + self.loop.close() diff --git a/grapy/core/exceptions.py b/grapy/core/exceptions.py new file mode 100644 index 0000000..14af66d --- /dev/null +++ b/grapy/core/exceptions.py @@ -0,0 +1,16 @@ +__all__ = ['EngineError', 'DropItem', 'IgnoreRequest', 'RetryRequest', 'ItemError'] + +class EngineError(Exception): + pass + +class DropItem(Exception): + pass + +class IgnoreRequest(Exception): + pass + +class RetryRequest(Exception): + pass + +class ItemError(Exception): + pass diff --git a/grapy/core/item.py b/grapy/core/item.py new file mode 100644 index 0000000..20b4a98 --- /dev/null +++ b/grapy/core/item.py @@ -0,0 +1,170 @@ +import json +import re +from .exceptions import ItemError +from ..utils import import_module +from uuid import uuid1 as uuid + +__all__ = ['Item', 'load_item', 'dump_item'] + +class Item(object): + _null_char = '\x01' + + _extra_field = {'name': 'extra', 'type': 'json'} + + _fields = [ + {'name': 'extra', 'type': 'json'} + ] + + __slots__ = ['__dict__'] + + def __init__(self, payload = {}): + + if self._extra_field not in self._fields: + self._fields.append(self._extra_field) + + if not isinstance(payload, dict): + payload = self.unpack(payload) + + self.update(payload) + + def __getitem__(self, key, default=None): + '''x.__getitem__(y) <==> x[y]''' + return getattr(self, key, default) + + def __setitem__(self, key, val): + '''x.__setitem__(i, y) <==> x[i]=y''' + if isinstance(val, str): + val = val.strip() + setattr(self, key, val) + + def keys(self): + '''D.keys() -> a set-like object providing a view on D's keys''' + return self.__dict__.keys() + + def values(self): + '''D.values() -> an object providing a view on D's values''' + return self.__dict__.values() + + def items(self): + return self.__dict__.items() + + def pop(self, key, default=None): + ''' + D.pop(k[,d]) -> v, remove specified key and return the corresponding value. + If key is not found, d is returned if given, otherwise KeyError is raised + ''' + return self.__dict__.pop(key, default) + + def get(self, key, default=None): + '''D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.''' + return self.__dict__.get(key, default) + + def update(self, item): + ''' + D.update([E, ]**F) -> None. + * Update D from dict/iterable E and F. + * If E present and has a .keys() method, does: for k in E: D[k] = E[k] + * If E present and lacks .keys() method, does: for (k, v) in E: D[k] = v + * In either case, this is followed by: for k in F: D[k] = F[k] + ''' + for k, v in item.items(): + if isinstance(v, str): + item[k] = v.strip() + return self.__dict__.update(item) + + def copy(self): + return self.__dict__.copy() + + def pack(self): + '''D.pack() -> a bytes object. pack item''' + payload = dict(self) + keys = list(map(lambda x: x['name'], self._fields)) + tps = list(map(lambda x: x['type'], self._fields)) + tps = dict(zip(keys, tps)) + + none_keys = list(filter(lambda x: not payload[x], payload.keys())) + list(map(payload.pop, none_keys)) + + other_keys = filter(lambda x : x not in keys, payload.keys()) + other = dict(zip(other_keys, map(lambda x: payload[x], other_keys))) + + payload[self._extra_field['name']] = other + + def _pack(key): + val = payload.get(key, '') + tp = tps[key] + + if val: + if tp == 'json': + val = json.dumps(val) + else: + val = '' + if not isinstance(val, str): + val = str(val) + return val + return self._null_char.join(map(_pack, keys)) + + def unpack(self, payload): + '''unpack item''' + if isinstance(payload, bytes): + payload = str(payload, 'utf-8') + + keys = list(map(lambda x: x['name'], self._fields)) + tps = list(map(lambda x: x['type'], self._fields)) + tps = dict(zip(keys, tps)) + + payload = payload.split(self._null_char) + + def _unpack(pack): + key, val = pack + tp = tps[key] + if not val: + return key, val + if tp == 'json': + val = json.loads(val) + elif tp == 'int': + val = int(val) + elif tp == 'float': + val = float(val) + elif tp == 'bool': + val = bool(val) + return key, val + + payload = dict(map(_unpack, zip(keys, payload))) + + if payload.get(self._extra_field['name']): + other = payload.pop(self._extra_field['name']) + if isinstance(other, dict): + payload.update(other) + + return payload + + def __str__(self): + return json.dumps(self.__dict__, indent=2) + + def __bytes__(self): + return bytes(self.pack(), 'utf-8') + + @property + def unique(self): + return str(uuid()) + +NULL_CHAR = '\x02\x00\x00' +def dump_item(klass, *args, **kwargs): + '''dump the Item''' + cls = klass.__class__ + cls_name = re.search("'([^']+)'", str(cls)).group(1) + if not isinstance(klass, Item): + raise ItemError( + 'ItemError: %s is not instance crawl.core.item.Item'%cls_name) + retval = NULL_CHAR.join([cls_name, klass.pack()]) + return retval + +def load_item(string): + '''load the Item''' + cls_name, data = string.split(NULL_CHAR) + klass = import_module(cls_name, data) + if not isinstance(klass, Item): + raise ItemError( + 'ItemError: %s is not instance crawl.core.item.Item'%cls_name) + return klass diff --git a/grapy/core/request.py b/grapy/core/request.py new file mode 100644 index 0000000..812f318 --- /dev/null +++ b/grapy/core/request.py @@ -0,0 +1,122 @@ +import json +import re +import asyncio +import aiohttp +from ..logging import logger +from urllib.parse import urljoin +from .response import Response +from .exceptions import IgnoreRequest, RetryRequest + +__all__ = ['Request'] + +class Request(object): + ''' + the Request object + ''' + + _keys = ['url', 'method', 'callback', 'callback_args', 'kwargs', 'spider', + 'req_id', 'group'] + _default = [{}, (), 'get', None, [], 'default'] + + _json_keys = ['callback_args', 'kwargs'] + + _null_char = '\x01' + + __slots__ = ['url', 'method', 'callback', 'callback_args', 'kwargs', + 'spider', 'unique', 'req_id', 'ref', 'group'] + + def __init__(self, url, method='get', + callback='parse', callback_args = [], **kwargs): + self.url = url + self.method = method + self.callback = callback + self.callback_args = callback_args + self.kwargs = kwargs + self.spider = 'default' + self.unique = True + self.req_id = 0 + self.ref = 0 + self.group = 0 + + def pack(self): + ''' + pack the Request object on bytes + ''' + def _pack(key): + val = getattr(self, key, '') + if val not in self._default: + if key in self._json_keys: + val = json.dumps(val) + else: + val = '' + if not isinstance(val, str): + val = str(val) + return val + return bytes(self._null_char.join(map(_pack, self._keys)), 'utf-8') + + def unpack(self, payload): + ''' + unpack the Request payload + ''' + payload = str(payload, 'utf-8') + payload = payload.split(self._null_char) + payload = dict(zip(self._keys, payload)) + + for json_key in self._json_keys: + if payload[json_key]: + payload[json_key] = json.loads(payload[json_key]) + + return payload + + def __bytes__(self): + return self.pack() + + @classmethod + def build(cls, payload): + ''' + build a Request + ''' + req = Request('') + payload = req.unpack(payload) + for key, val in payload.items(): + if val: + if hasattr(req, key): + setattr(req, key, val) + return req + + @asyncio.coroutine + def request(self): + ''' + do request default timeout is 300s + + >>> req = Request('http://example.com') + >>> rsp = yield from req.request() + ''' + method = self.method.lower() + kwargs = { + 'timeout': 300 + } + kwargs.update(self.kwargs.copy()) + url = self.url + + try: + rsp = yield from aiohttp.request(method, url, **kwargs) + ct = rsp.get('content-type', '') + logger.info('Request: {} {} {} {}'.format(method.upper(), url, rsp.status, ct)) + yield from asyncio.sleep(5) + if rsp.status >= 400 and rsp.status < 500: + raise IgnoreRequest(url) + if rsp.status == 200: + if re.search('html|json|text|xml|rss', ct, re.I): + content = yield from rsp.read() + rsp.close() + return Response(urljoin(url, rsp.url), content, rsp) + else: + raise IgnoreRequest(url) + else: + logger.error('Request fail: {} {}'.format(url, rsp.status)) + raise RetryRequest(url) + + except (aiohttp.IncompleteRead, aiohttp.BadStatusLine) as exc: + logger.error(str(exc) + ': ' + url) + raise RetryRequest(exc) diff --git a/grapy/core/response.py b/grapy/core/response.py new file mode 100644 index 0000000..e46de7b --- /dev/null +++ b/grapy/core/response.py @@ -0,0 +1,209 @@ +import re +from bs4 import BeautifulSoup +import json + +RE_XML = re.compile('<\?xml.+encoding=["\']([^\'"]+?)["\'].+\?>', re.I) +RE_HTML = re.compile('', re.I) + +__all__ = ['Response'] + +class Response(object): + + __slots__ = ['url', 'raw', 'encoding', 'content', '_soup', 'req'] + + def __init__(self, url, content, raw): + self.raw = raw + self.url = url + self._soup = None + self.encoding = None + self.content = content + self.req = None + + @property + def text(self): + 'return the unicode document' + content = self.content + if self.encoding: + return str(content, self.encoding, errors = 'ignore') + + charset = self._get_charset(content) + if charset: + self.encoding = charset + return str(content, charset, errors = 'ignore') + else: + try: + self.encoding = 'GBK' + return str(content, 'GBK') + except UnicodeDecodeError: + self.encoding = 'UTF-8' + return str(content, 'UTF-8', errors = 'ignore') + + def json(self): + '''return json document, maybe raise''' + # ct = self.raw.get('content-type', '').lower() + # if ct == 'application/json': + data = self.content + data = json.loads(data.decode('utf-8')) + return data + + @property + def headers(self): + 'return the request headers' + return self.raw.items() + + @property + def soup(self): + '''return the instance of BeautifulSoup''' + if self._soup is None: + text = self.text + self._soup = BeautifulSoup(text) + return self._soup + + def _get_charset(self, content): + + def map_charset(charset): + if charset: + charset = charset.upper() + if charset == 'GB2312': + charset = 'GBK' + return charset + + ct = '' + try: + ct = self.raw.get('content-type', '').lower() + except: + pass + p = re.search('charset=(.+)$', ct) + if p: + charset = p.group(1) + return map_charset(charset) + + content = str(content, 'utf-8', errors='ignore') + xml = RE_XML.search(content) + if xml: + charset = xml.group(1) + return map_charset(charset) + + html = RE_HTML.search(content) + if html: + charset = html.group(1) + return map_charset(charset) + + return None + + def select(self, selector): + ''' + select elements use the css selector + ''' + soup = self.soup + re_tag = re.compile('^[a-z0-9]+$', re.I | re.U) + re_attribute = re.compile('^(?P\w+)?\[(?P[a-z\-_]+)(?P[=~\|\^\$\*]?)=?"?(?P[^\]"]*)"?\]$') + + def attribute_checker(operator, attribute, value = ''): + """ + Takes an operator, attribute and optional value; returns a function + that will return True for elements that match that combination. + """ + + return { + '=': lambda el: el.get(attribute) == value, + # attribute includes value as one of a set of space separated tokens + '~': lambda el: value in el.get(attribute, '').split(), + # attribute starts with value + '^': lambda el: el.get(attribute, '').startswith(value), + # attribute ends with value + '$': lambda el: el.get(attribute, '').endswith(value), + # attribute contains value + '*': lambda el: value in el.get(attribute, ''), + # attribute is either exactly value or starts with value- + '|': lambda el: el.get(attribute, '') == value \ + or el.get(attribute, '').startswith('%s-' % value), + }.get(operator, lambda el: el.has_attr(attribute)) + + tokens = selector.split() + current_context = [soup] + + for index, token in enumerate(tokens): + if tokens[index - 1] == '>': + continue + + m = re_attribute.match(token) + if m: + # Attribute selector + tag, attribute, operator, value = m.groups() + + if not tag: + tag = True + + checker = attribute_checker(operator, attribute, value) + + found = [] + for context in current_context: + found.extend([el for el in context.find_all(tag) if checker(el)]) + + current_context = found + continue + + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if not tag: + tag = True + + el = current_context[0].find(tag, {'id': id}) + if not el: + return [] + + current_context = [el] + continue + + if '.' in token: + # Class selector + tag, klass = token.split('.', 1) + if not tag: + tag = True + + klasses = set(klass.split('.')) + found = [] + for context in current_context: + found.extend( + context.find_all(tag, {'class': lambda attr: + attr and klasses.issubset(attr.split())}) + ) + + current_context = found + continue + + if '*' in token: + # Star selector + found = [] + for context in current_context: + found.extend(context.find_all(True)) + + current_context = found + continue + + if token == '>': + # Child selector + tag = tokens[index + 1] + if not tag: + tag = True + + found = [] + for context in current_context: + found.extend(context.find_all(tag, recursive=False)) + + current_context = found + continue + + # Here we should just have a regular tag + if not re_tag.match(token): + return [] + + found = [] + for context in current_context: + found.extend(context.find_all(token)) + + current_context = found + + return current_context diff --git a/grapy/logging.py b/grapy/logging.py new file mode 100644 index 0000000..b0f805a --- /dev/null +++ b/grapy/logging.py @@ -0,0 +1,2 @@ +import logging +logger = logging.getLogger('crawl') diff --git a/grapy/sched.py b/grapy/sched.py new file mode 100644 index 0000000..663b042 --- /dev/null +++ b/grapy/sched.py @@ -0,0 +1,44 @@ +from .core import BaseScheduler +import hashlib +import asyncio + +__all__ = ['Scheduler'] + +def hash_url(url): + h = hashlib.sha1() + h.update(bytes(url, 'utf-8')) + return h.hexdigest() + +class Scheduler(BaseScheduler): + def __init__(self, stroage = {}, queue=[], max_tasks=5): + BaseScheduler.__init__(self) + self._stroage = stroage + self._queue = queue + self._sem = asyncio.Semaphore(max_tasks) + + def push_req(self, req): + key = hash_url(req.url) + if key in self._stroage: + return + + self._queue.insert(0, req) + self._stroage[key] = {'key': key, 'req': req, 'crawled': False} + + self.start() + + def run(self): + while True: + if len(self._queue) == 0: + break + + req = self._queue.pop() + yield from self._sem.acquire() + task = asyncio.Task(self.submit_req(req)) + task.add_done_callback(lambda t: self._sem.release()) + + self.is_running = False + + def submit_req(self, req): + yield from BaseScheduler.submit_req(self, req) + key = hash_url(req.url) + self._stroage[key] = {'key': key, 'req': req, 'crawled': True} diff --git a/grapy/utils.py b/grapy/utils.py new file mode 100644 index 0000000..bc4d6c6 --- /dev/null +++ b/grapy/utils.py @@ -0,0 +1,60 @@ +from importlib import import_module as _import_module +from .logging import logger + +__all__ = ['import_module', 'import_pipelines', 'import_middlewares', + 'import_spiders'] + +def import_module(module_name, *args, **kwargs): + ''' + import the module and init it + ''' + logger.debug('import module[%s]'%module_name) + idx = module_name.rfind('.') + module = _import_module(module_name[:idx]) + obj = getattr(module, module_name[idx+1:]) + return obj(*args, **kwargs) + +def import_pipelines(pipelines): + ''' + import module from a list like:: + + [ + {'class_or_method:index': args}, + {'class_or_method:index': kwargs}, + {'class_or_method:index': None} + ] + + the index is a number or string for order + ''' + retval = [] + for module_name, values in pipelines.items(): + args = [] + kwargs = {} + idx = module_name.find(':') + order = 0 + if idx > -1: + order = int(module_name[idx+1:]) + module_name = module_name[:idx] + tp = type(values) + if tp == tuple or tp == list: + args = values + elif tp == dict: + keys = values.keys() + if 'args' in keys or 'kwargs' in keys: + args = values.get('args', ()) + kwargs = values.get('kwargs', {}) + else: + kwargs = values + + elif values is not None: + args.append(values) + + retval.append((import_module(module_name, *args, **kwargs), order)) + + retval = [ret[0] for ret in sorted(retval, key=lambda x: x[1])] + + return retval + +import_middlewares = import_pipelines +import_spiders = import_pipelines + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..e5da175 --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +packages = [ + 'grapy', + 'grapy.core', +] + +requires = ['asyncio', 'aiohttp', 'beautifulsoup4'] + +setup( + name='grapy', + version='0.1.5', + description='a scrapy like model', + author='Li Meng Jun', + author_email='lmjubuntu@gmail.com', + url='http://lupino.me', + packages=packages, + package_dir={'grapy': 'grapy'}, + include_package_data=True, + install_requires=requires, +)