Skip to content
Paul Tremberth edited this page Jun 25, 2013 · 16 revisions

Welcome to the parslepy wiki!

Usage

  1. Instantiate a parslepy.Parselet with a Python dict containing extraction rules
  2. parslepy.Parselet works with lxml-processed documents:
  • either use .extract(document) if you've already parsed your document with lxml,
  • or use .parse(fp[, parser]) to let your Parselet instance do the parsing, using lxml's default HTML parser or optionally pass another parser for lxml. fp must be a file-like object, not a string (To use a string, you should use StringIO.)

Sample document for our examples and parsing it with lxml

>>> import lxml.etree
>>> import parslepy
>>> html = """
... <!DOCTYPE html>
... <html>
... <head>
...     <title>Sample document to test parslepy</title>
...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
... </head>
... <body>
... <h1 id="main">What&rsquo;s new</h1>
... <ul>
...     <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
...     <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
...     <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
... </ul>
... </body>
... </html>
... """
>>> html_parser = lxml.etree.HTMLParser()
>>> doc = lxml.etree.fromstring(html, parser=html_parser)
>>> doc
<Element html at 0x7f5fb1fce9b0>

Basic rules

>>> rules = {"title": "title"}
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'title': u'Sample document to test parslepy'}
>>> rules = {"heading": "h1"}
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'heading': u'What\u2019s new'}

Mixing CSS selectors and XPath expressions

>>> rules = {
...     "headingcss": "#main",
...     "headingxpath": "//h1[@id='main']"
... }
>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'}

Nested object lists

Nest your object rules inside a []

>>> rules = {
...     "heading": "h1#main",
...     "news(li.newsitem)": [{
...         "title": ".",
...         "url": "a/@href"
...     }],
... }
>>> p = parslepy.Parselet(rules)
>>> import pprint
>>> pprint.pprint(p.extract(doc))
{'heading': u'What\u2019s new',
 'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
          {'title': u'A second report on something',
           'url': '/article-002.html'},
          {'title': u'Python is great!', 'url': '/article-003.html'}]}

Broken rules and Exceptions

Lenient mode (default mode)

Non-matching rules will output {}

>>> rules = {
...     "heading1": "h1#main",
...     "heading2": "h2#main",
... }
>>> p = parslepy.Parselet(rules)
>>> pprint.pprint(p.extract(doc))
{'heading1': u'What\u2019s new'}    # only 1 key in output
>>> rules = {
...     "heading2": "h2#main"
... }
>>> p = parslepy.Parselet(rules)
>>> pprint.pprint(p.extract(doc))
{}    # nothin in output, no selector rule matched anything

Strict mode

Non-matching rules will raise NonMatchingNonOptionalKey exception

>>> rules = {
...     "heading1": "h1#main",
...     "heading2": "h2#main",
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> p.extract(doc)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "parslepy/base.py", line 501, in extract
    return self._extract(self.parselet_tree, document)
  File "parslepy/base.py", line 582, in _extract
    document.getroottree().getpath(document),v
parslepy.base.NonMatchingNonOptionalKey: key "heading2" is required but yield nothing
Current path: /html/(<Selector: inner=<CSSSelector 20a2758 for 'h2#main'>>)

Optional keys in strict mode

Add ? to the keys that may not match

>>> rules = {
...     "heading1": "h1#main",
...     "heading2?": "h2#main",
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> p.extract(doc)
{'heading1': u'What\u2019s new'}

Optional keys in nested object lists

In our sample document, one (and only one) LI contains a SPAN element with class fresh

In strict mode, adding a "fresh" key for our item extraction rules raises and Exception

>>> rules = {
...     "heading": "h1#main",
...     "news(li.newsitem)": [{
...         "title": ".",
...         "url": "a/@href",
...         "fresh": ".fresh"
...     }],
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> p.extract(doc)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "parslepy/base.py", line 501, in extract
    return self._extract(self.parselet_tree, document)
  File "parslepy/base.py", line 532, in _extract
    parse_result = self._extract(v, elem, level=level+1)
  File "parslepy/base.py", line 582, in _extract
    document.getroottree().getpath(document),v
parslepy.base.NonMatchingNonOptionalKey: key "fresh" is required but yield nothing
Current path: /html/body/ul/li[1]/(<Selector: inner=<CSSSelector 2096e60 for '.fresh'>>)

In non-strict/default mode, this will simply omit the "fresh" key for the items that do not have this SPAN

>>> p = parslepy.Parselet(rules)
>>> p.extract(doc)
{'news': [{'url': '/article-001.html', 'title': u'This is the first article'}, {'url': '/article-002.html', 'title': u'A second report on something'}, {'url': '/article-003.html', 'fresh': u'New!', 'title': u'Python is great! New!'}], 'heading': u'What\u2019s new'}
>>> pprint.pprint(p.extract(doc))
{'heading': u'What\u2019s new',
 'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
          {'title': u'A second report on something',
           'url': '/article-002.html'},
          {'fresh': u'New!',
           'title': u'Python is great! New!',
           'url': '/article-003.html'}]}
>>> 

Or in stict mode, you would need to set the "fresh" rule as optional to get the same output

>>> rules = {
...     "heading": "h1#main",
...     "news(li.newsitem)": [{
...         "title": ".",
...         "url": "a/@href",
...         "fresh?": ".fresh"
...     }],
... }
>>> p = parslepy.Parselet(rules, strict=True)
>>> pprint.pprint(p.extract(doc))
{'heading': u'What\u2019s new',
 'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
          {'title': u'A second report on something',
           'url': '/article-002.html'},
          {'fresh': u'New!',
           'title': u'Python is great! New!',
           'url': '/article-003.html'}]}
>>> 

Script syntax errors

>>> rules = {
...     "heading!": "h1#main",
... }
>>> p = parslepy.Parselet(rules, strict=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "parslepy/base.py", line 325, in __init__
    self.compile()
  File "parslepy/base.py", line 393, in compile
    self.parselet_tree = self._compile(self.parselet)
  File "parslepy/base.py", line 432, in _compile
    raise InvalidKeySyntax("Key %s is not valid" % k)
parslepy.base.InvalidKeySyntax: Key heading! is not valid
>>> p = parslepy.Parselet({"heading@": "#main"})
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "parslepy/base.py", line 325, in __init__
    self.compile()
  File "parslepy/base.py", line 393, in compile
    self.parselet_tree = self._compile(self.parselet)
  File "parslepy/base.py", line 432, in _compile
    raise InvalidKeySyntax("Key %s is not valid" % k)
parslepy.base.InvalidKeySyntax: Key heading@ is not valid
>>> p = parslepy.Parselet({"heading{": "#main"})
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "parslepy/base.py", line 325, in __init__
    self.compile()
  File "parslepy/base.py", line 393, in compile
    self.parselet_tree = self._compile(self.parselet)
  File "parslepy/base.py", line 432, in _compile
    raise InvalidKeySyntax("Key %s is not valid" % k)
parslepy.base.InvalidKeySyntax: Key heading{ is not valid

When not your keys but your selectors are invalid, you get an XPath syntax error exception

>>> p = parslepy.Parselet({"heading": "#main#"})
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "parslepy/base.py", line 325, in __init__
    self.compile()
  File "parslepy/base.py", line 393, in compile
    self.parselet_tree = self._compile(self.parselet)
  File "parslepy/base.py", line 471, in _compile
    child_tree = self._compile(v, level=level+1)
  File "parslepy/base.py", line 489, in _compile
    return self.selector_handler.make(parselet_node)
  File "parslepy/base.py", line 201, in make
    extensions = cls.XPATH_EXTENSIONS)
  File "xpath.pxi", line 438, in lxml.etree.XPath.__init__ (src/lxml/lxml.etree.c:134866)
  File "xpath.pxi", line 215, in lxml.etree._XPathEvaluatorBase._raise_parse_error (src/lxml/lxml.etree.c:132490)
lxml.etree.XPathSyntaxError: Invalid expression