Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore pyrdfa #1503

Closed
wants to merge 11 commits into from
69 changes: 69 additions & 0 deletions rdflib/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,75 @@ def plugins(
"JsonLDParser",
)

register(
"hturtle",
Parser,
"rdflib.plugins.parsers.hturtle",
"HTurtleParser",
)
register(
"rdfa",
Parser,
"rdflib.plugins.parsers.structureddata",
"RDFaParser",
)
register(
"mdata",
Parser,
"rdflib.plugins.parsers.structureddata",
"MicrodataParser",
)
register(
"microdata",
Parser,
"rdflib.plugins.parsers.structureddata",
"MicrodataParser",
)
# A convenience to use the RDFa 1.0 syntax (although the parse method can
# be invoked with an rdfa_version keyword, too)
register(
"rdfa1.0",
Parser,
"rdflib.plugins.parsers.structureddata",
"RDFa10Parser",
)
# Just for the completeness, if the user uses this
register(
"rdfa1.1",
Parser,
"rdflib.plugins.parsers.structureddata",
"RDFaParser",
)
# An HTML file may contain both microdata, rdfa, or turtle. If the user
# wants them all, the parser below simply invokes all:
register(
"html",
Parser,
"rdflib.plugins.parsers.structureddata",
"StructuredDataParser",
)
# Some media types are also bound to RDFa
register(
"application/svg+xml",
Parser,
"rdflib.plugins.parsers.structureddata",
"RDFaParser",
)
register(
"application/xhtml+xml",
Parser,
"rdflib.plugins.parsers.structureddata",
"RDFaParser",
)
# 'text/html' media type should be equivalent to html:
register(
"text/html",
Parser,
"rdflib.plugins.parsers.structureddata",
"StructuredDataParser",
)


# Register Quad Parsers
register(
"application/n-quads",
Expand Down
122 changes: 122 additions & 0 deletions rdflib/plugins/parsers/hturtle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
"""
Extraction parser RDF embedded verbatim into HTML or XML files. This is based
on:

* The specification on embedding turtle into html:
http://www.w3.org/TR/turtle/#in-html

For SVG (and currently SVG only) the method also extracts an embedded RDF/XML
data, per SVG specification

License: W3C Software License,
http://www.w3.org/Consortium/Legal/copyright-software
Author: Ivan Herman
Copyright: W3C
"""

from rdflib.parser import Parser
from pyRdfa import pyRdfa
from pyRdfa.options import Options
from pyRdfa.state import ExecutionContext
from pyRdfa.embeddedRDF import handle_embeddedRDF
from .structureddata import _get_orig_source, _check_error

try:
import html5lib

assert html5lib
html5lib = True
except ImportError:
import warnings

warnings.warn(
"html5lib not found! RDFa and Microdata parsers will not be available."
)
html5lib = False


class HTurtle(pyRdfa):
"""
Bastardizing the RDFa 1.1 parser to do a hturtle extractions
"""

def __init__(self, options=None, base="", media_type=""):
pyRdfa.__init__(
self, options=options, base=base, media_type=media_type, rdfa_version="1.1"
)

def graph_from_DOM(self, dom, graph, pgraph=None):
"""
Stealing the parsing function from the original class, to do
turtle extraction only
"""

def copyGraph(tog, fromg):
for t in fromg:
tog.add(t)
for k, ns in fromg.namespaces():
tog.bind(k, ns)

def _process_one_node(node, graph, state):
if handle_embeddedRDF(node, graph, state):
# we got an RDF content that has been extracted into Graph;
# the recursion should stop
return
else:
# recurse through all the child elements of the current node
for n in node.childNodes:
if n.nodeType == node.ELEMENT_NODE:
_process_one_node(n, graph, state)

topElement = dom.documentElement
state = ExecutionContext(
topElement, graph, base=self.base, options=self.options, rdfa_version="1.1"
)
_process_one_node(topElement, graph, state)
if pgraph is not None:
copyGraph(pgraph, self.options.processor_graph.graph)


# This is the parser interface as it would look when called from the rest of
# RDFLib


class HTurtleParser(Parser):
def parse(self, source, graph, pgraph=None, media_type=""):
"""
@param source: one of the input sources that the RDFLib package defined
@type source: InputSource class instance
@param graph: target graph for the triples; output graph, in RDFa spec.
parlance
@type graph: RDFLib Graph
@keyword media_type: explicit setting of the preferred media type
(a.k.a. content type) of the the RDFa source. None means the content
type of the HTTP result is used, or a guess is made based on the
suffix of a file
@type media_type: string
"""
if html5lib is False:
raise ImportError(
"html5lib is not installed, cannot " + "use RDFa and Microdata parsers."
)

(baseURI, orig_source) = _get_orig_source(source)
self._process(graph, pgraph, baseURI, orig_source, media_type=media_type)

def _process(self, graph, baseURI, orig_source, media_type=""):
self.options = Options(
output_processor_graph=None,
embedded_rdf=True,
vocab_expansion=False,
vocab_cache=False,
)

if media_type is None:
media_type = ""
processor = HTurtle(self.options, base=baseURI, media_type=media_type)
processor.graph_from_source(
orig_source, graph=graph, pgraph=None, rdfOutput=False
)
# get possible error triples to raise exceptions
_check_error(graph)
Loading