From 9ad094e2278b1abf7e9d41bb5e5406820be0a8e6 Mon Sep 17 00:00:00 2001 From: Graham Higgins Date: Mon, 13 Dec 2021 19:32:56 +0000 Subject: [PATCH 1/5] restore pyRdfa and pyMicrodata --- rdflib/plugin.py | 70 +++++ rdflib/plugins/parsers/hturtle.py | 122 ++++++++ rdflib/plugins/parsers/structureddata.py | 352 +++++++++++++++++++++++ setup.cfg | 2 + test/pymicrodata/minischema.html | 12 + test/pymicrodata/minischema.ttl | 6 + test/pymicrodata/schema.html | 25 ++ test/pymicrodata/schema.ttl | 20 ++ test/pymicrodata/test1.html | 15 + test/pymicrodata/test1.ttl | 8 + test/pymicrodata/test2.html | 15 + test/pymicrodata/test2.ttl | 4 + test/pymicrodata/test3.html | 13 + test/pymicrodata/test3.ttl | 4 + test/test_pymicrodata.py | 31 ++ 15 files changed, 699 insertions(+) create mode 100644 rdflib/plugins/parsers/hturtle.py create mode 100644 rdflib/plugins/parsers/structureddata.py create mode 100644 test/pymicrodata/minischema.html create mode 100644 test/pymicrodata/minischema.ttl create mode 100644 test/pymicrodata/schema.html create mode 100644 test/pymicrodata/schema.ttl create mode 100644 test/pymicrodata/test1.html create mode 100644 test/pymicrodata/test1.ttl create mode 100644 test/pymicrodata/test2.html create mode 100644 test/pymicrodata/test2.ttl create mode 100644 test/pymicrodata/test3.html create mode 100644 test/pymicrodata/test3.ttl create mode 100644 test/test_pymicrodata.py diff --git a/rdflib/plugin.py b/rdflib/plugin.py index b7edbc624..b6f08a1b1 100644 --- a/rdflib/plugin.py +++ b/rdflib/plugin.py @@ -435,6 +435,76 @@ def plugins( "JsonLDParser", ) +# The basic parsers: RDFa (by default, 1.1), +# microdata, and embedded turtle (a.k.a. hturtle) +register( + "hturtle", + Parser, + "rdflib.plugins.parsers.hturtle", + "HTurtleParser", +) +register( + "rdfa", + Parser, + "rdflib.plugins.parsers.structureddata", + "RDFaParser", +) +register( + "mdata", + Parser, + "rdflib.plugins.parsers.structureddata", + "MicrodataParser", +) +register( + "microdata", + Parser, + "rdflib.plugins.parsers.structureddata", + "MicrodataParser", +) +# A convenience to use the RDFa 1.0 syntax (although the parse method can +# be invoked with an rdfa_version keyword, too) +register( + "rdfa1.0", + Parser, + "rdflib.plugins.parsers.structureddata", + "RDFa10Parser", +) +# Just for the completeness, if the user uses this +register( + "rdfa1.1", + Parser, + "rdflib.plugins.parsers.structureddata", + "RDFaParser", +) +# An HTML file may contain both microdata, rdfa, or turtle. If the user +# wants them all, the parser below simply invokes all: +register( + "html", + Parser, + "rdflib.plugins.parsers.structureddata", + "StructuredDataParser", +) +# Some media types are also bound to RDFa +register( + "application/svg+xml", + Parser, + "rdflib.plugins.parsers.structureddata", + "RDFaParser", +) +register( + "application/xhtml+xml", + Parser, + "rdflib.plugins.parsers.structureddata", + "RDFaParser", +) +# 'text/html' media type should be equivalent to html: +register( + "text/html", + Parser, + "rdflib.plugins.parsers.structureddata", + "StructuredDataParser", +) + # Register Quad Parsers register( "application/n-quads", diff --git a/rdflib/plugins/parsers/hturtle.py b/rdflib/plugins/parsers/hturtle.py new file mode 100644 index 000000000..e319f6a30 --- /dev/null +++ b/rdflib/plugins/parsers/hturtle.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +""" +Extraction parser RDF embedded verbatim into HTML or XML files. This is based +on: + +* The specification on embedding turtle into html: + http://www.w3.org/TR/turtle/#in-html + +For SVG (and currently SVG only) the method also extracts an embedded RDF/XML +data, per SVG specification + +License: W3C Software License, +http://www.w3.org/Consortium/Legal/copyright-software +Author: Ivan Herman +Copyright: W3C +""" + +from rdflib.parser import Parser +from pyRdfa import pyRdfa +from pyRdfa.options import Options +from pyRdfa.state import ExecutionContext +from pyRdfa.embeddedRDF import handle_embeddedRDF +from .structureddata import _get_orig_source, _check_error + +try: + import html5lib + + assert html5lib + html5lib = True +except ImportError: + import warnings + + warnings.warn( + "html5lib not found! RDFa and Microdata parsers will not be available." + ) + html5lib = False + + +class HTurtle(pyRdfa): + """ + Bastardizing the RDFa 1.1 parser to do a hturtle extractions + """ + + def __init__(self, options=None, base="", media_type=""): + pyRdfa.__init__( + self, options=options, base=base, media_type=media_type, rdfa_version="1.1" + ) + + def graph_from_DOM(self, dom, graph, pgraph=None): + """ + Stealing the parsing function from the original class, to do + turtle extraction only + """ + + def copyGraph(tog, fromg): + for t in fromg: + tog.add(t) + for k, ns in fromg.namespaces(): + tog.bind(k, ns) + + def _process_one_node(node, graph, state): + if handle_embeddedRDF(node, graph, state): + # we got an RDF content that has been extracted into Graph; + # the recursion should stop + return + else: + # recurse through all the child elements of the current node + for n in node.childNodes: + if n.nodeType == node.ELEMENT_NODE: + _process_one_node(n, graph, state) + + topElement = dom.documentElement + state = ExecutionContext( + topElement, graph, base=self.base, options=self.options, rdfa_version="1.1" + ) + _process_one_node(topElement, graph, state) + if pgraph is not None: + copyGraph(pgraph, self.options.processor_graph.graph) + + +# This is the parser interface as it would look when called from the rest of +# RDFLib + + +class HTurtleParser(Parser): + def parse(self, source, graph, pgraph=None, media_type=""): + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa spec. + parlance + @type graph: RDFLib Graph + @keyword media_type: explicit setting of the preferred media type + (a.k.a. content type) of the the RDFa source. None means the content + type of the HTTP result is used, or a guess is made based on the + suffix of a file + @type media_type: string + """ + if html5lib is False: + raise ImportError( + "html5lib is not installed, cannot " + "use RDFa and Microdata parsers." + ) + + (baseURI, orig_source) = _get_orig_source(source) + self._process(graph, pgraph, baseURI, orig_source, media_type=media_type) + + def _process(self, graph, baseURI, orig_source, media_type=""): + self.options = Options( + output_processor_graph=None, + embedded_rdf=True, + vocab_expansion=False, + vocab_cache=False, + ) + + if media_type is None: + media_type = "" + processor = HTurtle(self.options, base=baseURI, media_type=media_type) + processor.graph_from_source( + orig_source, graph=graph, pgraph=None, rdfOutput=False + ) + # get possible error triples to raise exceptions + _check_error(graph) diff --git a/rdflib/plugins/parsers/structureddata.py b/rdflib/plugins/parsers/structureddata.py new file mode 100644 index 000000000..489641269 --- /dev/null +++ b/rdflib/plugins/parsers/structureddata.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python +""" +Extraction parsers for structured data embedded into HTML or XML files. +The former may include RDFa or microdata. The syntax and the extraction +procedures are based on: + +* The RDFa specifications: http://www.w3.org/TR/#tr_RDFa +* The microdata specification: http://www.w3.org/TR/microdata/ +* The specification of the microdata to RDF conversion: +http://www.w3.org/TR/microdata-rdf/ + +License: W3C Software License, +http://www.w3.org/Consortium/Legal/copyright-software +Author: Ivan Herman +Copyright: W3C + +""" + +from rdflib.parser import Parser, StringInputSource, URLInputSource, FileInputSource + +try: + import html5lib + + assert html5lib + html5lib = True +except ImportError: + import warnings + + warnings.warn( + "html5lib not found! RDFa and Microdata parsers will not be available." + ) + html5lib = False + + +def _get_orig_source(source): + """ + A bit of a hack; the RDFa/microdata parsers need more than what the + upper layers of RDFLib provide... + This method returns the original source references. + """ + if isinstance(source, StringInputSource): + orig_source = source.getByteStream() + elif isinstance(source, URLInputSource): + orig_source = source.url + elif isinstance(source, FileInputSource): + orig_source = source.file.name + source.file.close() + else: + orig_source = source.getByteStream() + baseURI = source.getPublicId() + return (baseURI, orig_source) + + +def _check_error(graph): + from .pyRdfa import RDFA_Error, ns_rdf + from .pyRdfa.options import ns_dc + + for (s, p, o) in graph.triples((None, ns_rdf["type"], RDFA_Error)): + for (x, y, msg) in graph.triples((s, ns_dc["description"], None)): + raise Exception("RDFa parsing Error! %s" % msg) + + +# This is the parser interface as it would look when called from the +# rest of RDFLib +class RDFaParser(Parser): + """ + Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1 + processing, see the relevant W3C documents at + http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG + and, in general, for any XML language. + + Note that the parser can also handle RDFa 1.0 if the extra parameter is + used and/or the input source uses RDFa 1.0 specific @version or DTD-s. + """ + + def parse( + self, + source, + graph, + pgraph=None, + media_type="", + rdfa_version=None, + embedded_rdf=False, + space_preserve=True, + vocab_expansion=False, + vocab_cache=False, + refresh_vocab_cache=False, + vocab_cache_report=False, + check_lite=False, + ): + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa spec. + parlance + @type graph: RDFLib Graph + @keyword pgraph: target for error and warning triples; processor graph, + in RDFa spec. parlance. If set to None, these triples are ignored + @type pgraph: RDFLib Graph + @keyword media_type: explicit setting of the preferred media type + (a.k.a. content type) of the the RDFa source. None means the content + type of the HTTP result is used, or a guess is made based on the + suffix of a file + @type media_type: string + @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by + default, 1.1 is used unless the source has explicit signals to use + 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) + @type rdfa_version: string + @keyword embedded_rdf: some formats allow embedding RDF in other + formats: (X)HTML can contain turtle in a special