Skip to content

Commit

Permalink
Version 0.3; namespace support, entity testing, dom parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Gavin Kistner committed Feb 15, 2013
1 parent 7f2b082 commit b56ea86
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 95 deletions.
60 changes: 45 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# SLAXML
SLAXML is a pure-Lua SAX-like streaming XML parser. It is more robust than
many (simpler) pattern-based parsers that exist ([such as mine][1]), properly supporting code like `<expr test="5 > 7" />`, CDATA nodes, comments, and processing instructions.
many (simpler) pattern-based parsers that exist ([such as mine][1]), properly
supporting code like `<expr test="5 > 7" />`, CDATA nodes, comments, namespaces,
and processing instructions.

It is currently not a truly valid XML parser, however, as it allows completely invalid XML such
as `<foo></bar>` to be parsed (and reported) as such. It is also not namespace-aware. See the "Limitations / TODO" section below for more details.
It is currently not a truly valid XML parser, however, as it allows some invalid
XML such as `<foo></bar>` to be parsed (and reported) as such.
See the "Limitations / TODO" section below for more details.

[1]: http://phrogz.net/lua/AKLOMParser.lua

Expand All @@ -14,12 +17,13 @@ as `<foo></bar>` to be parsed (and reported) as such. It is also not namespace-a

-- Specify as many/few of these as you like
parser = SLAXML:parser{
startElement = function(name) end, -- When "<foo" is seen
attribute = function(name,value) end, -- attribute found
closeElement = function(name) end, -- When "</foo" or "/>" is seen
text = function(text) end, -- text and CDATA nodes
comment = function(content) end, -- comments
pi = function(target,content) end, -- processing instructions e.g. "<?yes mon?>"
startElement = function(name,nsURI) end, -- When "<foo" or <x:foo is seen
attribute = function(name,value,nsURI) end, -- attribute found on current element
closeElement = function(name) end, -- When "</foo" or "/>" is seen
text = function(text) end, -- text and CDATA nodes
comment = function(content) end, -- comments
pi = function(target,content) end, -- processing instructions e.g. "<?yes mon?>"
namespace = function(nsURI) end, -- when xmlns="..." is seen (after startElement)
}

myxml = io.open('my.xml'):read()
Expand All @@ -32,8 +36,39 @@ If you just want to see if it parses your document correctly, you can also use j

…which will cause SLAXML to use its built-in callbacks that print the results as seen.

If you want to build a table object model from your XML (with simple collections like
`.kids` and `.attr` for navigating the hierarchy) then you can alternatively:

require 'slaxdom'
local doc = SLAXML:dom(myxml)
print( doc.root.name )
print( doc.root.nsURI )
print( doc.root.attr['version'] )
for i,node in ipairs(doc.root.kids) do
-- includes elements, comments, textnodes and PIs
print("Child #",i,"is",node.type,node.name)
end
for i,el in ipairs(doc.root.el) do
-- includes only elements
print("Element #",i,"is",node.name)
for name,value in pairs(node.attr) do
print("",name,"=",value)
end
end


# History

## v0.3 2013-Feb-15
### Features
+ Support namespaces for elements and attributes
+ `<foo xmlns="bar">` will call `startElement("foo",nil)` followed by `namespace("bar")`
+ Child elements inheriting the default namespace will call `startElement("child","bar")`
+ `<xy:foo>` will call `startElement("foo","uri-for-xy-namespace")` or error if not found
+ `<foo xy:bar="yay">` will call `attribute("bar","yay","uri-for-xy-namespace")` or error if not found
+ Add (optional) DOM parser that validates hierarchy and supports namespaces
- Except that namespaced attributes with the same name will collide

## v0.2 2013-Feb-15
### Features
+ Supports expanding numeric entities e.g. `&#34;` -> `"`
Expand All @@ -49,13 +84,8 @@ If you just want to see if it parses your document correctly, you can also use j

### Limitations / TODO
- Does not require or enforce well-formed XML (or report/fail on invalid)
- No support for namespaces:
- `xmlns="…"` attributes look like any other
- `xmlns:foo="…"` attributes will report name as "xmlns:foo"
- `<foo:bar>` elements will report name as "foo:bar"
- `foo:bar="…"` attributes will report name as "foo:bar"
- No support for entity expansion other than
`&lt; &gt; &quot; &apos; &amp;`
`&lt; &gt; &quot; &apos; &amp;` and numeric ASCII entities like `&#10;`
- XML Declarations (`<?xml version="1.x"?>`) are incorrectly reported
as Processing Instructions
- No support for DTDs
Expand Down
65 changes: 65 additions & 0 deletions slaxdom.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
-- Optional parser that creates a flat DOM from parsing
require 'slaxml'
function SLAXML:dom(xml,ignoreWhitespace)
SLAXML.ignoreWhitespace = ignoreWhitespace
local push, pop = table.insert, table.remove
local stack = {}
local doc = { type="document", name="#doc", kids={} }
local current = doc
local builder = SLAXML:parser{
startElement = function(name,nsURI)
local el = { type="element", name=name, kids={}, el={}, attr={}, nsURI=nsURI }
if current==doc then
if doc.root then
error(("Encountered element '%s' when the document already has a root '%s' element"):format(name,doc.root.name))
else
doc.root = el
end
end
if current.type~="element" and current.type~="document" then
error(("Encountered an element inside of a %s"):format(current.type))
else
push(current.kids,el)
if current.el then push(current.el,el) end
end
current = el
push(stack,el)
end,
namespace = function(nsURI)
current.nsURI = nsURI
end,
attribute = function(name,value,nsURI)
if not current or current.type~="element" then
error(("Encountered an attribute %s=%s but I wasn't inside an element"):format(name,value))
else
-- TODO: differentiate namespaced attributes
current.attr[name] = value
end
end,
closeElement = function(name)
if current.name~=name or current.type~="element" then
error(("Received a close element notification for '%s' but was inside a '%s' %s"):format(name,current.name,current.type))
end
pop(stack)
current = stack[#stack]
end,
text = function(value)
if current.type~='document' then
if current.type~="element" then
error(("Received a text notification '%s' but was inside a %s"):format(value,current.type))
else
push(current.kids,{type='text',name='#text',value=value,text=value})
if current.text then current.text = current.text..value else current.text=value end
end
end
end,
comment = function(value)
push(current.kids,{type='comment',name='#comment',value=value,text=value})
end,
pi = function(name,value)
push(current.kids,{type='pi',name=name,value=value})
end
}
builder:parse(xml)
return doc
end
134 changes: 57 additions & 77 deletions slaxml.lua
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
--[=====================================================================[
v0.2 Copyright © 2013 Gavin Kistner <[email protected]>; MIT Licensed
v0.3 Copyright © 2013 Gavin Kistner <[email protected]>; MIT Licensed
See http://github.com/Phrogz/SLAXML for details.
--]=====================================================================]
SLAXML = {
VERSION = "0.2",
VERSION = "0.3",
ignoreWhitespace = true,
_call = {
pi = function(target,content)
Expand All @@ -12,18 +12,21 @@ SLAXML = {
comment = function(content)
print(string.format("<!-- %s -->",content))
end,
startElement = function(name)
print(string.format("<%s>",name))
startElement = function(name,nsURI)
print(string.format("<%s %s>",name,nsURI or "-"))
end,
attribute = function(name,value)
print(string.format(" %s=%q",name,value))
attribute = function(name,value,nsURI)
print(string.format(" %s=%q (%s)",name,value,nsURI or "-"))
end,
text = function(text)
print(string.format(" text: %q",text))
end,
closeElement = function(name)
print(string.format("</%s>",name))
end,
namespace = function(nsURI) -- applies a default namespace to the current element
print(string.format(" (xmlns=%s)",nsURI))
end,
}
}

Expand All @@ -33,13 +36,14 @@ end

function SLAXML:parse(xml)
-- Cache references for maximum speed
local find, sub, gsub, char = string.find, string.sub, string.gsub, string.char
local find, sub, gsub, char, push, pop = string.find, string.sub, string.gsub, string.char, table.insert, table.remove
-- local sub, gsub, find, push, pop, unescape = string.sub, string.gsub, string.find, table.insert, table.remove, unescape
local first, last, match1, match2, pos2
local first, last, match1, match2, match3, pos2, nsURI
local pos = 1
local state = "text"
local textStart = 1
local currentElement
local nsStack = {}

local entityMap = { ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and char(s) or orig end
Expand Down Expand Up @@ -79,13 +83,29 @@ function SLAXML:parse(xml)
end
end

local function nsForPrefix(prefix)
for i=#nsStack,1,-1 do if nsStack[i][prefix] then return nsStack[i][prefix] end end
error(("Cannot find namespace for prefix %s"):format(prefix))
end

local function startElement()
first, last, match1 = find( xml, '^<([:%a_][:%w_.-]*)', pos )
first, last, match1 = find( xml, '^<([%a_][%w_.-]*)', pos )
if first then
nsURI = nil
finishText()
currentElement = match1
if self._call.startElement then self._call.startElement(match1) end
pos = last+1
first,last,match2 = find(xml, '^:([%a_][%w_.-]*)', pos )
if first then
nsURI = nsForPrefix(match1)
currentElement = match2
match1 = match2
pos = last+1
else
currentElement = match1
for i=#nsStack,1,-1 do if nsStack[i]['!'] then nsURI = nsStack[i]['!']; break end end
end
if self._call.startElement then self._call.startElement(match1,nsURI) end
push(nsStack,{})
return true
end
end
Expand All @@ -96,18 +116,34 @@ function SLAXML:parse(xml)
pos2 = last+1
first, last, match2 = find( xml, '^"([^<"]+)"', pos2 ) -- FIXME: disallow non-entity ampersands
if first then
if self._call.attribute then self._call.attribute(match1,unescape(match2)) end
pos = last+1
return true
match2 = unescape(match2)
else
first, last, match2 = find( xml, "^'([^<']+)'", pos2 ) -- FIXME: disallow non-entity ampersands
if first then
-- TODO: unescape entities in match2
if self._call.attribute then self._call.attribute(match1,unescape(match2)) end
pos = last+1
return true
match2 = unescape(match2)
end
end
end
if match1 and match2 then
nsURI = nil
local prefix,name = string.match(match1,'^([^:]+):([^:]+)$')
if prefix then
if prefix=='xmlns' then
nsStack[#nsStack][name] = match2
else
nsURI = nsForPrefix(prefix)
match1 = name
end
else
if match1=='xmlns' then
nsStack[#nsStack]['!'] = match2
if self._call.namespace then self._call.namespace(match2) end
end
end
if self._call.attribute then self._call.attribute(match1,match2,nsURI) end
return true
end
end

Expand All @@ -128,7 +164,10 @@ function SLAXML:parse(xml)
state = "text"
pos = last+1
textStart = pos
if match1=="/" and self._call.closeElement then self._call.closeElement(currentElement) end
if match1=="/" then
pop(nsStack)
if self._call.closeElement then self._call.closeElement(currentElement) end
end
return true
end
end
Expand All @@ -140,6 +179,7 @@ function SLAXML:parse(xml)
if self._call.closeElement then self._call.closeElement(match1) end
pos = last+1
textStart = pos
pop(nsStack)
return true
end
end
Expand All @@ -162,64 +202,4 @@ function SLAXML:parse(xml)
end
end
end
end

function SLAXML:dom(xml,ignoreWhitespace,slim)
SLAXML.ignoreWhitespace = ignoreWhitespace
local push, pop = table.insert, table.remove
local stack = {}
local doc = { type="document", name="#doc", kids={} }
local current = doc
local builder = SLAXML:parser{
startElement = function(name)
local el = { type="element", name=name, kids={}, el={}, attr={} }
if current==doc then
if doc.root then
error(("Encountered element '%s' when the document already has a root '%s' element"):format(name,doc.root.name))
else
doc.root = el
end
end
if current.type~="element" and current.type~="document" then
error(("Encountered an element inside of a %s"):format(current.type))
else
push(current.kids,el)
if current.el then push(current.el,el) end
end
current = el
push(stack,el)
end,
attribute = function(name,value)
if not current or current.type~="element" then
error(("Encountered an attribute %s=%s but I wasn't inside an element"):format(name,value))
else
current.attr[name] = value
end
end,
closeElement = function(name)
if current.name~=name or current.type~="element" then
error(("Received a close element notification for '%s' but was inside a '%s' %s"):format(name,current.name,current.type))
end
pop(stack)
current = stack[#stack]
end,
text = function(value)
if current.type~='document' then
if current.type~="element" then
error(("Received a text notification '%s' but was inside a %s"):format(value,current.type))
else
push(current.kids,{type='text',name='#text',value=value,text=value})
if current.text then current.text = current.text..value else current.text=value end
end
end
end,
comment = function(value)
push(current.kids,{type='comment',name='#comment',value=value,text=value})
end,
pi = function(name,value)
push(current.kids,{type='pi',name=name,value=value})
end
}
builder:parse(xml)
return doc
end
Loading

0 comments on commit b56ea86

Please sign in to comment.