first commit
This commit is contained in:
437
lua/XmlParser.lua
Normal file
437
lua/XmlParser.lua
Normal file
@@ -0,0 +1,437 @@
|
||||
--- @module Class providing the actual XML parser.
|
||||
-- Available options are:
|
||||
-- * stripWS
|
||||
-- Strip non-significant whitespace (leading/trailing)
|
||||
-- and do not generate events for empty text elements
|
||||
--
|
||||
-- * expandEntities
|
||||
-- Expand entities (standard entities + single char
|
||||
-- numeric entities only currently - could be extended
|
||||
-- at runtime if suitable DTD parser added elements
|
||||
-- to table (see obj._ENTITIES). May also be possible
|
||||
-- to expand multibyre entities for UTF-8 only
|
||||
--
|
||||
-- * errorHandler
|
||||
-- Custom error handler function
|
||||
--
|
||||
-- NOTE: Boolean options must be set to 'nil' not '0'
|
||||
|
||||
---Converts the decimal code of a character to its corresponding char
|
||||
--if it's a graphical char, otherwise, returns the HTML ISO code
|
||||
--for that decimal value in the format &#code
|
||||
--@param code the decimal value to convert to its respective character
|
||||
local function decimalToHtmlChar(code)
|
||||
local n = tonumber(code)
|
||||
if n >= 0 and n < 256 then
|
||||
return string.char(n)
|
||||
else
|
||||
return "&#"..code..";"
|
||||
end
|
||||
end
|
||||
|
||||
---Converts the hexadecimal code of a character to its corresponding char
|
||||
--if it's a graphical char, otherwise, returns the HTML ISO code
|
||||
--for that hexadecimal value in the format ode
|
||||
--@param code the hexadecimal value to convert to its respective character
|
||||
local function hexadecimalToHtmlChar(code)
|
||||
local n = tonumber(code, 16)
|
||||
if n >= 0 and n < 256 then
|
||||
return string.char(n)
|
||||
else
|
||||
return "&#x"..code..";"
|
||||
end
|
||||
end
|
||||
|
||||
local XmlParser = {
|
||||
-- Private attribures/functions
|
||||
_XML = '^([^<]*)<(%/?)([^>]-)(%/?)>',
|
||||
_ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"',
|
||||
_ATTR2 = '([%w-:_]+)%s*=%s*\'(.-)\'',
|
||||
_CDATA = '<%!%[CDATA%[(.-)%]%]>',
|
||||
_PI = '<%?(.-)%?>',
|
||||
_COMMENT = '<!%-%-(.-)%-%->',
|
||||
_TAG = '^(.-)%s.*',
|
||||
_LEADINGWS = '^%s+',
|
||||
_TRAILINGWS = '%s+$',
|
||||
_WS = '^%s*$',
|
||||
_DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>',
|
||||
_DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>',
|
||||
--_DTD3 = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>',
|
||||
_DTD3 = '<!DOCTYPE%s.->',
|
||||
_DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>',
|
||||
_DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>',
|
||||
|
||||
--Matches an attribute with non-closing double quotes (The equal sign is matched non-greedly by using =+?)
|
||||
_ATTRERR1 = '=+?%s*"[^"]*$',
|
||||
--Matches an attribute with non-closing single quotes (The equal sign is matched non-greedly by using =+?)
|
||||
_ATTRERR2 = '=+?%s*\'[^\']*$',
|
||||
--Matches a closing tag such as </person> or the end of a openning tag such as <person>
|
||||
_TAGEXT = '(%/?)>',
|
||||
|
||||
_errstr = {
|
||||
xmlErr = "Error Parsing XML",
|
||||
declErr = "Error Parsing XMLDecl",
|
||||
declStartErr = "XMLDecl not at start of document",
|
||||
declAttrErr = "Invalid XMLDecl attributes",
|
||||
piErr = "Error Parsing Processing Instruction",
|
||||
commentErr = "Error Parsing Comment",
|
||||
cdataErr = "Error Parsing CDATA",
|
||||
dtdErr = "Error Parsing DTD",
|
||||
endTagErr = "End Tag Attributes Invalid",
|
||||
unmatchedTagErr = "Unbalanced Tag",
|
||||
incompleteXmlErr = "Incomplete XML Document",
|
||||
},
|
||||
|
||||
_ENTITIES = {
|
||||
["<"] = "<",
|
||||
[">"] = ">",
|
||||
["&"] = "&",
|
||||
["""] = '"',
|
||||
["'"] = "'",
|
||||
["&#(%d+);"] = decimalToHtmlChar,
|
||||
["&#x(%x+);"] = hexadecimalToHtmlChar,
|
||||
},
|
||||
}
|
||||
|
||||
--- Instantiates a XmlParser object.
|
||||
--@param _handler Handler module to be used to convert the XML string
|
||||
-- to another formats. See the available handlers at the handler directory.
|
||||
-- Usually you get an instance to a handler module using, for instance:
|
||||
-- local handler = require("xmlhandler/tree").
|
||||
--@param _options Options for this XmlParser instance.
|
||||
--@see XmlParser.options
|
||||
function XmlParser.new(_handler, _options)
|
||||
local obj = {
|
||||
handler = _handler,
|
||||
options = _options,
|
||||
_stack = {}
|
||||
}
|
||||
|
||||
setmetatable(obj, XmlParser)
|
||||
obj.__index = XmlParser
|
||||
return obj;
|
||||
end
|
||||
|
||||
---Checks if a function/field exists in a table or in its metatable
|
||||
--@param table the table to check if it has a given function
|
||||
--@param elementName the name of the function/field to check if exists
|
||||
--@return true if the function/field exists, false otherwise
|
||||
local function fexists(table, elementName)
|
||||
if table == nil then
|
||||
return false
|
||||
end
|
||||
|
||||
if table[elementName] ~= nil then
|
||||
return true
|
||||
else
|
||||
return fexists(getmetatable(table), elementName)
|
||||
end
|
||||
end
|
||||
|
||||
local function err(self, err, pos)
|
||||
if self.options.errorHandler then
|
||||
self.options.errorHandler(err,pos)
|
||||
end
|
||||
end
|
||||
|
||||
--- Removes whitespaces
|
||||
local function stripWS(self, s)
|
||||
if self.options.stripWS then
|
||||
s = string.gsub(s,'^%s+','')
|
||||
s = string.gsub(s,'%s+$','')
|
||||
end
|
||||
return s
|
||||
end
|
||||
|
||||
local function parseEntities(self, s)
|
||||
if self.options.expandEntities then
|
||||
for k,v in pairs(self._ENTITIES) do
|
||||
s = string.gsub(s,k,v)
|
||||
end
|
||||
end
|
||||
|
||||
return s
|
||||
end
|
||||
|
||||
--- Parses a string representing a tag.
|
||||
--@param s String containing tag text
|
||||
--@return a {name, attrs} table
|
||||
-- where name is the name of the tag and attrs
|
||||
-- is a table containing the atributtes of the tag
|
||||
local function parseTag(self, s)
|
||||
local tag = {
|
||||
name = string.gsub(s, self._TAG, '%1'),
|
||||
attrs = {}
|
||||
}
|
||||
|
||||
local parseFunction = function (k, v)
|
||||
tag.attrs[k] = parseEntities(self, v)
|
||||
tag.attrs._ = 1
|
||||
end
|
||||
|
||||
string.gsub(s, self._ATTR1, parseFunction)
|
||||
string.gsub(s, self._ATTR2, parseFunction)
|
||||
|
||||
if tag.attrs._ then
|
||||
tag.attrs._ = nil
|
||||
else
|
||||
tag.attrs = nil
|
||||
end
|
||||
|
||||
return tag
|
||||
end
|
||||
|
||||
local function parseXmlDeclaration(self, xml, f)
|
||||
-- XML Declaration
|
||||
f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos)
|
||||
if not f.match then
|
||||
err(self, self._errstr.declErr, f.pos)
|
||||
end
|
||||
|
||||
if f.match ~= 1 then
|
||||
-- Must be at start of doc if present
|
||||
err(self, self._errstr.declStartErr, f.pos)
|
||||
end
|
||||
|
||||
local tag = parseTag(self, f.text)
|
||||
-- TODO: Check if attributes are valid
|
||||
-- Check for version (mandatory)
|
||||
if tag.attrs and tag.attrs.version == nil then
|
||||
err(self, self._errstr.declAttrErr, f.pos)
|
||||
end
|
||||
|
||||
if fexists(self.handler, 'decl') then
|
||||
self.handler:decl(tag, f.match, f.endMatch)
|
||||
end
|
||||
|
||||
return tag
|
||||
end
|
||||
|
||||
local function parseXmlProcessingInstruction(self, xml, f)
|
||||
local tag = {}
|
||||
|
||||
-- XML Processing Instruction (PI)
|
||||
f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos)
|
||||
if not f.match then
|
||||
err(self, self._errstr.piErr, f.pos)
|
||||
end
|
||||
if fexists(self.handler, 'pi') then
|
||||
-- Parse PI attributes & text
|
||||
tag = parseTag(self, f.text)
|
||||
local pi = string.sub(f.text, string.len(tag.name)+1)
|
||||
if pi ~= "" then
|
||||
if tag.attrs then
|
||||
tag.attrs._text = pi
|
||||
else
|
||||
tag.attrs = { _text = pi }
|
||||
end
|
||||
end
|
||||
self.handler:pi(tag, f.match, f.endMatch)
|
||||
end
|
||||
|
||||
return tag
|
||||
end
|
||||
|
||||
local function parseComment(self, xml, f)
|
||||
f.match, f.endMatch, f.text = string.find(xml, self._COMMENT, f.pos)
|
||||
if not f.match then
|
||||
err(self, self._errstr.commentErr, f.pos)
|
||||
end
|
||||
|
||||
if fexists(self.handler, 'comment') then
|
||||
f.text = parseEntities(self, stripWS(self, f.text))
|
||||
self.handler:comment(f.text, next, f.match, f.endMatch)
|
||||
end
|
||||
end
|
||||
|
||||
local function _parseDtd(self, xml, pos)
|
||||
-- match,endMatch,root,type,name,uri,internal
|
||||
local dtdPatterns = {self._DTD1, self._DTD2, self._DTD3, self._DTD4, self._DTD5}
|
||||
|
||||
for i, dtd in pairs(dtdPatterns) do
|
||||
local m,e,r,t,n,u,i = string.find(xml, dtd, pos)
|
||||
if m then
|
||||
return m, e, {_root=r, _type=t, _name=n, _uri=u, _internal=i}
|
||||
end
|
||||
end
|
||||
|
||||
return nil
|
||||
end
|
||||
|
||||
local function parseDtd(self, xml, f)
|
||||
f.match, f.endMatch, attrs = _parseDtd(self, xml, f.pos)
|
||||
if not f.match then
|
||||
err(self, self._errstr.dtdErr, f.pos)
|
||||
end
|
||||
|
||||
if fexists(self.handler, 'dtd') then
|
||||
local tag = {name="DOCTYPE", value=string.sub(xml, f.match+10, f.endMatch-1)}
|
||||
self.handler:dtd(tag, f.match, f.endMatch)
|
||||
end
|
||||
end
|
||||
|
||||
local function parseCdata(self, xml, f)
|
||||
f.match, f.endMatch, f.text = string.find(xml, self._CDATA, f.pos)
|
||||
if not f.match then
|
||||
err(self, self._errstr.cdataErr, f.pos)
|
||||
end
|
||||
|
||||
if fexists(self.handler, 'cdata') then
|
||||
self.handler:cdata(f.text, nil, f.match, f.endMatch)
|
||||
end
|
||||
end
|
||||
|
||||
--- Parse a Normal tag
|
||||
-- Need check for embedded '>' in attribute value and extend
|
||||
-- match recursively if necessary eg. <tag attr="123>456">
|
||||
local function parseNormalTag(self, xml, f)
|
||||
--Check for errors
|
||||
while 1 do
|
||||
--If there isn't an attribute without closing quotes (single or double quotes)
|
||||
--then breaks to follow the normal processing of the tag.
|
||||
--Otherwise, try to find where the quotes close.
|
||||
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR1)
|
||||
|
||||
if f.errEnd == nil then
|
||||
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR2)
|
||||
if f.errEnd == nil then
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
f.extStart, f.extEnd, f.endt2 = string.find(xml, self._TAGEXT, f.endMatch+1)
|
||||
f.tagstr = f.tagstr .. string.sub(xml, f.endMatch, f.extEnd-1)
|
||||
if not f.match then
|
||||
err(self, self._errstr.xmlErr, f.pos)
|
||||
end
|
||||
f.endMatch = f.extEnd
|
||||
end
|
||||
|
||||
-- Extract tag name and attrs
|
||||
local tag = parseTag(self, f.tagstr)
|
||||
|
||||
if (f.endt1=="/") then
|
||||
if fexists(self.handler, 'endtag') then
|
||||
if tag.attrs then
|
||||
-- Shouldn't have any attributes in endtag
|
||||
err(self, string.format("%s (/%s)", self._errstr.endTagErr, tag.name), f.pos)
|
||||
end
|
||||
if table.remove(self._stack) ~= tag.name then
|
||||
err(self, string.format("%s (/%s)", self._errstr.unmatchedTagErr, tag.name), f.pos)
|
||||
end
|
||||
self.handler:endtag(tag, f.match, f.endMatch)
|
||||
end
|
||||
else
|
||||
table.insert(self._stack, tag.name)
|
||||
if fexists(self.handler, 'starttag') then
|
||||
self.handler:starttag(tag, f.match, f.endMatch)
|
||||
end
|
||||
--TODO: Tags com fechamento automático estão sendo
|
||||
--retornadas como uma tabela, o que complica
|
||||
--para a app NCLua tratar isso. É preciso
|
||||
--fazer com que seja retornado um campo string vazio.
|
||||
|
||||
-- Self-Closing Tag
|
||||
if (f.endt2=="/") then
|
||||
table.remove(self._stack)
|
||||
if fexists(self.handler, 'endtag') then
|
||||
self.handler:endtag(tag, f.match, f.endMatch)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
return tag
|
||||
end
|
||||
|
||||
local function parseTagType(self, xml, f)
|
||||
-- Test for tag type
|
||||
if string.find(string.sub(f.tagstr, 1, 5), "?xml%s") then
|
||||
parseXmlDeclaration(self, xml, f)
|
||||
elseif string.sub(f.tagstr, 1, 1) == "?" then
|
||||
parseXmlProcessingInstruction(self, xml, f)
|
||||
elseif string.sub(f.tagstr, 1, 3) == "!--" then
|
||||
parseComment(self, xml, f)
|
||||
elseif string.sub(f.tagstr, 1, 8) == "!DOCTYPE" then
|
||||
parseDtd(self, xml, f)
|
||||
elseif string.sub(f.tagstr, 1, 8) == "![CDATA[" then
|
||||
parseCdata(self, xml, f)
|
||||
else
|
||||
parseNormalTag(self, xml, f)
|
||||
end
|
||||
end
|
||||
|
||||
--- Get next tag (first pass - fix exceptions below).
|
||||
--@return true if the next tag could be got, false otherwise
|
||||
local function getNextTag(self, xml, f)
|
||||
f.match, f.endMatch, f.text, f.endt1, f.tagstr, f.endt2 = string.find(xml, self._XML, f.pos)
|
||||
if not f.match then
|
||||
if string.find(xml, self._WS, f.pos) then
|
||||
-- No more text - check document complete
|
||||
if #self._stack ~= 0 then
|
||||
err(self, self._errstr.incompleteXmlErr, f.pos)
|
||||
else
|
||||
return false
|
||||
end
|
||||
else
|
||||
-- Unparsable text
|
||||
err(self, self._errstr.xmlErr, f.pos)
|
||||
end
|
||||
end
|
||||
|
||||
f.text = f.text or ''
|
||||
f.tagstr = f.tagstr or ''
|
||||
f.match = f.match or 0
|
||||
|
||||
return f.endMatch ~= nil
|
||||
end
|
||||
|
||||
--Main function which starts the XML parsing process
|
||||
--@param xml the XML string to parse
|
||||
--@param parseAttributes indicates if tag attributes should be parsed or not.
|
||||
-- If omitted, the default value is true.
|
||||
function XmlParser:parse(xml, parseAttributes)
|
||||
if type(self) ~= "table" or getmetatable(self) ~= XmlParser then
|
||||
error("You must call xmlparser:parse(parameters) instead of xmlparser.parse(parameters)")
|
||||
end
|
||||
|
||||
if parseAttributes == nil then
|
||||
parseAttributes = true
|
||||
end
|
||||
|
||||
self.handler.parseAttributes = parseAttributes
|
||||
|
||||
--Stores string.find results and parameters
|
||||
--and other auxiliar variables
|
||||
local f = {
|
||||
--string.find return
|
||||
match = 0,
|
||||
endMatch = 0,
|
||||
-- text, end1, tagstr, end2,
|
||||
|
||||
--string.find parameters and auxiliar variables
|
||||
pos = 1,
|
||||
-- startText, endText,
|
||||
-- errStart, errEnd, extStart, extEnd,
|
||||
}
|
||||
|
||||
while f.match do
|
||||
if not getNextTag(self, xml, f) then
|
||||
break
|
||||
end
|
||||
|
||||
-- Handle leading text
|
||||
f.startText = f.match
|
||||
f.endText = f.match + string.len(f.text) - 1
|
||||
f.match = f.match + string.len(f.text)
|
||||
f.text = parseEntities(self, stripWS(self, f.text))
|
||||
if f.text ~= "" and fexists(self.handler, 'text') then
|
||||
self.handler:text(f.text, nil, f.match, f.endText)
|
||||
end
|
||||
|
||||
parseTagType(self, xml, f)
|
||||
f.pos = f.endMatch + 1
|
||||
end
|
||||
end
|
||||
|
||||
XmlParser.__index = XmlParser
|
||||
return XmlParser
|
||||
Reference in New Issue
Block a user