236 lines
4.7 KiB
Plaintext
236 lines
4.7 KiB
Plaintext
##
|
|
# Low level AST parser that supports both XML and HTML.
|
|
#
|
|
# Note that this parser itself does not deal with special HTML void elements.
|
|
# It requires every tag to have a closing tag. As such you'll need to enable
|
|
# HTML parsing mode when parsing HTML. This can be done as following:
|
|
#
|
|
# parser = Oga::XML::Parser.new(:html => true)
|
|
#
|
|
class Oga::XML::Parser
|
|
|
|
token T_STRING T_TEXT
|
|
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
|
|
token T_CDATA_START T_CDATA_END
|
|
token T_COMMENT_START T_COMMENT_END
|
|
token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR
|
|
token T_XML_DECL_START T_XML_DECL_END
|
|
|
|
options no_result_var
|
|
|
|
rule
|
|
document
|
|
: expressions { s(:document, val[0]) }
|
|
| /* none */ { s(:document) }
|
|
;
|
|
|
|
expressions
|
|
: expressions expression { val.compact }
|
|
| expression { val[0] }
|
|
| /* none */ { nil }
|
|
;
|
|
|
|
expression
|
|
: doctype
|
|
| cdata
|
|
| comment
|
|
| element
|
|
| text
|
|
| xmldecl
|
|
;
|
|
|
|
# Doctypes
|
|
|
|
doctype
|
|
# <!DOCTYPE html>
|
|
: T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_END { s(:doctype, val[1]) }
|
|
|
|
# <!DOCTYPE html PUBLIC>
|
|
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_DOCTYPE_END
|
|
{
|
|
s(:doctype, val[1], val[2])
|
|
}
|
|
|
|
# <!DOCTYPE html PUBLIC "foo">
|
|
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
|
|
{
|
|
s(:doctype, val[1], val[2], val[3])
|
|
}
|
|
|
|
# <!DOCTYPE html PUBLIC "foo" "bar">
|
|
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END
|
|
{
|
|
s(:doctype, val[1], val[2], val[3], val[4])
|
|
}
|
|
;
|
|
|
|
# CDATA tags
|
|
|
|
cdata
|
|
# <![CDATA[]]>
|
|
: T_CDATA_START T_CDATA_END { s(:cdata) }
|
|
|
|
# <![CDATA[foo]]>
|
|
| T_CDATA_START T_TEXT T_CDATA_END { s(:cdata, val[1]) }
|
|
;
|
|
|
|
# Comments
|
|
|
|
comment
|
|
# <!---->
|
|
: T_COMMENT_START T_COMMENT_END { s(:comment) }
|
|
|
|
# <!-- foo -->
|
|
| T_COMMENT_START T_TEXT T_COMMENT_END { s(:comment, val[1]) }
|
|
;
|
|
|
|
# Elements
|
|
|
|
element
|
|
: element_open attributes expressions T_ELEM_END
|
|
{
|
|
s(:element, val[0], val[1], val[2])
|
|
}
|
|
;
|
|
|
|
element_open
|
|
# <p>
|
|
: T_ELEM_START T_ELEM_NAME { [nil, val[1]] }
|
|
|
|
# <foo:p>
|
|
| T_ELEM_START T_ELEM_NS T_ELEM_NAME { [val[1], val[2]] }
|
|
;
|
|
|
|
# Attributes
|
|
|
|
attributes
|
|
: attributes_ { s(:attributes, val[0]) }
|
|
| /* none */ { nil }
|
|
;
|
|
|
|
attributes_
|
|
: attributes_ attribute { val }
|
|
| attribute { val }
|
|
;
|
|
|
|
attribute
|
|
# foo
|
|
: T_ATTR { s(:attribute, val[0]) }
|
|
|
|
# foo="bar"
|
|
| T_ATTR T_STRING { s(:attribute, val[0], val[1]) }
|
|
;
|
|
|
|
# XML declarations
|
|
xmldecl
|
|
: T_XML_DECL_START T_XML_DECL_END { s(:xml_decl) }
|
|
| T_XML_DECL_START attributes T_XML_DECL_END { s(:xml_decl, val[1]) }
|
|
|
|
# Plain text
|
|
|
|
text
|
|
: T_TEXT { s(:text, val[0]) }
|
|
;
|
|
end
|
|
|
|
---- inner
|
|
##
|
|
# @param [String] data The input to parse.
|
|
#
|
|
# @param [Hash] options
|
|
# @see Oga::XML::Lexer#initialize
|
|
#
|
|
def initialize(data, options = {})
|
|
@data = data
|
|
@lexer = Lexer.new(data, options)
|
|
end
|
|
|
|
##
|
|
# Resets the internal state of the parser.
|
|
#
|
|
def reset
|
|
@lines = []
|
|
@line = 1
|
|
end
|
|
|
|
##
|
|
# Emits a new AST token.
|
|
#
|
|
# @param [Symbol] type
|
|
# @param [Array] children
|
|
#
|
|
def s(type, *children)
|
|
return AST::Node.new(
|
|
type,
|
|
children.flatten,
|
|
:line => @line
|
|
)
|
|
end
|
|
|
|
##
|
|
# Returns the next token from the lexer.
|
|
#
|
|
# @return [Array]
|
|
#
|
|
def next_token
|
|
type, value, line = @lexer.advance
|
|
|
|
@line = line if line
|
|
|
|
return type ? [type, value] : [false, false]
|
|
end
|
|
|
|
##
|
|
# @param [Fixnum] type The type of token the error occured on.
|
|
# @param [String] value The value of the token.
|
|
# @param [Array] stack The current stack of parsed nodes.
|
|
# @raise [Racc::ParseError]
|
|
#
|
|
def on_error(type, value, stack)
|
|
name = token_to_str(type)
|
|
index = @line - 1
|
|
lines = @data.lines.to_a
|
|
code = ''
|
|
|
|
# Show up to 5 lines before and after the offending line (if they exist).
|
|
(-5..5).each do |offset|
|
|
line = lines[index + offset]
|
|
number = @line + offset
|
|
|
|
if line and number > 0
|
|
if offset == 0
|
|
prefix = '=> '
|
|
else
|
|
prefix = ' '
|
|
end
|
|
|
|
code << "#{prefix}#{number}: #{line.strip}\n"
|
|
end
|
|
end
|
|
|
|
raise Racc::ParseError, <<-EOF.strip
|
|
Unexpected #{name} with value #{value.inspect} on line #{@line}:
|
|
|
|
#{code}
|
|
EOF
|
|
end
|
|
|
|
##
|
|
# Parses the input and returns the corresponding AST.
|
|
#
|
|
# @example
|
|
# parser = Oga::Parser.new('<foo>bar</foo>')
|
|
# ast = parser.parse
|
|
#
|
|
# @return [Oga::AST::Node]
|
|
#
|
|
def parse
|
|
ast = do_parse
|
|
|
|
reset
|
|
|
|
return ast
|
|
end
|
|
|
|
# vim: set ft=racc:
|