diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index c41dc35..7124d7e 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -108,11 +108,50 @@ module Oga doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace* 'HTML'i whitespace* any* greater; + # CDATA + # + # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections + # + # CDATA tags are broken up into 3 parts: the start, the content and the + # end tag. + # + # In HTML CDATA tags have no meaning/are not supported. Oga does support + # them but treats their contents as plain text. + # + cdata_start = smaller bang lbracket 'CDATA' lbracket; + cdata_end = rbracket rbracket greater; + main := |* whitespace => { t(:T_SPACE) }; newline => { t(:T_NEWLINE); advance_line }; doctype => { t(:T_DOCTYPE) }; + + # CDATA + # + # When processing CDATA patterns we'll emit tokens for the start tag, + # the content and the end tag. + # + cdata_start + %{ + @cdata_start = p + t(:T_CDATA_START, @ts, p) + } + + # Consume everything except ], which is the start of the ending tag. + (any - rbracket)+ + %{ + t(:T_TEXT, @cdata_start, p) + + @cdata_start = nil + } + + cdata_end + >{ + t(:T_CDATA_END, p, pe) + }; + + # General rules and actions. smaller => { t(:T_SMALLER) }; greater => { t(:T_GREATER) }; slash => { t(:T_SLASH) }; diff --git a/spec/oga/lexer/cdata_spec.rb b/spec/oga/lexer/cdata_spec.rb index 5244211..8a0f913 100644 --- a/spec/oga/lexer/cdata_spec.rb +++ b/spec/oga/lexer/cdata_spec.rb @@ -4,15 +4,17 @@ describe Oga::Lexer do context 'cdata tags' do example 'lex a cdata tag' do lex('').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_BANG, '!', 1, 2], - [:T_LBRACKET, '[', 1, 3], - [:T_TEXT, 'CDATA', 1, 4], - [:T_LBRACKET, '[', 1, 9], + [:T_CDATA_START, '', 1, 15], + [:T_CDATA_END, ']]>', 1, 13] + ] + end + + example 'lex tags inside CDATA tags as regular text' do + lex('Foo

]]>').should == [ + [:T_CDATA_START, 'Foo

', 1, 10], + [:T_CDATA_END, ']]>', 1, 20] ] end end