diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index 2fdf16b..db0037d 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -90,7 +90,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) ID id_advance_line = rb_intern("advance_line"); ID id_on_attribute = rb_intern("on_attribute"); ID id_on_attribute_ns = rb_intern("on_attribute_ns"); - ID id_on_cdata = rb_intern("on_cdata"); + ID id_on_cdata_start = rb_intern("on_cdata_start"); + ID id_on_cdata_body = rb_intern("on_cdata_body"); + ID id_on_cdata_end = rb_intern("on_cdata_end"); ID id_on_comment = rb_intern("on_comment"); ID id_on_doctype_end = rb_intern("on_doctype_end"); ID id_on_doctype_inline = rb_intern("on_doctype_inline"); diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index ae3a6eb..738ce85 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -104,7 +104,9 @@ public class Lexer extends RubyObject String id_advance_line = "advance_line"; String id_on_attribute = "on_attribute"; String id_on_attribute_ns = "on_attribute_ns"; - String id_on_cdata = "on_cdata"; + String id_on_cdata_start = "on_cdata_start"; + String id_on_cdata_body = "on_cdata_body"; + String id_on_cdata_end = "on_cdata_end"; String id_on_comment = "on_comment"; String id_on_doctype_end = "on_doctype_end"; String id_on_doctype_inline = "on_doctype_inline"; diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index a04140f..a0721e7 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -83,12 +83,35 @@ cdata_start = ''; - cdata = cdata_start (any* -- cdata_end) cdata_end; + + # Everything except "]" OR a single "]" + cdata_allowed = (^']'+ | ']') $count_newlines; action start_cdata { - callback(id_on_cdata, data, encoding, ts + 9, te - 3); + callback_simple(id_on_cdata_start); + + fnext cdata_body; } + cdata_body := |* + cdata_allowed => { + callback(id_on_cdata_body, data, encoding, ts, te); + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + }; + + cdata_end => { + callback_simple(id_on_cdata_end); + + fnext main; + }; + *|; + # Processing Instructions # # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes @@ -439,7 +462,7 @@ doctype_start => start_doctype; xml_decl_start => start_xml_decl; comment => start_comment; - cdata => start_cdata; + cdata_start => start_cdata; proc_ins_start => start_proc_ins; element_start => start_element; element_end => close_element; diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index c84e90f..d5f51d2 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -262,10 +262,26 @@ module Oga end ## - # Called on a CDATA tag. + # Called on the open CDATA tag. # - def on_cdata(value) - add_token(:T_CDATA, value) + def on_cdata_start + add_token(:T_CDATA_START) + end + + ## + # Called on the closing CDATA tag. + # + def on_cdata_end + add_token(:T_CDATA_END) + end + + ## + # Called for the body of a CDATA tag. + # + # @param [String] value + # + def on_cdata_body(value) + add_token(:T_CDATA_BODY, value) end ## diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll index a676a49..fd33903 100644 --- a/lib/oga/xml/parser.rll +++ b/lib/oga/xml/parser.rll @@ -24,7 +24,8 @@ %terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY; %terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME; -%terminals T_DOCTYPE_INLINE T_CDATA T_COMMENT; +%terminals T_DOCTYPE_INLINE T_COMMENT; +%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END; %terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS; %terminals T_XML_DECL_START T_XML_DECL_END; %terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END; @@ -93,7 +94,12 @@ doctype_types # CDATA tags cdata - = T_CDATA { on_cdata(val[0]) } + = T_CDATA_START cdata_body T_CDATA_END { on_cdata(val[1]) } + ; + +cdata_body + = T_CDATA_BODY cdata_body { val[0] + val[1] } + | _ { '' } ; # Comments diff --git a/spec/oga/xml/lexer/cdata_spec.rb b/spec/oga/xml/lexer/cdata_spec.rb index 82c304e..48f873d 100644 --- a/spec/oga/xml/lexer/cdata_spec.rb +++ b/spec/oga/xml/lexer/cdata_spec.rb @@ -1,30 +1,107 @@ require 'spec_helper' describe Oga::XML::Lexer do - describe 'cdata tags' do - it 'lexes a cdata tag' do - lex('').should == [[:T_CDATA, 'foo', 1]] + describe 'CDATA tags' do + it 'lexes a CDATA tag' do + lex('').should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, 'foo', 1], + [:T_CDATA_END, nil, 1] + ] end it 'lexes tags inside CDATA tags as regular text' do - lex('Foo
]]>').should == [[:T_CDATA, 'Foo
', 1]] + lex('Foo]]>').should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, 'Foo
', 1], + [:T_CDATA_END, nil, 1] + ] + end + + it 'lexes a single bracket inside a CDATA tag' do + lex('').should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, ']', 1], + [:T_CDATA_END, nil, 1] + ] end it 'lexes double brackets inside a CDATA tag' do - lex('').should == [[:T_CDATA, ']]', 1]] + lex('').should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, ']', 1], + [:T_CDATA_BODY, ']', 1], + [:T_CDATA_END, nil, 1] + ] end it 'lexes two CDATA tags following each other' do lex('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'a', 1], - [:T_CDATA, 'foo', 1], + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, 'foo', 1], + [:T_CDATA_END, nil, 1], [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'b', 1], - [:T_CDATA, 'bar', 1], + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, 'bar', 1], + [:T_CDATA_END, nil, 1], [:T_ELEM_END, nil, 1], [:T_ELEM_END, nil, 1] ] end + + it 'lexes a CDATA tag containing a newline after the open tag' do + lex("").should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, "\nfoo", 1], + [:T_CDATA_END, nil, 2] + ] + end + + it 'lexes a CDATA tag containing a newline before the closing tag' do + lex("").should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, "foo\n", 1], + [:T_CDATA_END, nil, 2] + ] + end + + it 'lexes a CDATA tag with the body surrounded by newlines' do + lex("").should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, "\nfoo\n", 1], + [:T_CDATA_END, nil, 3] + ] + end + + describe 'using an IO as input' do + it 'lexes a CDATA tag containing a newline after the open tag' do + lex_stringio("").should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, "\n", 1], + [:T_CDATA_BODY, "foo", 2], + [:T_CDATA_END, nil, 2] + ] + end + + it 'lexes a CDATA tag containing a newline before the closing tag' do + lex_stringio("").should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, "foo\n", 1], + [:T_CDATA_END, nil, 2] + ] + end + + it 'lexes a CDATA tag with the body surrounded by newlines' do + lex_stringio("").should == [ + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, "\n", 1], + [:T_CDATA_BODY, "foo\n", 2], + [:T_CDATA_END, nil, 3] + ] + end + end end end diff --git a/spec/oga/xml/lexer/inline_javascript_spec.rb b/spec/oga/xml/lexer/inline_javascript_spec.rb index 9ffc944..e1d7295 100644 --- a/spec/oga/xml/lexer/inline_javascript_spec.rb +++ b/spec/oga/xml/lexer/inline_javascript_spec.rb @@ -30,7 +30,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'script', 1], [:T_TEXT, @javascript, 1], - [:T_CDATA, 'foo', 1], + [:T_CDATA_START, nil, 1], + [:T_CDATA_BODY, 'foo', 1], + [:T_CDATA_END, nil, 1], [:T_ELEM_END, nil, 1] ] end diff --git a/spec/support/parsing_helpers.rb b/spec/support/parsing_helpers.rb index d5f07d6..a570dee 100644 --- a/spec/support/parsing_helpers.rb +++ b/spec/support/parsing_helpers.rb @@ -29,6 +29,13 @@ module Oga return Oga::XML::Lexer.new(input, options).lex end + ## + # @see [#lex] + # + def lex_stringio(input, options = {}) + return lex(StringIO.new(input), options) + end + ## # Lexes an XPath expression. #