diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index 22d15bc..8513c32 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -109,6 +109,7 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) ID id_on_proc_ins_end = rb_intern("on_proc_ins_end"); ID id_on_proc_ins_name = rb_intern("on_proc_ins_name"); ID id_on_proc_ins_start = rb_intern("on_proc_ins_start"); + ID id_on_proc_ins_body = rb_intern("on_proc_ins_body"); ID id_on_string_body = rb_intern("on_string_body"); ID id_on_string_dquote = rb_intern("on_string_dquote"); ID id_on_string_squote = rb_intern("on_string_squote"); diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index 7ef24e0..e04679e 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -123,6 +123,7 @@ public class Lexer extends RubyObject String id_on_proc_ins_end = "on_proc_ins_end"; String id_on_proc_ins_name = "on_proc_ins_name"; String id_on_proc_ins_start = "on_proc_ins_start"; + String id_on_proc_ins_body = "on_proc_ins_body"; String id_on_string_body = "on_string_body"; String id_on_string_dquote = "on_string_dquote"; String id_on_string_squote = "on_string_squote"; diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index bcabab4..b46a540 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -149,26 +149,33 @@ proc_ins_start = ''; + # Everything except "?" OR a single "?" + proc_ins_allowed = (^'?'+ | '?') $count_newlines; + action start_proc_ins { callback_simple(id_on_proc_ins_start); callback(id_on_proc_ins_name, data, encoding, ts + 2, te); - mark = te; - fnext proc_ins_body; } proc_ins_body := |* - proc_ins_end => { - callback(id_on_text, data, encoding, mark, ts); - callback_simple(id_on_proc_ins_end); + proc_ins_allowed => { + callback(id_on_proc_ins_body, data, encoding, ts, te); - mark = 0; + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + }; + + proc_ins_end => { + callback_simple(id_on_proc_ins_end); fnext main; }; - - any; *|; # Strings diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index a0fc1c8..e9ea474 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -344,6 +344,15 @@ module Oga add_token(:T_PROC_INS_NAME, value) end + ## + # Called on the body of a processing instruction. + # + # @param [String] value + # + def on_proc_ins_body(value) + add_token(:T_PROC_INS_BODY, value) + end + ## # Called on the end of a processing instruction. # diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll index b2c7d0a..2af01e8 100644 --- a/lib/oga/xml/parser.rll +++ b/lib/oga/xml/parser.rll @@ -29,7 +29,7 @@ %terminals T_CDATA_START T_CDATA_BODY T_CDATA_END; %terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS; %terminals T_XML_DECL_START T_XML_DECL_END; -%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END; +%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_BODY T_PROC_INS_END; document = expressions { on_document(val[0]) } @@ -117,12 +117,17 @@ comment_body # Processing Instructions proc_ins - = T_PROC_INS_START T_PROC_INS_NAME T_TEXT? T_PROC_INS_END + = T_PROC_INS_START T_PROC_INS_NAME proc_ins_body T_PROC_INS_END { on_proc_ins(val[1], val[2]) } ; +proc_ins_body + = T_PROC_INS_BODY proc_ins_body { val[0] + val[1] } + | _ { '' } + ; + # Elements element_name_ns diff --git a/spec/oga/xml/lexer/processing_instruction_spec.rb b/spec/oga/xml/lexer/processing_instruction_spec.rb index dd69059..0803032 100644 --- a/spec/oga/xml/lexer/processing_instruction_spec.rb +++ b/spec/oga/xml/lexer/processing_instruction_spec.rb @@ -2,7 +2,7 @@ require 'spec_helper' describe Oga::XML::Lexer do describe 'processing instructions' do - it 'lexes a processing instruction' do + it 'lexes an instruction' do lex('').should == [ [:T_PROC_INS_START, nil, 1], [:T_PROC_INS_NAME, 'foo', 1], @@ -10,13 +10,95 @@ describe Oga::XML::Lexer do ] end - it 'lexes a processing instruction containing text' do + it 'lexes an instruction containing text' do lex('').should == [ [:T_PROC_INS_START, nil, 1], [:T_PROC_INS_NAME, 'foo', 1], - [:T_TEXT, ' bar ', 1], + [:T_PROC_INS_BODY, ' bar ', 1], [:T_PROC_INS_END, nil, 1] ] end + + it 'lexes an instruction containing a ?' do + lex('').should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, ' ', 1], + [:T_PROC_INS_BODY, '?', 1], + [:T_PROC_INS_BODY, ' ', 1], + [:T_PROC_INS_END, nil, 1] + ] + end + + it 'lexes two instructions following each other' do + lex('').should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, ' bar ', 1], + [:T_PROC_INS_END, nil, 1], + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, ' baz ', 1], + [:T_PROC_INS_END, nil, 1] + ] + end + + it 'lexes an instruction with a newline after the name' do + lex("").should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, "\nbar", 1], + [:T_PROC_INS_END, nil, 2] + ] + end + + it 'lexes an instruction with a newline before the closing tag' do + lex("").should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, " bar\n", 1], + [:T_PROC_INS_END, nil, 2] + ] + end + + it 'lexes an instruction with the body surrounded by newlines' do + lex("").should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, "\nbar\n", 1], + [:T_PROC_INS_END, nil, 3] + ] + end + + describe 'using an IO as input' do + it 'lexes an instruction with a newline after the name' do + lex_stringio("").should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, "\n", 1], + [:T_PROC_INS_BODY, "bar", 2], + [:T_PROC_INS_END, nil, 2] + ] + end + + it 'lexes an instruction with a newline before the closing tag' do + lex_stringio("").should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, " bar\n", 1], + [:T_PROC_INS_END, nil, 2] + ] + end + + it 'lexes an instruction with the body surrounded by newlines' do + lex_stringio("").should == [ + [:T_PROC_INS_START, nil, 1], + [:T_PROC_INS_NAME, 'foo', 1], + [:T_PROC_INS_BODY, "\n", 1], + [:T_PROC_INS_BODY, "bar\n", 2], + [:T_PROC_INS_END, nil, 3] + ] + end + end end end