From cd0f3380c4660e1f5872c59d448ca8f042b1ee06 Mon Sep 17 00:00:00 2001
From: Yorick Peterse
Date: Mon, 19 May 2014 09:35:35 +0200
Subject: [PATCH] Merge multiple CDATA tokens into a single token.
The tokens T_CDATA_START, T_TEXT and T_CDATA_END have been merged together into
T_CDATA.
---
ext/ragel/base_lexer.rl | 19 ++++---------------
lib/oga/xml/lexer.rb | 13 +++----------
lib/oga/xml/parser.y | 7 ++-----
spec/oga/xml/lexer/cdata_spec.rb | 18 +++---------------
4 files changed, 12 insertions(+), 45 deletions(-)
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index 6c0c333..3d00f56 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -120,21 +120,6 @@
cdata_start = '';
- action start_cdata {
- callback_simple("on_cdata_start");
- fcall cdata;
- }
-
- # Machine that for processing the contents of CDATA tags. Everything
- # inside a CDATA tag is treated as plain text.
- cdata := |*
- any* cdata_end => {
- callback("on_text", data, encoding, ts, te - 3);
- callback_simple("on_cdata_end");
- fret;
- };
- *|;
-
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
@@ -248,6 +233,10 @@
callback("on_comment", data, encoding, ts + 4, te - 3);
};
+ cdata_start any* cdata_end => {
+ callback("on_cdata", data, encoding, ts + 9, te - 3);
+ };
+
# Enter the body of the tag. If HTML mode is enabled and the current
# element is a void element we'll close it and bail out.
'>' => {
diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index bf15f4b..e58c452 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -203,17 +203,10 @@ module Oga
end
##
- # Called on the start of a CDATA tag.
+ # Called on a CDATA tag.
#
- def on_cdata_start
- add_token(:T_CDATA_START)
- end
-
- ##
- # Called on the end of a CDATA tag.
- #
- def on_cdata_end
- add_token(:T_CDATA_END)
+ def on_cdata(value)
+ add_token(:T_CDATA, value)
end
##
diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y
index d444512..118c906 100644
--- a/lib/oga/xml/parser.y
+++ b/lib/oga/xml/parser.y
@@ -12,7 +12,7 @@ class Oga::XML::Parser
token T_STRING T_TEXT
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
token T_DOCTYPE_INLINE
-token T_CDATA_START T_CDATA_END T_COMMENT
+token T_CDATA T_COMMENT
token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR
token T_XML_DECL_START T_XML_DECL_END
@@ -81,11 +81,8 @@ rule
# CDATA tags
cdata
- #
- : T_CDATA_START T_CDATA_END { on_cdata }
-
#
- | T_CDATA_START T_TEXT T_CDATA_END { on_cdata(val[1]) }
+ | T_CDATA { on_cdata(val[0]) }
;
# Comments
diff --git a/spec/oga/xml/lexer/cdata_spec.rb b/spec/oga/xml/lexer/cdata_spec.rb
index f4465dd..0e0887b 100644
--- a/spec/oga/xml/lexer/cdata_spec.rb
+++ b/spec/oga/xml/lexer/cdata_spec.rb
@@ -3,27 +3,15 @@ require 'spec_helper'
describe Oga::XML::Lexer do
context 'cdata tags' do
example 'lex a cdata tag' do
- lex('').should == [
- [:T_CDATA_START, nil, 1],
- [:T_TEXT, 'foo', 1],
- [:T_CDATA_END, nil, 1]
- ]
+ lex('').should == [[:T_CDATA, 'foo', 1]]
end
example 'lex tags inside CDATA tags as regular text' do
- lex('Foo
]]>').should == [
- [:T_CDATA_START, nil, 1],
- [:T_TEXT, 'Foo
', 1],
- [:T_CDATA_END, nil, 1]
- ]
+ lex('Foo]]>').should == [[:T_CDATA, 'Foo
', 1]]
end
example 'lex double brackets inside a CDATA tag' do
- lex('').should == [
- [:T_CDATA_START, nil, 1],
- [:T_TEXT, ']]', 1],
- [:T_CDATA_END, nil, 1]
- ]
+ lex('').should == [[:T_CDATA, ']]', 1]]
end
end
end