From cd0f3380c4660e1f5872c59d448ca8f042b1ee06 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Mon, 19 May 2014 09:35:35 +0200
Subject: [PATCH] Merge multiple CDATA tokens into a single token.

The tokens T_CDATA_START, T_TEXT and T_CDATA_END have been merged together into
T_CDATA.
---
 ext/ragel/base_lexer.rl          | 19 ++++---------------
 lib/oga/xml/lexer.rb             | 13 +++----------
 lib/oga/xml/parser.y             |  7 ++-----
 spec/oga/xml/lexer/cdata_spec.rb | 18 +++---------------
 4 files changed, 12 insertions(+), 45 deletions(-)
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index 6c0c333..3d00f56 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -120,21 +120,6 @@
     cdata_start = '<![CDATA[';
     cdata_end   = ']]>';
 
-    action start_cdata {
-        callback_simple("on_cdata_start");
-        fcall cdata;
-    }
-
-    # Machine that for processing the contents of CDATA tags. Everything
-    # inside a CDATA tag is treated as plain text.
-    cdata := |*
-        any* cdata_end => {
-            callback("on_text", data, encoding, ts, te - 3);
-            callback_simple("on_cdata_end");
-            fret;
-        };
-    *|;
-
     # Comments
     #
     # http://www.w3.org/TR/html-markup/syntax.html#comments
@@ -248,6 +233,10 @@
             callback("on_comment", data, encoding, ts + 4, te - 3);
         };
 
+        cdata_start any* cdata_end => {
+            callback("on_cdata", data, encoding, ts + 9, te - 3);
+        };
+
         # Enter the body of the tag. If HTML mode is enabled and the current
         # element is a void element we'll close it and bail out.
         '>' => {
diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index bf15f4b..e58c452 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -203,17 +203,10 @@ module Oga
       end
 
       ##
-      # Called on the start of a CDATA tag.
+      # Called on a CDATA tag.
       #
-      def on_cdata_start
-        add_token(:T_CDATA_START)
-      end
-
-      ##
-      # Called on the end of a CDATA tag.
-      #
-      def on_cdata_end
-        add_token(:T_CDATA_END)
+      def on_cdata(value)
+        add_token(:T_CDATA, value)
       end
 
       ##
diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y
index d444512..118c906 100644
--- a/lib/oga/xml/parser.y
+++ b/lib/oga/xml/parser.y
@@ -12,7 +12,7 @@ class Oga::XML::Parser
 token T_STRING T_TEXT
 token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
 token T_DOCTYPE_INLINE
-token T_CDATA_START T_CDATA_END T_COMMENT
+token T_CDATA T_COMMENT
 token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR
 token T_XML_DECL_START T_XML_DECL_END
 
@@ -81,11 +81,8 @@ rule
   # CDATA tags
 
   cdata
-    # <![CDATA[]]>
-    : T_CDATA_START T_CDATA_END { on_cdata }
-
     # <![CDATA[foo]]>
-    | T_CDATA_START T_TEXT T_CDATA_END { on_cdata(val[1]) }
+    | T_CDATA { on_cdata(val[0]) }
     ;
 
   # Comments
diff --git a/spec/oga/xml/lexer/cdata_spec.rb b/spec/oga/xml/lexer/cdata_spec.rb
index f4465dd..0e0887b 100644
--- a/spec/oga/xml/lexer/cdata_spec.rb
+++ b/spec/oga/xml/lexer/cdata_spec.rb
@@ -3,27 +3,15 @@ require 'spec_helper'
 describe Oga::XML::Lexer do
   context 'cdata tags' do
     example 'lex a cdata tag' do
-      lex('<![CDATA[foo]]>').should == [
-        [:T_CDATA_START, nil, 1],
-        [:T_TEXT, 'foo', 1],
-        [:T_CDATA_END, nil, 1]
-      ]
+      lex('<![CDATA[foo]]>').should == [[:T_CDATA, 'foo', 1]]
     end
 
     example 'lex tags inside CDATA tags as regular text' do
-      lex('<![CDATA[<p>Foo</p>]]>').should == [
-        [:T_CDATA_START, nil, 1],
-        [:T_TEXT, '<p>Foo</p>', 1],
-        [:T_CDATA_END, nil, 1]
-      ]
+      lex('<![CDATA[<p>Foo</p>]]>').should == [[:T_CDATA, '<p>Foo</p>', 1]]
     end
 
     example 'lex double brackets inside a CDATA tag' do
-      lex('<![CDATA[]]]]>').should == [
-        [:T_CDATA_START, nil, 1],
-        [:T_TEXT, ']]', 1],
-        [:T_CDATA_END, nil, 1]
-      ]
+      lex('<![CDATA[]]]]>').should == [[:T_CDATA, ']]', 1]]
     end
   end
 end