From 5182d0c488759efb96d85a399de29550faea3efe Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sat, 23 May 2015 09:59:50 +0200 Subject: [PATCH] Correct closing of unclosed, nested HTML elements Previous HTML such as this would be lexed incorrectly:
inside div
outside div The lexer would see this as the following instead:
outside div
This commit exposes the name of the closing tag to XML::Lexer#on_element_end (omitted for self closing tags). This can be used to automatically close nested tags that were left open, ensuring the above HTML is lexer correctly. The new setup ignores namespace prefixes as these are not used in HTML, XML in turn won't even run the code to begin with since it doesn't allow one to leave out closing tags. --- ext/ragel/base_lexer.rl | 13 ++++-- lib/oga/xml/lexer.rb | 12 ++++- spec/oga/html/lexer/closing_mismatch_spec.rb | 13 ++++++ spec/oga/html/lexer/closing_rules/ul_spec.rb | 49 ++++++++++++++++++++ 4 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 spec/oga/html/lexer/closing_mismatch_spec.rb create mode 100644 spec/oga/html/lexer/closing_rules/ul_spec.rb diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 8a6e469..1546f2b 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -363,16 +363,15 @@ # body of an element is lexed using the `main` machine. # - element_start = '<' ident_char; - element_end = ''; - action start_element { fhold; fnext element_name; } action close_element { - callback_simple(id_on_element_end); + callback(id_on_element_end, data, encoding, mark, te - 1); + + mark = 0; } action close_element_fnext_main { @@ -381,6 +380,12 @@ fnext main; } + element_start = '<' ident_char; + + element_end = '' + | '' + ; + # Machine used for lexing the name/namespace of an element. element_name := |* identifier ':' => { diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 460b167..e2c724d 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -476,9 +476,19 @@ module Oga ## # Called on the closing tag of an element. # - def on_element_end + # @param [String] ns_name The name of the element (minus namespace + # prefix). This is not set for self closing tags. + # + def on_element_end(name = nil) return if @elements.empty? + if html? and name and @elements.include?(name) + while current_element != name + add_token(:T_ELEM_END) + @elements.pop + end + end + add_token(:T_ELEM_END) @elements.pop diff --git a/spec/oga/html/lexer/closing_mismatch_spec.rb b/spec/oga/html/lexer/closing_mismatch_spec.rb new file mode 100644 index 0000000..f177745 --- /dev/null +++ b/spec/oga/html/lexer/closing_mismatch_spec.rb @@ -0,0 +1,13 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'closing HTML elements with mismatched closing tags' do + it 'lexes a

element closed using a element' do + lex_html('

foo').should == [ + [:T_ELEM_NAME, 'p', 1], + [:T_TEXT, 'foo', 1], + [:T_ELEM_END, nil, 1] + ] + end + end +end diff --git a/spec/oga/html/lexer/closing_rules/ul_spec.rb b/spec/oga/html/lexer/closing_rules/ul_spec.rb new file mode 100644 index 0000000..39ae1a8 --- /dev/null +++ b/spec/oga/html/lexer/closing_rules/ul_spec.rb @@ -0,0 +1,49 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'using HTML