From dd138981f68a606eff5d5a01e990f04398087dc4 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sat, 3 Sep 2016 21:24:19 +0200 Subject: [PATCH] Generate XML without relying on recursion While using recursion is an easy way of generating XML it can lead to the call stack overflowing when serialising documents with lots of nested nodes. Generally there are two ways of working around this: 1. Use an explicit stack (e.g. an array or a queue of sorts) instead of relying on the call stack. 2. Use an algorithm that doesn't use a stack at all (e.g. Morris traversal). This commit introduces the XML::Generator class which can serialize documents back to XML without using a stack at all. This class takes advantage of XML nodes having access to not only their child nodes, but also their siblings and their parents. All XML serialisation logic now resides in the XML::Generator class. In turn the various "to_xml" methods just use this class and serialize everything starting at "self". --- CHANGELOG.md | 11 ++ lib/oga.rb | 2 + lib/oga/xml/attribute.rb | 14 +- lib/oga/xml/cdata.rb | 6 - lib/oga/xml/character_node.rb | 5 - lib/oga/xml/comment.rb | 6 - lib/oga/xml/doctype.rb | 16 +-- lib/oga/xml/document.rb | 23 +-- lib/oga/xml/element.rb | 32 ++--- lib/oga/xml/generator.rb | 198 ++++++++++++++++++++++++++ lib/oga/xml/node.rb | 1 + lib/oga/xml/processing_instruction.rb | 5 - lib/oga/xml/text.rb | 12 +- lib/oga/xml/to_xml.rb | 12 ++ lib/oga/xml/xml_declaration.rb | 17 +-- spec/oga/xml/character_node_spec.rb | 6 - spec/oga/xml/document_spec.rb | 6 + spec/oga/xml/element_spec.rb | 10 ++ spec/oga/xml/generator_spec.rb | 51 +++++++ 19 files changed, 311 insertions(+), 122 deletions(-) create mode 100644 lib/oga/xml/generator.rb create mode 100644 lib/oga/xml/to_xml.rb create mode 100644 spec/oga/xml/generator_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index e053d68..a4a3faa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ This document contains details of the various releases and their release dates. Dates are in the format `yyyy-mm-dd`. +## 2.4 - Unreleased + +### Serialising Large Documents + +Oga can now serialise large documents without causing the call stack to overflow +thanks to the new `Oga::XML::Generator` class. This class can generate XML +without using a stack at all. + +See issue for more +information. + ## 2.3 - 2016-07-13 Thanks to various changes provided by Erik Michaels-Ober Oga can now be used to diff --git a/lib/oga.rb b/lib/oga.rb index d87cc87..3bd9081 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -23,6 +23,7 @@ if RUBY_PLATFORM == 'java' end #:nocov: +require 'oga/xml/to_xml' require 'oga/xml/html_void_elements' require 'oga/xml/entities' require 'oga/xml/querying' @@ -42,6 +43,7 @@ require 'oga/xml/default_namespace' require 'oga/xml/attribute' require 'oga/xml/element' require 'oga/xml/node_set' +require 'oga/xml/generator' require 'oga/xml/sax_parser' require 'oga/xml/pull_parser' diff --git a/lib/oga/xml/attribute.rb b/lib/oga/xml/attribute.rb index c0496e1..7ae5222 100644 --- a/lib/oga/xml/attribute.rb +++ b/lib/oga/xml/attribute.rb @@ -3,6 +3,7 @@ module Oga # Class for storing information about a single XML attribute. class Attribute include ExpandedName + include ToXML # The name of the attribute. # @return [String] @@ -81,19 +82,6 @@ module Oga alias_method :to_s, :text - # @return [String] - def to_xml - if namespace_name - full_name = "#{namespace_name}:#{name}" - else - full_name = name - end - - enc_value = value ? Entities.encode_attribute(value) : nil - - %Q(#{full_name}="#{enc_value}") - end - # @return [String] def inspect segments = [] diff --git a/lib/oga/xml/cdata.rb b/lib/oga/xml/cdata.rb index 22180bc..439b096 100644 --- a/lib/oga/xml/cdata.rb +++ b/lib/oga/xml/cdata.rb @@ -2,12 +2,6 @@ module Oga module XML # Class used for storing information about CDATA tags. class Cdata < CharacterNode - # Converts the node back to XML. - # - # @return [String] - def to_xml - "" - end end # Cdata end # XML end # Oga diff --git a/lib/oga/xml/character_node.rb b/lib/oga/xml/character_node.rb index 5206aa8..8387028 100644 --- a/lib/oga/xml/character_node.rb +++ b/lib/oga/xml/character_node.rb @@ -15,11 +15,6 @@ module Oga @text = options[:text] end - # @return [String] - def to_xml - text.to_s - end - # @return [String] def inspect "#{self.class.to_s.split('::').last}(#{text.inspect})" diff --git a/lib/oga/xml/comment.rb b/lib/oga/xml/comment.rb index 31fed66..b5a3f90 100644 --- a/lib/oga/xml/comment.rb +++ b/lib/oga/xml/comment.rb @@ -2,12 +2,6 @@ module Oga module XML # Class used for storing information about XML comments. class Comment < CharacterNode - # Converts the node back to XML. - # - # @return [String] - def to_xml - "" - end end # Comment end # XML end # Oga diff --git a/lib/oga/xml/doctype.rb b/lib/oga/xml/doctype.rb index feb8593..c5f5bba 100644 --- a/lib/oga/xml/doctype.rb +++ b/lib/oga/xml/doctype.rb @@ -2,6 +2,8 @@ module Oga module XML # Class used for storing information about Doctypes. class Doctype + include ToXML + # The name of the doctype (e.g. "HTML"). # @return [String] attr_accessor :name @@ -39,20 +41,6 @@ module Oga @inline_rules = options[:inline_rules] end - # Converts the doctype back to XML. - # - # @return [String] - def to_xml - segments = "' - end - # Inspects the doctype. # # @return [String] diff --git a/lib/oga/xml/document.rb b/lib/oga/xml/document.rb index b2357da..fd4c0ee 100644 --- a/lib/oga/xml/document.rb +++ b/lib/oga/xml/document.rb @@ -5,6 +5,7 @@ module Oga class Document include Querying include Traversal + include ToXML # @return [Oga::XML::Doctype] attr_accessor :doctype @@ -56,23 +57,6 @@ module Oga self end - # Converts the document and its child nodes to XML. - # - # @return [String] - def to_xml - xml = children.map(&:to_xml).join('') - - if doctype - xml = doctype.to_xml + "\n" + xml.strip - end - - if xml_declaration - xml = xml_declaration.to_xml + "\n" + xml.strip - end - - xml - end - # @return [TrueClass|FalseClass] def html? type.equal?(:html) @@ -99,6 +83,11 @@ Document( ) EOF end + + # @return [FalseClass] + def literal_html_name? + false + end end # Document end # XML end # Oga diff --git a/lib/oga/xml/element.rb b/lib/oga/xml/element.rb index c513f3c..a2732d1 100644 --- a/lib/oga/xml/element.rb +++ b/lib/oga/xml/element.rb @@ -211,30 +211,6 @@ module Oga @children = NodeSet.new([text_node], self) end - # Converts the element and its child elements to XML. - # - # @return [String] - def to_xml - if namespace_name - full_name = "#{namespace_name}:#{name}" - else - full_name = name - end - - body = children.map(&:to_xml).join('') - attrs = '' - - attributes.each do |attr| - attrs << " #{attr.to_xml}" - end - - if self_closing? - return "<#{full_name}#{attrs} />" - else - return "<#{full_name}#{attrs}>#{body}" - end - end - # @return [String] def inspect segments = [] @@ -323,6 +299,14 @@ module Oga end end + # Returns true if the current element name is the name of one of the + # literal HTML elements. + # + # @return [TrueClass|FalseClass] + def literal_html_name? + Lexer::LITERAL_HTML_ELEMENTS.allow?(name) + end + private # Registers namespaces based on any "xmlns" attributes. diff --git a/lib/oga/xml/generator.rb b/lib/oga/xml/generator.rb new file mode 100644 index 0000000..f47d959 --- /dev/null +++ b/lib/oga/xml/generator.rb @@ -0,0 +1,198 @@ +module Oga + module XML + # Class for generating XML as a String based on an existing document. + # + # Basic usage: + # + # element = Oga::XML::Element.new(name: 'root') + # element.inner_text = 'hello' + # + # gen = Oga::XML::Generator.new(element) + # + # gen.to_xml # => "hello" + # + # @private + class Generator + # @param [Oga::XML::Document|Oga::XML::Node] start The node to serialise. + def initialize(root) + @start = root + + if @start.respond_to?(:root_node) + @html_mode = @start.root_node.html? + else + @html_mode = false + end + end + + # Returns the XML for the current root node. + # + # @return [String] + def to_xml + current = @start + output = '' + + while current + children = false + + # Determine what callback to use for the current node. The order of + # this statement is based on how likely it is for an arm to match. + case current + when Oga::XML::Element + callback = :on_element + children = true + when Oga::XML::Text + callback = :on_text + when Oga::XML::Cdata + callback = :on_cdata + when Oga::XML::Comment + callback = :on_comment + when Oga::XML::Attribute + callback = :on_attribute + when Oga::XML::ProcessingInstruction + callback = :on_processing_instruction + when Oga::XML::Doctype + callback = :on_doctype + when Oga::XML::XmlDeclaration + callback = :on_xml_declaration + when Oga::XML::Document + callback = :on_document + children = true + else + raise TypeError, "Can't serialize #{current.class} to XML" + end + + send(callback, current, output) + + if child_node = children && current.children[0] + current = child_node + else + until next_node = current.is_a?(Node) && current.next + if current.is_a?(Node) && current != @start + current = current.parent + end + + send(:after_element, current, output) if current.is_a?(Element) + + break if current == @start + end + + current = next_node + end + end + + output + end + + # @param [Oga::XML::Text] node + # @param [String] output + def on_text(node, output) + if @html_mode && (parent = node.parent) && parent.literal_html_name? + output << node.text + else + output << Entities.encode(node.text) + end + end + + # @param [Oga::XML::Cdata] node + # @param [String] output + def on_cdata(node, output) + output << "" + end + + # @param [Oga::XML::Comment] node + # @param [String] output + def on_comment(node, output) + output << "" + end + + # @param [Oga::XML::ProcessingInstruction] node + # @param [String] output + def on_processing_instruction(node, output) + output << "" + end + + # @param [Oga::XML::Element] element + # @param [String] body The content of the element. + def on_element(element, output) + name = element.expanded_name + attrs = '' + + element.attributes.each do |attr| + attrs << ' ' + on_attribute(attr, attrs) + end + + if self_closing?(element) + output << "<#{name}#{attrs} />" + else + output << "<#{name}#{attrs}>" + end + end + + # @param [Oga::XML::Element] element + # @param [String] output + def after_element(element, output) + output << "" unless self_closing?(element) + end + + # @param [Oga::XML::Attribute] attr + # @param [String] output + def on_attribute(attr, output) + name = attr.expanded_name + enc_value = attr.value ? Entities.encode_attribute(attr.value) : nil + + output << %Q(#{name}="#{enc_value}") + end + + # @param [Oga::XML::Doctype] node + # @param [String] output + def on_doctype(node, output) + output << "' + end + + # @param [Oga::XML::Document] node + # @param [String] output + def on_document(doc, output) + if doc.xml_declaration + on_xml_declaration(doc.xml_declaration, output) + output << "\n" + end + + if doc.doctype + on_doctype(doc.doctype, output) + output << "\n" + end + end + + # @param [Oga::XML::XmlDeclaration] node + # @param [String] output + def on_xml_declaration(node, output) + output << '' + end + + # @param [Oga::XML::Element] element + # @return [TrueClass|FalseClass] + def self_closing?(element) + if @html_mode && !HTML_VOID_ELEMENTS.allow?(element.name) + false + else + element.children.empty? + end + end + end + end +end diff --git a/lib/oga/xml/node.rb b/lib/oga/xml/node.rb index 457882e..004b236 100644 --- a/lib/oga/xml/node.rb +++ b/lib/oga/xml/node.rb @@ -5,6 +5,7 @@ module Oga # nodes. class Node include Traversal + include ToXML # @return [Oga::XML::NodeSet] attr_reader :node_set diff --git a/lib/oga/xml/processing_instruction.rb b/lib/oga/xml/processing_instruction.rb index be05af7..c225c23 100644 --- a/lib/oga/xml/processing_instruction.rb +++ b/lib/oga/xml/processing_instruction.rb @@ -15,11 +15,6 @@ module Oga @name = options[:name] end - # @return [String] - def to_xml - "" - end - # @return [String] def inspect "ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})" diff --git a/lib/oga/xml/text.rb b/lib/oga/xml/text.rb index 2ee1734..20e013c 100644 --- a/lib/oga/xml/text.rb +++ b/lib/oga/xml/text.rb @@ -28,15 +28,6 @@ module Oga @text end - # @see [Oga::XML::CharacterNode#to_xml] - def to_xml - return super if inside_literal_html? - - Entities.encode(super) - end - - private - # @return [TrueClass|FalseClass] def decode_entities? !@decoded && !inside_literal_html? @@ -46,8 +37,7 @@ module Oga def inside_literal_html? node = parent - node.is_a?(Element) && html? && - Lexer::LITERAL_HTML_ELEMENTS.allow?(node.name) + node && html? && node.literal_html_name? end end # Text end # XML diff --git a/lib/oga/xml/to_xml.rb b/lib/oga/xml/to_xml.rb new file mode 100644 index 0000000..fd56ffe --- /dev/null +++ b/lib/oga/xml/to_xml.rb @@ -0,0 +1,12 @@ +module Oga + module XML + # Module that provides a `#to_xml` method that serialises the current node + # back to XML. + module ToXML + # @return [String] + def to_xml + Generator.new(self).to_xml + end + end + end +end diff --git a/lib/oga/xml/xml_declaration.rb b/lib/oga/xml/xml_declaration.rb index 2ad6e16..f5b6268 100644 --- a/lib/oga/xml/xml_declaration.rb +++ b/lib/oga/xml/xml_declaration.rb @@ -2,6 +2,8 @@ module Oga module XML # Class containing information about an XML declaration tag. class XmlDeclaration + include ToXML + # @return [String] attr_accessor :version @@ -23,21 +25,6 @@ module Oga @standalone = options[:standalone] end - # Converts the declaration tag to XML. - # - # @return [String] - def to_xml - pairs = [] - - [:version, :encoding, :standalone].each do |getter| - value = send(getter) - - pairs << %Q{#{getter}="#{value}"} if value - end - - "" - end - # @return [String] def inspect segments = [] diff --git a/spec/oga/xml/character_node_spec.rb b/spec/oga/xml/character_node_spec.rb index e21c681..69ad9a5 100644 --- a/spec/oga/xml/character_node_spec.rb +++ b/spec/oga/xml/character_node_spec.rb @@ -14,12 +14,6 @@ describe Oga::XML::CharacterNode do end end - describe '#to_xml' do - it 'converts the node to XML' do - described_class.new(:text => 'a').to_xml.should == 'a' - end - end - describe '#inspect' do it 'returns the inspect value' do described_class.new(:text => 'a').inspect.should == 'CharacterNode("a")' diff --git a/spec/oga/xml/document_spec.rb b/spec/oga/xml/document_spec.rb index 119d600..6c2ed33 100644 --- a/spec/oga/xml/document_spec.rb +++ b/spec/oga/xml/document_spec.rb @@ -142,4 +142,10 @@ Document( EOF end end + + describe '#literal_html_name?' do + it 'returns false' do + described_class.new.literal_html_name?.should == false + end + end end diff --git a/spec/oga/xml/element_spec.rb b/spec/oga/xml/element_spec.rb index d43c6e5..9bfd17f 100644 --- a/spec/oga/xml/element_spec.rb +++ b/spec/oga/xml/element_spec.rb @@ -636,4 +636,14 @@ describe Oga::XML::Element do end end end + + describe '#literal_html_name?' do + it 'returns true for an element name matching one of the literal HTML elements' do + described_class.new(:name => 'script').literal_html_name?.should == true + end + + it 'returns false for an element name not matching one of the literal HTML elements' do + described_class.new(:name => 'foo').literal_html_name?.should == false + end + end end diff --git a/spec/oga/xml/generator_spec.rb b/spec/oga/xml/generator_spec.rb new file mode 100644 index 0000000..683daea --- /dev/null +++ b/spec/oga/xml/generator_spec.rb @@ -0,0 +1,51 @@ +require 'spec_helper' + +describe Oga::XML::Generator do + describe '#to_xml' do + describe 'using an unsupported root type' do + it 'raises TypeError' do + -> { described_class.new(:foo).to_xml }.should raise_error(TypeError) + end + end + + describe 'using an Element as the root node' do + it 'returns a String' do + element = Oga::XML::Element.new(name: 'foo') + element.set('attr', 'value') + + output = described_class.new(element).to_xml + + output.should == '' + end + end + + describe 'using a Document as the root node' do + it 'returns a String' do + element = Oga::XML::Element.new(name: 'foo') + doc = Oga::XML::Document.new(children: [element]) + output = described_class.new(doc).to_xml + + output.should == '' + end + end + + describe 'using Element nodes with siblings' do + it 'returns a String' do + root = Oga::XML::Element.new( + name: 'root', + children: [ + Oga::XML::Element.new(name: 'a'), + Oga::XML::Element.new( + name: 'b', + children: [Oga::XML::Element.new(name: 'c')] + ) + ] + ) + + output = described_class.new(root).to_xml + + output.should == '' + end + end + end +end