diff --git a/lib/oga.rb b/lib/oga.rb index 7b6b11e..19ca643 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -2,6 +2,8 @@ require 'set' require_relative 'oga/xml/lexer' require_relative 'oga/xml/parser' +require_relative 'oga/xml/pull_parser' + require_relative 'oga/xml/node' require_relative 'oga/xml/element' require_relative 'oga/xml/document' diff --git a/lib/oga/xml/pull_parser.rb b/lib/oga/xml/pull_parser.rb new file mode 100644 index 0000000..457625e --- /dev/null +++ b/lib/oga/xml/pull_parser.rb @@ -0,0 +1,86 @@ +module Oga + module XML + ## + # The PullParser class can be used to parse an XML document incrementally + # instead of parsing it as a whole. This results in lower memory usage and + # potentially faster parsing times. The downside is that pull parsers are + # typically more difficult to use compared to DOM parsers. + # + # Basic parsing using this class works as following: + # + # parser = Oga::XML::PullParser.new('... xml here ...') + # + # parser.parse do |node| + # if node.is_a?(Oga::XML::PullParser) + # + # end + # end + # + # This parses yields proper XML instances such as {Oga::XML::Element}. + # Doctypes and XML declarations are ignored by this parser. + # + class PullParser < Parser + ## + # @return [Array] + # + DISABLED_CALLBACKS = [ + :on_document, + :on_doctype, + :on_xml_decl, + :on_element_children + ] + + ## + # @return [Array] + # + BLOCK_CALLBACKS = [ + :on_cdata, + :on_comment, + :on_text, + :on_element + ] + + ## + # @see Oga::XML::Parser#reset + # + def reset + super + + @block = nil + end + + ## + # Parses the input and yields every node to the supplied block. + # + # @yieldparam [Oga::XML::Node] + # + def parse(&block) + @block = block + + yyparse(self, :yield_next_token) + + reset + + return + end + + # eval is a heck of a lot faster than define_method on both Rubinius and + # JRuby. + DISABLED_CALLBACKS.each do |method| + eval <<-EOF, nil, __FILE__, __LINE__ + 1 + def #{method}(*_) + return + end + EOF + end + + BLOCK_CALLBACKS.each do |method| + eval <<-EOF, nil, __FILE__, __LINE__ + 1 + def #{method}(*args) + @block.call(super) + end + EOF + end + end # PullParser + end # XML +end # Oga diff --git a/spec/oga/xml/pull_parser/doctype_spec.rb b/spec/oga/xml/pull_parser/doctype_spec.rb new file mode 100644 index 0000000..43bb99b --- /dev/null +++ b/spec/oga/xml/pull_parser/doctype_spec.rb @@ -0,0 +1,19 @@ +require 'spec_helper' + +describe Oga::XML::PullParser do + context 'doctypes' do + before :all do + @parser = described_class.new('') + end + + example 'ignore doctypes' do + amount = 0 + + @parser.parse do + amount += 1 + end + + amount.should == 0 + end + end +end diff --git a/spec/oga/xml/pull_parser/element_spec.rb b/spec/oga/xml/pull_parser/element_spec.rb new file mode 100644 index 0000000..4b3b0c7 --- /dev/null +++ b/spec/oga/xml/pull_parser/element_spec.rb @@ -0,0 +1,29 @@ +require 'spec_helper' + +describe Oga::XML::PullParser do + context 'elements' do + before :all do + @parser = described_class.new('Alice') + end + + example 'parse an element' do + name = nil + + @parser.parse do |node| + name = node.name if node.is_a?(Oga::XML::Element) + end + + name.should == 'person' + end + + example 'parse the text of an element' do + text = nil + + @parser.parse do |node| + text = node.text if node.is_a?(Oga::XML::Text) + end + + text.should == 'Alice' + end + end +end