From 5bfc2d50f2a3d387cb9fc28826d1f3d5a2d9d224 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 9 Feb 2016 19:51:53 +0100 Subject: [PATCH] Preserve entities that can't be decoded Certain entities when decoded will produce a String with an invalid encoding. This commit ensures that instead of raising an EncodingError further down the line (e.g. when calling "inspect" on a document) the entities are preserved as-is. Fixes #143 --- lib/oga/xml/entities.rb | 19 +++++++++++++++---- spec/oga/xml/entities_spec.rb | 8 ++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/lib/oga/xml/entities.rb b/lib/oga/xml/entities.rb index 6bff83e..0b53260 100644 --- a/lib/oga/xml/entities.rb +++ b/lib/oga/xml/entities.rb @@ -74,14 +74,14 @@ module Oga input = input.gsub(REGULAR_ENTITY, mapping) if input.include?(AMPERSAND) - input = input.gsub(NUMERIC_CODE_POINT_ENTITY) do - [Integer($1, 10)].pack('U*') + input = input.gsub(NUMERIC_CODE_POINT_ENTITY) do |found| + pack_string($1, 10) || found end end if input.include?(AMPERSAND) - input = input.gsub(HEX_CODE_POINT_ENTITY) do - [Integer($1, 16)].pack('U*') + input = input.gsub(HEX_CODE_POINT_ENTITY) do |found| + pack_string($1, 16) || found end end @@ -104,6 +104,17 @@ module Oga def self.encode_attribute(input) input.gsub(ENCODE_ATTRIBUTE_REGEXP, ENCODE_ATTRIBUTE_MAPPING) end + + private + + # @param [String] input + # @param [Fixnum] base + # @return [String] + def self.pack_string(input, base) + packed = [Integer(input, base)].pack('U*') + + packed.valid_encoding? ? packed : nil + end end # Entities end # XML end # Oga diff --git a/spec/oga/xml/entities_spec.rb b/spec/oga/xml/entities_spec.rb index 3a31162..f015243 100644 --- a/spec/oga/xml/entities_spec.rb +++ b/spec/oga/xml/entities_spec.rb @@ -85,6 +85,14 @@ describe Oga::XML::Entities do it 'preserves entity-like letters in non-hex mode' do described_class.decode('{A;').should == '{A;' end + + it "preserves numeric entities when they can't be decoded" do + described_class.decode('�').should == '�' + end + + it "preserves hex entities when they can't be decoded" do + described_class.decode('�').should == '�' + end end describe 'encode' do