Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions benchmark/parse_doctype.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
loop_count: 100
contexts:
- gems:
rexml: 3.2.6
require: false
prelude: require 'rexml'
- name: master
prelude: |
$LOAD_PATH.unshift(File.expand_path("lib"))
require 'rexml'
- name: 3.2.6(YJIT)
gems:
rexml: 3.2.6
require: false
prelude: |
require 'rexml'
RubyVM::YJIT.enable
- name: master(YJIT)
prelude: |
$LOAD_PATH.unshift(File.expand_path("lib"))
require 'rexml'
RubyVM::YJIT.enable

prelude: |
require 'rexml/document'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/streamparser'
require 'rexml/streamlistener'

# Single entity reference: <!ENTITY foo "bar"> -> &foo;
single_entity_xml = <<~XML
<!DOCTYPE root [
<!ENTITY word "hello">
]>
<root>&word;</root>
XML

# Chained entity references: a -> b -> c -> ... -> value (n_depth levels deep)
n_depth = 100
chained_entities_xml = <<~XML
<!DOCTYPE root [
#{(1..n_depth).map {|i| i == n_depth ? "<!ENTITY e#{i} \"value\">" : "<!ENTITY e#{i} \"&e#{i+1};\">" }.join("\n")}
]>
<root>&e1;</root>
XML

# Many distinct entities referenced in document content
n_entities = 100
many_entities_xml = <<~XML
<!DOCTYPE root [
#{(1..n_entities).map {|i| "<!ENTITY ent#{i} \"value#{i}\">" }.join("\n")}
]>
<root>#{(1..n_entities).map {|i| "&ent#{i};" }.join}</root>
XML

# Same entity referenced repeatedly in attribute values
n_items = 100
repeated_entity_xml = <<~XML
<!DOCTYPE root [
<!ENTITY word "hello">
]>
<root>#{'<item val="&word;">&word;</item>' * n_items}</root>
XML

class Listener
include REXML::StreamListener
end

benchmark:
'single_entity(dom)' : REXML::Document.new(single_entity_xml).root.text
'single_entity(stream)' : REXML::Parsers::StreamParser.new(single_entity_xml, Listener.new).parse
'chained_entities(dom)' : REXML::Document.new(chained_entities_xml).root.text
'chained_entities(stream)': REXML::Parsers::StreamParser.new(chained_entities_xml, Listener.new).parse
'many_entities(dom)' : REXML::Document.new(many_entities_xml).root.text
'many_entities(stream)' : REXML::Parsers::StreamParser.new(many_entities_xml, Listener.new).parse
'repeated_entity(dom)' : REXML::Document.new(repeated_entity_xml).root.text
'repeated_entity(stream)' : REXML::Parsers::StreamParser.new(repeated_entity_xml, Listener.new).parse
4 changes: 2 additions & 2 deletions lib/rexml/doctype.rb
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ def context
@parent&.context
end

def entity( name )
@entities[name]&.unnormalized
def entity( name, expanding: nil )
@entities[name]&.unnormalized(expanding: expanding)
end

def add child
Expand Down
14 changes: 11 additions & 3 deletions lib/rexml/entity.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# frozen_string_literal: false
require 'set'
require_relative 'child'
require_relative 'source'
require_relative 'xmltokens'
Expand Down Expand Up @@ -70,13 +71,20 @@ def Entity::matches? string

# Evaluates to the unnormalized value of this entity; that is, replacing
# &ent; entities.
def unnormalized
def unnormalized(expanding: nil)
document&.record_entity_expansion

return nil if @value.nil?

@unnormalized = Text::unnormalize(@value, parent,
entity_expansion_text_limit: document&.entity_expansion_text_limit)
expanding ||= Set.new
raise "Detected an entity reference loop: #{@name}" if expanding.include?(@name)

expanding.add(@name)
result = Text::unnormalize(@value, parent,
entity_expansion_text_limit: document&.entity_expansion_text_limit,
expanding: expanding)
expanding.delete(@name)
result
end

#once :unnormalized
Expand Down
14 changes: 10 additions & 4 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -535,14 +535,20 @@ def pull_event
end
private :pull_event

def entity( reference, entities )
def entity( reference, entities, expanding = nil )
return unless entities

value = entities[ reference ]
return if value.nil?

expanding ||= Set.new
raise "Detected an entity reference loop: #{reference}" if expanding.include?(reference)

record_entity_expansion
unnormalize( value, entities )
expanding.add(reference)
result = unnormalize( value, entities, nil, expanding )
expanding.delete(reference)
result
end

# Escapes all possible entities
Expand All @@ -562,7 +568,7 @@ def normalize( input, entities=nil, entity_filter=nil )
end

# Unescapes all possible entities
def unnormalize( string, entities=nil, filter=nil )
def unnormalize( string, entities=nil, filter=nil, expanding=nil )
if string.include?("\r")
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
else
Expand All @@ -588,7 +594,7 @@ def unnormalize( string, entities=nil, filter=nil )
if matches.size > 0
matches.tally.each do |entity_reference, n|
entity_expansion_count_before = @entity_expansion_count
entity_value = entity( entity_reference, entities )
entity_value = entity( entity_reference, entities, expanding )
if entity_value
if n > 1
entity_expansion_count_delta =
Expand Down
8 changes: 4 additions & 4 deletions lib/rexml/text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -384,11 +384,11 @@ def Text::normalize( input, doctype=nil, entity_filter=nil )
end

# Unescapes all possible entities
def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil, expanding: nil )
entity_expansion_text_limit ||= Security.entity_expansion_text_limit
sum = 0
string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
s = Text.expand($&, doctype, filter)
s = Text.expand($&, doctype, filter, expanding: expanding)
if sum + s.bytesize > entity_expansion_text_limit
raise "entity expansion has grown too large"
else
Expand All @@ -398,7 +398,7 @@ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expa
}
end

def Text.expand(ref, doctype, filter)
def Text.expand(ref, doctype, filter, expanding: nil)
if ref[1] == ?#
if ref[2] == ?x
[ref[3...-1].to_i(16)].pack('U*')
Expand All @@ -410,7 +410,7 @@ def Text.expand(ref, doctype, filter)
elsif filter and filter.include?( ref[1...-1] )
ref
elsif doctype
doctype.entity( ref[1...-1] ) or ref
doctype.entity( ref[1...-1], expanding: expanding ) or ref
else
entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
entity_value ? entity_value.value : ref
Expand Down
93 changes: 93 additions & 0 deletions test/test_entity.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# frozen_string_literal: false

require 'rexml/document'
require 'rexml/entity'
require 'rexml/source'
require 'rexml/parsers/streamparser'
require 'rexml/streamlistener'

module REXMLTests
class EntityTester < Test::Unit::TestCase
Expand Down Expand Up @@ -241,6 +244,96 @@ def test_single_pass_unnormalization # ticket 123
assert_equal '&amp;&', REXML::Text::unnormalize('&#38;amp;&amp;')
end

def test_entity_direct_circular_reference
source = <<~XML
<!DOCTYPE root [
<!ENTITY x "&x;">
]>
<root>&x;</root>
XML
listener = Class.new { include REXML::StreamListener }.new
parser = REXML::Parsers::StreamParser.new(source, listener)
error = assert_raise(RuntimeError) do
parser.parse
end
assert_match(/Detected an entity reference loop: x/, error.message)
end

def test_entity_indirect_circular_reference
source = <<~XML
<!DOCTYPE root [
<!ENTITY a "&b;">
<!ENTITY b "&a;">
]>
<root>&a;</root>
XML
listener = Class.new { include REXML::StreamListener }.new
parser = REXML::Parsers::StreamParser.new(source, listener)
error = assert_raise(RuntimeError) do
parser.parse
end
assert_match(/Detected an entity reference loop/, error.message)
end

def test_entity_circular_reference_with_long_value
source = <<~XML
<!DOCTYPE root [
<!ENTITY x "#{"&amp;" * 100}&x;">
]>
<root>&x;</root>
XML
listener = Class.new { include REXML::StreamListener }.new
parser = REXML::Parsers::StreamParser.new(source, listener)
error = assert_raise(RuntimeError) do
parser.parse
end
assert_match(/Detected an entity reference loop: x/, error.message)
end

def test_entity_direct_circular_reference_via_dom
source = <<~XML
<!DOCTYPE root [
<!ENTITY x "&x;">
]>
<root>&x;</root>
XML
doc = REXML::Document.new(source)
error = assert_raise(RuntimeError) do
doc.root.text
end
assert_match(/Detected an entity reference loop: x/, error.message)
end

def test_entity_indirect_circular_reference_via_dom
source = <<~XML
<!DOCTYPE root [
<!ENTITY a "&b;">
<!ENTITY b "&a;">
]>
<root>&a;</root>
XML
doc = REXML::Document.new(source)
error = assert_raise(RuntimeError) do
doc.root.text
end
assert_match(/Detected an entity reference loop/, error.message)
end

def test_entity_unnormalized_direct_circular_reference
source = <<~XML
<!DOCTYPE root [
<!ENTITY x "&x;">
]>
<root/>
XML
doc = REXML::Document.new(source)
entity = doc.doctype.entities["x"]
error = assert_raise(RuntimeError) do
entity.unnormalized
end
assert_match(/Detected an entity reference loop: x/, error.message)
end

def test_entity_filter
document = REXML::Document.new(<<-XML)
<!DOCTYPE root [
Expand Down
Loading