Skip to content
92 changes: 33 additions & 59 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ module Private
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
end
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
EXTERNAL_ID_PUBLIC_PATTERN = /\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um
EXTERNAL_ID_SYSTEM_PATTERN = /\s+#{SYSTEMLITERAL}/um
PUBLIC_ID_PATTERN = /\s+#{PUBIDLITERAL}/um
end
private_constant :Private

Expand Down Expand Up @@ -307,7 +310,6 @@ def pull_event
@source.ensure_buffer
else
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
Expand Down Expand Up @@ -409,7 +411,6 @@ def pull_event
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
@source.skip_spaces
unless @source.match?(">", true)
Expand Down Expand Up @@ -667,68 +668,41 @@ def parse_name(base_error_message)
end

def parse_id(base_error_message,
accept_external_id:,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove this keyword argument because it's always true, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

id = parse_id(base_error_message,
accept_external_id: true,

id = parse_id(base_error_message,
accept_external_id: true,

accept_public_id:)
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
system_literal = md[2]
system = system_literal[1..-2] if system_literal # Remove quote
["PUBLIC", pubid, system]
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
["PUBLIC", pubid, nil]
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
system = nil
system_literal = md[1]
system = system_literal[1..-2] if system_literal # Remove quote
["SYSTEM", nil, system]
else
details = parse_id_invalid_details(accept_external_id: accept_external_id,
accept_public_id: accept_public_id)
message = "#{base_error_message}: #{details}"
raise REXML::ParseException.new(message, @source)
end
end

def parse_id_invalid_details(accept_external_id:,
accept_public_id:)
public = /\A\s*PUBLIC/um
system = /\A\s*SYSTEM/um
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
return "public ID literal is missing"
end
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
return "invalid public ID literal"
end
if accept_public_id
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
return "system ID literal is missing"
end
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
@source.skip_spaces
if @source.match?("PUBLIC", true)
if (md = @source.match(Private::EXTERNAL_ID_PUBLIC_PATTERN, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
system_literal = md[2]
system = system_literal[1..-2] if system_literal # Remove quote
["PUBLIC", pubid, system]
elsif accept_public_id and (md = @source.match(Private::PUBLIC_ID_PATTERN, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
["PUBLIC", pubid, nil]
elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um)
raise REXML::ParseException.new("#{base_error_message}: public ID literal is missing", @source)
elsif !@source.match?(Private::PUBLIC_ID_PATTERN)
raise REXML::ParseException.new("#{base_error_message}: invalid public ID literal", @source)
else
"garbage after public ID literal"
raise REXML::ParseException.new("#{base_error_message}: garbage after public ID literal", @source)
end
elsif accept_external_id and @source.match?(/#{system}/um)
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
return "system literal is missing"
end
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
elsif @source.match?("SYSTEM", true)
if (md = @source.match(Private::EXTERNAL_ID_SYSTEM_PATTERN, true))
system = nil
system_literal = md[1]
system = system_literal[1..-2] if system_literal # Remove quote
["SYSTEM", nil, system]
elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um)
raise REXML::ParseException.new("#{base_error_message}: system literal is missing", @source)
else
raise REXML::ParseException.new("#{base_error_message}: invalid system literal", @source)
end
"garbage after system literal"
else
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
return "invalid ID type"
end
"ID type is missing"
raise REXML::ParseException.new("#{base_error_message}: invalid ID type", @source)
end
end

Expand Down
40 changes: 35 additions & 5 deletions test/parse/test_document_type_declaration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,22 @@ def test_no_literal
Line: 3
Position: 26
Last 80 unconsumed characters:
SYSTEM> <r/>
> <r/>
DETAIL
end

def test_garbage_invalid_system_literal
exception = assert_raise(REXML::ParseException) do
parse(<<-DOCTYPE)
<!DOCTYPE r SYSTEM '
DOCTYPE
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed DOCTYPE: invalid system literal
Line: 3
Position: 27
Last 80 unconsumed characters:
' <r/>
DETAIL
end

Expand All @@ -165,10 +180,10 @@ def test_garbage_after_literal
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed DOCTYPE: garbage after external ID
Line: 3
Position: 36
Line: 1
Position: 29
Last 80 unconsumed characters:
x'> <r/>
x'>
DETAIL
end

Expand Down Expand Up @@ -200,7 +215,7 @@ def test_content_double_quote
Line: 3
Position: 62
Last 80 unconsumed characters:
PUBLIC 'double quote " is invalid' "r.dtd"> <r/>
'double quote " is invalid' "r.dtd"> <r/>
DETAIL
end

Expand All @@ -220,6 +235,21 @@ def test_double_quote
end

class TestSystemLiteral < self
def test_garbage_after_public_ID_literal
exception = assert_raise(REXML::ParseException) do
parse(<<-DOCTYPE)
<!DOCTYPE r PUBLIC "public-id-literal" 'system>
DOCTYPE
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed DOCTYPE: garbage after public ID literal
Line: 3
Position: 54
Last 80 unconsumed characters:
"public-id-literal" 'system> <r/>
DETAIL
end

def test_garbage_after_literal
exception = assert_raise(REXML::ParseException) do
parse(<<-DOCTYPE)
Expand Down
38 changes: 19 additions & 19 deletions test/parse/test_notation_declaration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ def test_no_name
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed notation declaration: name is missing
Line: 5
Position: 72
Line: 2
Position: 62
Last 80 unconsumed characters:
<!NOTATION> ]> <r/>
<!NOTATION>
DETAIL
end

Expand All @@ -62,10 +62,10 @@ def test_no_id_type
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed notation declaration: invalid ID type
Line: 5
Position: 77
Line: 2
Position: 67
Last 80 unconsumed characters:
> ]> <r/>
>
DETAIL
end

Expand All @@ -77,10 +77,10 @@ def test_invalid_id_type
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed notation declaration: invalid ID type
Line: 5
Position: 85
Line: 2
Position: 75
Last 80 unconsumed characters:
INVALID> ]> <r/>
INVALID>
DETAIL
end
end
Expand All @@ -98,7 +98,7 @@ def test_no_literal
Line: 5
Position: 84
Last 80 unconsumed characters:
SYSTEM> ]> <r/>
> ]> <r/>
DETAIL
end

Expand All @@ -110,10 +110,10 @@ def test_garbage_after_literal
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed notation declaration: garbage before end >
Line: 5
Position: 103
Line: 2
Position: 93
Last 80 unconsumed characters:
x'> ]> <r/>
x'>
DETAIL
end

Expand Down Expand Up @@ -145,7 +145,7 @@ def test_content_double_quote
Line: 5
Position: 129
Last 80 unconsumed characters:
PUBLIC 'double quote " is invalid' "system-literal"> ]> <r/>
'double quote " is invalid' "system-literal"> ]> <r/>
DETAIL
end

Expand Down Expand Up @@ -173,10 +173,10 @@ def test_garbage_after_literal
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed notation declaration: garbage before end >
Line: 5
Position: 123
Line: 2
Position: 113
Last 80 unconsumed characters:
x'> ]> <r/>
x'>
DETAIL
end

Expand Down Expand Up @@ -229,7 +229,7 @@ def test_no_literal
Line: 5
Position: 84
Last 80 unconsumed characters:
PUBLIC> ]> <r/>
> ]> <r/>
DETAIL
end

Expand All @@ -244,7 +244,7 @@ def test_literal_content_double_quote
Line: 5
Position: 128
Last 80 unconsumed characters:
PUBLIC 'double quote \" is invalid in PubidLiteral'> ]> <r/>
'double quote \" is invalid in PubidLiteral'> ]> <r/>
DETAIL
end

Expand Down
Loading