Skip to content

Commit 5474ab2

Browse files
authored
Use more StringScanner based API for parse_id (#299)
## Why? Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner.
1 parent 39e7e28 commit 5474ab2

File tree

3 files changed

+87
-83
lines changed

3 files changed

+87
-83
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 33 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ module Private
158158
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
159159
end
160160
XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
161+
EXTERNAL_ID_PUBLIC_PATTERN = /\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um
162+
EXTERNAL_ID_SYSTEM_PATTERN = /\s+#{SYSTEMLITERAL}/um
163+
PUBLIC_ID_PATTERN = /\s+#{PUBIDLITERAL}/um
161164
end
162165
private_constant :Private
163166

@@ -307,7 +310,6 @@ def pull_event
307310
@source.ensure_buffer
308311
else
309312
id = parse_id(base_error_message,
310-
accept_external_id: true,
311313
accept_public_id: false)
312314
if id[0] == "SYSTEM"
313315
# For backward compatibility
@@ -409,7 +411,6 @@ def pull_event
409411
end
410412
name = parse_name(base_error_message)
411413
id = parse_id(base_error_message,
412-
accept_external_id: true,
413414
accept_public_id: true)
414415
@source.skip_spaces
415416
unless @source.match?(">", true)
@@ -667,68 +668,41 @@ def parse_name(base_error_message)
667668
end
668669

669670
def parse_id(base_error_message,
670-
accept_external_id:,
671671
accept_public_id:)
672-
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
673-
pubid = system = nil
674-
pubid_literal = md[1]
675-
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
676-
system_literal = md[2]
677-
system = system_literal[1..-2] if system_literal # Remove quote
678-
["PUBLIC", pubid, system]
679-
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
680-
pubid = system = nil
681-
pubid_literal = md[1]
682-
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
683-
["PUBLIC", pubid, nil]
684-
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
685-
system = nil
686-
system_literal = md[1]
687-
system = system_literal[1..-2] if system_literal # Remove quote
688-
["SYSTEM", nil, system]
689-
else
690-
details = parse_id_invalid_details(accept_external_id: accept_external_id,
691-
accept_public_id: accept_public_id)
692-
message = "#{base_error_message}: #{details}"
693-
raise REXML::ParseException.new(message, @source)
694-
end
695-
end
696-
697-
def parse_id_invalid_details(accept_external_id:,
698-
accept_public_id:)
699-
public = /\A\s*PUBLIC/um
700-
system = /\A\s*SYSTEM/um
701-
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
702-
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
703-
return "public ID literal is missing"
704-
end
705-
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
706-
return "invalid public ID literal"
707-
end
708-
if accept_public_id
709-
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
710-
return "system ID literal is missing"
711-
end
712-
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
713-
return "invalid system literal"
714-
end
715-
"garbage after system literal"
672+
@source.skip_spaces
673+
if @source.match?("PUBLIC", true)
674+
if (md = @source.match(Private::EXTERNAL_ID_PUBLIC_PATTERN, true))
675+
pubid = system = nil
676+
pubid_literal = md[1]
677+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
678+
system_literal = md[2]
679+
system = system_literal[1..-2] if system_literal # Remove quote
680+
["PUBLIC", pubid, system]
681+
elsif accept_public_id and (md = @source.match(Private::PUBLIC_ID_PATTERN, true))
682+
pubid = system = nil
683+
pubid_literal = md[1]
684+
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
685+
["PUBLIC", pubid, nil]
686+
elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um)
687+
raise REXML::ParseException.new("#{base_error_message}: public ID literal is missing", @source)
688+
elsif !@source.match?(Private::PUBLIC_ID_PATTERN)
689+
raise REXML::ParseException.new("#{base_error_message}: invalid public ID literal", @source)
716690
else
717-
"garbage after public ID literal"
691+
raise REXML::ParseException.new("#{base_error_message}: garbage after public ID literal", @source)
718692
end
719-
elsif accept_external_id and @source.match?(/#{system}/um)
720-
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
721-
return "system literal is missing"
722-
end
723-
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
724-
return "invalid system literal"
693+
elsif @source.match?("SYSTEM", true)
694+
if (md = @source.match(Private::EXTERNAL_ID_SYSTEM_PATTERN, true))
695+
system = nil
696+
system_literal = md[1]
697+
system = system_literal[1..-2] if system_literal # Remove quote
698+
["SYSTEM", nil, system]
699+
elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um)
700+
raise REXML::ParseException.new("#{base_error_message}: system literal is missing", @source)
701+
else
702+
raise REXML::ParseException.new("#{base_error_message}: invalid system literal", @source)
725703
end
726-
"garbage after system literal"
727704
else
728-
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
729-
return "invalid ID type"
730-
end
731-
"ID type is missing"
705+
raise REXML::ParseException.new("#{base_error_message}: invalid ID type", @source)
732706
end
733707
end
734708

test/parse/test_document_type_declaration.rb

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,22 @@ def test_no_literal
153153
Line: 3
154154
Position: 26
155155
Last 80 unconsumed characters:
156-
SYSTEM> <r/>
156+
> <r/>
157+
DETAIL
158+
end
159+
160+
def test_garbage_invalid_system_literal
161+
exception = assert_raise(REXML::ParseException) do
162+
parse(<<-DOCTYPE)
163+
<!DOCTYPE r SYSTEM '
164+
DOCTYPE
165+
end
166+
assert_equal(<<-DETAIL.chomp, exception.to_s)
167+
Malformed DOCTYPE: invalid system literal
168+
Line: 3
169+
Position: 27
170+
Last 80 unconsumed characters:
171+
' <r/>
157172
DETAIL
158173
end
159174

@@ -165,10 +180,10 @@ def test_garbage_after_literal
165180
end
166181
assert_equal(<<-DETAIL.chomp, exception.to_s)
167182
Malformed DOCTYPE: garbage after external ID
168-
Line: 3
169-
Position: 36
183+
Line: 1
184+
Position: 29
170185
Last 80 unconsumed characters:
171-
x'> <r/>
186+
x'>
172187
DETAIL
173188
end
174189

@@ -200,7 +215,7 @@ def test_content_double_quote
200215
Line: 3
201216
Position: 62
202217
Last 80 unconsumed characters:
203-
PUBLIC 'double quote " is invalid' "r.dtd"> <r/>
218+
'double quote " is invalid' "r.dtd"> <r/>
204219
DETAIL
205220
end
206221

@@ -220,6 +235,21 @@ def test_double_quote
220235
end
221236

222237
class TestSystemLiteral < self
238+
def test_garbage_after_public_ID_literal
239+
exception = assert_raise(REXML::ParseException) do
240+
parse(<<-DOCTYPE)
241+
<!DOCTYPE r PUBLIC "public-id-literal" 'system>
242+
DOCTYPE
243+
end
244+
assert_equal(<<-DETAIL.chomp, exception.to_s)
245+
Malformed DOCTYPE: garbage after public ID literal
246+
Line: 3
247+
Position: 54
248+
Last 80 unconsumed characters:
249+
"public-id-literal" 'system> <r/>
250+
DETAIL
251+
end
252+
223253
def test_garbage_after_literal
224254
exception = assert_raise(REXML::ParseException) do
225255
parse(<<-DOCTYPE)

test/parse/test_notation_declaration.rb

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ def test_no_name
3232
end
3333
assert_equal(<<-DETAIL.chomp, exception.to_s)
3434
Malformed notation declaration: name is missing
35-
Line: 5
36-
Position: 72
35+
Line: 2
36+
Position: 62
3737
Last 80 unconsumed characters:
38-
<!NOTATION> ]> <r/>
38+
<!NOTATION>
3939
DETAIL
4040
end
4141

@@ -62,10 +62,10 @@ def test_no_id_type
6262
end
6363
assert_equal(<<-DETAIL.chomp, exception.to_s)
6464
Malformed notation declaration: invalid ID type
65-
Line: 5
66-
Position: 77
65+
Line: 2
66+
Position: 67
6767
Last 80 unconsumed characters:
68-
> ]> <r/>
68+
>
6969
DETAIL
7070
end
7171

@@ -77,10 +77,10 @@ def test_invalid_id_type
7777
end
7878
assert_equal(<<-DETAIL.chomp, exception.to_s)
7979
Malformed notation declaration: invalid ID type
80-
Line: 5
81-
Position: 85
80+
Line: 2
81+
Position: 75
8282
Last 80 unconsumed characters:
83-
INVALID> ]> <r/>
83+
INVALID>
8484
DETAIL
8585
end
8686
end
@@ -98,7 +98,7 @@ def test_no_literal
9898
Line: 5
9999
Position: 84
100100
Last 80 unconsumed characters:
101-
SYSTEM> ]> <r/>
101+
> ]> <r/>
102102
DETAIL
103103
end
104104

@@ -110,10 +110,10 @@ def test_garbage_after_literal
110110
end
111111
assert_equal(<<-DETAIL.chomp, exception.to_s)
112112
Malformed notation declaration: garbage before end >
113-
Line: 5
114-
Position: 103
113+
Line: 2
114+
Position: 93
115115
Last 80 unconsumed characters:
116-
x'> ]> <r/>
116+
x'>
117117
DETAIL
118118
end
119119

@@ -145,7 +145,7 @@ def test_content_double_quote
145145
Line: 5
146146
Position: 129
147147
Last 80 unconsumed characters:
148-
PUBLIC 'double quote " is invalid' "system-literal"> ]> <r/>
148+
'double quote " is invalid' "system-literal"> ]> <r/>
149149
DETAIL
150150
end
151151

@@ -173,10 +173,10 @@ def test_garbage_after_literal
173173
end
174174
assert_equal(<<-DETAIL.chomp, exception.to_s)
175175
Malformed notation declaration: garbage before end >
176-
Line: 5
177-
Position: 123
176+
Line: 2
177+
Position: 113
178178
Last 80 unconsumed characters:
179-
x'> ]> <r/>
179+
x'>
180180
DETAIL
181181
end
182182

@@ -229,7 +229,7 @@ def test_no_literal
229229
Line: 5
230230
Position: 84
231231
Last 80 unconsumed characters:
232-
PUBLIC> ]> <r/>
232+
> ]> <r/>
233233
DETAIL
234234
end
235235

@@ -244,7 +244,7 @@ def test_literal_content_double_quote
244244
Line: 5
245245
Position: 128
246246
Last 80 unconsumed characters:
247-
PUBLIC 'double quote \" is invalid in PubidLiteral'> ]> <r/>
247+
'double quote \" is invalid in PubidLiteral'> ]> <r/>
248248
DETAIL
249249
end
250250

0 commit comments

Comments
 (0)