Skip to content

Commit c084a66

Browse files
[3.13] gh-79986: Add parsing for References/In-Reply-To email headers (GH-137201) (#142574)
gh-79986: Add parsing for References/In-Reply-To email headers (GH-137201) This is a followup to 46d88a1 (GH-13397), which added parsing for Message-ID. Similar handling is needed for the other two identification headers. (cherry picked from commit 79aa43a) Co-authored-by: elenril <anton@khirnov.net>
1 parent 9b7a628 commit c084a66

File tree

5 files changed

+137
-0
lines changed

5 files changed

+137
-0
lines changed

Lib/email/_header_value_parser.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,12 @@ class MessageID(MsgID):
874874
class InvalidMessageID(MessageID):
875875
token_type = 'invalid-message-id'
876876

877+
class MessageIDList(TokenList):
878+
token_type = 'message-id-list'
879+
880+
@property
881+
def message_ids(self):
882+
return [x for x in self if x.token_type=='msg-id']
877883

878884
class Header(TokenList):
879885
token_type = 'header'
@@ -2171,6 +2177,32 @@ def parse_message_id(value):
21712177

21722178
return message_id
21732179

2180+
def parse_message_ids(value):
2181+
"""in-reply-to = "In-Reply-To:" 1*msg-id CRLF
2182+
references = "References:" 1*msg-id CRLF
2183+
"""
2184+
message_id_list = MessageIDList()
2185+
while value:
2186+
if value[0] == ',':
2187+
# message id list separated with commas - this is invalid,
2188+
# but happens rather frequently in the wild
2189+
message_id_list.defects.append(
2190+
errors.InvalidHeaderDefect("comma in msg-id list"))
2191+
message_id_list.append(
2192+
WhiteSpaceTerminal(' ', 'invalid-comma-replacement'))
2193+
value = value[1:]
2194+
continue
2195+
try:
2196+
token, value = get_msg_id(value)
2197+
message_id_list.append(token)
2198+
except errors.HeaderParseError as ex:
2199+
token = get_unstructured(value)
2200+
message_id_list.append(InvalidMessageID(token))
2201+
message_id_list.defects.append(
2202+
errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex)))
2203+
break
2204+
return message_id_list
2205+
21742206
#
21752207
# XXX: As I begin to add additional header parsers, I'm realizing we probably
21762208
# have two level of parser routines: the get_XXX methods that get a token in

Lib/email/headerregistry.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,18 @@ def parse(cls, value, kwds):
534534
kwds['defects'].extend(parse_tree.all_defects)
535535

536536

537+
class ReferencesHeader:
538+
539+
max_count = 1
540+
value_parser = staticmethod(parser.parse_message_ids)
541+
542+
@classmethod
543+
def parse(cls, value, kwds):
544+
kwds['parse_tree'] = parse_tree = cls.value_parser(value)
545+
kwds['decoded'] = str(parse_tree)
546+
kwds['defects'].extend(parse_tree.all_defects)
547+
548+
537549
# The header factory #
538550

539551
_default_header_map = {
@@ -557,6 +569,8 @@ def parse(cls, value, kwds):
557569
'content-disposition': ContentDispositionHeader,
558570
'content-transfer-encoding': ContentTransferEncodingHeader,
559571
'message-id': MessageIDHeader,
572+
'in-reply-to': ReferencesHeader,
573+
'references': ReferencesHeader,
560574
}
561575

562576
class HeaderRegistry:

Lib/test/test_email/test__header_value_parser.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2867,6 +2867,81 @@ def test_get_msg_id_ws_only_local(self):
28672867
)
28682868
self.assertEqual(msg_id.token_type, 'msg-id')
28692869

2870+
def test_parse_message_ids_valid(self):
2871+
message_ids = self._test_parse_x(
2872+
parser.parse_message_ids,
2873+
"<foo@bar> <bar@foo>",
2874+
"<foo@bar> <bar@foo>",
2875+
"<foo@bar> <bar@foo>",
2876+
[],
2877+
)
2878+
self.assertEqual(message_ids.token_type, 'message-id-list')
2879+
2880+
def test_parse_message_ids_empty(self):
2881+
message_ids = self._test_parse_x(
2882+
parser.parse_message_ids,
2883+
" ",
2884+
" ",
2885+
" ",
2886+
[errors.InvalidHeaderDefect],
2887+
)
2888+
self.assertEqual(message_ids.token_type, 'message-id-list')
2889+
2890+
def test_parse_message_ids_comment(self):
2891+
message_ids = self._test_parse_x(
2892+
parser.parse_message_ids,
2893+
"<foo@bar> (foo's message from \"bar\")",
2894+
"<foo@bar> (foo's message from \"bar\")",
2895+
"<foo@bar> ",
2896+
[],
2897+
)
2898+
self.assertEqual(message_ids.message_ids[0].value, '<foo@bar> ')
2899+
self.assertEqual(message_ids.token_type, 'message-id-list')
2900+
2901+
def test_parse_message_ids_no_sep(self):
2902+
message_ids = self._test_parse_x(
2903+
parser.parse_message_ids,
2904+
"<foo@bar><bar@foo>",
2905+
"<foo@bar><bar@foo>",
2906+
"<foo@bar><bar@foo>",
2907+
[],
2908+
)
2909+
self.assertEqual(message_ids.message_ids[0].value, '<foo@bar>')
2910+
self.assertEqual(message_ids.message_ids[1].value, '<bar@foo>')
2911+
self.assertEqual(message_ids.token_type, 'message-id-list')
2912+
2913+
def test_parse_message_ids_comma_sep(self):
2914+
message_ids = self._test_parse_x(
2915+
parser.parse_message_ids,
2916+
"<foo@bar>,<bar@foo>",
2917+
"<foo@bar> <bar@foo>",
2918+
"<foo@bar> <bar@foo>",
2919+
[errors.InvalidHeaderDefect],
2920+
)
2921+
self.assertEqual(message_ids.message_ids[0].value, '<foo@bar>')
2922+
self.assertEqual(message_ids.message_ids[1].value, '<bar@foo>')
2923+
self.assertEqual(message_ids.token_type, 'message-id-list')
2924+
2925+
def test_parse_message_ids_invalid_id(self):
2926+
message_ids = self._test_parse_x(
2927+
parser.parse_message_ids,
2928+
"<Date: Wed, 08 Jun 2002 09:78:58 +0600>",
2929+
"<Date: Wed, 08 Jun 2002 09:78:58 +0600>",
2930+
"<Date: Wed, 08 Jun 2002 09:78:58 +0600>",
2931+
[errors.InvalidHeaderDefect]*2,
2932+
)
2933+
self.assertEqual(message_ids.token_type, 'message-id-list')
2934+
2935+
def test_parse_message_ids_broken_ang(self):
2936+
message_ids = self._test_parse_x(
2937+
parser.parse_message_ids,
2938+
"<foo@bar> >bar@foo",
2939+
"<foo@bar> >bar@foo",
2940+
"<foo@bar> >bar@foo",
2941+
[errors.InvalidHeaderDefect]*1,
2942+
)
2943+
self.assertEqual(message_ids.token_type, 'message-id-list')
2944+
28702945

28712946

28722947
@parameterize

Lib/test/test_email/test_headerregistry.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,5 +1813,18 @@ def test_message_id_header_is_not_folded(self):
18131813
h.fold(policy=policy.default.clone(max_line_length=20)),
18141814
'Message-ID:\n <ईमेलfromMessage@wők.com>\n')
18151815

1816+
def test_fold_references(self):
1817+
h = self.make_header(
1818+
'References',
1819+
'<referenceid1thatislongerthan@maxlinelength.com> '
1820+
'<referenceid2thatislongerthan@maxlinelength.com>'
1821+
)
1822+
self.assertEqual(
1823+
h.fold(policy=policy.default.clone(max_line_length=20)),
1824+
'References: '
1825+
'<referenceid1thatislongerthan@maxlinelength.com>\n'
1826+
' <referenceid2thatislongerthan@maxlinelength.com>\n')
1827+
1828+
18161829
if __name__ == '__main__':
18171830
unittest.main()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add parsing for ``References`` and ``In-Reply-To`` headers to the :mod:`email`
2+
library that parses the header content as lists of message id tokens. This
3+
prevents them from being folded incorrectly.

0 commit comments

Comments
 (0)