Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 60 additions & 60 deletions src/mailparser/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,66 +18,66 @@

import re

REGXIP = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

JUNK_PATTERN = r"[ \(\)\[\]\t\n]+"

# Patterns for receiveds
RECEIVED_PATTERNS = [
# FIXED: More restrictive 'from' clause
# Only matches 'from' at the beginning of the header (^) or after
# newline/whitespace to avoid matching within "for <email> from <email>"
# constructs which caused duplicate matches in IBM gateway headers
(
r"(?:(?:^|\n\s*)from\s+(?P<from>.+?)(?:\s*[(]?"
r"envelope-from|\s*[(]?envelope-sender|\s+"
r"by|\s+with(?! cipher)|\s+id|\s+via|;))"
),
# IMPROVED: More precise 'by' clause
# Modified to not consume 'with' clause, allowing proper separation
# of 'by' (server name) and 'with' (protocol) fields
(
r"(?:(?:^|\s)by\s+(?P<by>[^\s]+(?:\s+[^\s]+)*?)"
r"(?:\s+with(?! cipher)|\s*[(]?envelope-from|\s*"
r"[(]?envelope-sender|\s+id|\s+for|\s+via|;))"
),
# IMPROVED: 'with' clause with better boundary detection
(
r"(?:(?:^|\s)with(?! cipher)\s+(?P<with>.+?)"
r"(?:\s*[(]?envelope-from|\s*[(]?"
r"envelope-sender|\s+id|\s+for|\s+via|;))"
),
# IMPROVED: 'id' clause with cleaner boundaries
(
r"(?:(?:^|\s)id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*"
r"[(]?envelope-sender|\s+for|\s+via|;))"
),
# IMPROVED: 'for' clause - handles "for <email> from <email>" pattern
# Stops before 'from' keyword to prevent the 'from' pattern from
# matching the sender email in this construct
(
r"(?:(?:^|\s)for\s+(?P<for><[^>]+>|[^\s]+)"
r"(?:\s+from|\s*[(]?envelope-from|\s*[(]?"
r"envelope-sender|\s+via|;))"
),
# IMPROVED: 'via' clause with better termination
(
r"(?:(?:^|\s)via\s+(?P<via>.+?)(?:\s*[(]?"
r"envelope-from|\s*[(]?envelope-sender|;))"
),
# assumes emails are always inside <>
r"(?:envelope-from\s+<(?P<envelope_from>.+?)>)",
r"(?:envelope-sender\s+<(?P<envelope_sender>.+?)>)",
# datetime comes after ; at the end
r";\s*(?P<date>.*)",
# sendgrid datetime
(
r"(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:"
r"\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+"
),
]

RECEIVED_COMPILED_LIST = [re.compile(i, re.I | re.DOTALL) for i in RECEIVED_PATTERNS]
# IPv4 pattern - validates octet range (0-255) per RFC 791
REGXIP = re.compile(
r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)"
)

# IPv6 pattern - matches standard and common compressed forms per RFC 5952
REGXIP6 = re.compile(
r"(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}" # full form
r"|(?:[0-9a-fA-F]{1,4}:){1,7}:" # trailing ::
r"|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}" # :: with 1 group after
r"|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}"
r"|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}"
r"|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}"
r"|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}"
r"|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}"
r"|:(?::[0-9a-fA-F]{1,4}){1,7}" # ::x:x...
r"|::)" # just ::
)

# Normalize whitespace: collapse tabs and newlines to single space.
# Parenthesized comments and bracketed IPs are preserved.
JUNK_PATTERN = r"[\t\n]+"

# ------------------------------------------------------------------ #
# Received header parsing — RFC 5321 §4.4 grammar:
#
# Received = "Received:" *( received-token / comment ) ";" date-time
# received-token = "from" domain / "by" domain / "via" atom
# / "with" atom / "id" atom / "for" addr-spec
#
# Strategy: tokenize on clause keywords, then extract values per clause.
# This eliminates the duplicated boundary lookaheads of the old
# per-clause pattern list and matches the RFC grammar directly.
# ------------------------------------------------------------------ #

# Pattern that splits a received header into clause tokens.
# Matches each RFC 5321 keyword at a word boundary followed by its value,
# which extends up to the next keyword or semicolon.
# The keywords are: from, by, via, with (not "with cipher"), id, for,
# plus the non-standard envelope-from and envelope-sender.
_CLAUSE_SPLITTER = re.compile(
r"(?:^|\s+)"
r"(from|by|via|with(?!\s+cipher)|id|for|envelope-from|envelope-sender)"
r"\s+",
re.I,
)

# Extracts envelope-from email: envelope-from <addr>
_ENVELOPE_FROM_RE = re.compile(r"<([^>]+)>")

# Date after semicolon (standard RFC 5321)
_DATE_RE = re.compile(r";\s*(.*)", re.DOTALL)

# SendGrid non-standard date format (no semicolon)
_SENDGRID_DATE_RE = re.compile(
r"(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{9}\s+\+0000\s+UTC)"
r"\s+m=\+\d+\.\d+",
re.I,
)

EPILOGUE_DEFECTS = {"StartBoundaryNotFoundDefect"}

Expand Down
12 changes: 10 additions & 2 deletions src/mailparser/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import logging
import os

from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP
from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP6
from mailparser.utils import (
convert_mail_date,
decode_header_part,
Expand Down Expand Up @@ -510,14 +510,22 @@ def get_server_ipaddress(self, trust):
def _extract_ip(self, received_header):
"""
Extract the IP address from the received header if it is not private.
Supports both IPv4 (RFC 791) and IPv6 (RFC 5952) addresses.

Args:
received_header (string): The received header string

Returns:
string with the ip address or None
"""
check = REGXIP.findall(received_header[0 : received_header.find("by")])
by_idx = received_header.find("by")
from_part = received_header[:by_idx] if by_idx != -1 else received_header

# Try IPv4 first, then IPv6
check = REGXIP.findall(from_part)
if not check:
check = REGXIP6.findall(from_part)

if check:
try:
ip_str = str(check[-1])
Expand Down
112 changes: 71 additions & 41 deletions src/mailparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@
ADDRESSES_HEADERS,
JUNK_PATTERN,
OTHERS_PARTS,
RECEIVED_COMPILED_LIST,
_CLAUSE_SPLITTER,
_DATE_RE,
_ENVELOPE_FROM_RE,
_SENDGRID_DATE_RE,
)
from mailparser.exceptions import MailParserOSError, MailParserReceivedParsingError

Expand Down Expand Up @@ -240,8 +243,11 @@ def msgconvert(email):

def parse_received(received):
"""
Parse a single received header.
Return a dictionary of values by clause.
Parse a single received header by tokenizing on RFC 5321 §4.4 keywords.

Uses a keyword-based splitter to divide the header into clauses
(from, by, via, with, id, for, envelope-from, envelope-sender),
then extracts the date from after the semicolon.

Arguments:
received {str} -- single received header
Expand All @@ -255,47 +261,71 @@ def parse_received(received):
"""

values_by_clause = {}
for pattern in RECEIVED_COMPILED_LIST:
matches = [match for match in pattern.finditer(received)]

if len(matches) == 0:
# no matches for this clause, but it's ok! keep going!
log.debug("No matches found for %s in %s" % (pattern.pattern, received))
elif len(matches) > 1:
# uh, can't have more than one of each clause in a received.
# so either there's more than one or the current regex is wrong
msg = "More than one match found for %s in %s" % (pattern.pattern, received)
log.error(msg)
raise MailParserReceivedParsingError(msg)

# --- Step 1: Extract date (after semicolon, or SendGrid format) ---
date_match = _DATE_RE.search(received)
if date_match:
values_by_clause["date"] = date_match.group(1)
# Work only on the part before the semicolon for clause parsing
header_body = received[: date_match.start()]
else:
# Try SendGrid non-standard date
sg_match = _SENDGRID_DATE_RE.search(received)
if sg_match:
values_by_clause["date"] = sg_match.group(1)
header_body = received[: sg_match.start()]
else:
header_body = received

# --- Step 2: Tokenize on clause keywords ---
# _CLAUSE_SPLITTER.split gives: [preamble, kw1, val1, kw2, val2, ...]
parts = _CLAUSE_SPLITTER.split(header_body)

# parts[0] is preamble (before first keyword), then alternating kw/value
i = 1 # skip preamble
while i + 1 < len(parts):
keyword = parts[i].lower()
value = parts[i + 1].strip()
i += 2

if keyword in ("envelope-from", "envelope-sender"):
# Extract email from angle brackets
m = _ENVELOPE_FROM_RE.search(value)
if m:
values_by_clause[keyword.replace("-", "_")] = m.group(1)
elif keyword == "for":
values_by_clause[keyword] = value
elif keyword == "from":
# RFC 5321: only one 'from' clause per received header.
# Only accept the first occurrence; subsequent ones come from
# IBM-style "for <addr> from <sender>" constructs.
if "from" not in values_by_clause:
values_by_clause[keyword] = value
else:
# otherwise we have one matching clause!
log.debug("Found one match for %s in %s" % (pattern.pattern, received))
match = matches[0].groupdict()
key = list(match.keys())[0]
value = list(match.values())[0]
values_by_clause[key] = value

if len(values_by_clause) == 0:
# we weren't able to match anything...
values_by_clause[keyword] = value

# --- Step 3: Extract envelope-from/sender from within clause values ---
# Some MTAs embed envelope-from inside parenthesized comments in the
# 'by' clause, e.g.: "by host.com (envelope-from <addr>)"
for clause_key in ("by", "from", "with"):
clause_val = values_by_clause.get(clause_key, "")
for env_key, env_name in (
("envelope_from", "envelope-from"),
("envelope_sender", "envelope-sender"),
):
if env_key not in values_by_clause and env_name in clause_val.lower():
m = re.search(
r"(?i)" + re.escape(env_name) + r"\s+<([^>]+)>",
clause_val,
)
if m:
values_by_clause[env_key] = m.group(1)

if not values_by_clause:
msg = "Unable to match any clauses in %s" % (received)

# Modification #1: Commenting the following log as
# this raised exception is caught above and then
# raw header is updated in response
# We dont want to get so many errors in our error
# logger as we are not even trying to parse the
# received headers
# Wanted to make it configurable via settiings,
# but this package does not depend on django and
# making configurable setting
# will make it django dependent,
# so better to keep it working with only python
# dependent and on any framework of python
# commenting it just for our use

# log.error(msg)

raise MailParserReceivedParsingError(msg)

log.debug("Parsed clauses: %s", list(values_by_clause.keys()))
return values_by_clause


Expand Down
Loading
Loading