SpamScope · fedelemantuano · Mar 16, 2026
diff --git a/src/mailparser/const.py b/src/mailparser/const.py
@@ -18,66 +18,66 @@
 
 import re
 
-REGXIP = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
-
-JUNK_PATTERN = r"[ \(\)\[\]\t\n]+"
-
-# Patterns for receiveds
-RECEIVED_PATTERNS = [
-    # FIXED: More restrictive 'from' clause
-    # Only matches 'from' at the beginning of the header (^) or after
-    # newline/whitespace to avoid matching within "for <email> from <email>"
-    # constructs which caused duplicate matches in IBM gateway headers
-    (
-        r"(?:(?:^|\n\s*)from\s+(?P<from>.+?)(?:\s*[(]?"
-        r"envelope-from|\s*[(]?envelope-sender|\s+"
-        r"by|\s+with(?! cipher)|\s+id|\s+via|;))"
-    ),
-    # IMPROVED: More precise 'by' clause
-    # Modified to not consume 'with' clause, allowing proper separation
-    # of 'by' (server name) and 'with' (protocol) fields
-    (
-        r"(?:(?:^|\s)by\s+(?P<by>[^\s]+(?:\s+[^\s]+)*?)"
-        r"(?:\s+with(?! cipher)|\s*[(]?envelope-from|\s*"
-        r"[(]?envelope-sender|\s+id|\s+for|\s+via|;))"
-    ),
-    # IMPROVED: 'with' clause with better boundary detection
-    (
-        r"(?:(?:^|\s)with(?! cipher)\s+(?P<with>.+?)"
-        r"(?:\s*[(]?envelope-from|\s*[(]?"
-        r"envelope-sender|\s+id|\s+for|\s+via|;))"
-    ),
-    # IMPROVED: 'id' clause with cleaner boundaries
-    (
-        r"(?:(?:^|\s)id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*"
-        r"[(]?envelope-sender|\s+for|\s+via|;))"
-    ),
-    # IMPROVED: 'for' clause - handles "for <email> from <email>" pattern
-    # Stops before 'from' keyword to prevent the 'from' pattern from
-    # matching the sender email in this construct
-    (
-        r"(?:(?:^|\s)for\s+(?P<for><[^>]+>|[^\s]+)"
-        r"(?:\s+from|\s*[(]?envelope-from|\s*[(]?"
-        r"envelope-sender|\s+via|;))"
-    ),
-    # IMPROVED: 'via' clause with better termination
-    (
-        r"(?:(?:^|\s)via\s+(?P<via>.+?)(?:\s*[(]?"
-        r"envelope-from|\s*[(]?envelope-sender|;))"
-    ),
-    # assumes emails are always inside <>
-    r"(?:envelope-from\s+<(?P<envelope_from>.+?)>)",
-    r"(?:envelope-sender\s+<(?P<envelope_sender>.+?)>)",
-    # datetime comes after ; at the end
-    r";\s*(?P<date>.*)",
-    # sendgrid datetime
-    (
-        r"(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:"
-        r"\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+"
-    ),
-]
-
-RECEIVED_COMPILED_LIST = [re.compile(i, re.I | re.DOTALL) for i in RECEIVED_PATTERNS]
+# IPv4 pattern - validates octet range (0-255) per RFC 791
+REGXIP = re.compile(
+    r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
+    r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)"
+)
+
+# IPv6 pattern - matches standard and common compressed forms per RFC 5952
+REGXIP6 = re.compile(
+    r"(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}"  # full form
+    r"|(?:[0-9a-fA-F]{1,4}:){1,7}:"  # trailing ::
+    r"|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}"  # :: with 1 group after
+    r"|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}"
+    r"|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}"
+    r"|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}"
+    r"|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}"
+    r"|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}"
+    r"|:(?::[0-9a-fA-F]{1,4}){1,7}"  # ::x:x...
+    r"|::)"  # just ::
+)
+
+# Normalize whitespace: collapse tabs and newlines to single space.
+# Parenthesized comments and bracketed IPs are preserved.
+JUNK_PATTERN = r"[\t\n]+"
+
+# ------------------------------------------------------------------ #
+# Received header parsing — RFC 5321 §4.4 grammar:
+#
+#   Received     = "Received:" *( received-token / comment ) ";" date-time
+#   received-token = "from" domain / "by" domain / "via" atom
+#                  / "with" atom  / "id"  atom   / "for" addr-spec
+#
+# Strategy: tokenize on clause keywords, then extract values per clause.
+# This eliminates the duplicated boundary lookaheads of the old
+# per-clause pattern list and matches the RFC grammar directly.
+# ------------------------------------------------------------------ #
+
+# Pattern that splits a received header into clause tokens.
+# Matches each RFC 5321 keyword at a word boundary followed by its value,
+# which extends up to the next keyword or semicolon.
+# The keywords are: from, by, via, with (not "with cipher"), id, for,
+# plus the non-standard envelope-from and envelope-sender.
+_CLAUSE_SPLITTER = re.compile(
+    r"(?:^|\s+)"
+    r"(from|by|via|with(?!\s+cipher)|id|for|envelope-from|envelope-sender)"
+    r"\s+",
+    re.I,
+)
+
+# Extracts envelope-from email: envelope-from <addr>
+_ENVELOPE_FROM_RE = re.compile(r"<([^>]+)>")
+
+# Date after semicolon (standard RFC 5321)
+_DATE_RE = re.compile(r";\s*(.*)", re.DOTALL)
+
+# SendGrid non-standard date format (no semicolon)
+_SENDGRID_DATE_RE = re.compile(
+    r"(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{9}\s+\+0000\s+UTC)"
+    r"\s+m=\+\d+\.\d+",
+    re.I,
+)
 
 EPILOGUE_DEFECTS = {"StartBoundaryNotFoundDefect"}
 

diff --git a/src/mailparser/core.py b/src/mailparser/core.py
@@ -23,7 +23,7 @@
 import logging
 import os
 
-from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP
+from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP6
 from mailparser.utils import (
     convert_mail_date,
     decode_header_part,
@@ -510,14 +510,22 @@ def get_server_ipaddress(self, trust):
     def _extract_ip(self, received_header):
         """
         Extract the IP address from the received header if it is not private.
+        Supports both IPv4 (RFC 791) and IPv6 (RFC 5952) addresses.
 
         Args:
             received_header (string): The received header string
 
         Returns:
             string with the ip address or None
         """
-        check = REGXIP.findall(received_header[0 : received_header.find("by")])
+        by_idx = received_header.find("by")
+        from_part = received_header[:by_idx] if by_idx != -1 else received_header
+
+        # Try IPv4 first, then IPv6
+        check = REGXIP.findall(from_part)
+        if not check:
+            check = REGXIP6.findall(from_part)
+
         if check:
             try:
                 ip_str = str(check[-1])

diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py
@@ -39,7 +39,10 @@
     ADDRESSES_HEADERS,
     JUNK_PATTERN,
     OTHERS_PARTS,
-    RECEIVED_COMPILED_LIST,
+    _CLAUSE_SPLITTER,
+    _DATE_RE,
+    _ENVELOPE_FROM_RE,
+    _SENDGRID_DATE_RE,
 )
 from mailparser.exceptions import MailParserOSError, MailParserReceivedParsingError
 
@@ -240,8 +243,11 @@ def msgconvert(email):
 
 def parse_received(received):
     """
-    Parse a single received header.
-    Return a dictionary of values by clause.
+    Parse a single received header by tokenizing on RFC 5321 §4.4 keywords.
+
+    Uses a keyword-based splitter to divide the header into clauses
+    (from, by, via, with, id, for, envelope-from, envelope-sender),
+    then extracts the date from after the semicolon.
 
     Arguments:
         received {str} -- single received header
@@ -255,47 +261,71 @@ def parse_received(received):
     """
 
     values_by_clause = {}
-    for pattern in RECEIVED_COMPILED_LIST:
-        matches = [match for match in pattern.finditer(received)]
-
-        if len(matches) == 0:
-            # no matches for this clause, but it's ok! keep going!
-            log.debug("No matches found for %s in %s" % (pattern.pattern, received))
-        elif len(matches) > 1:
-            # uh, can't have more than one of each clause in a received.
-            # so either there's more than one or the current regex is wrong
-            msg = "More than one match found for %s in %s" % (pattern.pattern, received)
-            log.error(msg)
-            raise MailParserReceivedParsingError(msg)
+
+    # --- Step 1: Extract date (after semicolon, or SendGrid format) ---
+    date_match = _DATE_RE.search(received)
+    if date_match:
+        values_by_clause["date"] = date_match.group(1)
+        # Work only on the part before the semicolon for clause parsing
+        header_body = received[: date_match.start()]
+    else:
+        # Try SendGrid non-standard date
+        sg_match = _SENDGRID_DATE_RE.search(received)
+        if sg_match:
+            values_by_clause["date"] = sg_match.group(1)
+            header_body = received[: sg_match.start()]
+        else:
+            header_body = received
+
+    # --- Step 2: Tokenize on clause keywords ---
+    # _CLAUSE_SPLITTER.split gives: [preamble, kw1, val1, kw2, val2, ...]
+    parts = _CLAUSE_SPLITTER.split(header_body)
+
+    # parts[0] is preamble (before first keyword), then alternating kw/value
+    i = 1  # skip preamble
+    while i + 1 < len(parts):
+        keyword = parts[i].lower()
+        value = parts[i + 1].strip()
+        i += 2
+
+        if keyword in ("envelope-from", "envelope-sender"):
+            # Extract email from angle brackets
+            m = _ENVELOPE_FROM_RE.search(value)
+            if m:
+                values_by_clause[keyword.replace("-", "_")] = m.group(1)
+        elif keyword == "for":
+            values_by_clause[keyword] = value
+        elif keyword == "from":
+            # RFC 5321: only one 'from' clause per received header.
+            # Only accept the first occurrence; subsequent ones come from
+            # IBM-style "for <addr> from <sender>" constructs.
+            if "from" not in values_by_clause:
+                values_by_clause[keyword] = value
         else:
-            # otherwise we have one matching clause!
-            log.debug("Found one match for %s in %s" % (pattern.pattern, received))
-            match = matches[0].groupdict()
-            key = list(match.keys())[0]
-            value = list(match.values())[0]
-            values_by_clause[key] = value
-
-    if len(values_by_clause) == 0:
-        # we weren't able to match anything...
+            values_by_clause[keyword] = value
+
+    # --- Step 3: Extract envelope-from/sender from within clause values ---
+    # Some MTAs embed envelope-from inside parenthesized comments in the
+    # 'by' clause, e.g.: "by host.com (envelope-from <addr>)"
+    for clause_key in ("by", "from", "with"):
+        clause_val = values_by_clause.get(clause_key, "")
+        for env_key, env_name in (
+            ("envelope_from", "envelope-from"),
+            ("envelope_sender", "envelope-sender"),
+        ):
+            if env_key not in values_by_clause and env_name in clause_val.lower():
+                m = re.search(
+                    r"(?i)" + re.escape(env_name) + r"\s+<([^>]+)>",
+                    clause_val,
+                )
+                if m:
+                    values_by_clause[env_key] = m.group(1)
+
+    if not values_by_clause:
         msg = "Unable to match any clauses in %s" % (received)
-
-        # Modification #1: Commenting the following log as
-        # this raised exception is caught above and then
-        # raw header is updated in response
-        # We dont want to get so many errors in our error
-        # logger as we are not even trying to parse the
-        # received headers
-        # Wanted to make it configurable via settiings,
-        # but this package does not depend on django and
-        # making configurable setting
-        # will make it django dependent,
-        # so better to keep it working with only python
-        # dependent and on any framework of python
-        # commenting it just for our use
-
-        # log.error(msg)
-
         raise MailParserReceivedParsingError(msg)
+
+    log.debug("Parsed clauses: %s", list(values_by_clause.keys()))
     return values_by_clause