Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
**v0.54.1**
* [[TeamMsgExtractor #462](https://github.com/TeamMsgExtractor/msg-extractor/issues/462)] Fix potential issue where child MSG might have incompatible encoding to parent MSG when trying to grab a stream from the parent.
* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to `RTFDE`. This shows improvements on all files that take more than one second. Currently, this actually fixes some files previously outputting wrong from `RTFDE` when deencapsulating the HTML body, specifically around non breaking spaces sometimes not transferring over.

**v0.54.0**
* [[TeamMsgExtractor #456](https://github.com/TeamMsgExtractor/msg-extractor/issues/456)] Changed the prepared html output to use plainly encoded HTML instead of prettified, since current prettification options used mangles the output and causes the output to sometimes be very large.

Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
.. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
:target: LICENSE.txt

.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.0-blue.svg
:target: https://pypi.org/project/extract-msg/0.54.0/
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.1-blue.svg
:target: https://pypi.org/project/extract-msg/0.54.1/

.. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
:target: https://www.python.org/downloads/release/python-3810/
Expand Down
4 changes: 2 additions & 2 deletions extract_msg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

__author__ = 'Destiny Peterson & Matthew Walker'
__date__ = '2025-03-23'
__version__ = '0.54.0'
__date__ = '2025-04-10'
__version__ = '0.54.1'

__all__ = [
# Modules:
Expand Down
12 changes: 12 additions & 0 deletions extract_msg/constants/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
'HTML_SAN_SPACE',
'INVALID_FILENAME_CHARS',
'INVALID_OLE_PATH',
'RTF_BODY_STRIP_INIT',
'RTF_BODY_STRIP_PRE_CLOSE',
'RTF_BODY_STRIP_PRE_OPEN',
'RTF_ENC_BODY_START',
]

Expand Down Expand Up @@ -40,3 +43,12 @@
# invalid.
INVALID_OLE_PATH: Final[_RE_STR_TYPE] = re.compile(r'[:/\\!]')

# Used as the initial step in stripping RTF files for deencapsulation. Finds
# ignored sections that do not contrain groups *and* finds HTML tag sections
# that are entirely empty. It also then finds sections of data that can be
# merged together without affecting the results
RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'[a-fA-F0-9]{2}\\htmlrtf0 ?)')

# Preprocessing steps to simplify the RTF.
RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)|(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)')
RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?')
8 changes: 7 additions & 1 deletion extract_msg/msg_classes/message_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
from ..utils import (
addNumToDir, addNumToZipDir, createZipOpen, decodeRfc2047, findWk,
htmlSanitize, inputToBytes, inputToString, isEncapsulatedRtf,
prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, validateHtml
prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, stripRtf,
validateHtml
)


Expand Down Expand Up @@ -1012,6 +1013,11 @@ def deencapsulatedRtf(self) -> Optional[RTFDE.DeEncapsulator]:
while body and body[-1] != 125:
body = body[:-1]

# Some files take a long time due to how they are structured and
# how RTFDE works. The longer a file would normally take, the
# better this fix works:
body = stripRtf(body)

try:
deencapsultor = RTFDE.DeEncapsulator(body)
deencapsultor.deencapsulate()
Expand Down
22 changes: 19 additions & 3 deletions extract_msg/msg_classes/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,23 @@ def __init__(self, path, **kwargs):
self.__overrideEncoding = overrideEncoding

if prefix and not filename:
filename = self.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False)
# We actually need to get this from the parent.
msg = None
parentNeedsClose = False
if self.__parentMsg:
msg = self.__parentMsg()
if msg is None:
# We *NEED* the parent here, so we're going to do something
# dumb and just generate it *manually*, grab what we need, # and them immediately close it.
#
# We don't need anything more advanced than MSGFile.
msg = MSGFile(path, prefix = prefixl[:-2], delayAttachments = True)
parentNeedsClose = True
# Now that we know we have the parent, grab the stream.
filename = msg.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False)
# Now if we opened the parent, close it.
if parentNeedsClose:
msg.close()
if filename:
self.filename = filename
elif hasattr(path, '__len__'):
Expand Down Expand Up @@ -492,7 +508,7 @@ def export(self, path, allowBadEmbed: bool = False) -> None:

:param path: A path-like object (including strings and ``pathlib.Path``
objects) or an IO device with a write method which accepts bytes.
:param allowBadEmbed: If True, attempts to skip steps that will fail if
:param allowBadEmbed: If True, attempts to skip steps that will fail if
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
"""
from ..ole_writer import OleWriter
Expand All @@ -507,7 +523,7 @@ def exportBytes(self, allowBadEmbed: bool = False) -> bytes:
"""
Saves a new copy of the MSG file, returning the bytes.

:param allowBadEmbed: If True, attempts to skip steps that will fail if
:param allowBadEmbed: If True, attempts to skip steps that will fail if
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
"""
out = io.BytesIO()
Expand Down
59 changes: 59 additions & 0 deletions extract_msg/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'rtfSanitizeHtml',
'rtfSanitizePlain',
'setupLogging',
'stripRtf',
'tryGetMimetype',
'unsignedToSignedInt',
'unwrapMsg',
Expand All @@ -61,6 +62,7 @@
import logging.config
import os
import pathlib
import re
import shutil
import struct
import sys
Expand Down Expand Up @@ -1012,6 +1014,63 @@ def setupLogging(defaultPath = None, defaultLevel = logging.WARN, logfile = None
return True


def stripRtf(rtfBody: bytes) -> bytes:
"""
Cleans up RTF before sending it to RTFDE.

Attempts to find common sections of RTF data that will
"""
# First, do a pre-strip to try and simplify ignored sections as much as possible.
rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(_stripRtfOpenHelper, rtfBody)
rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(_stripRtfCloseHelper, rtfBody)
# Second do an initial strip to simplify our data stream.
rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody)
# Do it one more time to help with some things that might not have gotten
# caught the first time, perhaps because something now exists after
# stripping.
rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody)

# TODO: Further processing...

return rtfBody

def _stripRtfCloseHelper(match: re.Match) -> bytes:
if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1:
return ret

if b'\\f' in ret:
return ret

return b'\\htmlrtf}\\htmlrtf0 '


def _stripRtfOpenHelper(match: re.Match) -> bytes:
if b'\\f' in (ret := match.expand(b'\\g<0>')):
return ret

return b'\\htmlrtf{\\htmlrtf0 '


def _stripRtfHelper(match: re.Match) -> bytes:
res = match.string

# If these don't match, don't even try.
if res.count(b'{') != res.count(b'}') or res.count(b'{') == 0:
return res

# If any group markers are prefixed by a backslash, give up.
if res.find(b'\\{') != -1 or res.find(b'\\}') != -1:
return res

# Last little bit of processing to validate everything. We know the {}
# match, but let's be *absolutely* sure.
# TODO

return res




def tryGetMimetype(att: AttachmentBase, mimetype: Union[str, None]) -> Union[str, None]:
"""
Uses an optional dependency to try and get the mimetype of an attachment.
Expand Down