Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import struct
import zipfile
from io import BytesIO
from typing import BinaryIO
Expand Down Expand Up @@ -115,6 +116,39 @@ def _pre_process_math(content: bytes) -> bytes:
return str(soup).encode()


def _fix_zip_name_casing(input_docx: BinaryIO) -> BinaryIO:
input_docx.seek(0)
raw = bytearray(input_docx.read())
patched = False

with zipfile.ZipFile(BytesIO(raw), mode="r") as zip_input:
for info in zip_input.infolist():
offset = info.header_offset
if raw[offset : offset + 4] != b"PK\x03\x04":
continue

name_len = struct.unpack_from("<H", raw, offset + 26)[0]
name_start = offset + 30
name_end = name_start + name_len
local_name = raw[name_start:name_end]
encoding = "utf-8" if info.flag_bits & 0x800 else "cp437"
central_name = info.filename.encode(encoding)

if (
local_name != central_name
and len(local_name) == len(central_name)
and local_name.lower() == central_name.lower()
):
raw[name_start:name_end] = central_name
patched = True

if patched:
return BytesIO(bytes(raw))

input_docx.seek(0)
return input_docx


def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
"""
Pre-processes a DOCX file with provided steps.
Expand All @@ -129,6 +163,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
Returns:
BinaryIO: A binary output stream representing the processed DOCX file.
"""
input_docx = _fix_zip_name_casing(input_docx)
output_docx = BytesIO()
# The files that need to be pre-processed from .docx
pre_process_enable_files = [
Expand Down
30 changes: 30 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import os
import re
import shutil
import struct
import zipfile
import pytest
from unittest.mock import MagicMock

from markitdown.converter_utils.docx.pre_process import pre_process_docx
from markitdown._uri_utils import parse_data_uri, file_uri_to_path

from markitdown import (
Expand Down Expand Up @@ -274,6 +277,33 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def test_docx_preprocess_repairs_case_mismatched_zip_names() -> None:
docx = io.BytesIO()
with zipfile.ZipFile(docx, mode="w") as zf:
zf.writestr("[Content_Types].xml", b"<Types/>")
zf.writestr("word/document.xml", b"<w:document/>")
zf.writestr("customXml/item2.xml", b"<item/>")

raw = bytearray(docx.getvalue())
with zipfile.ZipFile(io.BytesIO(raw), mode="r") as zf:
info = zf.getinfo("customXml/item2.xml")

name_len = struct.unpack_from("<H", raw, info.header_offset + 26)[0]
name_start = info.header_offset + 30
name_end = name_start + name_len
assert raw[name_start:name_end] == b"customXml/item2.xml"
raw[name_start:name_end] = b"customXML/item2.xml"

with zipfile.ZipFile(io.BytesIO(raw), mode="r") as zf:
with pytest.raises(zipfile.BadZipFile):
zf.read("customXml/item2.xml")

processed = pre_process_docx(io.BytesIO(raw))
with zipfile.ZipFile(processed, mode="r") as zf:
assert zf.read("customXml/item2.xml") == b"<item/>"
assert zf.read("word/document.xml")


def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand Down