microsoft · he-yufeng · May 23, 2026
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -1,3 +1,4 @@
+import struct
 import zipfile
 from io import BytesIO
 from typing import BinaryIO
@@ -115,6 +116,39 @@ def _pre_process_math(content: bytes) -> bytes:
     return str(soup).encode()
 
 
+def _fix_zip_name_casing(input_docx: BinaryIO) -> BinaryIO:
+    input_docx.seek(0)
+    raw = bytearray(input_docx.read())
+    patched = False
+
+    with zipfile.ZipFile(BytesIO(raw), mode="r") as zip_input:
+        for info in zip_input.infolist():
+            offset = info.header_offset
+            if raw[offset : offset + 4] != b"PK\x03\x04":
+                continue
+
+            name_len = struct.unpack_from("<H", raw, offset + 26)[0]
+            name_start = offset + 30
+            name_end = name_start + name_len
+            local_name = raw[name_start:name_end]
+            encoding = "utf-8" if info.flag_bits & 0x800 else "cp437"
+            central_name = info.filename.encode(encoding)
+
+            if (
+                local_name != central_name
+                and len(local_name) == len(central_name)
+                and local_name.lower() == central_name.lower()
+            ):
+                raw[name_start:name_end] = central_name
+                patched = True
+
+    if patched:
+        return BytesIO(bytes(raw))
+
+    input_docx.seek(0)
+    return input_docx
+
+
 def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     """
     Pre-processes a DOCX file with provided steps.
@@ -129,6 +163,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     Returns:
         BinaryIO: A binary output stream representing the processed DOCX file.
     """
+    input_docx = _fix_zip_name_casing(input_docx)
     output_docx = BytesIO()
     # The files that need to be pre-processed from .docx
     pre_process_enable_files = [

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -3,9 +3,12 @@
 import os
 import re
 import shutil
+import struct
+import zipfile
 import pytest
 from unittest.mock import MagicMock
 
+from markitdown.converter_utils.docx.pre_process import pre_process_docx
 from markitdown._uri_utils import parse_data_uri, file_uri_to_path
 
 from markitdown import (
@@ -274,6 +277,33 @@ def test_docx_equations() -> None:
     assert block_equations, "No block equations found in the document."
 
 
+def test_docx_preprocess_repairs_case_mismatched_zip_names() -> None:
+    docx = io.BytesIO()
+    with zipfile.ZipFile(docx, mode="w") as zf:
+        zf.writestr("[Content_Types].xml", b"<Types/>")
+        zf.writestr("word/document.xml", b"<w:document/>")
+        zf.writestr("customXml/item2.xml", b"<item/>")
+
+    raw = bytearray(docx.getvalue())
+    with zipfile.ZipFile(io.BytesIO(raw), mode="r") as zf:
+        info = zf.getinfo("customXml/item2.xml")
+
+    name_len = struct.unpack_from("<H", raw, info.header_offset + 26)[0]
+    name_start = info.header_offset + 30
+    name_end = name_start + name_len
+    assert raw[name_start:name_end] == b"customXml/item2.xml"
+    raw[name_start:name_end] = b"customXML/item2.xml"
+
+    with zipfile.ZipFile(io.BytesIO(raw), mode="r") as zf:
+        with pytest.raises(zipfile.BadZipFile):
+            zf.read("customXml/item2.xml")
+
+    processed = pre_process_docx(io.BytesIO(raw))
+    with zipfile.ZipFile(processed, mode="r") as zf:
+        assert zf.read("customXml/item2.xml") == b"<item/>"
+        assert zf.read("word/document.xml")
+
+
 def test_input_as_strings() -> None:
     markitdown = MarkItDown()