Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 25 additions & 23 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ def enable_builtins(self, **kwargs) -> None:
This method should only be called once, if built-ins were initially disabled.
"""
if not self._builtins_enabled:
disabled_converters = set(kwargs.get("disabled_converters") or [])

# TODO: Move these into converter constructors
self._llm_client = kwargs.get("llm_client")
self._llm_model = kwargs.get("llm_model")
Expand Down Expand Up @@ -178,30 +180,30 @@ def enable_builtins(self, **kwargs) -> None:
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_converter(
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
def maybe_register(converter: DocumentConverter, priority: float) -> None:
if type(converter).__name__ not in disabled_converters:
self.register_converter(converter, priority=priority)

maybe_register(PlainTextConverter(), PRIORITY_GENERIC_FILE_FORMAT)
maybe_register(
ZipConverter(markitdown=self), PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(RssConverter())
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
self.register_converter(AudioConverter())
self.register_converter(ImageConverter())
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
maybe_register(HtmlConverter(), PRIORITY_GENERIC_FILE_FORMAT)
maybe_register(RssConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(WikipediaConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(YouTubeConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(BingSerpConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(DocxConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(XlsxConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(XlsConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(PptxConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(AudioConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(ImageConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(IpynbConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(PdfConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(OutlookMsgConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(EpubConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)
maybe_register(CsvConverter(), PRIORITY_SPECIFIC_FILE_FORMAT)

# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
Expand Down
27 changes: 27 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ def validate_strings(result, expected_strings, exclude_strings=None):
assert string not in text_content


def registered_converter_names(markitdown):
return [type(registration.converter).__name__ for registration in markitdown._converters]


def test_stream_info_operations() -> None:
"""Test operations performed on StreamInfo objects."""

Expand Down Expand Up @@ -456,6 +460,27 @@ def test_markitdown_llm_parameters() -> None:
assert messages[0]["content"][0]["text"] == test_prompt


def test_markitdown_disabled_converters() -> None:
markitdown = MarkItDown(
disabled_converters=["ZipConverter", "AudioConverter"],
)

converter_names = registered_converter_names(markitdown)
assert "ZipConverter" not in converter_names
assert "AudioConverter" not in converter_names
assert "PlainTextConverter" in converter_names
assert "PdfConverter" in converter_names


def test_enable_builtins_with_disabled_converters() -> None:
markitdown = MarkItDown(enable_builtins=False)
markitdown.enable_builtins(disabled_converters=["PdfConverter"])

converter_names = registered_converter_names(markitdown)
assert "PdfConverter" not in converter_names
assert "PlainTextConverter" in converter_names


@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
Expand Down Expand Up @@ -496,6 +521,8 @@ def test_markitdown_llm() -> None:
test_doc_rlink,
test_markitdown_exiftool,
test_markitdown_llm_parameters,
test_markitdown_disabled_converters,
test_enable_builtins_with_disabled_converters,
test_markitdown_llm,
]:
print(f"Running {test.__name__}...", end="")
Expand Down