Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,20 @@ def main():
help="Comma-separated list of file types to route to Content Understanding (e.g., pdf,jpeg,mp4). If omitted, all supported types are routed.",
)

parser.add_argument(
"--docintel-model-id",
type=str,
default=None,
help="Document Intelligence model ID (e.g., 'prebuilt-layout', 'prebuilt-invoice', or a custom model ID). Defaults to 'prebuilt-layout'.",
)

parser.add_argument(
"--docintel-query-fields",
type=str,
default=None,
help="Comma-separated list of field names to extract via the Document Intelligence queryFields add-on (OCR file types only).",
)

parser.add_argument(
"-p",
"--use-plugins",
Expand Down Expand Up @@ -208,9 +222,19 @@ def main():
elif args.filename is None:
_exit_with_error("Filename is required when using Document Intelligence.")

markitdown = MarkItDown(
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
)
docintel_kwargs: Dict[str, Any] = {
"docintel_endpoint": args.endpoint,
}
if args.docintel_model_id:
docintel_kwargs["docintel_model_id"] = args.docintel_model_id
if args.docintel_query_fields:
fields = [
f.strip() for f in args.docintel_query_fields.split(",") if f.strip()
]
if fields:
docintel_kwargs["docintel_query_fields"] = fields

markitdown = MarkItDown(enable_plugins=args.use_plugins, **docintel_kwargs)
elif args.use_cu:
if args.cu_endpoint is None:
_exit_with_error(
Expand Down
8 changes: 8 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,14 @@ def enable_builtins(self, **kwargs) -> None:
if docintel_version is not None:
docintel_args["api_version"] = docintel_version

docintel_model_id = kwargs.get("docintel_model_id")
if docintel_model_id is not None:
docintel_args["model_id"] = docintel_model_id

docintel_query_fields = kwargs.get("docintel_query_fields")
if docintel_query_fields is not None:
docintel_args["query_fields"] = docintel_query_fields

self.register_converter(
DocumentIntelligenceConverter(**docintel_args),
)
Expand Down
209 changes: 203 additions & 6 deletions packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import sys
import re
import os
from typing import BinaryIO, Any, List
from datetime import date, datetime, time
from typing import BinaryIO, Any, List, Optional
from enum import Enum

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException
from .. import __version__ as _markitdown_version

_USER_AGENT = f"markitdown-docintel/{_markitdown_version}"

# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
Expand Down Expand Up @@ -127,15 +131,181 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
return extensions


def _field_value(field: Any) -> Any:
"""
Extract a serializable Python value from a Document Intelligence DocumentField.

Returns the most specific typed value when available, falling back to the
raw ``content`` string. Returns ``None`` when nothing usable is present.
"""
if field is None:
return None

# Typed scalar values (in rough order of specificity).
for attr in (
"value_string",
"value_boolean",
"value_integer",
"value_number",
"value_date",
"value_time",
"value_phone_number",
"value_country_region",
"value_selection_mark",
"value_signature",
):
v = getattr(field, attr, None)
if v is not None:
if isinstance(v, (date, datetime, time)):
return v.isoformat()
return v

# Currency: { amount, currencySymbol, currencyCode }
cur = getattr(field, "value_currency", None)
if cur is not None:
amount = getattr(cur, "amount", None)
code = getattr(cur, "currency_code", None) or getattr(
cur, "currency_symbol", None
)
if amount is not None and code:
return f"{amount} {code}"
if amount is not None:
return amount

# Address: serialize to its content/string form.
addr = getattr(field, "value_address", None)
if addr is not None:
return getattr(field, "content", None) or str(addr)

# Array of fields -> list of values.
arr = getattr(field, "value_array", None)
if arr is not None:
return [_field_value(item) for item in arr]

# Object of fields -> dict of values.
obj = getattr(field, "value_object", None)
if obj is not None:
return {k: _field_value(v) for k, v in obj.items()}

# Last resort: the raw extracted text.
return getattr(field, "content", None)


def _yaml_scalar(value: Any) -> str:
"""Render a scalar value as a YAML string."""
if value is None:
return "null"
if isinstance(value, bool):
return "true" if value else "false"
if isinstance(value, (int, float)):
return repr(value)
s = str(value)
# Quote when necessary: contains special chars, leading/trailing whitespace,
# or characters that would confuse a YAML parser.
if (
s == ""
or s != s.strip()
or any(c in s for c in ":#&*!|>'\"%@`\n\r\t")
or s.lower() in ("null", "true", "false", "yes", "no", "~")
):
# Escape backslashes and double quotes; collapse newlines.
escaped = (
s.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
return f'"{escaped}"'
return s


def _yaml_dump(value: Any, indent: int = 0) -> str:
"""Minimal YAML emitter for scalars, lists, and dicts of scalars/lists/dicts."""
pad = " " * indent
if isinstance(value, dict):
if not value:
return f"{pad}{{}}"
lines: List[str] = []
for k, v in value.items():
key = _yaml_scalar(k)
if isinstance(v, (dict, list)) and v:
lines.append(f"{pad}{key}:")
lines.append(_yaml_dump(v, indent + 1))
else:
lines.append(
f"{pad}{key}: {_yaml_scalar(v) if not isinstance(v, (dict, list)) else ('{}' if isinstance(v, dict) else '[]')}"
)
return "\n".join(lines)
if isinstance(value, list):
if not value:
return f"{pad}[]"
lines = []
for item in value:
if isinstance(item, (dict, list)) and item:
lines.append(f"{pad}-")
lines.append(_yaml_dump(item, indent + 1))
else:
lines.append(
f"{pad}- {_yaml_scalar(item) if not isinstance(item, (dict, list)) else ('{}' if isinstance(item, dict) else '[]')}"
)
return "\n".join(lines)
return f"{pad}{_yaml_scalar(value)}"


def _fields_to_front_matter(documents: Any, model_id: Optional[str] = None) -> str:
"""
Build a YAML front matter block from ``AnalyzeResult.documents[*].fields``.

Returns an empty string when there are no documents or no non-empty fields.
Multiple documents are merged into a single ``fields`` mapping; on duplicate
keys, the value from the later document wins.

The shape mirrors the Content Understanding converter's front matter so that
downstream consumers (e.g., LLM pipelines) can parse both uniformly:

---
modelId: prebuilt-invoice
fields:
VendorName: Contoso Ltd.
InvoiceTotal: 1250.0
---
"""
if not documents:
return ""

merged: dict = {}
for doc in documents:
fields = getattr(doc, "fields", None) or {}
for name, field in fields.items():
value = _field_value(field)
if value is None or value == "" or value == [] or value == {}:
continue
merged[name] = value

if not merged:
return ""

payload: dict = {}
if model_id:
payload["modelId"] = model_id
payload["fields"] = merged

body = _yaml_dump(payload)
return f"---\n{body}\n---\n\n"


class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""

def __init__(
self,
*,
endpoint: str,
api_version: str = "2024-07-31-preview",
api_version: str = "2024-11-30",
credential: AzureKeyCredential | TokenCredential | None = None,
model_id: str = "prebuilt-layout",
query_fields: Optional[List[str]] = None,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
Expand All @@ -152,13 +322,19 @@ def __init__(

Args:
endpoint (str): The endpoint for the Document Intelligence service.
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
api_version (str): The API version to use. Defaults to "2024-11-30" (GA).
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
model_id (str): The Document Intelligence model ID to use (e.g., "prebuilt-layout",
"prebuilt-invoice", "prebuilt-receipt", or a custom model ID). Defaults to "prebuilt-layout".
query_fields (List[str] | None): Optional list of field names to extract via the DI
``queryFields`` add-on. Only applied to OCR-supported file types (PDF/images).
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
"""

super().__init__()
self._file_types = file_types
self._model_id = model_id
self._query_fields = list(query_fields) if query_fields else None

# Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated
Expand All @@ -184,6 +360,7 @@ def __init__(
endpoint=self.endpoint,
api_version=self.api_version,
credential=credential,
user_agent=_USER_AGENT,
)

def accepts(
Expand Down Expand Up @@ -228,27 +405,47 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
if mimetype.startswith(prefix):
return []

return [
features = [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
]
if self._query_fields:
features.append(DocumentAnalysisFeature.QUERY_FIELDS)
return features

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Build optional kwargs so that we only pass query_fields when the
# QUERY_FIELDS feature is actually enabled for this file type.
features = self._analysis_features(stream_info)
extra: dict = {}
if self._query_fields and DocumentAnalysisFeature.QUERY_FIELDS in features:
extra["query_fields"] = self._query_fields

# Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout",
model_id=self._model_id,
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
features=self._analysis_features(stream_info),
features=features,
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
**extra,
)
result: AnalyzeResult = poller.result()

# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)

# Prepend YAML front matter when DI returned structured fields (e.g., from
# prebuilt-invoice/-receipt, custom models, or queryFields).
front_matter = _fields_to_front_matter(
getattr(result, "documents", None), model_id=self._model_id
)
if front_matter:
markdown_text = front_matter + markdown_text

return DocumentConverterResult(markdown=markdown_text)
Loading
Loading