microsoft · chienyuanchang · May 8, 2026 · May 26, 2026 · May 26, 2026
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
@@ -119,6 +119,20 @@ def main():
         help="Comma-separated list of file types to route to Content Understanding (e.g., pdf,jpeg,mp4). If omitted, all supported types are routed.",
     )
 
+    parser.add_argument(
+        "--docintel-model-id",
+        type=str,
+        default=None,
+        help="Document Intelligence model ID (e.g., 'prebuilt-layout', 'prebuilt-invoice', or a custom model ID). Defaults to 'prebuilt-layout'.",
+    )
+
+    parser.add_argument(
+        "--docintel-query-fields",
+        type=str,
+        default=None,
+        help="Comma-separated list of field names to extract via the Document Intelligence queryFields add-on (OCR file types only).",
+    )
+
     parser.add_argument(
         "-p",
         "--use-plugins",
@@ -208,9 +222,19 @@ def main():
         elif args.filename is None:
             _exit_with_error("Filename is required when using Document Intelligence.")
 
-        markitdown = MarkItDown(
-            enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
-        )
+        docintel_kwargs: Dict[str, Any] = {
+            "docintel_endpoint": args.endpoint,
+        }
+        if args.docintel_model_id:
+            docintel_kwargs["docintel_model_id"] = args.docintel_model_id
+        if args.docintel_query_fields:
+            fields = [
+                f.strip() for f in args.docintel_query_fields.split(",") if f.strip()
+            ]
+            if fields:
+                docintel_kwargs["docintel_query_fields"] = fields
+
+        markitdown = MarkItDown(enable_plugins=args.use_plugins, **docintel_kwargs)
     elif args.use_cu:
         if args.cu_endpoint is None:
             _exit_with_error(

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -222,6 +222,14 @@ def enable_builtins(self, **kwargs) -> None:
                 if docintel_version is not None:
                     docintel_args["api_version"] = docintel_version
 
+                docintel_model_id = kwargs.get("docintel_model_id")
+                if docintel_model_id is not None:
+                    docintel_args["model_id"] = docintel_model_id
+
+                docintel_query_fields = kwargs.get("docintel_query_fields")
+                if docintel_query_fields is not None:
+                    docintel_args["query_fields"] = docintel_query_fields
+
                 self.register_converter(
                     DocumentIntelligenceConverter(**docintel_args),
                 )

diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,12 +1,16 @@
 import sys
 import re
 import os
-from typing import BinaryIO, Any, List
+from datetime import date, datetime, time
+from typing import BinaryIO, Any, List, Optional
 from enum import Enum
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException
+from .. import __version__ as _markitdown_version
+
+_USER_AGENT = f"markitdown-docintel/{_markitdown_version}"
 
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
@@ -127,15 +131,181 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
     return extensions
 
 
+def _field_value(field: Any) -> Any:
+    """
+    Extract a serializable Python value from a Document Intelligence DocumentField.
+
+    Returns the most specific typed value when available, falling back to the
+    raw ``content`` string. Returns ``None`` when nothing usable is present.
+    """
+    if field is None:
+        return None
+
+    # Typed scalar values (in rough order of specificity).
+    for attr in (
+        "value_string",
+        "value_boolean",
+        "value_integer",
+        "value_number",
+        "value_date",
+        "value_time",
+        "value_phone_number",
+        "value_country_region",
+        "value_selection_mark",
+        "value_signature",
+    ):
+        v = getattr(field, attr, None)
+        if v is not None:
+            if isinstance(v, (date, datetime, time)):
+                return v.isoformat()
+            return v
+
+    # Currency: { amount, currencySymbol, currencyCode }
+    cur = getattr(field, "value_currency", None)
+    if cur is not None:
+        amount = getattr(cur, "amount", None)
+        code = getattr(cur, "currency_code", None) or getattr(
+            cur, "currency_symbol", None
+        )
+        if amount is not None and code:
+            return f"{amount} {code}"
+        if amount is not None:
+            return amount
+
+    # Address: serialize to its content/string form.
+    addr = getattr(field, "value_address", None)
+    if addr is not None:
+        return getattr(field, "content", None) or str(addr)
+
+    # Array of fields -> list of values.
+    arr = getattr(field, "value_array", None)
+    if arr is not None:
+        return [_field_value(item) for item in arr]
+
+    # Object of fields -> dict of values.
+    obj = getattr(field, "value_object", None)
+    if obj is not None:
+        return {k: _field_value(v) for k, v in obj.items()}
+
+    # Last resort: the raw extracted text.
+    return getattr(field, "content", None)
+
+
+def _yaml_scalar(value: Any) -> str:
+    """Render a scalar value as a YAML string."""
+    if value is None:
+        return "null"
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, (int, float)):
+        return repr(value)
+    s = str(value)
+    # Quote when necessary: contains special chars, leading/trailing whitespace,
+    # or characters that would confuse a YAML parser.
+    if (
+        s == ""
+        or s != s.strip()
+        or any(c in s for c in ":#&*!|>'\"%@`\n\r\t")
+        or s.lower() in ("null", "true", "false", "yes", "no", "~")
+    ):
+        # Escape backslashes and double quotes; collapse newlines.
+        escaped = (
+            s.replace("\\", "\\\\")
+            .replace('"', '\\"')
+            .replace("\n", "\\n")
+            .replace("\r", "\\r")
+            .replace("\t", "\\t")
+        )
+        return f'"{escaped}"'
+    return s
+
+
+def _yaml_dump(value: Any, indent: int = 0) -> str:
+    """Minimal YAML emitter for scalars, lists, and dicts of scalars/lists/dicts."""
+    pad = "  " * indent
+    if isinstance(value, dict):
+        if not value:
+            return f"{pad}{{}}"
+        lines: List[str] = []
+        for k, v in value.items():
+            key = _yaml_scalar(k)
+            if isinstance(v, (dict, list)) and v:
+                lines.append(f"{pad}{key}:")
+                lines.append(_yaml_dump(v, indent + 1))
+            else:
+                lines.append(
+                    f"{pad}{key}: {_yaml_scalar(v) if not isinstance(v, (dict, list)) else ('{}' if isinstance(v, dict) else '[]')}"
+                )
+        return "\n".join(lines)
+    if isinstance(value, list):
+        if not value:
+            return f"{pad}[]"
+        lines = []
+        for item in value:
+            if isinstance(item, (dict, list)) and item:
+                lines.append(f"{pad}-")
+                lines.append(_yaml_dump(item, indent + 1))
+            else:
+                lines.append(
+                    f"{pad}- {_yaml_scalar(item) if not isinstance(item, (dict, list)) else ('{}' if isinstance(item, dict) else '[]')}"
+                )
+        return "\n".join(lines)
+    return f"{pad}{_yaml_scalar(value)}"
+
+
+def _fields_to_front_matter(documents: Any, model_id: Optional[str] = None) -> str:
+    """
+    Build a YAML front matter block from ``AnalyzeResult.documents[*].fields``.
+
+    Returns an empty string when there are no documents or no non-empty fields.
+    Multiple documents are merged into a single ``fields`` mapping; on duplicate
+    keys, the value from the later document wins.
+
+    The shape mirrors the Content Understanding converter's front matter so that
+    downstream consumers (e.g., LLM pipelines) can parse both uniformly:
+
+        ---
+        modelId: prebuilt-invoice
+        fields:
+          VendorName: Contoso Ltd.
+          InvoiceTotal: 1250.0
+        ---
+    """
+    if not documents:
+        return ""
+
+    merged: dict = {}
+    for doc in documents:
+        fields = getattr(doc, "fields", None) or {}
+        for name, field in fields.items():
+            value = _field_value(field)
+            if value is None or value == "" or value == [] or value == {}:
+                continue
+            merged[name] = value
+
+    if not merged:
+        return ""
+
+    payload: dict = {}
+    if model_id:
+        payload["modelId"] = model_id
+    payload["fields"] = merged
+
+    body = _yaml_dump(payload)
+    return f"---\n{body}\n---\n\n"
+
+
 class DocumentIntelligenceConverter(DocumentConverter):
     """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
 
     def __init__(
         self,
         *,
         endpoint: str,
-        api_version: str = "2024-07-31-preview",
+        api_version: str = "2024-11-30",
         credential: AzureKeyCredential | TokenCredential | None = None,
+        model_id: str = "prebuilt-layout",
+        query_fields: Optional[List[str]] = None,
         file_types: List[DocumentIntelligenceFileType] = [
             DocumentIntelligenceFileType.DOCX,
             DocumentIntelligenceFileType.PPTX,
@@ -152,13 +322,19 @@ def __init__(
 
         Args:
             endpoint (str): The endpoint for the Document Intelligence service.
-            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
+            api_version (str): The API version to use. Defaults to "2024-11-30" (GA).
             credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
+            model_id (str): The Document Intelligence model ID to use (e.g., "prebuilt-layout",
+                "prebuilt-invoice", "prebuilt-receipt", or a custom model ID). Defaults to "prebuilt-layout".
+            query_fields (List[str] | None): Optional list of field names to extract via the DI
+                ``queryFields`` add-on. Only applied to OCR-supported file types (PDF/images).
             file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
         """
 
         super().__init__()
         self._file_types = file_types
+        self._model_id = model_id
+        self._query_fields = list(query_fields) if query_fields else None
 
         # Raise an error if the dependencies are not available.
         # This is different than other converters since this one isn't even instantiated
@@ -184,6 +360,7 @@ def __init__(
             endpoint=self.endpoint,
             api_version=self.api_version,
             credential=credential,
+            user_agent=_USER_AGENT,
         )
 
     def accepts(
@@ -228,27 +405,47 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
             if mimetype.startswith(prefix):
                 return []
 
-        return [
+        features = [
             DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
             DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
             DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
         ]
+        if self._query_fields:
+            features.append(DocumentAnalysisFeature.QUERY_FIELDS)
+        return features
 
     def convert(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
+        # Build optional kwargs so that we only pass query_fields when the
+        # QUERY_FIELDS feature is actually enabled for this file type.
+        features = self._analysis_features(stream_info)
+        extra: dict = {}
+        if self._query_fields and DocumentAnalysisFeature.QUERY_FIELDS in features:
+            extra["query_fields"] = self._query_fields
+
         # Extract the text using Azure Document Intelligence
         poller = self.doc_intel_client.begin_analyze_document(
-            model_id="prebuilt-layout",
+            model_id=self._model_id,
             body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
-            features=self._analysis_features(stream_info),
+            features=features,
             output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+            **extra,
         )
         result: AnalyzeResult = poller.result()
 
         # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
         markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+
+        # Prepend YAML front matter when DI returned structured fields (e.g., from
+        # prebuilt-invoice/-receipt, custom models, or queryFields).
+        front_matter = _fields_to_front_matter(
+            getattr(result, "documents", None), model_id=self._model_id
+        )
+        if front_matter:
+            markdown_text = front_matter + markdown_text
+
         return DocumentConverterResult(markdown=markdown_text)