Extralit · priyankeshh · Sep 25, 2025 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025
@@ -0,0 +1,34 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+from pydantic import BaseModel
+
+
+class Block(BaseModel):
+    type: str = "unknown"
+    bbox: list[Any] = []
+    content: str = ""
+    id: str = ""
+    score: Optional[float] = None
+
+
+class Page(BaseModel):
+    page: int
+    blocks: list[Block]
+
+
+class Layout(BaseModel):
+    pages: list[Page]
@@ -0,0 +1,39 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class PDFOCRSettings(BaseSettings):
+    """
+    PDF OCR settings that can be configured via environment variables.
+
+    All settings have the OCR_ prefix.
+    """
+
+    model_config = SettingsConfigDict(env_prefix="OCR_")
+
+    run_mode: Literal["marker", "local"] = "local"
+
+    marker_modal_base_url: str | None = Field(default=None, description="Base URL for Modal-hosted Marker service")
+
+    marker_modal_timeout_secs: int = Field(
+        default=600, description="Timeout in seconds for requests to Modal-hosted Marker service"
+    )
+
+
+settings = PDFOCRSettings()
@@ -0,0 +1,77 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Any, Optional
+
+import aiofiles
+import httpx
+from dotenv import load_dotenv
+
+from extralit_server.api.handlers.v1.models import client
+from extralit_server.contexts.document.layout import PDFOCRSettings
+
+load_dotenv()  # loads variables from a .env file in the project root
+
+# Initialize settings
+ocr_settings = PDFOCRSettings()
+
+
+def get_modal_base_url() -> str:
+    base_url = ocr_settings.marker_modal_base_url
+    if not base_url:
+        raise RuntimeError("OCR_MARKER_MODAL_BASE_URL is not set. Set it to your Modal endpoint URL.")
+    return base_url.rstrip("/")
+
+
+async def convert_document_via_modal(
+    pdf_path: Path,
+    output_format: str = "json",
+    page_range: Optional[str] = None,
+    force_ocr: bool = False,
+    paginate_output: bool = False,
+    use_llm: bool = False,
+    timeout: Optional[int] = None,
+    extra_headers: Optional[dict[str, str]] = None,
+) -> dict[str, Any]:
+    """
+    Calls the Modal-hosted Marker /convert endpoint and returns the JSON response.
+    """
+    base_url = get_modal_base_url()
+    url = f"{base_url}/convert"
+
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+
+    # httpx requires files as (name, file, content_type)
+    async with aiofiles.open(pdf_path, "rb") as f:
+        file_bytes = await f.read()
+    files = {"file": (pdf_path.name, file_bytes, "application/pdf")}
+    data = {
+        "output_format": output_format,
+        "page_range": page_range,
+        "force_ocr": str(bool(force_ocr)).lower(),
+        "paginate_output": str(bool(paginate_output)).lower(),
+        "use_llm": str(bool(use_llm)).lower(),
+    }
+    data = {k: v for k, v in data.items() if v not in (None, "", "none", "null")}
+
+    headers = extra_headers or {}
+    t = timeout if timeout is not None else ocr_settings.marker_modal_timeout_secs
+    try:
+        resp = await client.post(url, files=files, data=data, headers=headers, timeout=t)
+        resp.raise_for_status()
+    except httpx.HTTPStatusError as e:
+        raise RuntimeError(f"Modal Marker conversion failed: {e}; body={resp.text[:1000]}") from e
+    return resp.json()