Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Optional

from pydantic import BaseModel


class Block(BaseModel):
type: str = "unknown"
bbox: list[Any] = []
content: str = ""
id: str = ""
score: Optional[float] = None


class Page(BaseModel):
page: int
blocks: list[Block]


class Layout(BaseModel):
pages: list[Page]
39 changes: 39 additions & 0 deletions extralit-server/src/extralit_server/contexts/document/layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Literal

from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict


class PDFOCRSettings(BaseSettings):
"""
PDF OCR settings that can be configured via environment variables.

All settings have the OCR_ prefix.
"""

model_config = SettingsConfigDict(env_prefix="OCR_")

run_mode: Literal["marker", "local"] = "local"

marker_modal_base_url: str | None = Field(default=None, description="Base URL for Modal-hosted Marker service")

marker_modal_timeout_secs: int = Field(
default=600, description="Timeout in seconds for requests to Modal-hosted Marker service"
)


settings = PDFOCRSettings()
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Any, Optional

import aiofiles
import httpx
from dotenv import load_dotenv

from extralit_server.api.handlers.v1.models import client
from extralit_server.contexts.document.layout import PDFOCRSettings

load_dotenv() # loads variables from a .env file in the project root

# Initialize settings
ocr_settings = PDFOCRSettings()


def get_modal_base_url() -> str:
base_url = ocr_settings.marker_modal_base_url
if not base_url:
raise RuntimeError("OCR_MARKER_MODAL_BASE_URL is not set. Set it to your Modal endpoint URL.")
return base_url.rstrip("/")


async def convert_document_via_modal(
pdf_path: Path,
output_format: str = "json",
page_range: Optional[str] = None,
force_ocr: bool = False,
paginate_output: bool = False,
use_llm: bool = False,
timeout: Optional[int] = None,
extra_headers: Optional[dict[str, str]] = None,
) -> dict[str, Any]:
"""
Calls the Modal-hosted Marker /convert endpoint and returns the JSON response.
"""
base_url = get_modal_base_url()
url = f"{base_url}/convert"

if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")

# httpx requires files as (name, file, content_type)
async with aiofiles.open(pdf_path, "rb") as f:
file_bytes = await f.read()
files = {"file": (pdf_path.name, file_bytes, "application/pdf")}
data = {
"output_format": output_format,
"page_range": page_range,
"force_ocr": str(bool(force_ocr)).lower(),
"paginate_output": str(bool(paginate_output)).lower(),
"use_llm": str(bool(use_llm)).lower(),
}
data = {k: v for k, v in data.items() if v not in (None, "", "none", "null")}

headers = extra_headers or {}
t = timeout if timeout is not None else ocr_settings.marker_modal_timeout_secs
try:
resp = await client.post(url, files=files, data=data, headers=headers, timeout=t)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
raise RuntimeError(f"Modal Marker conversion failed: {e}; body={resp.text[:1000]}") from e
return resp.json()
Loading
Loading