Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/docs/extraction/extraction-charts-infographics.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Charts and infographics

Charts and infographic regions are classified as graphic elements and processed with the corresponding NVIDIA NIM workflows (for example, **yolox-graphic-elements** in current releases). Outputs use the same metadata schema as other extracted objects.
Chart and infographic regions are detected by page-element detection and processed by NeMo Retriever Library's OCR workflow. Outputs use the same metadata schema as other extracted objects.

**Related**

Expand Down
2 changes: 0 additions & 2 deletions nemo_retriever/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ python -m nemo_retriever.examples.graph_pipeline \
/your-example-dir \
--lancedb-uri lancedb \
--page-elements-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3 \
--graphic-elements-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1 \
--ocr-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1 \
--ocr-version v1 \
--table-structure-invoke-url https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1 \
Expand Down Expand Up @@ -493,7 +492,6 @@ ingestor = (
.extract(
# for self hosted NIMs, your URLs will depend on your NIM container DNS settings
page_elements_invoke_url="https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-page-elements-v3",
graphic_elements_invoke_url="https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-graphic-elements-v1",
ocr_invoke_url="https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1",
table_structure_invoke_url="https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-table-structure-v1"
)
Expand Down
2 changes: 0 additions & 2 deletions nemo_retriever/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ local = [
"timm==1.0.22",
"albumentations==2.0.8",
"nemotron-page-elements-v3>=0.dev0",
"nemotron-graphic-elements-v1>=0.dev0",
"nemotron-table-structure-v1>=0.dev0",
# Stay on the 1.0.2 OCR dev train and exclude older PyPI finals.
"nemotron-ocr>=1.0.2.dev0,<1.0.2a0; sys_platform == 'linux'",
Expand Down Expand Up @@ -150,7 +149,6 @@ version = {attr = "nemo_retriever.version.get_build_version"}
nv-ingest-api = { path = "../api/", editable = true }
nv-ingest-client = { path = "../client/", editable = true }
nemotron-page-elements-v3 = { index = "test-pypi" }
nemotron-graphic-elements-v1 = { index = "test-pypi" }
nemotron-table-structure-v1 = { index = "test-pypi" }
nemotron-ocr = { index = "test-pypi" }
# On Linux, resolve torch/torchvision from the CUDA wheel index.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def stage_names_from_flags(
use_table_structure: bool = False,
table_output_format: str = "pseudo_markdown",
extract_charts: bool = False,
use_graphic_elements: bool = False,
embed_text: bool = False,
) -> Iterable[str]:
validate_table_structure_flags(use_table_structure, table_output_format)
Expand All @@ -62,9 +61,7 @@ def stage_names_from_flags(
yield "enrich_table_structure"
elif extract_tables:
yield "enrich_table"
if extract_charts and use_graphic_elements:
yield "enrich_graphic_elements"
elif extract_charts:
if extract_charts:
yield "enrich_chart"
if embed_text:
yield "embed_text"
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import pandas as pd

from nemo_retriever.chart.chart_detection import graphic_elements_ocr_page_elements
from nemo_retriever.chart.processor import extract_chart_data_from_primitives_df
from nemo_retriever.infographic.processor import extract_infographic_data_from_primitives_df
from nemo_retriever.table.processor import extract_table_data_from_primitives_df
Expand All @@ -24,19 +23,12 @@ def _enrich_table_structure(df: pd.DataFrame, **kwargs: Any) -> tuple[pd.DataFra
return out_df, {}


def _enrich_graphic_elements(df: pd.DataFrame, **kwargs: Any) -> tuple[pd.DataFrame, Dict[str, Any]]:
"""Thin wrapper so ``graphic_elements_ocr_page_elements`` matches the ``(df, info)`` contract."""
out_df = graphic_elements_ocr_page_elements(df, **kwargs)
return out_df, {}


# Registry can be extended by future package integrations without changing
# pipeline orchestration internals.
STAGE_REGISTRY: Dict[str, StageHandler] = {
"enrich_infographic": extract_infographic_data_from_primitives_df,
"enrich_table": extract_table_data_from_primitives_df,
"enrich_table_structure": _enrich_table_structure,
"enrich_chart": extract_chart_data_from_primitives_df,
"enrich_graphic_elements": _enrich_graphic_elements,
"embed_text": embed_text_from_primitives_df,
}
61 changes: 0 additions & 61 deletions nemo_retriever/src/nemo_retriever/chart/chart_detection.py

This file was deleted.

61 changes: 0 additions & 61 deletions nemo_retriever/src/nemo_retriever/chart/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,66 +72,5 @@ def run(
console.print(f"[green]Done[/green] wrote={output_path} rows={len(out_df)}")


@app.command("graphic-elements")
def render_graphic_elements(
config: Optional[Path] = typer.Option(
None,
"--config",
exists=True,
dir_okay=False,
file_okay=True,
help="Optional YAML config file. If set, values are loaded from YAML; "
"explicitly passed CLI flags override YAML.",
),
input_dir: Optional[Path] = typer.Option(
None,
"--input-dir",
exists=True,
file_okay=False,
dir_okay=True,
help="Directory to scan recursively for *.pdf (can be provided via --config).",
),
method: str = typer.Option("pdfium", "--method", help="PDF extraction method."),
auth_token: Optional[str] = typer.Option(None, "--auth-token", help="Auth token for NIM-backed services."),
yolox_grpc_endpoint: Optional[str] = typer.Option(None, "--yolox-grpc-endpoint"),
yolox_http_endpoint: Optional[str] = typer.Option(None, "--yolox-http-endpoint"),
nemotron_parse_grpc_endpoint: Optional[str] = typer.Option(None, "--nemotron-parse-grpc-endpoint"),
nemotron_parse_http_endpoint: Optional[str] = typer.Option(None, "--nemotron-parse-http-endpoint"),
nemotron_parse_model_name: Optional[str] = typer.Option(None, "--nemotron-parse-model-name"),
extract_text: bool = typer.Option(True, "--extract-text/--no-extract-text"),
extract_images: bool = typer.Option(False, "--extract-images/--no-extract-images"),
extract_tables: bool = typer.Option(False, "--extract-tables/--no-extract-tables"),
extract_charts: bool = typer.Option(False, "--extract-charts/--no-extract-charts"),
extract_infographics: bool = typer.Option(False, "--extract-infographics/--no-extract-infographics"),
extract_page_as_image: bool = typer.Option(False, "--extract-page-as-image/--no-extract-page-as-image"),
text_depth: str = typer.Option("page", "--text-depth"),
write_json_outputs: bool = typer.Option(True, "--write-json-outputs/--no-write-json-outputs"),
json_output_dir: Optional[Path] = typer.Option(None, "--json-output-dir", file_okay=False, dir_okay=True),
limit: Optional[int] = typer.Option(None, "--limit", help="Optionally limit number of PDFs processed."),
) -> None:
_ = (
config,
input_dir,
method,
auth_token,
yolox_grpc_endpoint,
yolox_http_endpoint,
nemotron_parse_grpc_endpoint,
nemotron_parse_http_endpoint,
nemotron_parse_model_name,
extract_text,
extract_images,
extract_tables,
extract_charts,
extract_infographics,
extract_page_as_image,
text_depth,
write_json_outputs,
json_output_dir,
limit,
)
typer.echo("graphic-elements command is not implemented yet.")


def main() -> None:
app()
18 changes: 0 additions & 18 deletions nemo_retriever/src/nemo_retriever/chart/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,3 @@ def load_chart_extractor_schema_from_dict(cfg: Dict[str, Any]) -> ChartExtractor
class ChartExtractionStageConfig:
batch_size: int = 64
stage_name: str = "chart_extraction"


@dataclass(frozen=True)
class GraphicElementsOCRStageConfig:
graphic_elements_invoke_url: str = ""
ocr_invoke_url: str = ""
api_key: str = ""
request_timeout_s: float = 60.0


def load_graphic_elements_ocr_config_from_dict(cfg: Dict[str, Any]) -> GraphicElementsOCRStageConfig:
cfg = dict(cfg or {})
return GraphicElementsOCRStageConfig(
graphic_elements_invoke_url=str(cfg.get("graphic_elements_invoke_url") or ""),
ocr_invoke_url=str(cfg.get("ocr_invoke_url") or ""),
api_key=str(cfg.get("api_key") or ""),
request_timeout_s=float(cfg.get("request_timeout_s", 60.0)),
)
134 changes: 0 additions & 134 deletions nemo_retriever/src/nemo_retriever/chart/cpu_actor.py

This file was deleted.

Loading
Loading