Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ import os
import asyncio
import sys

from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OCRPreset
from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OcrPreset

single_file = "docs/document.png"

Expand All @@ -116,7 +116,7 @@ options = AnyparserOption(
model="ocr",
format="markdown",
ocr_language=[OcrLanguage.JAPANESE],
ocr_preset=OCRPreset.SCAN,
ocr_preset=OcrPreset.SCAN,
)

parser = Anyparser(options)
Expand Down Expand Up @@ -226,7 +226,7 @@ The `Anyparser` class utilizes the `AnyparserOption` dataclass for flexible conf
from dataclasses import dataclass
from typing import List, Literal, Optional, Union

from anyparser_core import OcrLanguage, OCRPreset
from anyparser_core import OcrLanguage, OcrPreset

@dataclass
class AnyparserOption:
Expand Down Expand Up @@ -255,7 +255,7 @@ class AnyparserOption:

# OCR Configuration
ocr_language: Optional[List[OcrLanguage]] = None # Languages for OCR processing
ocr_preset: Optional[OCRPreset] = None # Preset configuration for OCR
ocr_preset: Optional[OcrPreset] = None # Preset configuration for OCR

# Crawler Configuration
max_depth: Optional[int] = None # Maximum crawl depth
Expand All @@ -278,7 +278,7 @@ class AnyparserOption:
| `files` | `Optional[Union[str, List[str]]]` | `None` | Input files to process |
| `url` | `Optional[str]` | `None` | URL for crawler model |
| `ocr_language` | `Optional[List[OcrLanguage]]` | `None` | Languages for OCR processing |
| `ocr_preset` | `Optional[OCRPreset]` | `None` | Preset configuration for OCR |
| `ocr_preset` | `Optional[OcrPreset]` | `None` | Preset configuration for OCR |
| `max_depth` | `Optional[int]` | `None` | Maximum crawl depth for crawler model |
| `max_executions` | `Optional[int]` | `None` | Maximum number of pages to crawl |
| `strategy` | `Optional[str]` | `None` | Crawling strategy: `"LIFO"` or `"FIFO"` |
Expand All @@ -288,19 +288,19 @@ class AnyparserOption:

The following OCR presets are available for optimized document processing:

- `OCRPreset.DOCUMENT` - General document processing
- `OCRPreset.HANDWRITING` - Handwritten text recognition
- `OCRPreset.SCAN` - Scanned document processing
- `OCRPreset.RECEIPT` - Receipt processing
- `OCRPreset.MAGAZINE` - Magazine/article processing
- `OCRPreset.INVOICE` - Invoice processing
- `OCRPreset.BUSINESS_CARD` - Business card processing
- `OCRPreset.PASSPORT` - Passport document processing
- `OCRPreset.DRIVER_LICENSE` - Driver's license processing
- `OCRPreset.IDENTITY_CARD` - ID card processing
- `OCRPreset.LICENSE_PLATE` - License plate recognition
- `OCRPreset.MEDICAL_REPORT` - Medical document processing
- `OCRPreset.BANK_STATEMENT` - Bank statement processing
- `OcrPreset.DOCUMENT` - General document processing
- `OcrPreset.HANDWRITING` - Handwritten text recognition
- `OcrPreset.SCAN` - Scanned document processing
- `OcrPreset.RECEIPT` - Receipt processing
- `OcrPreset.MAGAZINE` - Magazine/article processing
- `OcrPreset.INVOICE` - Invoice processing
- `OcrPreset.BUSINESS_CARD` - Business card processing
- `OcrPreset.PASSPORT` - Passport document processing
- `OcrPreset.DRIVER_LICENSE` - Driver's license processing
- `OcrPreset.IDENTITY_CARD` - ID card processing
- `OcrPreset.LICENSE_PLATE` - License plate recognition
- `OcrPreset.MEDICAL_REPORT` - Medical document processing
- `OcrPreset.BANK_STATEMENT` - Bank statement processing

**Model Types for AI Data Pipelines:**

Expand Down
6 changes: 3 additions & 3 deletions anyparser_core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .config.hardcoded import OcrLanguage, OCRPreset
from .config.hardcoded import OcrLanguage, OcrPreset
from .form import build_form
from .options import AnyparserOption, AnyparserParsedOption, UploadedFile
from .parser import (
Expand All @@ -15,8 +15,8 @@
AnyparserUrl,
)
from .validator import validate_and_parse, validate_option, validate_path
from .version import __version__

__version__ = "1.0.1"
__all__ = [
"Anyparser",
"AnyparserCrawlDirective",
Expand All @@ -35,6 +35,6 @@
"validate_option",
"build_form",
"Anyparser",
"OCRPreset",
"OcrPreset",
"OcrLanguage",
]
2 changes: 1 addition & 1 deletion anyparser_core/config/hardcoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
]


class OCRPreset(Enum):
class OcrPreset(Enum):
"""Enumeration of supported OCR presets for document processing."""

DOCUMENT = "document"
Expand Down
10 changes: 5 additions & 5 deletions anyparser_core/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,18 @@ def add_field(name: str, value: Any) -> None:
if parsed.model == "ocr":
if parsed.ocr_language:
add_field(
"ocrLanguage", ",".join([lang.value for lang in parsed.ocr_language])
"ocr_language", ",".join([lang.value for lang in parsed.ocr_language])
)

if parsed.ocr_preset:
add_field("ocrPreset", parsed.ocr_preset.value)
add_field("ocr_preset", parsed.ocr_preset.value)

if parsed.model == "crawler":
add_field("url", parsed.url)
add_field("maxDepth", parsed.max_depth)
add_field("maxExecutions", parsed.max_executions)
add_field("max_depth", parsed.max_depth)
add_field("max_executions", parsed.max_executions)
add_field("strategy", parsed.strategy)
add_field("traversalScope", parsed.traversal_scope)
add_field("traversal_scope", parsed.traversal_scope)
else:
# Add files to the form
for file in parsed.files:
Expand Down
8 changes: 4 additions & 4 deletions anyparser_core/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataclasses import dataclass, field
from typing import List, Literal, Optional, TypedDict, Union

from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset
from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset

# Type aliases for better readability
AnyparserFormatType = Literal["json", "markdown", "html"]
Expand All @@ -26,7 +26,7 @@ class AnyparserOption:
table: Optional[bool] = None
files: Optional[Union[str, List[str]]] = None
ocr_language: Optional[List[OcrLanguage]] = None
ocr_preset: Optional[OCRPreset] = None
ocr_preset: Optional[OcrPreset] = None
url: Optional[str] = None
max_depth: Optional[int] = None
max_executions: Optional[int] = None
Expand Down Expand Up @@ -54,7 +54,7 @@ class AnyparserParsedOption:
image: Optional[bool] = None
table: Optional[bool] = None
ocr_language: Optional[List[OcrLanguage]] = None
ocr_preset: Optional[OCRPreset] = None
ocr_preset: Optional[OcrPreset] = None
url: Optional[str] = None
max_depth: Optional[int] = None
max_executions: Optional[int] = None
Expand All @@ -72,7 +72,7 @@ class DefaultOptions(TypedDict):
image: Optional[bool]
table: Optional[bool]
ocr_language: Optional[List[OcrLanguage]]
ocr_preset: Optional[OCRPreset]
ocr_preset: Optional[OcrPreset]
url: Optional[str]
max_depth: Optional[int]
max_executions: Optional[int]
Expand Down
5 changes: 4 additions & 1 deletion anyparser_core/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .options import AnyparserOption
from .request import async_request
from .validator import validate_and_parse
from .version import __version__


@dataclass
Expand Down Expand Up @@ -81,6 +82,7 @@ class AnyparserUrl:
images: List[AnyparserImageReference] = field(default_factory=list)
text: Optional[str] = field(default=None)


@dataclass
class AnyparserPdfPage:
"""Represents a parsed PDF page with extracted content."""
Expand Down Expand Up @@ -152,7 +154,8 @@ async def parse(

# Set up the headers, using the same boundary
headers: Dict[str, str] = {
"Content-Type": f"multipart/form-data; boundary={boundary}"
"Content-Type": f"multipart/form-data; boundary={boundary}",
"User-Agent": f"anyparser_core@{__version__}",
}

if parsed.api_key:
Expand Down
5 changes: 1 addition & 4 deletions anyparser_core/validator/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ async def validate_path(file_paths: Union[str, List[str]]) -> PathValidationResu
Validates file paths exist and are accessible
"""
if not file_paths or (isinstance(file_paths, str) and not file_paths.strip()):
return InvalidPathValidationResult(
error=FileNotFoundError("No files provided")
)

return InvalidPathValidationResult(error=FileNotFoundError("No files provided"))

if isinstance(file_paths, (str, Path)):
files = [file_paths]
Expand Down
1 change: 1 addition & 0 deletions anyparser_core/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "1.0.2"
26 changes: 26 additions & 0 deletions changelogs/v1.0.2-changelog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Release anyparser-core@1.0.2

## Changes

**User Agent**

- Added a User-Agent header.
- Moved the version literal `__version__` to a separate file to prevent circular referencing.

**Rename "OCRPreset" to "OcrPreset"**

This pull request refactors the OCRPreset class to OcrPreset across the codebase for consistency in naming conventions.

- Renamed OCRPreset to `OcrPreset` in files like `README.md`, `anyparser_core/__init__.py`, and examples.
- Updated variable names and documentation to reflect the new class name.
- Modified test files to use the updated class.

This change is purely a refactor with no functional impact, aiming for consistency and improved readability.

## Breaking Changes

The class `OCRPreset` has been renamed to `OcrPreset` to maintain consistency in naming conventions.

## Migration Guide

Search and replace all instances of `OCRPreset` with `OcrPreset` in your codebase.
15 changes: 3 additions & 12 deletions examples/03_one_liner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,8 @@

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from anyparser_core import Anyparser

multiple_files = ["docs/sample.docx", "docs/sample.pdf"]
# ------------------------------------------------------------------------------

result = asyncio.run(Anyparser().parse(multiple_files))

for item in result:
print("-" * 100)
print("File:", item.original_filename)
print("Checksum:", item.checksum)
print("Total characters:", item.total_characters)
print("Markdown:", item.markdown)
from anyparser_core import Anyparser

print("-" * 100)
print(asyncio.run(Anyparser().parse(["docs/sample.docx", "docs/sample.pdf"])))
4 changes: 2 additions & 2 deletions examples/04_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OCRPreset
from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OcrPreset

single_file = "docs/document.png"

Expand All @@ -14,7 +14,7 @@
model="ocr",
format="markdown",
ocr_language=[OcrLanguage.JAPANESE],
ocr_preset=OCRPreset.SCAN,
ocr_preset=OcrPreset.SCAN,
)

parser = Anyparser(options)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[project]
name = "anyparser-core"
version = "1.0.1"
version = "1.0.2"
description = "Anyparser SDK for Python"
readme = "README.md"
requires-python = ">=3.9"
Expand Down
20 changes: 10 additions & 10 deletions tests/test_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from anyparser_core import OcrLanguage, OCRPreset
from anyparser_core import OcrLanguage, OcrPreset
from anyparser_core.form import build_form
from anyparser_core.options import AnyparserParsedOption, UploadedFile

Expand Down Expand Up @@ -55,15 +55,15 @@ def test_build_form_with_ocr_options(basic_parsed_option):
"""Test form building with OCR options"""
basic_parsed_option.model = "ocr"
basic_parsed_option.ocr_language = [OcrLanguage.ENGLISH, OcrLanguage.SPANISH]
basic_parsed_option.ocr_preset = OCRPreset.DOCUMENT
basic_parsed_option.ocr_preset = OcrPreset.DOCUMENT

boundary = "boundary123"
form_data = build_form(basic_parsed_option, boundary)
form_str = form_data.decode("utf-8")

assert 'Content-Disposition: form-data; name="ocrLanguage"' in form_str
assert 'Content-Disposition: form-data; name="ocr_language"' in form_str
assert "eng,spa" in form_str
assert 'Content-Disposition: form-data; name="ocrPreset"' in form_str
assert 'Content-Disposition: form-data; name="ocr_preset"' in form_str
assert "document" in form_str


Expand Down Expand Up @@ -132,12 +132,12 @@ def test_build_form_ocr():
format="json",
model="ocr",
ocr_language=[OcrLanguage.JAPANESE],
ocr_preset=OCRPreset.SCAN,
ocr_preset=OcrPreset.SCAN,
files=[],
)
form_data = build_form(option, "boundary")
assert b'name="ocrLanguage"' in form_data
assert b'name="ocrPreset"' in form_data
assert b'name="ocr_language"' in form_data
assert b'name="ocr_preset"' in form_data


def test_build_form_crawler():
Expand All @@ -155,10 +155,10 @@ def test_build_form_crawler():
)
form_data = build_form(option, "boundary")
assert b'name="url"' in form_data
assert b'name="maxDepth"' in form_data
assert b'name="maxExecutions"' in form_data
assert b'name="max_depth"' in form_data
assert b'name="max_executions"' in form_data
assert b'name="strategy"' in form_data
assert b'name="traversalScope"' in form_data
assert b'name="traversal_scope"' in form_data


def test_build_form_with_files(tmp_path):
Expand Down
12 changes: 6 additions & 6 deletions tests/test_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset
from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset
from anyparser_core.options import (
AnyparserOption,
AnyparserParsedOption,
Expand Down Expand Up @@ -61,7 +61,7 @@ def test_build_options_with_custom_options(mock_api_key, monkeypatch):
image=False,
table=False,
ocr_language=[OcrLanguage.ENGLISH],
ocr_preset=OCRPreset.DOCUMENT,
ocr_preset=OcrPreset.DOCUMENT,
)

options = build_options(custom_options)
Expand All @@ -72,7 +72,7 @@ def test_build_options_with_custom_options(mock_api_key, monkeypatch):
assert options["image"] is False
assert options["table"] is False
assert options["ocr_language"] == [OcrLanguage.ENGLISH]
assert options["ocr_preset"] == OCRPreset.DOCUMENT
assert options["ocr_preset"] == OcrPreset.DOCUMENT


def test_anyparser_parsed_file():
Expand All @@ -94,7 +94,7 @@ def test_anyparser_parsed_option():
image=False,
table=True,
ocr_language=[OcrLanguage.ENGLISH],
ocr_preset=OCRPreset.HANDWRITING,
ocr_preset=OcrPreset.HANDWRITING,
)

assert len(parsed_option.files) == 1
Expand All @@ -105,7 +105,7 @@ def test_anyparser_parsed_option():
assert parsed_option.image is False
assert parsed_option.table is True
assert parsed_option.ocr_language == [OcrLanguage.ENGLISH]
assert parsed_option.ocr_preset == OCRPreset.HANDWRITING
assert parsed_option.ocr_preset == OcrPreset.HANDWRITING


def test_anyparser_option_validation():
Expand All @@ -117,7 +117,7 @@ def test_anyparser_option_validation():

# Test invalid OCR preset
with pytest.raises(ValueError):
options = build_options(AnyparserOption(ocr_preset=OCRPreset("invalid")))
options = build_options(AnyparserOption(ocr_preset=OcrPreset("invalid")))
validate_option(options)

# Test missing API URL
Expand Down
Loading