Skip to content

Commit 9627f48

Browse files
✨ add support for crop & split extractions (#395)
1 parent 99a1c0a commit 9627f48

30 files changed

+535
-56
lines changed

.github/workflows/cron.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ jobs:
99
uses: mindee/mindee-api-python/.github/workflows/_test-regressions.yml@main
1010
secrets: inherit
1111
test-code-samples:
12-
uses: mindee/mindee-api-python/.github/workflows/_smoke_test.yml@main
12+
uses: mindee/mindee-api-python/.github/workflows/_smoke-test.yml@main
1313
secrets: inherit

examples/auto_multi_receipts_extraction_example.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ def parse_receipts(input_path):
1616
extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
1717

1818
for idx, receipt in enumerate(extracted_receipts, 1):
19-
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
19+
result_receipt = mindee_client.parse(
20+
product.ReceiptV5, receipt.as_input_source()
21+
)
2022
print(f"Receipt {idx}:")
2123
print(result_receipt.document)
2224
print("-" * 40)

mindee/extraction/common/extracted_image.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import io
22
from pathlib import Path
3-
from typing import Optional
3+
from typing import Optional, Union
44

55
from PIL import Image
66

@@ -17,6 +17,8 @@ class ExtractedImage:
1717
"""Id of the page the image was extracted from."""
1818
_element_id: int
1919
"""Id of the element on a given page."""
20+
filename: str
21+
"""Name of the file the image was extracted from."""
2022

2123
def __init__(
2224
self, input_source: LocalInputSource, page_id: int, element_id: int
@@ -30,6 +32,7 @@ def __init__(
3032
"""
3133
self.buffer = io.BytesIO(input_source.file_object.read())
3234
self.buffer.name = input_source.filename
35+
self.filename = input_source.filename
3336
if input_source.is_pdf():
3437
extension = "jpg"
3538
else:
@@ -43,7 +46,9 @@ def __init__(
4346
self._page_id = page_id
4447
self._element_id = 0 if element_id is None else element_id
4548

46-
def save_to_file(self, output_path: str, file_format: Optional[str] = None):
49+
def save_to_file(
50+
self, output_path: Union[Path, str], file_format: Optional[str] = None
51+
):
4752
"""
4853
Saves the document to a file.
4954
@@ -56,20 +61,27 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None):
5661
if not file_format:
5762
if len(resolved_path.suffix) < 1:
5863
raise ValueError("Invalid file format.")
59-
file_format = (
60-
resolved_path.suffix.upper()
61-
) # technically redundant since PIL applies an upper operation
62-
# to the parameter , but older versions may not do so.
64+
# Let PIL infer format from filename extension
6365
self.buffer.seek(0)
6466
image = Image.open(self.buffer)
65-
image.save(resolved_path, format=file_format)
67+
if file_format:
68+
image.save(resolved_path, format=file_format)
69+
else:
70+
image.save(resolved_path)
6671
logger.info("File saved successfully to '%s'.", resolved_path)
6772
except TypeError as exc:
6873
raise MindeeError("Invalid path/filename provided.") from exc
6974
except Exception as exc:
75+
print(exc)
7076
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc
7177

7278
def as_source(self) -> FileInput:
79+
"""
80+
Deprecated. Use ``as_input_source`` instead.
81+
"""
82+
return self.as_input_source()
83+
84+
def as_input_source(self) -> FileInput:
7385
"""
7486
Return the file as a Mindee-compatible BufferInput source.
7587

mindee/extraction/common/image_extractor.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import io
2-
from typing import BinaryIO, List
2+
from typing import BinaryIO, List, Union
33

44
import pypdfium2 as pdfium
55
from PIL import Image
66

77
from mindee.error.mindee_error import MindeeError
88
from mindee.extraction.common.extracted_image import ExtractedImage
99
from mindee.geometry.point import Point
10-
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
10+
from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
1111
from mindee.input.sources.bytes_input import BytesInput
1212
from mindee.input.sources.local_input_source import LocalInputSource
1313

@@ -114,7 +114,9 @@ def get_file_extension(file_format: str):
114114

115115

116116
def extract_multiple_images_from_source(
117-
input_source: LocalInputSource, page_id: int, polygons: List[List[Point]]
117+
input_source: LocalInputSource,
118+
page_id: int,
119+
polygons: List[Union[Polygon, List[Point]]],
118120
) -> List[ExtractedImage]:
119121
"""
120122
Extracts elements from a page based on a list of bounding boxes.

mindee/extraction/pdf_extractor/extracted_pdf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from pathlib import Path
2-
from typing import BinaryIO
2+
from typing import BinaryIO, Union
33

44
import pypdfium2 as pdfium
55

@@ -28,6 +28,10 @@ def get_page_count(self) -> int:
2828
) from exc
2929

3030
def write_to_file(self, output_path: str):
31+
"""Deprecated. Use ``save_to_file`` instead."""
32+
self.save_to_file(output_path)
33+
34+
def save_to_file(self, output_path: Union[Path, str]):
3135
"""
3236
Writes the contents of the current PDF object to a file.
3337
@@ -40,6 +44,7 @@ def write_to_file(self, output_path: str):
4044
raise MindeeError("Invalid save path provided {}.")
4145
if out_path.suffix.lower() != "pdf":
4246
out_path = out_path.parent / (out_path.stem + "." + "pdf")
47+
self.pdf_bytes.seek(0)
4348
with open(out_path, "wb") as out_file:
4449
out_file.write(self.pdf_bytes.read())
4550

mindee/mindee_http/mindee_api_v2.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import requests
55

66
from mindee.error.mindee_error import MindeeApiV2Error
7-
from mindee.input import LocalInputSource, UrlInputSource, BaseParameters
7+
from mindee.input.base_parameters import BaseParameters
8+
from mindee.input.sources.local_input_source import LocalInputSource
9+
from mindee.input.sources.url_input_source import UrlInputSource
810
from mindee.logger import logger
911
from mindee.mindee_http.base_settings import USER_AGENT
1012
from mindee.mindee_http.settings_mixin import SettingsMixin

mindee/mindee_http/workflow_endpoint.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
import requests
44

5-
from mindee.input import LocalInputSource, UrlInputSource, WorkflowOptions
5+
from mindee.input.sources.local_input_source import LocalInputSource
6+
from mindee.input.sources.url_input_source import UrlInputSource
7+
from mindee.input.workflow_options import WorkflowOptions
68
from mindee.mindee_http.base_endpoint import BaseEndpoint
79
from mindee.mindee_http.workflow_settings import WorkflowSettings
810

mindee/v2/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
from mindee.v2.file_operations.crop import (
2+
extract_crops,
3+
extract_single_crop,
4+
)
5+
from mindee.v2.file_operations.split import extract_splits
16
from mindee.v2.product.classification.classification_parameters import (
27
ClassificationParameters,
38
)
@@ -12,6 +17,10 @@
1217
from mindee.v2.product.split.split_response import SplitResponse
1318

1419
__all__ = [
20+
"extract_crops",
21+
"extract_splits",
22+
"extract_crops",
23+
"extract_single_crop",
1524
"ClassificationResponse",
1625
"ClassificationParameters",
1726
"CropResponse",
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from mindee.v2.file_operations.crop import (
2+
extract_crops,
3+
extract_single_crop,
4+
)
5+
from mindee.v2.file_operations.split import extract_splits
6+
7+
__all__ = ["extract_crops", "extract_splits", "extract_crops", "extract_single_crop"]

mindee/v2/file_operations/crop.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from typing import List, Union
2+
3+
from mindee.error import MindeeError
4+
from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
5+
from mindee.geometry import Point, Polygon
6+
from mindee.input.sources.local_input_source import LocalInputSource
7+
from mindee.parsing.v2.field import FieldLocation
8+
from mindee.v2.file_operations.crop_files import CropFiles
9+
from mindee.v2.product.crop.crop_box import CropBox
10+
11+
12+
def extract_single_crop(
13+
input_source: LocalInputSource, crop: FieldLocation
14+
) -> ExtractedImage:
15+
"""
16+
Extracts a single crop as complete PDFs from the document.
17+
18+
:param input_source: Local Input Source to extract sub-receipts from.
19+
:param crop: Crop to extract.
20+
:return: ExtractedImage.
21+
"""
22+
23+
polygons: List[Union[Polygon, List[Point]]] = [crop.polygon]
24+
return extract_multiple_images_from_source(input_source, crop.page, polygons)[0]
25+
26+
27+
def extract_crops(input_source: LocalInputSource, crops: List[CropBox]) -> CropFiles:
28+
"""
29+
Extracts individual receipts from multi-receipts documents.
30+
31+
:param input_source: Local Input Source to extract sub-receipts from.
32+
:param crops: List of crops.
33+
:return: Individual extracted receipts as an array of ExtractedImage.
34+
"""
35+
images: List[ExtractedImage] = []
36+
if not crops:
37+
raise MindeeError("No possible candidates found for Crop extraction.")
38+
polygons: List[List[Union[Polygon, List[Point]]]] = [
39+
[] for _ in range(input_source.page_count)
40+
]
41+
for i, crop in enumerate(crops):
42+
polygons[crop.location.page].append(crop.location.polygon)
43+
for i, polygon in enumerate(polygons):
44+
images.extend(
45+
extract_multiple_images_from_source(
46+
input_source,
47+
i,
48+
polygon,
49+
)
50+
)
51+
return CropFiles(images)

0 commit comments

Comments
 (0)