Skip to content

Commit 16bcb10

Browse files
🐛 fix split feature adding additional pages to output files (#397)
1 parent 3f7a5ad commit 16bcb10

File tree

7 files changed

+30
-25
lines changed

7 files changed

+30
-25
lines changed

mindee/extraction/common/extracted_image.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ def save_to_file(
6161
if not file_format:
6262
if len(resolved_path.suffix) < 1:
6363
raise ValueError("Invalid file format.")
64-
# Let PIL infer format from filename extension
6564
self.buffer.seek(0)
6665
image = Image.open(self.buffer)
6766
if file_format:

mindee/v2/file_operations/crop_files.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
class CropFiles(List[ExtractedImage]):
88
"""Crop files."""
99

10-
def save_all_to_disk(self, path: Union[Path, str]):
10+
def save_all_to_disk(self, path: Union[Path, str], prefix: str = "crop"):
1111
"""
1212
Save all extracted crops to disk.
1313
14-
:param path: Path to save the extracted splits to
14+
:param path: Path to save the extracted splits to.
15+
:param prefix: Prefix to add to the filename, defaults to 'crop'.
1516
"""
1617
if isinstance(path, str):
1718
path = Path(path)
1819
path.mkdir(parents=True, exist_ok=True)
1920
for idx, split in enumerate(self, start=1):
20-
split.save_to_file(path / f"crop_{idx:03}.jpg")
21+
split.save_to_file(path / f"{prefix}_{idx:03}.jpg")

mindee/v2/file_operations/split.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,28 @@
11
from typing import List, Union
22

33
from mindee.error import MindeeError
4-
from mindee.extraction import PdfExtractor
4+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
5+
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
56
from mindee.input.sources.local_input_source import LocalInputSource
67
from mindee.v2.file_operations.split_files import SplitFiles
7-
from mindee.v2.product.split.split_range import SplitRange
8+
9+
10+
def extract_single_split(
11+
input_source: LocalInputSource, split: List[int]
12+
) -> ExtractedPdf:
13+
"""
14+
Extracts a single split as a complete PDF from the document.
15+
16+
:param input_source: Input source to split.
17+
:param split: List of pages to keep.
18+
:return: Extracted PDF
19+
"""
20+
return extract_splits(input_source, [split])[0]
821

922

1023
def extract_splits(
1124
input_source: LocalInputSource,
12-
splits: Union[List[SplitRange], List[List[int]]],
25+
splits: Union[List[List[int]]],
1326
) -> SplitFiles:
1427
"""
1528
Extracts splits as complete PDFs from the document.
@@ -21,13 +34,7 @@ def extract_splits(
2134
pdf_extractor = PdfExtractor(input_source)
2235
page_groups = []
2336
for split in splits:
24-
if isinstance(split, SplitRange):
25-
lower_bound = split.page_range[0]
26-
upper_bound = split.page_range[1]
27-
else:
28-
lower_bound = split[0]
29-
upper_bound = split[1]
30-
page_groups.append(list(range(lower_bound, upper_bound + 1)))
37+
page_groups.append(list(range(split[0], split[1] + 1)))
3138
if len(splits) < 1:
3239
raise MindeeError("No indexes provided.")
3340
return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))

mindee/v2/file_operations/split_files.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
class SplitFiles(List[ExtractedPdf]):
88
"""Split files."""
99

10-
def save_all_to_disk(self, path: Union[str, Path]):
10+
def save_all_to_disk(self, path: Union[str, Path], prefix: str = "split"):
1111
"""
1212
Save all extracted splits to disk.
1313
14-
:param path: Path to save the extracted splits to
14+
:param path: Path to save the extracted splits to.
15+
:param prefix: Prefix to add to the filename, defaults to 'split'.
1516
"""
1617
if isinstance(path, str):
1718
path = Path(path)
1819
path.mkdir(parents=True, exist_ok=True)
1920
for idx, split in enumerate(self, start=1):
20-
split.save_to_file(path / f"split_{idx:03}.pdf")
21+
split.save_to_file(path / f"{prefix}_{idx:03}.pdf")

mindee/v2/product/split/split_range.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from typing import List
22

33
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
4-
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
54
from mindee.input.sources.local_input_source import LocalInputSource
65
from mindee.parsing.common.string_dict import StringDict
6+
from mindee.v2.file_operations.split import extract_single_split
77

88

99
class SplitRange:
@@ -32,5 +32,4 @@ def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf:
3232
:param input_source: Local file to apply the inference to
3333
:return: Extracted PDF
3434
"""
35-
pdf_extractor = PdfExtractor(input_source)
36-
return pdf_extractor.extract_sub_documents([self.page_range])[0]
35+
return extract_single_split(input_source, self.page_range)

tests/v2/file_operations/test_split_operation.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import pytest
44

5-
from mindee.v2.file_operations.split import extract_splits
65
from mindee.input.sources.path_input import PathInput
76
from mindee.v2.product.split.split_response import (
87
SplitResponse,
@@ -37,7 +36,7 @@ def test_single_page_split_split(splits_default, splits_single_page_json_path):
3736
with open(splits_single_page_json_path, "rb") as f:
3837
response = json.load(f)
3938
doc = SplitResponse(response)
40-
extracted_splits = extract_splits(input_sample, doc.inference.result.splits)
39+
extracted_splits = doc.extract_from_file(input_sample)
4140
assert len(extracted_splits) == 1
4241

4342
assert extracted_splits[0].get_page_count() == 1
@@ -48,7 +47,7 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
4847
with open(splits_multi_page_json_path, "rb") as f:
4948
response = json.load(f)
5049
doc = SplitResponse(response)
51-
extracted_splits = extract_splits(input_sample, doc.inference.result.splits)
50+
extracted_splits = doc.extract_from_file(input_sample)
5251
assert len(extracted_splits) == 3
5352

5453
assert extracted_splits[0].get_page_count() == 1

tests/v2/file_operations/test_split_operation_integration.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
SplitResponse,
1111
)
1212
from mindee.input.sources.path_input import PathInput
13-
from mindee.v2.file_operations.split import extract_splits
1413
from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files
1514

1615

@@ -38,7 +37,7 @@ def test_pdf_should_extract_splits():
3837
)
3938
assert response.inference.file.page_count == 2
4039

41-
extracted_pdfs = extract_splits(split_input, response.inference.result.splits)
40+
extracted_pdfs = response.extract_from_file(split_input)
4241

4342
assert len(extracted_pdfs) == 2
4443
assert extracted_pdfs[0].filename == "default_sample_001-001.pdf"

0 commit comments

Comments
 (0)