Skip to content

Commit f5788a4

Browse files
committed
💥 use 0-based page indexes for file names
1 parent 5c7e168 commit f5788a4

4 files changed

Lines changed: 45 additions & 57 deletions

File tree

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 35 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,15 @@ public BasePDFExtractor(LocalInputSource source) throws IOException {
5555
}
5656
}
5757

58-
/**
59-
* Converts an array to a buffered image.
60-
*
61-
* @param byteArray Raw byte array.
62-
* @return a valid ImageIO buffer.
63-
* @throws IOException Throws if the file can't be accessed.
64-
*/
65-
private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
66-
try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) {
67-
return ImageIO.read(stream);
58+
public ExtractedPDF extractSinglePage(
59+
List<Integer> pageNumbers,
60+
boolean closeOriginal
61+
) throws IOException {
62+
if (pageNumbers.isEmpty()) {
63+
throw new MindeeException("Empty indexes not allowed for extraction.");
6864
}
65+
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
66+
return new ExtractedPDF(pdfBytes, makeFilename(pageNumbers));
6967
}
7068

7169
/**
@@ -79,23 +77,37 @@ public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws
7977
var extractedPDFs = new ExtractedPDFs();
8078

8179
for (List<Integer> pageIndexElement : pageIndexes) {
82-
if (pageIndexElement.isEmpty()) {
83-
throw new MindeeException("Empty indexes not allowed for extraction.");
84-
}
85-
String[] splitName = InputSourceUtils.splitNameStrict(filename);
86-
String fieldFilename = splitName[0]
87-
+ String.format("_%3s", pageIndexElement.get(0) + 1).replace(" ", "0")
88-
+ "-"
89-
+ String
90-
.format("%3s", pageIndexElement.get(pageIndexElement.size() - 1) + 1)
91-
.replace(" ", "0")
92-
+ "."
93-
+ splitName[1];
94-
extractedPDFs.add(extractSinglePage(pageIndexElement, fieldFilename, false));
80+
extractedPDFs.add(extractSinglePage(pageIndexElement, false));
9581
}
9682
return extractedPDFs;
9783
}
9884

85+
/**
86+
* Converts an array to a buffered image.
87+
*
88+
* @param byteArray Raw byte array.
89+
* @return a valid ImageIO buffer.
90+
* @throws IOException Throws if the file can't be accessed.
91+
*/
92+
private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
93+
try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) {
94+
return ImageIO.read(stream);
95+
}
96+
}
97+
98+
/**
99+
* Make a nice filename for the split.
100+
*/
101+
private String makeFilename(List<Integer> pageNumbers) {
102+
String[] splitName = InputSourceUtils.splitNameStrict(filename);
103+
return splitName[0]
104+
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
105+
+ "-"
106+
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
107+
+ "."
108+
+ splitName[1];
109+
}
110+
99111
private static PDPage clonePage(PDPage page) {
100112

101113
COSDictionary pageDict = page.getCOSObject();
@@ -129,28 +141,4 @@ private static byte[] createPdfFromExistingPdf(
129141
outputStream.close();
130142
return output;
131143
}
132-
133-
public ExtractedPDF extractSinglePage(
134-
List<Integer> pageNumbers,
135-
String fieldFilename,
136-
boolean closeOriginal
137-
) throws IOException {
138-
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
139-
return new ExtractedPDF(pdfBytes, fieldFilename);
140-
}
141-
142-
public ExtractedPDF extractSinglePage(
143-
List<Integer> pageNumbers,
144-
boolean closeOriginal
145-
) throws IOException {
146-
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
147-
String[] splitName = InputSourceUtils.splitNameStrict(filename);
148-
String fieldFilename = splitName[0]
149-
+ String.format("_%3s", pageNumbers.get(0) + 1).replace(" ", "0")
150-
+ "-"
151-
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1) + 1).replace(" ", "0")
152-
+ "."
153-
+ splitName[1];
154-
return new ExtractedPDF(pdfBytes, fieldFilename);
155-
}
156144
}

src/test/java/com/mindee/v1/fileOperations/InvoiceSplitterAutoExtractionIT.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedExc
7171
List<ExtractedPDF> extractedPDFsStrict = extractor
7272
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), false);
7373
Assertions.assertEquals(2, extractedPDFsStrict.size());
74-
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename());
75-
Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename());
74+
Assertions.assertEquals("default_sample_000-000.pdf", extractedPDFsStrict.get(0).getFilename());
75+
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(1).getFilename());
7676

7777
PredictResponse<InvoiceV4> invoice0 = getInvoicePrediction(
7878
extractedPDFsStrict.get(0).asInputSource()

src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException {
3131
var extractedPDFSNoStrict = extractor
3232
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), false);
3333
Assertions.assertEquals(3, extractedPDFSNoStrict.size());
34-
Assertions.assertEquals("invoice_5p_001-001.pdf", extractedPDFSNoStrict.get(0).getFilename());
35-
Assertions.assertEquals("invoice_5p_002-004.pdf", extractedPDFSNoStrict.get(1).getFilename());
36-
Assertions.assertEquals("invoice_5p_005-005.pdf", extractedPDFSNoStrict.get(2).getFilename());
34+
Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFSNoStrict.get(0).getFilename());
35+
Assertions.assertEquals("invoice_5p_001-003.pdf", extractedPDFSNoStrict.get(1).getFilename());
36+
Assertions.assertEquals("invoice_5p_004-004.pdf", extractedPDFSNoStrict.get(2).getFilename());
3737
}
3838

3939
@Test
@@ -48,7 +48,7 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException {
4848
var extractedPDFStrict = extractor
4949
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), true);
5050
Assertions.assertEquals(2, extractedPDFStrict.size());
51-
Assertions.assertEquals("invoice_5p_001-001.pdf", extractedPDFStrict.get(0).getFilename());
52-
Assertions.assertEquals("invoice_5p_002-005.pdf", extractedPDFStrict.get(1).getFilename());
51+
Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFStrict.get(0).getFilename());
52+
Assertions.assertEquals("invoice_5p_001-004.pdf", extractedPDFStrict.get(1).getFilename());
5353
}
5454
}

src/test/java/com/mindee/v2/fileOperations/SplitTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ void singlePage_splitsCorrectly() throws IOException {
2020
var extractedSplit = new Split(inputSample)
2121
.extractSingle(doc.getInference().getResult().getSplits().get(0));
2222

23-
assertEquals("default_sample_001-001.pdf", extractedSplit.getFilename());
23+
assertEquals("default_sample_000-000.pdf", extractedSplit.getFilename());
2424
var asInputSource = extractedSplit.asInputSource();
2525
assertEquals(1, asInputSource.getPageCount());
2626
}
@@ -38,12 +38,12 @@ void multiplePages_splitsCorrectly() throws IOException {
3838
assertEquals(2, extractedSplits.size());
3939

4040
var split0 = extractedSplits.get(0);
41-
assertEquals("default_sample_001-001.pdf", split0.getFilename());
41+
assertEquals("default_sample_000-000.pdf", split0.getFilename());
4242
var asInputSource0 = split0.asInputSource();
4343
assertEquals(1, asInputSource0.getPageCount());
4444

4545
var split1 = extractedSplits.get(1);
46-
assertEquals("default_sample_002-002.pdf", split1.getFilename());
46+
assertEquals("default_sample_001-001.pdf", split1.getFilename());
4747
var asInputSource1 = split1.asInputSource();
4848
assertEquals(1, asInputSource1.getPageCount());
4949
}

0 commit comments

Comments
 (0)