mindee-api-java/src/main/java/com/mindee/pdf/PDFUtils.java at f8241655f9d83b99dea77302002909fc09242bac · Hellozaq/mindee-api-java · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
package com.mindee.pdf;

import com.mindee.input.LocalInputSource;
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

/**
 * Utilities for working with PDFs.
 */
public final class PDFUtils {

  private PDFUtils() {
  }

  /**
   * Get the number of pages in the PDF.
   *
   * @param inputSource The PDF file.
   */
  public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
    PDDocument document = Loader.loadPDF(inputSource.getFile());
    int pageCount = document.getNumberOfPages();
    document.close();
    return pageCount;
  }

  private static PDPage clonePage(PDPage page) {

    COSDictionary pageDict = page.getCOSObject();
    COSDictionary newPageDict = new COSDictionary(pageDict);

    newPageDict.removeItem(COSName.ANNOTS);

    return new PDPage(newPageDict);
  }

  private static byte[] createPdfFromExistingPdf(
      PDDocument document, List<Integer> pageNumbers,
      boolean closeOriginal
  ) throws IOException {
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    PDDocument newDocument = new PDDocument();
    int pageCount = document.getNumberOfPages();
    pageNumbers.stream().filter(i -> i < pageCount)
        .forEach(i -> newDocument.addPage(clonePage(document.getPage(i))));

    newDocument.save(outputStream);
    newDocument.close();
    if (closeOriginal) {
      document.close();
    }

    byte[] output = outputStream.toByteArray();
    outputStream.close();
    return output;
  }

  /**
   * Merge specified PDF pages together.
   *
   * @param file        The PDF file.
   * @param pageNumbers Lit of page numbers to merge together.
   */
  public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
    PDDocument document = Loader.loadPDF(file);
    return createPdfFromExistingPdf(document, pageNumbers, true);
  }

  public static byte[] mergePdfPages(PDDocument document, List<Integer> pageNumbers)
      throws IOException {
    return mergePdfPages(document, pageNumbers, true);
  }


  public static byte[] mergePdfPages(
      PDDocument document, List<Integer> pageNumbers,
      boolean closeOriginal
  ) throws IOException {
    return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
  }


  public static boolean isPdfEmpty(File file) throws IOException {
    return checkIfPdfIsEmpty(Loader.loadPDF(file));
  }

  private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException {
    boolean isEmpty = true;
    for (PDPage page : document.getPages()) {
      PDResources resources = page.getResources();
      if (resources == null) {
        continue;
      }
      Iterable<COSName> xObjects = resources.getXObjectNames();
      Iterable<COSName> fonts = resources.getFontNames();

      if (xObjects.spliterator().getExactSizeIfKnown() != 0
          || fonts.spliterator().getExactSizeIfKnown() != 0) {
        isEmpty = false;
        break;
      }
    }
    document.close();

    return isEmpty;
  }

  /**
   * Render all pages of a PDF as images.
   * Converting PDFs with hundreds of pages may result in a heap space error.
   *
   * @param filePath The path to the PDF file.
   * @return List of all pages as images.
   */
  public static List<PdfPageImage> pdfToImages(String filePath) throws IOException {
    return pdfToImages(new LocalInputSource(filePath));
  }

  /**
   * Render all pages of a PDF as images.
   * Converting PDFs with hundreds of pages may result in a heap space error.
   *
   * @param source The PDF file.
   * @return List of all pages as images.
   */
  public static List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
    PDDocument document = Loader.loadPDF(source.getFile());
    PDFRenderer pdfRenderer = new PDFRenderer(document);
    List<PdfPageImage> pdfPageImages = new ArrayList<>();
    for (int i = 0; i < document.getNumberOfPages(); i++) {
      BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
      pdfPageImages.add(new PdfPageImage(imageBuffer, i, source.getFilename(), "jpg"));
    }
    document.close();
    return pdfPageImages;
  }

  /**
   * Render a single page of a PDF as an image.
   * Main use case is for processing PDFs with hundreds of pages.
   * If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
   *
   * @param filePath   The path to the PDF file.
   * @param pageNumber The page number to render, first page is 1.
   * @return The page as an image.
   */
  public static PdfPageImage pdfPageToImage(String filePath, int pageNumber) throws IOException {
    return pdfPageToImage(new LocalInputSource(filePath), pageNumber);
  }

  /**
   * Render a single page of a PDF as an image.
   * Main use case is for processing PDFs with hundreds of pages.
   * If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and
   * then <code>pdfToImages</code>.
   *
   * @param source     The PDF file.
   * @param pageNumber The page number to render, first page is 1.
   * @return The page as an image.
   */
  public static PdfPageImage pdfPageToImage(
      LocalInputSource source,
      int pageNumber
  ) throws IOException {
    int index = pageNumber - 1;
    PDDocument document = Loader.loadPDF(source.getFile());
    PDFRenderer pdfRenderer = new PDFRenderer(document);
    BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
    document.close();
    return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg");
  }

  private static BufferedImage pdfPageToImageBuffer(
      int index, PDDocument document,
      PDFRenderer pdfRenderer
  ) throws IOException {
    PDRectangle bbox = document.getPage(index).getBBox();
    float dimension = bbox.getWidth() * bbox.getHeight();
    int dpi;
    if (dimension < 200000) {
      dpi = 300;
    } else if (dimension < 300000) {
      dpi = 250;
    } else {
      dpi = 200;
    }
    return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
  }

  public static byte[] documentToBytes(PDDocument document) throws IOException {
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    document.save(outputStream);
    return outputStream.toByteArray();
  }


  public static void extractAndAddText(
      PDDocument inputDoc, PDPageContentStream contentStream,
      int pageIndex, boolean disableSourceText
  ) throws IOException {
    if (disableSourceText) {
      return;
    }

    PDFTextStripper stripper = new PDFTextStripper() {
      @Override
      protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
        if (textPositions.isEmpty()) {
          return;
        }

        TextPosition firstPosition = textPositions.get(0);
        float fontSize = firstPosition.getFontSizeInPt();
        PDColor color = getGraphicsState().getNonStrokingColor();
        contentStream.beginText();
        contentStream.setFont(firstPosition.getFont(), fontSize);
        contentStream.setNonStrokingColor(convertToAwtColor(color));

        float x = firstPosition.getXDirAdj();
        float y = firstPosition.getPageHeight() - firstPosition.getYDirAdj();

        contentStream.newLineAtOffset(x, y);
        try {
          contentStream.showText(text);
        } catch (IllegalArgumentException | UnsupportedOperationException e) {
          contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize);
          contentStream.showText(text);
        }
        contentStream.endText();
      }
    };

    stripper.setStartPage(pageIndex + 1);
    stripper.setEndPage(pageIndex + 1);
    stripper.getText(inputDoc);
  }

  private static Color convertToAwtColor(PDColor pdColor) {
    float[] components = pdColor.getComponents();
    if (components.length == 1) {
      // Grayscale
      return new Color(components[0], components[0], components[0]);
    } else if (components.length == 3) {
      // RGB
      return new Color(components[0], components[1], components[2]);
    } else if (components.length == 4) {
      // CMYK (simplified conversion)
      float c = components[0];
      float m = components[1];
      float y = components[2];
      float k = components[3];
      float r = 1 - Math.min(1, c + k);
      float g = 1 - Math.min(1, m + k);
      float b = 1 - Math.min(1, y + k);
      return new Color(r, g, b);
    }
    return Color.BLACK;
  }

  public static void addImageToPage(
      PDPageContentStream contentStream, PDImageXObject pdImage,
      PDRectangle pageSize
  ) throws IOException {
    contentStream.drawImage(pdImage, 0, 0, pageSize.getWidth(), pageSize.getHeight());
  }
}