java reads pdf file content

1. Introduce maven

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.25</version>
</dependency>

2. Code tool class

package com.jiayou.peis.utils;

//import com.itextpdf.text.pdf.PdfReader;
//import com.itextpdf.text.pdf.parser.PdfTextExtractor;
//import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;

import com.google.common.collect.Lists;
import com.jiayou.peis.entity.ImageObject;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * PDF processing
 *
 * @author Bob Ren (Copyright ? 2015-2029 Guizhou Jiayou Online Network Co., Ltd.)
 * @version 1.0.0
 * @date 2022-02-07 16:21
 */
public class PdfUtils {<!-- -->
    // /**
// * Use itextpdf to extract PDF text (parsing is unreliable)
// *
// * @param inputStream
// * @return
// * @throwsIOException
// */
// @Deprecated
// public static String toText(InputStream inputStream) throws IOException {<!-- -->
// try {<!-- -->
// StringBuilder buf = new StringBuilder();
// PdfReader reader = new PdfReader(inputStream);
// int pageNum = reader.getNumberOfPages();
// for(int i=1;i<=pageNum;i + + ){<!-- -->
// // Read the document content of page i
// buf.append(PdfTextExtractor.getTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
// }
            return buf.toString();
// return StrUtils.removeReturnChar(buf.toString());
// } finally {<!-- -->
// CloseUtils.closeQuietly(inputStream);
// }
// }
    public static String text(byte[] data) throws IOException {<!-- -->
        return PdfUtils.text(data, true);
    }
    public static String text(byte[] data, boolean sortByPosition) throws IOException {<!-- -->
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    /**
     * Use pdfbox to extract PDF text (parsing is normal and can be used)
     *
     * @param file
     * @return
     * @throwsIOException
     */
    public static String text(File file, boolean sortByPosition) throws IOException {<!-- -->
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    public static String text(File file) throws IOException {<!-- -->
        return PdfUtils.text(file, true);
    }
    public static String text(InputStream inputStream) throws IOException {<!-- -->
        return text(inputStream, true);
    }
    /**
     * Use pdfbox to extract PDF text (parsing is normal and can be used)
     *
     * @param inputStream
     * @return
     * @throwsIOException
     */
    public static String text(InputStream inputStream, boolean sortByPosition) throws IOException {<!-- -->
        PDDocument document = null;
        try {<!-- -->
// document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            PDFTextStripper textStripper = new PDFTextStripper();
            // Get total page count of the PDF document
            int numberOfPages = document.getNumberOfPages();
            //set the first page to be extracted
            textStripper.setStartPage(1);
            // set the last page to be extracted
            textStripper.setEndPage(numberOfPages);
            // Get text content
            textStripper.setSortByPosition(sortByPosition);
            textStripper.setShouldSeparateByBeads(true);
            return StrUtils.removeReturnChar(textStripper.getText(document));
        } finally {<!-- -->
            CloseUtils.closeQuietly(document, inputStream);
        }
    }

    /**
     * Use pdfbox to extract PDF text (parsing is normal and can be used)
     *
     * @param file
     * @return
     * @throwsIOException
     */
    public static List<ImageObject> images(File file) throws IOException {<!-- -->
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.images(inputStream);
    }

    public static List<ImageObject> images(byte[] data) throws IOException {<!-- -->
        ByteArrayInputStream inputStream = null;
        try {<!-- -->
            inputStream = new ByteArrayInputStream(data);
            return PdfUtils.images(inputStream);
        } finally {<!-- -->
            CloseUtils.closeQuietly(inputStream);
        }
    }

    /**
     * Use pdfbox to extract PDF image list
     *
     * @param inputStream
     * @return
     * @throwsIOException
     */
    public static List<ImageObject> images(InputStream inputStream) throws IOException {<!-- -->
        List<ImageObject> imageList = Lists.newArrayList();
        PDDocument document = null;
        try {<!-- -->
// document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            // get resources for a page
            PDResources pdResources = document.getPage(0).getResources();
            int i = 0;
            for (COSName csName : pdResources.getXObjectNames()) {<!-- -->
// System.out.println(i + ":" + csName);
                PDXObject pdxObject = pdResources.getXObject(csName);
                if (pdxObject instanceof PDImageXObject) {<!-- -->
// i + + ;
                    PDStream pdStream = pdxObject.getStream();
                    PDImageXObject image = new PDImageXObject(pdStream, pdResources);
                    String imageSuffix = imageSuffix(image);
                    // image storage location and image name
                    BufferedImage bufferedImage = image.getImage();
                    ImageObject object = new ImageObject();
                    object.setIndex(i + + );
                    object.setImage(bufferedImage);
                    object.setSuffix(imageSuffix);
                    imageList.add(object);
                }
            }
        } finally {<!-- -->
            CloseUtils.closeQuietly(document, inputStream);
        }
        return imageList;
    }

    /**
     * Get image suffix
     *
     * @param pdImage
     * @return
     * @throwsIOException
     */
    private static String imageSuffix(PDImageXObject pdImage) throws IOException {<!-- -->
        String suffix = pdImage.getSuffix();
        if (suffix == null || "jb2".equals(suffix)) {<!-- -->
            suffix = "png";
        } else if ("jpx".equals(suffix)) {<!-- -->
            // use jp2 suffix for file because jpx not known by windows
            suffix = "jp2";
        }

        if (hasMasks(pdImage)) {<!-- -->
            // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
            suffix = "png";
        }
        return suffix;
    }

    private static boolean hasMasks(PDImage pdImage) throws IOException {<!-- -->
        if (pdImage instanceof PDImageXObject) {<!-- -->
            PDImageXObject ximg = (PDImageXObject) pdImage;
            return ximg.getMask() != null || ximg.getSoftMask() != null;
        }
        return false;
    }

    /**
     * Save pictures to the specified folder
     *
     * @param imageList
     * @param dir
     * @param prefixName
     * @throwsIOException
     */
    public static void saveImage(List<ImageObject> imageList, String dir, String prefixName) throws IOException {<!-- -->
        File imgDir = new File(dir);
        FileUtils.forceMkdir(imgDir);
        for(ImageObject image:imageList){<!-- -->
            File imgFile = new File(dir, prefixName + "_" + image.getIndex() + "." + image.getSuffix());
            ImageIO.write(image.getImage(), image.getSuffix(), imgFile);
        }
    }
}

3. Method 2

https://blog.csdn.net/ThinkPet/article/details/131256428