1. Introduce maven
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.25</version> </dependency>
2. Code tool class
package com.jiayou.peis.utils; //import com.itextpdf.text.pdf.PdfReader; //import com.itextpdf.text.pdf.parser.PdfTextExtractor; //import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.google.common.collect.Lists; import com.jiayou.peis.entity.ImageObject; import org.apache.commons.io.FileUtils; import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.text.PDFTextStripper; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.*; import java.util.ArrayList; import java.util.List; /** * PDF processing * * @author Bob Ren (Copyright ? 2015-2029 Guizhou Jiayou Online Network Co., Ltd.) * @version 1.0.0 * @date 2022-02-07 16:21 */ public class PdfUtils {<!-- --> // /** // * Use itextpdf to extract PDF text (parsing is unreliable) // * // * @param inputStream // * @return // * @throwsIOException // */ // @Deprecated // public static String toText(InputStream inputStream) throws IOException {<!-- --> // try {<!-- --> // StringBuilder buf = new StringBuilder(); // PdfReader reader = new PdfReader(inputStream); // int pageNum = reader.getNumberOfPages(); // for(int i=1;i<=pageNum;i + + ){<!-- --> // // Read the document content of page i // buf.append(PdfTextExtractor.getTextFromPage(reader, i, new SimpleTextExtractionStrategy())); // } return buf.toString(); // return StrUtils.removeReturnChar(buf.toString()); // } finally {<!-- --> // CloseUtils.closeQuietly(inputStream); // } // } public static String text(byte[] data) throws IOException {<!-- --> return PdfUtils.text(data, true); } public static String text(byte[] data, boolean sortByPosition) throws IOException {<!-- --> ByteArrayInputStream inputStream = new ByteArrayInputStream(data); return PdfUtils.text(inputStream, sortByPosition); } /** * Use pdfbox to extract PDF text (parsing is normal and can be used) * * @param file * @return * @throwsIOException */ public static String text(File file, boolean sortByPosition) throws IOException {<!-- --> InputStream inputStream = new FileInputStream(file); return PdfUtils.text(inputStream, sortByPosition); } public static String text(File file) throws IOException {<!-- --> return PdfUtils.text(file, true); } public static String text(InputStream inputStream) throws IOException {<!-- --> return text(inputStream, true); } /** * Use pdfbox to extract PDF text (parsing is normal and can be used) * * @param inputStream * @return * @throwsIOException */ public static String text(InputStream inputStream, boolean sortByPosition) throws IOException {<!-- --> PDDocument document = null; try {<!-- --> // document = PDDocument.load(inputStream); document = Loader.loadPDF(inputStream); PDFTextStripper textStripper = new PDFTextStripper(); // Get total page count of the PDF document int numberOfPages = document.getNumberOfPages(); //set the first page to be extracted textStripper.setStartPage(1); // set the last page to be extracted textStripper.setEndPage(numberOfPages); // Get text content textStripper.setSortByPosition(sortByPosition); textStripper.setShouldSeparateByBeads(true); return StrUtils.removeReturnChar(textStripper.getText(document)); } finally {<!-- --> CloseUtils.closeQuietly(document, inputStream); } } /** * Use pdfbox to extract PDF text (parsing is normal and can be used) * * @param file * @return * @throwsIOException */ public static List<ImageObject> images(File file) throws IOException {<!-- --> InputStream inputStream = new FileInputStream(file); return PdfUtils.images(inputStream); } public static List<ImageObject> images(byte[] data) throws IOException {<!-- --> ByteArrayInputStream inputStream = null; try {<!-- --> inputStream = new ByteArrayInputStream(data); return PdfUtils.images(inputStream); } finally {<!-- --> CloseUtils.closeQuietly(inputStream); } } /** * Use pdfbox to extract PDF image list * * @param inputStream * @return * @throwsIOException */ public static List<ImageObject> images(InputStream inputStream) throws IOException {<!-- --> List<ImageObject> imageList = Lists.newArrayList(); PDDocument document = null; try {<!-- --> // document = PDDocument.load(inputStream); document = Loader.loadPDF(inputStream); // get resources for a page PDResources pdResources = document.getPage(0).getResources(); int i = 0; for (COSName csName : pdResources.getXObjectNames()) {<!-- --> // System.out.println(i + ":" + csName); PDXObject pdxObject = pdResources.getXObject(csName); if (pdxObject instanceof PDImageXObject) {<!-- --> // i + + ; PDStream pdStream = pdxObject.getStream(); PDImageXObject image = new PDImageXObject(pdStream, pdResources); String imageSuffix = imageSuffix(image); // image storage location and image name BufferedImage bufferedImage = image.getImage(); ImageObject object = new ImageObject(); object.setIndex(i + + ); object.setImage(bufferedImage); object.setSuffix(imageSuffix); imageList.add(object); } } } finally {<!-- --> CloseUtils.closeQuietly(document, inputStream); } return imageList; } /** * Get image suffix * * @param pdImage * @return * @throwsIOException */ private static String imageSuffix(PDImageXObject pdImage) throws IOException {<!-- --> String suffix = pdImage.getSuffix(); if (suffix == null || "jb2".equals(suffix)) {<!-- --> suffix = "png"; } else if ("jpx".equals(suffix)) {<!-- --> // use jp2 suffix for file because jpx not known by windows suffix = "jp2"; } if (hasMasks(pdImage)) {<!-- --> // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG suffix = "png"; } return suffix; } private static boolean hasMasks(PDImage pdImage) throws IOException {<!-- --> if (pdImage instanceof PDImageXObject) {<!-- --> PDImageXObject ximg = (PDImageXObject) pdImage; return ximg.getMask() != null || ximg.getSoftMask() != null; } return false; } /** * Save pictures to the specified folder * * @param imageList * @param dir * @param prefixName * @throwsIOException */ public static void saveImage(List<ImageObject> imageList, String dir, String prefixName) throws IOException {<!-- --> File imgDir = new File(dir); FileUtils.forceMkdir(imgDir); for(ImageObject image:imageList){<!-- --> File imgFile = new File(dir, prefixName + "_" + image.getIndex() + "." + image.getSuffix()); ImageIO.write(image.getImage(), image.getSuffix(), imgFile); } } }
3. Method 2
https://blog.csdn.net/ThinkPet/article/details/131256428