JAVA reads (DOC, DOCX, PDF, PPT, PPTX) file text content and images

The following is nonsense:

Warm reminder: There are many ways to parse these common files. The following content is implemented using apache-poi + apache-pdfbox.

Regarding document parsing, I searched the Internet for a long time, but the content was too complicated and I couldn’t find the appropriate code. Most of them only support text. I have no choice but to do some CV bit by bit on the Internet. I finally extracted these codes. I can’t say they are easy to use, but they should be able to solve the urgent need. There are still many questions about doc documents and pdf documents. I hope you guys can correct me in the post below and optimize the code, which would be better.

The following is the text content:

First, install the following dependencies

 <dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.0</version>
</dependency>
        <dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.0</version>
</dependency>
        <dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.22</version>
</dependency>

If you want to test it, I will post a document address for you: (but this online document does not have pictures)

public static void main(String[] args) throws IOException {
        String document = processDocumentFromFilePath("E:\VPN System User Manual.pptx", "E:\Temporary Picture");
        System.out.println(document);
        String documentFromUrl = processDocumentFromUrl("http://api.idocv.com/data/doc/manual.docx", "E:\Temporary picture");
        System.out.println(documentFromUrl);
    }

Then get on the bus: whizzing

import com.alibaba.dubbo.common.utils.CollectionUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.sl.usermodel.TextParagraph;
import org.apache.poi.xslf.usermodel.*;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;

public class FileProcessorUtils {

   
    /***
     * This method is for local files
     * Extract file information and return content
     * @param filePath file storage address
     * @param imgRoot image storage address
     * @return
     */
    public static String processDocumentFromFilePath(String filePath,String imgRoot) throws IOException {
        File file = new File(filePath);
        FileInputStream fileInputStream = new FileInputStream(file);

        // Call the appropriate processing method based on the file type
        switch (fileTypeName(filePath)) {
            case "doc":
                return processWordDocDocumentFromStream(fileInputStream,imgRoot);
            case "docx":
                return processWordDocxDocumentFromStream(fileInputStream,imgRoot);
            case "pdf":
                return processPdfDocumentFromStream(fileInputStream,imgRoot);
            case "ppt":
                return processPptDocumentFromStream(fileInputStream,imgRoot);
            case "pptx":
                return processPptxDocumentFromStream(fileInputStream,imgRoot);
            default:
                throw new RuntimeException("Unsupported file format, file parsing currently only supports (DOC/DOCX/PDF/PPT/PPTX)");
        }
    }


    /***
     * This method is for network files
     * Extract file information and return content
     * @param downloadUrl file download link
     * @param imgRoot image storage address
     * @return
     */
    public static String processDocumentFromUrl(String downloadUrl,String imgRoot) throws IOException {

        HttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(downloadUrl);
        HttpResponse response = httpClient.execute(httpGet);

        //Get file type
        // TODO: 2023/9/14 Not all download links here have suffix information. If you want to improve the robustness of the code, you can modify the code here to obtain the file type.
        String typeName = fileTypeName(downloadUrl);
        // Call the appropriate processing method based on the file type
        switch (typeName) {
            case "doc":
                return processWordDocDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "docx":
                return processWordDocxDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "pdf":
                return processPdfDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "ppt":
                return processPptDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "pptx":
                return processPptxDocumentFromStream(response.getEntity().getContent(),imgRoot);
            default:
                throw new RuntimeException("Unsupported file format, file parsing currently only supports (DOC/DOCX/PDF/PPT/PPTX)");
        }
    }

    /***
     * word (doc) file processing
     * @param inputStream(file stream)
     * @return
     */
    private static String processWordDocDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        HWPFDocument document = new HWPFDocument(inputStream);
        StringBuilder htmlText = new StringBuilder();
        WordExtractor extractor = new WordExtractor(document);

        try {
            String[] paragraphs = extractor.getParagraphText();
            for (int paragraphIndex = 0; paragraphIndex < paragraphs.length; paragraphIndex + + ) {
                String paragraphText = paragraphs[paragraphIndex];
                //Get text alignment
                String justification = getJustification(document.getRange().getParagraph(paragraphIndex).getJustification());
                //Add other HTML tags as needed
                htmlText.append("<p style='text-align:").append(justification).append("'><span>").append(paragraphText).append("</ span>").append("</p>");
            }

            //Extract image
            List<Picture> pictures = document.getPicturesTable().getAllPictures();
            for (int i = 0; i < pictures.size(); i + + ) {
                Picture picture = pictures.get(i);
                byte[] pictureData = picture.getContent();
                String newFileName = new Date().getTime() + i + "_image." + picture.suggestFileExtension(); // You can change the extension name as needed, and the suggestFileExtension() method automatically obtains the appropriate picture type
                String imgPath = saveImageToFile(pictureData, newFileName, imageRoot);
                htmlText.append("<p><img alt='' src='").append(imgPath).append("'></p>");
            }
        } finally {
            extractor.close();
            document.close();
        }
        return htmlText.toString();
    }


    /***
     * word (docx) file processing
     * @param inputStream(file stream)
     * @return
     */
    private static String processWordDocxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        //Get file content
        XWPFDocument document = new XWPFDocument(inputStream);
        StringBuilder htmlText = new StringBuilder();
        try {
            //Get all elements
            List<XWPFParagraph> paragraphs = document.getParagraphs();
            //Append based on element type
            for (XWPFParagraph paragraph : paragraphs) {
                //Get text alignment
                ParagraphAlignment alignment = paragraph.getAlignment();
                htmlText.append("<p style='text-align:").append(alignment).append("'>");

                List<XWPFRun> runs = paragraph.getRuns();
                for (XWPFRun run : runs) {
                    // Process font size, style and other information
                    String fontSize = run.getFontSize() + "pt";
                    String fontFamily = run.getFontFamily();
                    //Add style information to HTML
                    htmlText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.text() + " </span>");
                }
                htmlText.append("</p>");

                // Check whether there is an image in the current line paragraph
                List<XWPFPicture> pictures = paragraph.getRuns().stream()
                        .flatMap(run -> run.getEmbeddedPictures().stream())
                        .collect(Collectors.toList());
                if(CollectionUtils.isNotEmpty(pictures)){
                    if(pictures.size()>0){
                        pictures.forEach( bean ->{
                            XWPFPictureData pictureData = bean.getPictureData();
                            String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension();
                            String imgPath = null;
                            try {
                                imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            }
                            htmlText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>");
                        });
                    }
                }
            }
        } finally {
            document.close();
        }
        return htmlText.toString();
    }

    /***
     * PDF file processing
     * @param inputStream(file stream)
     * @return
     */
    private static String processPdfDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        PDDocument pdfDocument = PDDocument.load(inputStream);
        PDFTextStripper textStripper = new PDFTextStripper();

        StringBuilder htmlText = new StringBuilder();

        String[] lines = textStripper.getText(pdfDocument).split("\\
");

        for (String line : lines) {
            htmlText.append("<p style='text-align:left'>").append(line).append("</p>");
        }

        pdfDocument.close();

        return htmlText.toString();
    }

    /**
     * Process PPT (.ppt) files
     * @param inputStream (file stream)
     * @return
     * @throwsIOException
     */
    private static String processPptDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        HSLFSlideShow ppt = new HSLFSlideShow(inputStream);
        StringBuilder pptText = new StringBuilder();
        try {
            //Extract text content
            for (HSLFSlide slide : ppt.getSlides()) {
                for (HSLFShape shape : slide.getShapes()) {
                    //If it is text processing text
                    if (shape instanceof HSLFTextShape) {
                        HSLFTextShape textShape = (HSLFTextShape) shape;
                        for (HSLFTextParagraph paragraph : textShape.getTextParagraphs()) {
                            //Get text alignment
                            TextParagraph.TextAlign textAlign = paragraph.getTextAlign();
                            pptText.append("<p style='text-align:").append(textAlign).append("'>");
                            for (HSLFTextRun run : paragraph.getTextRuns()) {
                                // Process font size, font style and other information
                                String fontSize = run.getFontSize() + "pt";
                                String fontFamily = run.getFontFamily();
                                run.getRawText();
                                //Add style information to HTML
                                pptText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.getRawText() + " </span>");
                            }
                            pptText.append("</p>"); // Line break processing


                        }
                    }else if (shape instanceof HSLPictureShape) {
                        // If it is a picture, process the picture
                        HSLPictureShape pictureShape = (HSLFPictureShape) shape;
                        HSLPictureData pictureData = pictureShape.getPictureData();
                        String contentType = pictureData.getContentType();
                        String newFileName = new Date().getTime() + "_image." + imageTypeName(contentType);
                        String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);
                        pptText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>");
                    }
                }
            }
        } finally {
            ppt.close();
        }

        return pptText.toString();
    }

    /**
     * Process PPTX (.pptx) files
     * @param inputStream (file stream)
     * @return
     * @throwsIOException
     */
    private static String processPptxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        XMLSlideShow pptx = new XMLSlideShow(inputStream);
        StringBuilder pptxText = new StringBuilder();
        try {
            //Extract text content
            for (XSLFSlide slide : pptx.getSlides()) {
                for (XSLFShape shape : slide.getShapes()) {
                    if (shape instanceof XSLFTextShape) {
                        XSLFTextShape textShape = (XSLFTextShape) shape;
                        for (XSLFTextParagraph paragraph : textShape.getTextParagraphs()) {
                            //Get text alignment
                            TextParagraph.TextAlign textAlign = paragraph.getTextAlign();
                            pptxText.append("<p style='text-align:").append(textAlign).append("'>");
                            for (XSLFTextRun run : paragraph.getTextRuns()) {
                                // Process font size, font style and other information
                                String fontSize = run.getFontSize() + "pt";
                                String fontFamily = run.getFontFamily();

                                //Add style information to HTML
                                pptxText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.getRawText() + " </span>");
                            }
                            pptxText.append("</p>"); // Line break processing
                        }
                    }else if (shape instanceof XSLFPictureShape) {
                        // If it is a picture, process the picture
                        XSLFPictureShape pictureShape = (XSLFPictureShape) shape;
                        XSLFPictureData pictureData = pictureShape.getPictureData();
                        String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension();
                        String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);
                        pptxText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>");
                    }
                }
            }
        } finally {
            pptx.close();
        }
        return pptxText.toString();
    }

    /**
     * Save the image to the specified location and return the reference address
     * @param imageData
     * @param imageRoot
     * @return
     * @throwsIOException
     */
    public static String saveImageToFile(byte[] imageData, String imageFileName, String imageRoot) throws IOException {
        String imagePath = imageRoot + File.separator + imageFileName;
        File file = new File(imageRoot);
        if(!file.exists()){
            file.mkdir();
        }
        try (FileOutputStream fos = new FileOutputStream(imagePath)) {
            fos.write(imageData);
        }
        return imagePath;
    }

    /**
     * Form processing
     * @param table
     * @return
     */
    private static String getTableHtmlText(XWPFTable table) {
        StringBuilder tableHtml = new StringBuilder("<table>");
        for (XWPFTableRow row : table.getRows()) {
            tableHtml.append("<tr>");
            for (XWPFTableCell cell : row.getTableCells()) {
                tableHtml.append("<td>").append(cell.getText()).append("</td>");
            }
            tableHtml.append("</tr>");
        }
        tableHtml.append("</table>");
        return tableHtml.toString();
    }

    /***
     * Get file suffix
     * @param filePath
     * @return
     */
    private static String fileTypeName(String filePath) {
        int dotIndex = filePath.lastIndexOf(".");
        if (dotIndex > 0) {
            return filePath.substring(dotIndex + 1).toLowerCase();
        }
        return "";
    }

    /***
     * Get image type
     * @param imagePath
     * @return
     */
    private static String imageTypeName(String imagePath) {
        int dotIndex = imagePath.lastIndexOf("/");
        if (dotIndex > 0) {
            return imagePath.substring(dotIndex + 1).toLowerCase();
        }
        return "";
    }

    /***
     *doc document obtains the current line alignment, which is left aligned by default
     * @param type
     * @return
     */
    private static String getJustification(Integer type) {
        switch (type) {
            case 0:
                return "left";
            case 1:
                return "center";
            case 2:
                return "right";
            default:
                return "left";
        }
    }
}