Convert Word Documents to PDF, PDF to Image with Apache POI

Use Apache POI to convert Word document to PDF, PDF to image

I found a lot of online examples, but the writing was very complicated and unusable. It took me a day to sort it out. Don’t talk nonsense, just upload the code

Required dependencies

 <dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.27</version>
</dependency>

<!-- fill word-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>

<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>

<!--word to pdf, fill word-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>

<!--word to pdf-->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.pdf-gae</artifactId>
<version>2.0.1</version>
</dependency>

<dependency>
<groupId>com.documents4j</groupId>
<artifactId>documents4j-local</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>com.documents4j</groupId>
<artifactId>documents4j-transformer-msoffice-word</artifactId>
<version>1.0.3</version>
</dependency>

<!--pdf to picture-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.0-alpha2</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>3.0.0-alpha2</version>
</dependency>

Tool class

package com.xxx.common.utils;
import java.awt.image.BufferedImage;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import com.documents4j.api.DocumentType;
import com.documents4j.api.IConverter;
import com.documents4j.job.LocalConverter;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.poi.xwpf.usermodel.*;
import javax.imageio.ImageIO;
import java.io.*;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;



/**
 * @description: tool class
 */
@Slf4j
public class WordUtils {<!-- -->

    /**
     * @Author: RedRush
     * @Date: 2023/2/20 17:42
     * @param path template path
     * @param outPath output path
     * @param dict The collection of information that needs to be replaced
     * @Return: boolean
     * @description: Compile the text and tables in the template according to the dict
     */
    public static void compile(String path, String outPath, Map<String, Object> dict) throws Exception{<!-- -->
        FileInputStream is = new FileInputStream(path);
        XWPFDocument document = new XWPFDocument(is);
        if (dict != null) {<!-- -->
            // Replace text outside the table (text only)
            WordUtils.compileText(document, dict);
            // replace the text object in the table
            WordUtils.compileTable(document, dict);
        }
        File f = new File(outPath. substring(0, outPath. lastIndexOf(File. separator)));
        if(!f.exists()){<!-- -->
            f. mkdirs();
        }
        FileOutputStream out = new FileOutputStream(outPath);
        document. write(out);


    }

    /***
     * @Description : replace paragraph text
     * @param document docx parsing object
     * @param dict The collection of information that needs to be replaced
     * @return void
     * @Date 2022/11/17 17:22
     */
    public static void compileText(XWPFDocument document, Map<String, Object> dict) {<!-- -->
        // Get the collection of paragraphs
        Iterator<XWPFParagraph> iterator = document. getParagraphsIterator();
        XWPFParagraph paragraph = null;
        while (iterator.hasNext()) {<!-- -->
            paragraph = iterator. next();
            // Determine whether this paragraph needs to be replaced
            if (checkText(paragraph. getText())) {<!-- -->
                replaceValue(paragraph, dict);
            }
        }
    }

    /***
     * @Description : replace the text in the table
     * @param document
     * @param dict The collection of information that needs to be replaced
     * @return void
     * @Date 2022/11/18 11:29
     */
    public static void compileTable(XWPFDocument document, Map<String, Object> dict) {<!-- -->
        // Get the form of the file
        Iterator<XWPFTable> tableList = document. getTablesIterator();
        XWPFTable table;
        List<XWPFTableRow> rows;
        List<XWPFTableCell> cells;
        // Loop through all the text that needs to be replaced and replace it
        while (tableList.hasNext()) {<!-- -->
            table = tableList. next();
            if (checkText(table.getText())) {<!-- -->
                rows = table. getRows();
                // Traverse the table and replace the template
                for (XWPFTableRow row : rows) {<!-- -->
                    cells = row. getTableCells();
                    for (XWPFTableCell cell : cells) {<!-- -->
                        // Determine whether the cell needs to be replaced
                        if (checkText(cell. getText())) {<!-- -->
                            List<XWPFParagraph> paragraphs = cell. getParagraphs();
                            for (XWPFParagraph paragraph : paragraphs) {<!-- -->
                                replaceValue(paragraph, dict);
                            }
                        }
                    }
                }
            }
        }
    }

    /**
     * @Author: RedRush
     * @Date: 2023/2/20 17:31
     * @param paragraph word text
     * @param dict The collection of information that needs to be replaced
     * @description: replacement string
     */
    private static void replaceValue(XWPFParagraph paragraph, Map<String, Object> dict) {<!-- -->
        String nextLine;
        List<XWPFRun> runs = paragraph. getRuns();
        for (int i = 0; i < runs. size(); i ++ ) {<!-- -->
            // read current line
            String readLine = runs. get(i). text();
// System.out.println("readLine:" + readLine);
            // Skip if empty or does not contain the target string
            if(StringUtils.isEmpty(readLine) || !readLine.contains("$")) continue;
            // Initialize the result set
            StringBuffer sb = new StringBuffer();
            // Loop through the template string of the current row
            while (readLine. contains("$")){<!-- -->
                // Get the string on the left side of the template string
                int left;
                if(readLine. contains("${")){<!-- -->
                    left = readLine. indexOf("${");
                } else {<!-- -->
                    if(runs. size() < i + 1){<!-- -->
                        break;
                    }
                    nextLine = runs. get(i + 1). text();
                    if(!nextLine.startsWith("{")) break;
                    readLine += nextLine;
                    paragraph. removeRun(i + 1);
                    left = readLine. indexOf("${");
                }
                sb.append(readLine.substring(0, left));
                // Get the right side of the template string
                while(runs.size() >= i + 1 & amp; & amp; !readLine.contains("}")){<!-- -->
                    nextLine = runs. get(i + 1). text();
                    readLine += nextLine;
                    paragraph. removeRun(i + 1);
                }
                int right = readLine. indexOf("}");
                if(right == -1) break;
                // Replace the template string [if it does not exist in the dictionary, it will be replaced with an empty string]
                sb.append(dict.getOrDefault(readLine.substring(left, right + 1), ""));
                if(right + 1 < readLine. length()){<!-- -->
                    sb. append(readLine. substring(right + 1));
                }
                readLine = sb. toString();
            }
            runs.get(i).setText(sb.toString(), 0);
        }
    }


    /***
     * @Description : Check if the text contains the specified character (here "$")
     * @param text
     * @return boolean
     * @Date 2022/11/17 17:22
     */
    private static boolean checkText(String text) {<!-- -->
        return text. contains("$");
    }



    /**
     * Convert word to pdf through documents4j
     *
     * @param sourcePath source file address such as /root/example.doc
     * @param targetPath target file address such as /root/example.pdf
     */
    public static void documents4jWordToPdf(String sourcePath, String targetPath) {<!-- -->
        File inputWord = new File(sourcePath);
        File outputFile = new File(targetPath);
        try {<!-- -->
            InputStream docxInputStream = new FileInputStream(inputWord);
            OutputStream outputStream = new FileOutputStream(outputFile);
            IConverter converter = LocalConverter.builder().build();
            boolean execute = converter. convert(docxInputStream)
                    .as(DocumentType.DOCX)
                    .to(outputStream)
                    .as(DocumentType.PDF).schedule().get();
            outputStream. close();
            docxInputStream. close();

            log.info("Conversion completed targetPath = {}", outputFile.getAbsolutePath());
            System.out.println("Conversion completed targetPath = " + outputFile.getAbsolutePath());
            converter. shutDown();
            return;
        } catch (Exception e) {<!-- -->
            log.error("[documents4J] word to pdf failed: {}", e.toString());
        }
    }

    /**
     * Convert the generated Word document to PDF format
     * @param wordPath Word document path
     * @param pdfPath generated PDF path
     */
    public static void convertToPDF(String wordPath , String pdfPath ) {<!-- -->
        documents4jWordToPdf(wordPath, pdfPath);
    }


    /**
     * Convert PDF files to pictures
     * @param sourcePath PDF file address
     */
    public void execute(String sourcePath) {<!-- -->
        File file = new File(sourcePath);
        String path = file. getAbsolutePath();
        String targetPathNoExt = path.substring(0, path.lastIndexOf("."));
        try {<!-- -->
            PDDocument doc = Loader.loadPDF(file);
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc. getNumberOfPages();
            for (int i = 0; i < pageCount; i ++ ) {<!-- -->
// System.out.println("Current page" + (i + 1));
                BufferedImage image = renderer.renderImageWithDPI(i, 296);
                // BufferedImage image = renderer. renderImage(i, 2.5f);
                ImageIO.write(image, "PNG", new File(targetPathNoExt + "_" + i + ".png"));
            }
        } catch (IOException e) {<!-- -->
            e.printStackTrace();
        }
    }
}

Test Code

 /**
     * Fill word and generate pdf
     * @throws Exception
     */
    @Test
    public void fillTemplate() throws Exception {<!-- -->
    //Fill word needs to be defined in the document: ${name}
        Map<String, Object> strData = new HashMap<>();
        strData.put("${name}", "xxx"); //Customer name
        strData.put("${createDate}", DateUtil.format(DateUtil.date(), DatePattern.CHINESE_DATE_PATTERN));
        strData.put("${openingAccountBalance}", 121);//The opening account balance (yuan)
        strData.put("${rechargeAmount}", 121);//New recharge amount (yuan)
        strData.put("${totalConsumptionAmount}", 121);//Consumption during this account period (yuan)
        strData.put("${balanceAfterSettlement}", 121);//End account balance (yuan)
        String readPath = "C:\Users\gukt\Desktop\template.docx"; //word template
        String outPath = "C:\Users\gukt\Desktop\output.docx"; // output fill word path
        String outPdfPath = "C:\Users\gukt\Desktop\output.pdf"; //Enter word to pdf path
        WordUtils.compile(readPath, outPath, strData); //Fill in word, if you only want to convert word to pdf, you don’t need it here
        
       // conversion operation, can be used alone
        WordUtils.convertToPDF(outPath,outPdfPath); //word to pdf
        WordUtils.execute(outPdfPath); //pdf to picture png
    }