Two ways to convert Java word to html

The key points are highlighted. . .

Method 1: Use the toolkit poi provided by apache. poi uses version 4.1.2

Disadvantages: Inaccurate processing of font styles; inaccurate conversion of wmf formula image parts, this document only supports doc format

Advantages: The conversion speed is relatively fast and local debugging is convenient

Method 2: Use libreoffice, using version 7.5

Address:Download LibreOffice | LibreOffice Simplified Chinese official website – Free and free office suite

Linux installation libreoffice case:linux centos7 tool installation libreOffice libreOffice installation tutorial_centos7 installation libreoffice_the_bog’s blog-CSDN blog

Disadvantages: Relatively slow conversion speed

Advantages: The font style is very precise. This document only supports doc, docx, etc. Convert pdf and other related commands from Baidu

Without further ado, let’s get straight to the code! ! !

Method 1 code implementation:

Relevant jar package address:

 <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi</artifactId>
      <version>4.1.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-scratchpad</artifactId>
      <version>4.1.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-ooxml</artifactId>
      <version>4.1.2</version>
    </dependency>
  <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.xmlgraphics</groupId>
      <artifactId>batik-codec</artifactId>
      <version>1.7</version>
    </dependency>
    <dependency>
      <groupId>net.arnx</groupId>
      <artifactId>wmf2svg</artifactId>
      <version>0.9.5</version>
    </dependency>

package cn.hls.winner.winner_problem_manage.utils;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.util.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.FileCopyUtils;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * @author lhz
 * @description TODO
 * @date 2023/9/18 10:14
 */
public class Word2003Util {

    private static final Logger logger = LoggerFactory.getLogger(Word2003Util.class);


    /**
     *
     * @param multipartFile uploaded file
     * @param htmlFile html upload path
     * @param htmlFileImgUrl html image upload path
     * @param wordFileUrl word upload path
     * @return
     */
    public static String word2003ToHtml(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl) {
        //Need to determine whether the file is doc, docx
        if (multipartFile == null) {
            return "The word document uploaded is empty!";
        }
        if (multipartFile.getOriginalFilename().endsWith("docx")) {
            return "The word document format is wrong, please upload it in doc format!";
        }
        logger.info("***** word2003ToHtml start file:{}", multipartFile);
        //Return server proxy address
        String htmlUrl = "";
        //Randomly name html files
        String uuid = UUID.randomUUID().toString();
        String htmlFileName = uuid + "." + "html";
        logger.info("====Initialization====(htmlFileName){parameter} " + htmlFileName);
        try {
            //Upload the local image address of the server
            logger.info("==== htmlFile{parameter} ====" + htmlFile);
            //The image address forwarded by nginx
            logger.info("==== htmlFileImgUrl{parameter} ====" + htmlFileImgUrl);
            //Generate the folder address of the web page
            String htmlFileUrl = htmlFile + uuid + "/";
            logger.info("==== htmlFileUrl{parameter} ==== " + htmlFileUrl);
            //Upload files to the server
            boolean flag = upload(multipartFile, wordFileUrl, uuid);
            if (!flag) {
                return "Word document upload failed!";
            }
            logger.info("====== Word document uploaded successfully!====");
            //Get file name
            String name = multipartFile.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.Suffix name
            String filePath = wordFileUrl + uuid + suffix;
            logger.info("==== filePath ====" + filePath);
            File file = new File(filePath);
            // 1) Load word document to generate HWPFDocument object
            InputStream inputStream = new FileInputStream(file);
            HWPFDocument wordDocument = new HWPFDocument(inputStream);
            WordToHtmlConverter wordToHtmlConverter =
                    new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            //The map's address
            String fileImg = htmlFileUrl + "images/";
            File htmlFile1 = new File(htmlFileUrl);
            if (!htmlFile1.exists()) {
                //create
                if (htmlFile1.mkdirs()) {
                    logger.info("Create" + htmlFileUrl + "Success");
                } else {
                    logger.info("Create" + htmlFileUrl + "Success");
                }
            }
            //html proxy address
            htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;
            //html generation path
            htmlFileName = htmlFileUrl + htmlFileName;
            logger.info("==== htmlFileName{ html ======== output address} " + htmlFileName);
            //Set the location where the image is stored
            String finalFileImg = fileImg;
            final int[] index = {1};
            //Process image address
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                    File imgPath = new File(finalFileImg);
                    if (!imgPath.exists()) {//Create the image directory if it does not exist
                        imgPath.mkdirs();
                    }
                    String extension = pictureType.getExtension();
                    //Randomly generate image names
                    suggestedName = finalFileImg + "image" + index[0] + "." + extension;
                    File file = new File(suggestedName);
                    OutputStream os = null;
                    try {
                        os = new FileOutputStream(file);
                        os.write(content);
                        os.close();
                        //Process wmf formula pictures
// if (extension.equals("wmf") || extension.equals("svg")) {
// if (extension.equals("wmf")) {
// String svgFile = suggestedName.substring(0,
// suggestedName.lastIndexOf(".wmf"))
// + ".svg";
// SvgToPngUtil.wmfToSvg(suggestedName, svgFile);
// }
// String suggestedNameSVG = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".svg";
                            String s = SvgToPngUtil.readToString(suggestedNameSVG);
                            String suggestedNamePng = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".png";
                            SvgToPngUtil.convertToPng(s, suggestedNamePng);
                            String s1 = SvgToPngUtil.GetImageStr(suggestedNameSVG);
// //Delete useless pictures
                            deleteFile(suggestedNameSVG, suggestedName);
// suggestedName = suggestedNameSVG;
// }
                    } catch (FileNotFoundException e) {
                        throw new RuntimeException(e);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    //Here you can specify the path of the picture in the word document.
                    String imgUlr = suggestedName.replace(htmlFile, htmlFileImgUrl);
                    index[0] + + ;
                    return imgUlr;
                }
            });
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            OutputStream outputStream = new FileOutputStream(htmlFileName);
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(outputStream);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            outputStream.close();
            logger.info("====== Web page style conversion begins ====");
            String htmlContents = readHtml(htmlFileName);
            FileCopyUtils.copy(htmlContents.getBytes("utf-8"), new File(htmlFileName));
            logger.info("====== Web page style conversion completed ====");
        } catch (Exception e) {
            logger.error("word2003ToHtml====Exception");
            logger.error(e.getMessage());
            throw new RuntimeException(e);
        }
        //
        return htmlUrl;
    }

    //Get web content
    public static String readHtml(String htmlFileName) throws Exception {
        StringBuilder htmlContents1 = new StringBuilder();
        String htmlContents = "";
        //Read image web content
        BufferedReader buf = new BufferedReader(
                new InputStreamReader(new FileInputStream(htmlFileName), "utf-8"));
        String c = "";
        while ((c = buf.readLine()) != null) {
            htmlContents1.append(c + "\
");
        }

        buf.close();
        htmlContents = htmlContents1.toString();
        htmlContents = htmlContents.replace("hyphenate:auto;font-family:Times New Roman;", "hyphenate:auto;font-family:宋体;").replace("vertical-align:text-bottom;", "vertical -align: middle;").replace("'","'").replace(" & amp;rsquo;","'");
        org.jsoup.nodes.Document document = Jsoup.parse(htmlContents);
        formatHtml(document);
        htmlContents = document.toString();
        return htmlContents;
    }

    //Web page font style
    public static void formatHtml(org.jsoup.nodes.Document document) {
        Elements elements = document.getAllElements();
        String title = document.title();
        logger.info("==== formatHtml ====title" + title);
        for (Element element : elements) {
            if ("main".equals(element.className())) {
                continue;
            }
            if (title.contains("Physics") || title.contains("Mathematics") || title.contains("Chemistry")) {
                if (element.hasClass("s1")) {
                    element.attr("style", "font-family:Times New Roman;" + element.attr("style"));
                }
            }
            String[] attrs = element.attr("style").split(";");
            List<String> attrList = new ArrayList();
            for (String attr : attrs) {
                if (attr.contains("font-family")) {
                    attrList.add(attr);
                }
            }
            //Remove the class attributes b1 b2 in the <body> tag
            Elements bodies = element.getElementsByTag("body");
            for(Element body : bodies){
                System.out.println("=======className:" + body.className() + "==========");
                if("b1 b2".equals(body.className())){
                    body.attr("class","");
                }
            }
        }
    }

    public static void deleteFile(String... imgUrl) {
        for (String s : imgUrl) {
            File file = new File(s);
            try {
                if (file.isFile()) {
                    // Delete Files
                    if (file.delete()) {
                        logger.info("The file was deleted successfully ==== The name is: " + file.getName());
                    } else {
                    }
                } else {
                }
            } catch (Exception e) {
                logger.error("====== Failed to delete picture ======" + e.getMessage());
                throw new RuntimeException();
            }
        }
    }


    /**
     * @param file file
     * @param htmlFile file upload address
     * @param fileName file name
     * @return
     */
    public static boolean upload(MultipartFile file, String htmlFile, String fileName) {
        InputStream is = null;
        OutputStream os = null;
        try {
            File file1 = new File(htmlFile);
            if (!file1.exists()) {
                file1.mkdirs();
            }
            String name = file.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.Suffix name
            is = file.getInputStream();
            os = new FileOutputStream(htmlFile + fileName + suffix);
            //Data copy
            IOUtils.copy(is, os);
            logger.info("==== File writing successful!====");
        } catch (IOException e) {
            logger.error("====== File upload failed ====" + e.getMessage());
            return false;
        } finally {
            if (null != is) {
                try {
                    is.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            if (null != os) {
                try {
                    os.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
        return true;
    }
}

Method 2Code implementation:

package com.hls.poi.service;


import com.hls.poi.controller.WordToHtmlController;
import org.apache.poi.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.multipart.MultipartFile;

import java.io.*;
import java.util.UUID;

public class LibreOfficeCommandWordService {

    private static final Logger logger = LoggerFactory.getLogger(WordToHtmlController.class);

    /**
     * /opt/libreoffice7.5/program/soffice --headless --invisible --convert-to pdf /opt/a/1.docx --outdir /opt/a/
     * /opt/a/1.docx after –convert-to pdf is the original file path
     * –outdir /opt/a/ (the directory where the converted files are stored)
     * <p>
     * soffice --headless --invisible --convert-to html:HTML ffc75d91-3594-451d-a55f-a941325bc380.doc --outdir mmm
     */

    //You need to find the actual directory where LibreOffice is installed based on the actual situation.
    //Mac is installed to /usr/local/bin by default.
    //CentOS is installed in /usr/bin by default
    private final static String sofficeDir = "/opt/libreoffice7.6/program/";

    /**
     * @param multipartFile uploaded file
     * @param htmlFile html upload path
     * @param htmlFileImgUrl html image upload path
     * @param wordFileUrl word upload path
     * @param sofficeDir libreOffice installation address
     * @throwsException
     */
    public String word2html(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl, String sofficeDir) throws Exception {
        try {
            logger.info("exec command:[{}]\
output: [{}]", "Enter word2pdf{} method");
            //Need to determine whether the file is doc, docx
            if (multipartFile == null) {
                return "The word document uploaded is empty!";
            }
            //Return server proxy address
            String htmlUrl = "";
            //Randomly name html files
            String uuid = UUID.randomUUID().toString();
            String htmlFileName = uuid + "." + "html";
            logger.info("====Initialization====(htmlFileName){parameter} " + htmlFileName);
            //Upload the local image address of the server
            logger.info("==== htmlFile{parameter} ====" + htmlFile);
            //The image address forwarded by nginx
            logger.info("==== htmlFileImgUrl{parameter} ====" + htmlFileImgUrl);
            //Generate the folder address of the web page
            String htmlFileUrl = htmlFile + uuid + "/";
            logger.info("==== htmlFileUrl{parameter} ==== " + htmlFileUrl);
            //Upload files to the server
            boolean flag = upload(multipartFile, wordFileUrl, uuid);
            if (!flag) {
                return "Word document upload failed!";
            }
            logger.info("====== Word document uploaded successfully!====");
            //Get file name
            String name = multipartFile.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.Suffix name
            //Word document path after uploading /home/winnersoft/date/tomcat/html-root/office/word/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.doc
            String inPath = wordFileUrl + uuid + suffix;
            logger.info("==== inPath ====" + inPath);
            if (!new File(inPath).exists()) {
                return "The word document does not exist!";
            }
            //The map's address
            File htmlFile1 = new File(htmlFileUrl);
            if (!htmlFile1.exists()) {
                //create
                if (htmlFile1.mkdirs()) {
                    logger.info("Create" + htmlFileUrl + "Success");
                } else {
                    logger.info("Create" + htmlFileUrl + "Success");
                }
            }
            //html proxy address //http://172.18.222.25:82/office/html/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.html
            htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;
            //html generation path /home/winnersoft/date/tomcat/html-root/office/html/af7ac82f-71bc-498c-8866-8bf7ef325345/
            htmlFileName = htmlFileUrl;
            logger.info("==== outPath{ html ======== output address} " + htmlFileName);
            //Set the location where the image is stored
// String command = String.format("%s/soffice --convert-to pdf:writer_pdf_Export %s --outdir %s", sofficeDir, inPath, outPath);
            String command = String.format("%s/soffice --headless --invisible --convert-to html:HTML %s --outdir %s", sofficeDir, inPath, htmlFileName);
            logger.info("command==================================" + command);
            String output = this.executeCommand(command);
            logger.info("exec command:[{}]\
output: [{}]", command, output);
            return htmlUrl;
        } catch (IOException e) {
            logger.error("io exception" + e.getMessage());
            throw new RuntimeException(e);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }

    protected String executeCommand(String command) throws IOException, InterruptedException {
        logger.info("executeCommand{} execute conversion");
        StringBuffer output = new StringBuffer();
        Process p;
        p = Runtime.getRuntime().exec(command);
        p.waitFor();
        try (
                InputStreamReader inputStreamReader = new InputStreamReader(p.getInputStream(), "UTF-8");
                BufferedReader reader = new BufferedReader(inputStreamReader)
        ) {
            String line = "";
            while ((line = reader.readLine()) != null) {
                output.append(line + "\
");
            }
        }
        // Destroy the child process
        p.destroy();
        return output.toString();
    }

    /**
     * @param file file
     * @param htmlFile file upload address
     * @param fileName file name
     * @return
     */
    public static boolean upload(MultipartFile file, String htmlFile, String fileName) {
        InputStream is = null;
        OutputStream os = null;
        try {
            File file1 = new File(htmlFile);
            if (!file1.exists()) {
                file1.mkdirs();
            }
            String name = file.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.Suffix name
            is = file.getInputStream();
            os = new FileOutputStream(htmlFile + fileName + suffix);
            //Data copy
            IOUtils.copy(is, os);
            logger.info("==== File writing successful!====");
        } catch (IOException e) {
            logger.error("====== File upload failed ====" + e.getMessage());
            return false;
        } finally {
            if (null != is) {
                try {
                    is.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            if (null != os) {
                try {
                    os.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
        return true;
    }
}