The key points are highlighted. . .
Method 1: Use the toolkit poi provided by apache. poi uses version 4.1.2
Disadvantages: Inaccurate processing of font styles; inaccurate conversion of wmf formula image parts, this document only supports doc format
Advantages: The conversion speed is relatively fast and local debugging is convenient
Method 2: Use libreoffice, using version 7.5
Address:Download LibreOffice | LibreOffice Simplified Chinese official website – Free and free office suite
Linux installation libreoffice case:linux centos7 tool installation libreOffice libreOffice installation tutorial_centos7 installation libreoffice_the_bog’s blog-CSDN blog
Disadvantages: Relatively slow conversion speed
Advantages: The font style is very precise. This document only supports doc, docx, etc. Convert pdf and other related commands from Baidu
Without further ado, let’s get straight to the code! ! !
Method 1 code implementation:
Relevant jar package address:
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.9.2</version> </dependency> <dependency> <groupId>org.apache.xmlgraphics</groupId> <artifactId>batik-codec</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>net.arnx</groupId> <artifactId>wmf2svg</artifactId> <version>0.9.5</version> </dependency>
package cn.hls.winner.winner_problem_manage.utils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.util.IOUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.FileCopyUtils; import org.springframework.web.multipart.MultipartFile; import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.UUID; /** * @author lhz * @description TODO * @date 2023/9/18 10:14 */ public class Word2003Util { private static final Logger logger = LoggerFactory.getLogger(Word2003Util.class); /** * * @param multipartFile uploaded file * @param htmlFile html upload path * @param htmlFileImgUrl html image upload path * @param wordFileUrl word upload path * @return */ public static String word2003ToHtml(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl) { //Need to determine whether the file is doc, docx if (multipartFile == null) { return "The word document uploaded is empty!"; } if (multipartFile.getOriginalFilename().endsWith("docx")) { return "The word document format is wrong, please upload it in doc format!"; } logger.info("***** word2003ToHtml start file:{}", multipartFile); //Return server proxy address String htmlUrl = ""; //Randomly name html files String uuid = UUID.randomUUID().toString(); String htmlFileName = uuid + "." + "html"; logger.info("====Initialization====(htmlFileName){parameter} " + htmlFileName); try { //Upload the local image address of the server logger.info("==== htmlFile{parameter} ====" + htmlFile); //The image address forwarded by nginx logger.info("==== htmlFileImgUrl{parameter} ====" + htmlFileImgUrl); //Generate the folder address of the web page String htmlFileUrl = htmlFile + uuid + "/"; logger.info("==== htmlFileUrl{parameter} ==== " + htmlFileUrl); //Upload files to the server boolean flag = upload(multipartFile, wordFileUrl, uuid); if (!flag) { return "Word document upload failed!"; } logger.info("====== Word document uploaded successfully!===="); //Get file name String name = multipartFile.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.Suffix name String filePath = wordFileUrl + uuid + suffix; logger.info("==== filePath ====" + filePath); File file = new File(filePath); // 1) Load word document to generate HWPFDocument object InputStream inputStream = new FileInputStream(file); HWPFDocument wordDocument = new HWPFDocument(inputStream); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //The map's address String fileImg = htmlFileUrl + "images/"; File htmlFile1 = new File(htmlFileUrl); if (!htmlFile1.exists()) { //create if (htmlFile1.mkdirs()) { logger.info("Create" + htmlFileUrl + "Success"); } else { logger.info("Create" + htmlFileUrl + "Success"); } } //html proxy address htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName; //html generation path htmlFileName = htmlFileUrl + htmlFileName; logger.info("==== htmlFileName{ html ======== output address} " + htmlFileName); //Set the location where the image is stored String finalFileImg = fileImg; final int[] index = {1}; //Process image address wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { File imgPath = new File(finalFileImg); if (!imgPath.exists()) {//Create the image directory if it does not exist imgPath.mkdirs(); } String extension = pictureType.getExtension(); //Randomly generate image names suggestedName = finalFileImg + "image" + index[0] + "." + extension; File file = new File(suggestedName); OutputStream os = null; try { os = new FileOutputStream(file); os.write(content); os.close(); //Process wmf formula pictures // if (extension.equals("wmf") || extension.equals("svg")) { // if (extension.equals("wmf")) { // String svgFile = suggestedName.substring(0, // suggestedName.lastIndexOf(".wmf")) // + ".svg"; // SvgToPngUtil.wmfToSvg(suggestedName, svgFile); // } // String suggestedNameSVG = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".svg"; String s = SvgToPngUtil.readToString(suggestedNameSVG); String suggestedNamePng = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".png"; SvgToPngUtil.convertToPng(s, suggestedNamePng); String s1 = SvgToPngUtil.GetImageStr(suggestedNameSVG); // //Delete useless pictures deleteFile(suggestedNameSVG, suggestedName); // suggestedName = suggestedNameSVG; // } } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } //Here you can specify the path of the picture in the word document. String imgUlr = suggestedName.replace(htmlFile, htmlFileImgUrl); index[0] + + ; return imgUlr; } }); wordToHtmlConverter.processDocument(wordDocument); Document htmlDocument = wordToHtmlConverter.getDocument(); OutputStream outputStream = new FileOutputStream(htmlFileName); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(outputStream); TransformerFactory factory = TransformerFactory.newInstance(); Transformer serializer = factory.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); outputStream.close(); logger.info("====== Web page style conversion begins ===="); String htmlContents = readHtml(htmlFileName); FileCopyUtils.copy(htmlContents.getBytes("utf-8"), new File(htmlFileName)); logger.info("====== Web page style conversion completed ===="); } catch (Exception e) { logger.error("word2003ToHtml====Exception"); logger.error(e.getMessage()); throw new RuntimeException(e); } // return htmlUrl; } //Get web content public static String readHtml(String htmlFileName) throws Exception { StringBuilder htmlContents1 = new StringBuilder(); String htmlContents = ""; //Read image web content BufferedReader buf = new BufferedReader( new InputStreamReader(new FileInputStream(htmlFileName), "utf-8")); String c = ""; while ((c = buf.readLine()) != null) { htmlContents1.append(c + "\ "); } buf.close(); htmlContents = htmlContents1.toString(); htmlContents = htmlContents.replace("hyphenate:auto;font-family:Times New Roman;", "hyphenate:auto;font-family:宋体;").replace("vertical-align:text-bottom;", "vertical -align: middle;").replace("'","'").replace(" & amp;rsquo;","'"); org.jsoup.nodes.Document document = Jsoup.parse(htmlContents); formatHtml(document); htmlContents = document.toString(); return htmlContents; } //Web page font style public static void formatHtml(org.jsoup.nodes.Document document) { Elements elements = document.getAllElements(); String title = document.title(); logger.info("==== formatHtml ====title" + title); for (Element element : elements) { if ("main".equals(element.className())) { continue; } if (title.contains("Physics") || title.contains("Mathematics") || title.contains("Chemistry")) { if (element.hasClass("s1")) { element.attr("style", "font-family:Times New Roman;" + element.attr("style")); } } String[] attrs = element.attr("style").split(";"); List<String> attrList = new ArrayList(); for (String attr : attrs) { if (attr.contains("font-family")) { attrList.add(attr); } } //Remove the class attributes b1 b2 in the <body> tag Elements bodies = element.getElementsByTag("body"); for(Element body : bodies){ System.out.println("=======className:" + body.className() + "=========="); if("b1 b2".equals(body.className())){ body.attr("class",""); } } } } public static void deleteFile(String... imgUrl) { for (String s : imgUrl) { File file = new File(s); try { if (file.isFile()) { // Delete Files if (file.delete()) { logger.info("The file was deleted successfully ==== The name is: " + file.getName()); } else { } } else { } } catch (Exception e) { logger.error("====== Failed to delete picture ======" + e.getMessage()); throw new RuntimeException(); } } } /** * @param file file * @param htmlFile file upload address * @param fileName file name * @return */ public static boolean upload(MultipartFile file, String htmlFile, String fileName) { InputStream is = null; OutputStream os = null; try { File file1 = new File(htmlFile); if (!file1.exists()) { file1.mkdirs(); } String name = file.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.Suffix name is = file.getInputStream(); os = new FileOutputStream(htmlFile + fileName + suffix); //Data copy IOUtils.copy(is, os); logger.info("==== File writing successful!===="); } catch (IOException e) { logger.error("====== File upload failed ====" + e.getMessage()); return false; } finally { if (null != is) { try { is.close(); } catch (IOException e) { throw new RuntimeException(e); } } if (null != os) { try { os.close(); } catch (IOException e) { throw new RuntimeException(e); } } } return true; } }
Method 2Code implementation:
package com.hls.poi.service; import com.hls.poi.controller.WordToHtmlController; import org.apache.poi.util.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.web.multipart.MultipartFile; import java.io.*; import java.util.UUID; public class LibreOfficeCommandWordService { private static final Logger logger = LoggerFactory.getLogger(WordToHtmlController.class); /** * /opt/libreoffice7.5/program/soffice --headless --invisible --convert-to pdf /opt/a/1.docx --outdir /opt/a/ * /opt/a/1.docx after –convert-to pdf is the original file path * –outdir /opt/a/ (the directory where the converted files are stored) * <p> * soffice --headless --invisible --convert-to html:HTML ffc75d91-3594-451d-a55f-a941325bc380.doc --outdir mmm */ //You need to find the actual directory where LibreOffice is installed based on the actual situation. //Mac is installed to /usr/local/bin by default. //CentOS is installed in /usr/bin by default private final static String sofficeDir = "/opt/libreoffice7.6/program/"; /** * @param multipartFile uploaded file * @param htmlFile html upload path * @param htmlFileImgUrl html image upload path * @param wordFileUrl word upload path * @param sofficeDir libreOffice installation address * @throwsException */ public String word2html(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl, String sofficeDir) throws Exception { try { logger.info("exec command:[{}]\ output: [{}]", "Enter word2pdf{} method"); //Need to determine whether the file is doc, docx if (multipartFile == null) { return "The word document uploaded is empty!"; } //Return server proxy address String htmlUrl = ""; //Randomly name html files String uuid = UUID.randomUUID().toString(); String htmlFileName = uuid + "." + "html"; logger.info("====Initialization====(htmlFileName){parameter} " + htmlFileName); //Upload the local image address of the server logger.info("==== htmlFile{parameter} ====" + htmlFile); //The image address forwarded by nginx logger.info("==== htmlFileImgUrl{parameter} ====" + htmlFileImgUrl); //Generate the folder address of the web page String htmlFileUrl = htmlFile + uuid + "/"; logger.info("==== htmlFileUrl{parameter} ==== " + htmlFileUrl); //Upload files to the server boolean flag = upload(multipartFile, wordFileUrl, uuid); if (!flag) { return "Word document upload failed!"; } logger.info("====== Word document uploaded successfully!===="); //Get file name String name = multipartFile.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.Suffix name //Word document path after uploading /home/winnersoft/date/tomcat/html-root/office/word/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.doc String inPath = wordFileUrl + uuid + suffix; logger.info("==== inPath ====" + inPath); if (!new File(inPath).exists()) { return "The word document does not exist!"; } //The map's address File htmlFile1 = new File(htmlFileUrl); if (!htmlFile1.exists()) { //create if (htmlFile1.mkdirs()) { logger.info("Create" + htmlFileUrl + "Success"); } else { logger.info("Create" + htmlFileUrl + "Success"); } } //html proxy address //http://172.18.222.25:82/office/html/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.html htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName; //html generation path /home/winnersoft/date/tomcat/html-root/office/html/af7ac82f-71bc-498c-8866-8bf7ef325345/ htmlFileName = htmlFileUrl; logger.info("==== outPath{ html ======== output address} " + htmlFileName); //Set the location where the image is stored // String command = String.format("%s/soffice --convert-to pdf:writer_pdf_Export %s --outdir %s", sofficeDir, inPath, outPath); String command = String.format("%s/soffice --headless --invisible --convert-to html:HTML %s --outdir %s", sofficeDir, inPath, htmlFileName); logger.info("command==================================" + command); String output = this.executeCommand(command); logger.info("exec command:[{}]\ output: [{}]", command, output); return htmlUrl; } catch (IOException e) { logger.error("io exception" + e.getMessage()); throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } } protected String executeCommand(String command) throws IOException, InterruptedException { logger.info("executeCommand{} execute conversion"); StringBuffer output = new StringBuffer(); Process p; p = Runtime.getRuntime().exec(command); p.waitFor(); try ( InputStreamReader inputStreamReader = new InputStreamReader(p.getInputStream(), "UTF-8"); BufferedReader reader = new BufferedReader(inputStreamReader) ) { String line = ""; while ((line = reader.readLine()) != null) { output.append(line + "\ "); } } // Destroy the child process p.destroy(); return output.toString(); } /** * @param file file * @param htmlFile file upload address * @param fileName file name * @return */ public static boolean upload(MultipartFile file, String htmlFile, String fileName) { InputStream is = null; OutputStream os = null; try { File file1 = new File(htmlFile); if (!file1.exists()) { file1.mkdirs(); } String name = file.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.Suffix name is = file.getInputStream(); os = new FileOutputStream(htmlFile + fileName + suffix); //Data copy IOUtils.copy(is, os); logger.info("==== File writing successful!===="); } catch (IOException e) { logger.error("====== File upload failed ====" + e.getMessage()); return false; } finally { if (null != is) { try { is.close(); } catch (IOException e) { throw new RuntimeException(e); } } if (null != os) { try { os.close(); } catch (IOException e) { throw new RuntimeException(e); } } } return true; } }