Parse html to generate Word document

Content: Read the text content in the html file, and then generate a Word document for export.

Case scenario: After the requirement development is completed, a document (code modification list) needs to be written. The content of the document is all the codes modified/added this time. The modified file paths and code snippets need to be listed, and used Different color markings distinguish.
examples:

It would be quite troublesome to copy, paste and annotate manually. So record it here, use code to simply process it, and generate the required documents.

Step 1: After the functional test is completed, merge the code into the new branch, and then enter the page to view the merge record.

Step 2: Save this page locally, and then execute the program to generate a Word document.

Function Implementation Instructions

Project: SpringBoot
html file source: gitee. If it is a gitLab page, the page tag elements may be different. You need to modify the way of parsing HTML and reading tags in the code.

Code implementation logic: use jsoup to read and parse html files, and then use poi to generate Word documents.

To parse HTML, you need to first figure out the tag elements in HTML, and then read them with code.

Start coding
Maven dependencies:

 <!-- poi reads and generates Word documents and Excel documents-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<!-- Parse html -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>

Java code:

package com.example.demo16.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xwpf.usermodel.ParagraphAlignment;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTShd;

import java.io.File;
import java.io.FileOutputStream;

@Slf4j
public class CodeChangeDetailUtil {

    public static void main(String[] args) throws Exception {
        String htmlPath="E:/Downloads/code_01.html";
        String saveWordPath="E:/Downloads/code_0112.docx";
        codeChangeDetailOutPutWorld(htmlPath, saveWordPath);
    }

    /**
     *Old code identifier: old
     * New code identifier: new
     */
    private static final String CODE_TYPE_OLD = "old";
    private static final String CODE_TYPE_NEW = "new";

    /**
     * Parse HTML file content to generate Word
     * @param htmlPath HTML file path
     * @param saveWordPath Word document saving path
     * @throwsException
     */
    public static void codeChangeDetailOutPutWorld(String htmlPath, String saveWordPath) throws Exception {
        XWPFDocument doc = new XWPFDocument();
        Document document = Jsoup.parse(new File(htmlPath), "utf8");
        Elements elements = document.getElementsByClass("files");
        // Collection of file elements
        Elements diffFileElements = elements.get(0).getElementsByClass("diff-file");
        createHeader(doc, "Modification list (quantity: " + diffFileElements.size() + ")");
        for (Element element : diffFileElements) {
            String headerText = element.getElementsByClass("header").get(0)
                    .getElementsByTag("a")
                    .get(0).text();
            createNullLine(doc, 1);
            createText(doc, headerText, "");
        }
        createNullLine(doc, 2);
        createHeader(doc, "Program modification record");
        for (Element element : diffFileElements) {
            //File header, file path
            String headerText = element.getElementsByClass("header").get(0)
                    .getElementsByTag("a")
                    .get(0).text();
            //log.info("headerText:{}",headerText);
            // Get file path/name
            createNullLine(doc, 2);
            createTextHeader(doc, headerText);
            // File content elements, codes are stored in the rows of the table, one line of code, here get all the rows of the table
            if (null == findTableElements(element)) {
                log.error("The content of this file may be folded, please search on the original page [Difference is collapsed, click to expand] Click to expand and then save the html file, file: {}", headerText);
                throw new Exception("The content of this file may be folded, please search for [Difference is folded, click to expand] on the original page, click to expand and then save the html file");
            }
            Elements trElements = findTableElements(element).get(0)
                    .getElementsByTag("tbody").get(0)
                    .getElementsByTag("tr");
            // Traverse all lines and get the file content
            for (Element tr : trElements) {
                String lineContent = tr.getElementsByClass("line_content").get(0).text();
                // old code
                if(tr.getElementsByClass("old").size()>0){
                    createText(doc, lineContent, CODE_TYPE_OLD);
                    continue;
                }
                // new code
                if(tr.getElementsByClass("new").size()>0){
                    createText(doc, lineContent, CODE_TYPE_NEW);
                    continue;
                }
                createText(doc, lineContent, "");
            }
        }
        FileOutputStream fileOutputStream = new FileOutputStream(saveWordPath);
        doc.write(fileOutputStream);
        fileOutputStream.close();
        log.info("Document generated successfully, storage path: {}", saveWordPath);
    }

    /**
     * Create title
     * @paramdoc
     * @param headerText header text
     */
    private static void createHeader(XWPFDocument doc, String headerText) {
        //Create title
        XWPFParagraph paragraph = doc.createParagraph();
        //Title level, 1,2,3...
        paragraph.setStyle("1");
        //Set alignment
        paragraph.setAlignment(ParagraphAlignment.LEFT);
        XWPFRun run = paragraph.createRun();
        run.setColor("000000");
        run.setText(headerText);
        run.setFontFamily("黑体");
        run.setFontSize(22);
        // bold
        run.setBold(true);
    }

    /**
     * Generate file path
     * @paramdoc
     * @param contentText text content
     */
    private static void createTextHeader(XWPFDocument doc, String contentText) {
        XWPFParagraph paragraph = doc.createParagraph();
        // align left
        paragraph.setAlignment(ParagraphAlignment.LEFT);
        XWPFRun contentRun = paragraph.createRun();
        contentRun.setFontSize(11);
        //Create a blank line before
        createNullLine(doc, 2);
        contentRun.setText(contentText);
        // bold
        contentRun.setBold(true);
    }

    /**
     * document content
     * @paramdoc
     * @param contentText text content
     * @param textType code content identifier
     */
    private static void createText(XWPFDocument doc, String contentText, String textType) {
        XWPFParagraph paragraph = doc.createParagraph();
        // align left
        paragraph.setAlignment(ParagraphAlignment.LEFT);
        XWPFRun contentRun = paragraph.createRun();
        contentRun.setFontFamily("Consolas");
        contentRun.setFontSize(9);
        contentRun.setText(contentText);
        // highlight
        CTShd ctShd = contentRun.getCTR().addNewRPr().addNewShd();
        // old: old code, new: new code
        if (CODE_TYPE_OLD.equals(textType)) {
            ctShd.setFill("FFEFD5");
        } else if (CODE_TYPE_NEW.equals(textType)) {
            ctShd.setFill("CCFF99");
        }
    }

    /**
     * Create a blank line
     * @paramdoc
     * @param lineNum line number
     */
    private static void createNullLine(XWPFDocument doc, int lineNum) {
        XWPFParagraph paragraph = doc.createParagraph();
        XWPFRun contentRun = paragraph.createRun();
        for (int i = 0; i <lineNum; i + + ) {
            contentRun.setText("\
");
        }
    }

    /**
     * Find table elements
     * @param element element object
     * @return
     */
    private static Elements findTableElements(Element element) {
        Elements trElements_temp = element.getElementsByClass("diff-content");
        for (Element element_t : trElements_temp) {
            int tableSize = element_t.getElementsByTag("table").size();
            if (tableSize > 0) {
                return element_t.getElementsByTag("table");
            }
        }
        return null;
    }
}

Detailed analysis of specific pages. The main purpose is to use jsoup to read html content and process it. For details on the use of jsoup, please refer to the official documentation.