/** * Normalize html using jsoup * * @param html html content * @return normalized html */ private static String formatHtml(String html) { org.jsoup.nodes.Document doc = Jsoup.parse(html); //Set the export format to keep spaces doc.outputSettings(new org.jsoup.nodes.Document.OutputSettings().prettyPrint(false)); // remove excessive width String style = doc.attr("style"); if (StringUtils.isNotEmpty(style) & amp; & amp; style.contains("width")) { doc.attr("style", ""); } Elements divs = doc.select("div"); for (Element div : divs) { String divStyle = div.attr("style"); if (StringUtils.isNotEmpty(divStyle) & amp; & amp; divStyle.contains("width")) { div.attr("style", ""); } //get text label Elements span = div.select("span"); for (Element element : span) { //replace spaces with \? //Note: You also need to add a space after \? element.html(element.html().replace(" ","\? ")); } } // jsoup generates closing tags doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); return doc.html(); }
After exporting effect: the underlined part is a space