Share a method of using poi to read the closest table data under a certain text in doc and docx
- Introduce the following packages
Since I use other functions of easyexcel, I quoted this package. This package contains the package for operating docx.
<!-- Alibaba Cloud easyexcel --> <dependency> <groupId>com.alibaba</groupId> <artifactId>easyexcel</artifactId> <version>3.1.3</version> </dependency> <!-- Operation doc --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.1.2</version> </dependency>
- filePath is the file path, findStr is the text that needs to be located
public static void main(String[] args) {<!-- --> // testTable("C:\Users\zhouchuxiang\Desktop\\ ew DOCX document.docx", "11"); testTable("C:\Users\zhouchuxiang\Desktop\\ ew DOCX document (2).docx", "Gender"); } private static void testTable(String filePath, String findStr) {<!-- --> InputStream is = null; try {<!-- --> is = new FileInputStream(filePath); if (filePath.endsWith(".docx")) {<!-- --> handleDocx(is, findStr); } else if (filePath.endsWith(".doc")) {<!-- --> handleDoc(is, findStr); } } catch (Exception e) {<!-- --> e.printStackTrace(); } finally {<!-- --> try {<!-- --> if (is != null) {<!-- --> is.close(); } } catch (Exception e) {<!-- --> e.printStackTrace(); } } }
- The following is how to operate doc
static void handleDoc(InputStream inputStream, String findStr) throws Exception {<!-- --> //word 2003: Picture will not be read HWPFDocument hwpf = new HWPFDocument(inputStream); //Get the reading range of the document Range range = hwpf.getRange(); //Whether it is possible to find the table element identifier boolean flagToFindTable = false; //Get the number of paragraphs int paraNum = range.numParagraphs(); for (int temp = 0; temp < paraNum; temp + + ) {<!-- --> //Read paragraph Paragraph paragraph = range.getParagraph(temp); //System.out.println("paragraph" + temp + "value:" + paragraph.text()); //The current paragraph contains the query text but the query text is not in the table if (paragraph.text().contains(findStr) & amp; & amp; !paragraph.isInTable() & amp; & amp; !flagToFindTable) {<!-- --> flagToFindTable = true; } //Find the query text and then execute it if (!flagToFindTable) {<!-- --> continue; } //Confirm that the paragraph is the first paragraph of the table if (paragraph.isInTable()) {<!-- --> Table tb = range.getTable(paragraph); //Iterate rows, starting from 0 by default for (int i = 0; i < tb.numRows(); i + + ) {<!-- --> TableRow tr = tb.getRow(i); //Iterate columns, starting from 0 by default for (int j = 0; j < tr.numCells(); j + + ) {<!-- --> //Get the cell TableCell td = tr.getCell(j); //Get the contents of the cell String tempStr = ""; for (int k = 0; k <td.numParagraphs(); k + + ) {<!-- --> Paragraph para = td.getParagraph(k); String s = para.text().trim(); tempStr + = s; } System.out.print(tempStr + "\t"); } System.out.println(); } break; } } }
- The following is how to operate docx
static void handleDocx(InputStream inputStream, String findStr) throws Exception {<!-- --> XWPFDocument document = new XWPFDocument(inputStream); List<IBodyElement> bodyElements = document.getBodyElements(); //Whether it is possible to find the table element identifier boolean flagToFindTable = false; for (IBodyElement bodyElement : bodyElements) {<!-- --> BodyElementType elementType = bodyElement.getElementType(); //Loop in order, first check which previous paragraph contains the query text if (elementType == BodyElementType.PARAGRAPH & amp; & amp; !flagToFindTable) {<!-- --> //paragraph XWPFParagraph para = (XWPFParagraph) bodyElement; List<XWPFRun> runs = para.getRuns(); if (runs == null || runs.isEmpty()) {<!-- --> continue; } for (XWPFRun run : runs) {<!-- --> //If the fragment has no text, maybe the fragment is a picture if (StringUtils.isNotEmpty(run.text())) {<!-- --> //This fragment is not empty if (run.text().contains(findStr)) {<!-- --> //If the paragraph contains this query text flagToFindTable = true; } } } } if (elementType == BodyElementType.TABLE & amp; & amp; flagToFindTable) {<!-- --> //sheet XWPFTable table = (XWPFTable) bodyElement; List<XWPFTableRow> rows = table.getRows(); //Read each row of data for (int i = 0; i < rows.size(); i + + ) {<!-- --> XWPFTableRow row = rows.get(i); //Read each column of data List<XWPFTableCell> cells = row.getTableCells(); for (int j = 0; j < cells.size(); j + + ) {<!-- --> XWPFTableCell cell = cells.get(j); System.out.print(cell.getText() + "\t"); } System.out.println(); } break; } } if (!flagToFindTable) {<!-- --> System.out.println(String.format("This word document does not have [%s] text, please re-enter the search form", findStr)); } }
Note: This method can only recognize the original DOC. If the DOCX file is changed to DOC by modifying the file extension, the code will not be recognized