Use poi to read the closest table data under a certain text in doc and docx

Share a method of using poi to read the closest table data under a certain text in doc and docx

  1. Introduce the following packages
    Since I use other functions of easyexcel, I quoted this package. This package contains the package for operating docx.
 <!-- Alibaba Cloud easyexcel -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>3.1.3</version>
</dependency>
<!-- Operation doc -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
  1. filePath is the file path, findStr is the text that needs to be located
public static void main(String[] args) {<!-- -->
// testTable("C:\Users\zhouchuxiang\Desktop\\
ew DOCX document.docx", "11");
        testTable("C:\Users\zhouchuxiang\Desktop\\
ew DOCX document (2).docx", "Gender");
    }

    private static void testTable(String filePath, String findStr) {<!-- -->
        InputStream is = null;
        try {<!-- -->
            is = new FileInputStream(filePath);
            if (filePath.endsWith(".docx")) {<!-- -->
                handleDocx(is, findStr);
            } else if (filePath.endsWith(".doc")) {<!-- -->
                handleDoc(is, findStr);
            }
        } catch (Exception e) {<!-- -->
            e.printStackTrace();
        } finally {<!-- -->
            try {<!-- -->
                if (is != null) {<!-- -->
                    is.close();
                }
            } catch (Exception e) {<!-- -->
                e.printStackTrace();
            }
        }
    }
  1. The following is how to operate doc
static void handleDoc(InputStream inputStream, String findStr) throws Exception {<!-- -->
        //word 2003: Picture will not be read
        HWPFDocument hwpf = new HWPFDocument(inputStream);
        //Get the reading range of the document
        Range range = hwpf.getRange();
        //Whether it is possible to find the table element identifier
        boolean flagToFindTable = false;
        //Get the number of paragraphs
        int paraNum = range.numParagraphs();
        for (int temp = 0; temp < paraNum; temp + + ) {<!-- -->
            //Read paragraph
            Paragraph paragraph = range.getParagraph(temp);
            //System.out.println("paragraph" + temp + "value:" + paragraph.text());
            //The current paragraph contains the query text but the query text is not in the table
            if (paragraph.text().contains(findStr) & amp; & amp; !paragraph.isInTable() & amp; & amp; !flagToFindTable) {<!-- -->
                flagToFindTable = true;
            }
            //Find the query text and then execute it
            if (!flagToFindTable) {<!-- -->
                continue;
            }
            //Confirm that the paragraph is the first paragraph of the table
            if (paragraph.isInTable()) {<!-- -->
                Table tb = range.getTable(paragraph);
                //Iterate rows, starting from 0 by default
                for (int i = 0; i < tb.numRows(); i + + ) {<!-- -->
                    TableRow tr = tb.getRow(i);
                    //Iterate columns, starting from 0 by default
                    for (int j = 0; j < tr.numCells(); j + + ) {<!-- -->
                        //Get the cell
                        TableCell td = tr.getCell(j);
                        //Get the contents of the cell
                        String tempStr = "";
                        for (int k = 0; k <td.numParagraphs(); k + + ) {<!-- -->
                            Paragraph para = td.getParagraph(k);
                            String s = para.text().trim();
                            tempStr + = s;
                        }
                        System.out.print(tempStr + "\t");
                    }
                    System.out.println();
                }
                break;
            }
        }
    }
  1. The following is how to operate docx
 static void handleDocx(InputStream inputStream, String findStr) throws Exception {<!-- -->
        XWPFDocument document = new XWPFDocument(inputStream);
        List<IBodyElement> bodyElements = document.getBodyElements();
        //Whether it is possible to find the table element identifier
        boolean flagToFindTable = false;
        for (IBodyElement bodyElement : bodyElements) {<!-- -->
            BodyElementType elementType = bodyElement.getElementType();
            //Loop in order, first check which previous paragraph contains the query text
            if (elementType == BodyElementType.PARAGRAPH & amp; & amp; !flagToFindTable) {<!-- -->
                //paragraph
                XWPFParagraph para = (XWPFParagraph) bodyElement;
                List<XWPFRun> runs = para.getRuns();
                if (runs == null || runs.isEmpty()) {<!-- -->
                    continue;
                }
                for (XWPFRun run : runs) {<!-- -->
                    //If the fragment has no text, maybe the fragment is a picture
                    if (StringUtils.isNotEmpty(run.text())) {<!-- -->
                        //This fragment is not empty
                        if (run.text().contains(findStr)) {<!-- -->
                            //If the paragraph contains this query text
                            flagToFindTable = true;
                        }
                    }
                }
            }

            if (elementType == BodyElementType.TABLE & amp; & amp; flagToFindTable) {<!-- -->
                //sheet
                XWPFTable table = (XWPFTable) bodyElement;
                List<XWPFTableRow> rows = table.getRows();
                //Read each row of data
                for (int i = 0; i < rows.size(); i + + ) {<!-- -->
                    XWPFTableRow row = rows.get(i);
                    //Read each column of data
                    List<XWPFTableCell> cells = row.getTableCells();
                    for (int j = 0; j < cells.size(); j + + ) {<!-- -->
                        XWPFTableCell cell = cells.get(j);
                        System.out.print(cell.getText() + "\t");
                    }
                    System.out.println();
                }
                break;
            }
        }
        if (!flagToFindTable) {<!-- -->
            System.out.println(String.format("This word document does not have [%s] text, please re-enter the search form", findStr));
        }
    }

Note: This method can only recognize the original DOC. If the DOCX file is changed to DOC by modifying the file extension, the code will not be recognized