调通解析Word文件

This commit is contained in:
chenxudong 2025-02-17 16:16:39 +08:00
parent 3c8d8b165e
commit 5b8dfe9577
2 changed files with 12 additions and 17 deletions

View File

@ -21,7 +21,7 @@ public class WordParse extends FileParse {
res = OfficeFileUtil.parseDocAllText(fileTmpPath);
}
} catch (Exception e) {
log.info("解析{}失败,原因{}", fileType, e.getMessage(), e);
log.error("解析{}失败,原因{}", fileType, e.getMessage(), e);
} finally {
FileUtil.del(fileTmpPath);
}

View File

@ -21,6 +21,7 @@ import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@ -79,15 +80,12 @@ public class OfficeFileUtil {
public static String parseDocxAllText(String wordPath) throws IOException {
log.info("Start parse docx file, path is {}", wordPath);
InputStream fis = Files.newInputStream(Paths.get(wordPath));
XWPFDocument document = new XWPFDocument();
StringBuilder stringBuilder = new StringBuilder();
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
stringBuilder.append(paragraph.getText());
}
XWPFDocument document = new XWPFDocument(fis);
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
String text = extractor.getText();
document.close();
fis.close();
return stringBuilder.toString();
return text;
}
public static String parsePdfAllText(String path) throws IOException {
@ -102,18 +100,15 @@ public class OfficeFileUtil {
return text;
}
public static String parseDocAllText(String path) throws IOException {
log.info("Start parse doc file, path is {}", path);
FileInputStream fis = new FileInputStream(path);
public static String parseDocAllText(String wordPath) throws IOException {
log.info("Start parse doc file, path is {}", wordPath);
InputStream fis = Files.newInputStream(Paths.get(wordPath));
HWPFDocument document = new HWPFDocument(fis);
WordExtractor wordExtractor = new WordExtractor(document);
String[] paragraphText = wordExtractor.getParagraphText();
StringBuilder stringBuilder = new StringBuilder();
for (String paragraph : paragraphText) {
stringBuilder.append(paragraph);
}
String text = wordExtractor.getText();
document.close();
fis.close();
return stringBuilder.toString();
return text;
}
public static String parseXlsxAllText(String path) throws IOException {