From 5b8dfe95775f4dd049a3d749373d3abc005e3b53 Mon Sep 17 00:00:00 2001 From: chenxudong Date: Mon, 17 Feb 2025 16:16:39 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E9=80=9A=E8=A7=A3=E6=9E=90Word?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../software/common/parse/WordParse.java | 2 +- .../software/common/util/OfficeFileUtil.java | 27 ++++++++----------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java index f31d8cc..5122506 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java @@ -21,7 +21,7 @@ public class WordParse extends FileParse { res = OfficeFileUtil.parseDocAllText(fileTmpPath); } } catch (Exception e) { - log.info("解析{}失败,原因{}", fileType, e.getMessage(), e); + log.error("解析{}失败,原因{}", fileType, e.getMessage(), e); } finally { FileUtil.del(fileTmpPath); } diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index 66773d8..496b19b 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -21,6 +21,7 @@ import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; @@ -79,15 +80,12 @@ public class OfficeFileUtil { public static String parseDocxAllText(String wordPath) throws IOException { log.info("Start parse docx file, path is {}", wordPath); InputStream fis = Files.newInputStream(Paths.get(wordPath)); - XWPFDocument document = new XWPFDocument(); - StringBuilder stringBuilder = new StringBuilder(); - List paragraphs = document.getParagraphs(); - for (XWPFParagraph paragraph : paragraphs) { - stringBuilder.append(paragraph.getText()); - } + XWPFDocument document = new XWPFDocument(fis); + XWPFWordExtractor extractor = new XWPFWordExtractor(document); + String text = extractor.getText(); document.close(); fis.close(); - return stringBuilder.toString(); + return text; } public static String parsePdfAllText(String path) throws IOException { @@ -102,18 +100,15 @@ public class OfficeFileUtil { return text; } - public static String parseDocAllText(String path) throws IOException { - log.info("Start parse doc file, path is {}", path); - FileInputStream fis = new FileInputStream(path); + public static String parseDocAllText(String wordPath) throws IOException { + log.info("Start parse doc file, path is {}", wordPath); + InputStream fis = Files.newInputStream(Paths.get(wordPath)); HWPFDocument document = new HWPFDocument(fis); WordExtractor wordExtractor = new WordExtractor(document); - String[] paragraphText = wordExtractor.getParagraphText(); - StringBuilder stringBuilder = new StringBuilder(); - for (String paragraph : paragraphText) { - stringBuilder.append(paragraph); - } + String text = wordExtractor.getText(); + document.close(); fis.close(); - return stringBuilder.toString(); + return text; } public static String parseXlsxAllText(String path) throws IOException {