diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java index f31d8cc..5122506 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java @@ -21,7 +21,7 @@ public class WordParse extends FileParse { res = OfficeFileUtil.parseDocAllText(fileTmpPath); } } catch (Exception e) { - log.info("解析{}失败,原因{}", fileType, e.getMessage(), e); + log.error("解析{}失败,原因{}", fileType, e.getMessage(), e); } finally { FileUtil.del(fileTmpPath); } diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index 66773d8..496b19b 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -21,6 +21,7 @@ import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; @@ -79,15 +80,12 @@ public class OfficeFileUtil { public static String parseDocxAllText(String wordPath) throws IOException { log.info("Start parse docx file, path is {}", wordPath); InputStream fis = Files.newInputStream(Paths.get(wordPath)); - XWPFDocument document = new XWPFDocument(); - StringBuilder stringBuilder = new StringBuilder(); - List paragraphs = document.getParagraphs(); - for (XWPFParagraph paragraph : paragraphs) { - stringBuilder.append(paragraph.getText()); - } + XWPFDocument document = new XWPFDocument(fis); + XWPFWordExtractor extractor = new XWPFWordExtractor(document); + String text = extractor.getText(); document.close(); fis.close(); - return stringBuilder.toString(); + return text; } public static String parsePdfAllText(String path) throws IOException { @@ -102,18 +100,15 @@ public class OfficeFileUtil { return text; } - public static String parseDocAllText(String path) throws IOException { - log.info("Start parse doc file, path is {}", path); - FileInputStream fis = new FileInputStream(path); + public static String parseDocAllText(String wordPath) throws IOException { + log.info("Start parse doc file, path is {}", wordPath); + InputStream fis = Files.newInputStream(Paths.get(wordPath)); HWPFDocument document = new HWPFDocument(fis); WordExtractor wordExtractor = new WordExtractor(document); - String[] paragraphText = wordExtractor.getParagraphText(); - StringBuilder stringBuilder = new StringBuilder(); - for (String paragraph : paragraphText) { - stringBuilder.append(paragraph); - } + String text = wordExtractor.getText(); + document.close(); fis.close(); - return stringBuilder.toString(); + return text; } public static String parseXlsxAllText(String path) throws IOException {