调通解析Word文件

This commit is contained in:
chenxudong 2025-02-17 16:16:39 +08:00
parent 3c8d8b165e
commit 5b8dfe9577
2 changed files with 12 additions and 17 deletions

View File

@ -21,7 +21,7 @@ public class WordParse extends FileParse {
res = OfficeFileUtil.parseDocAllText(fileTmpPath); res = OfficeFileUtil.parseDocAllText(fileTmpPath);
} }
} catch (Exception e) { } catch (Exception e) {
log.info("解析{}失败,原因{}", fileType, e.getMessage(), e); log.error("解析{}失败,原因{}", fileType, e.getMessage(), e);
} finally { } finally {
FileUtil.del(fileTmpPath); FileUtil.del(fileTmpPath);
} }

View File

@ -21,6 +21,7 @@ import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@ -79,15 +80,12 @@ public class OfficeFileUtil {
public static String parseDocxAllText(String wordPath) throws IOException { public static String parseDocxAllText(String wordPath) throws IOException {
log.info("Start parse docx file, path is {}", wordPath); log.info("Start parse docx file, path is {}", wordPath);
InputStream fis = Files.newInputStream(Paths.get(wordPath)); InputStream fis = Files.newInputStream(Paths.get(wordPath));
XWPFDocument document = new XWPFDocument(); XWPFDocument document = new XWPFDocument(fis);
StringBuilder stringBuilder = new StringBuilder(); XWPFWordExtractor extractor = new XWPFWordExtractor(document);
List<XWPFParagraph> paragraphs = document.getParagraphs(); String text = extractor.getText();
for (XWPFParagraph paragraph : paragraphs) {
stringBuilder.append(paragraph.getText());
}
document.close(); document.close();
fis.close(); fis.close();
return stringBuilder.toString(); return text;
} }
public static String parsePdfAllText(String path) throws IOException { public static String parsePdfAllText(String path) throws IOException {
@ -102,18 +100,15 @@ public class OfficeFileUtil {
return text; return text;
} }
public static String parseDocAllText(String path) throws IOException { public static String parseDocAllText(String wordPath) throws IOException {
log.info("Start parse doc file, path is {}", path); log.info("Start parse doc file, path is {}", wordPath);
FileInputStream fis = new FileInputStream(path); InputStream fis = Files.newInputStream(Paths.get(wordPath));
HWPFDocument document = new HWPFDocument(fis); HWPFDocument document = new HWPFDocument(fis);
WordExtractor wordExtractor = new WordExtractor(document); WordExtractor wordExtractor = new WordExtractor(document);
String[] paragraphText = wordExtractor.getParagraphText(); String text = wordExtractor.getText();
StringBuilder stringBuilder = new StringBuilder(); document.close();
for (String paragraph : paragraphText) {
stringBuilder.append(paragraph);
}
fis.close(); fis.close();
return stringBuilder.toString(); return text;
} }
public static String parseXlsxAllText(String path) throws IOException { public static String parseXlsxAllText(String path) throws IOException {