diff --git a/electromagnetic-common/pom.xml b/electromagnetic-common/pom.xml
index f992669..ecdbafb 100644
--- a/electromagnetic-common/pom.xml
+++ b/electromagnetic-common/pom.xml
@@ -65,6 +65,34 @@
documents4j-transformer-msoffice-word
1.0.3
+
+
+ org.apache.poi
+ poi-ooxml
+ 4.1.2
+
+
+
+
+ org.apache.poi
+ poi-ooxml-schemas
+ 4.1.0
+
+
+
+
+ org.apache.poi
+ poi-scratchpad
+ 4.1.2
+
+
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.24
+
+
+
\ No newline at end of file
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/ExcelParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/ExcelParse.java
new file mode 100644
index 0000000..d4cc30b
--- /dev/null
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/ExcelParse.java
@@ -0,0 +1,11 @@
+package com.electromagnetic.industry.software.common.parse;
+
+import java.io.InputStream;
+
+public class ExcelParse extends FileParse {
+
+ @Override
+ public String parseContent(InputStream stream, String fileType) {
+ return "";
+ }
+}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/FileParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/FileParse.java
new file mode 100644
index 0000000..9b9d469
--- /dev/null
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/FileParse.java
@@ -0,0 +1,18 @@
+package com.electromagnetic.industry.software.common.parse;
+
+import cn.hutool.core.util.IdUtil;
+
+import java.io.File;
+import java.io.InputStream;
+
+public abstract class FileParse {
+
+ private static String tmpPath = Thread.currentThread().getContextClassLoader().getResource("").getPath();
+
+ public abstract String parseContent(InputStream stream, String fileType);
+
+ protected String createFileTmpPath(String fileType) {
+ String uuid = IdUtil.fastSimpleUUID();
+ return tmpPath + File.separator + uuid + "." + fileType;
+ }
+}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java
new file mode 100644
index 0000000..c9e92bb
--- /dev/null
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java
@@ -0,0 +1,21 @@
+package com.electromagnetic.industry.software.common.parse;
+
+import com.electromagnetic.industry.software.common.util.OfficeFileUtil;
+import lombok.extern.slf4j.Slf4j;
+
+import java.io.InputStream;
+
+@Slf4j
+public class PdfParse extends FileParse {
+ @Override
+ public String parseContent(InputStream stream, String fileType) {
+ String res = "";
+ try {
+ String fileTmpPath = createFileTmpPath(fileType);
+ res = OfficeFileUtil.parsePdf(fileTmpPath);
+ } catch (Exception e) {
+ log.error("解析pdf文件失败{}", e.getMessage(), e);
+ }
+ return res;
+ }
+}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PptParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PptParse.java
new file mode 100644
index 0000000..4b81e5e
--- /dev/null
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PptParse.java
@@ -0,0 +1,11 @@
+package com.electromagnetic.industry.software.common.parse;
+
+import java.io.InputStream;
+
+public class PptParse extends FileParse {
+
+ @Override
+ public String parseContent(InputStream stream, String fileType) {
+ return "";
+ }
+}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java
new file mode 100644
index 0000000..ad29da7
--- /dev/null
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java
@@ -0,0 +1,26 @@
+package com.electromagnetic.industry.software.common.parse;
+
+import cn.hutool.core.io.FileUtil;
+import lombok.extern.slf4j.Slf4j;
+
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+@Slf4j
+public class TextParse extends FileParse {
+
+ @Override
+ public String parseContent(InputStream stream, String fileType) {
+ String fileTmpPath = createFileTmpPath(fileType);
+ String res = "";
+ try {
+ FileUtil.writeFromStream(stream, fileTmpPath);
+ res = FileUtil.readString(fileTmpPath, Charset.defaultCharset());
+ } catch (Exception e) {
+ log.info("解析文件 {} 错误, 原因 {}", fileType, e.getMessage(), e);
+ } finally {
+ FileUtil.del(fileTmpPath);
+ }
+ return res;
+ }
+}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java
new file mode 100644
index 0000000..76191e3
--- /dev/null
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java
@@ -0,0 +1,28 @@
+package com.electromagnetic.industry.software.common.parse;
+
+import cn.hutool.core.io.FileUtil;
+import com.electromagnetic.industry.software.common.util.OfficeFileUtil;
+import lombok.extern.slf4j.Slf4j;
+
+import java.io.InputStream;
+
+@Slf4j
+public class WordParse extends FileParse {
+
+ @Override
+ public String parseContent(InputStream stream, String fileType) {
+ String fileTmpPath = createFileTmpPath(fileType);
+ String res = "";
+ try {
+ FileUtil.writeFromStream(stream, fileTmpPath);
+ if (fileType.endsWith("docx")) {
+ return OfficeFileUtil.parseDocx(fileTmpPath);
+ } else {
+ return OfficeFileUtil.parseDoc(fileTmpPath);
+ }
+ } catch (Exception e) {
+ log.info("解析{}失败,原因{}", fileType, e.getMessage(), e);
+ }
+ return res;
+ }
+}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java
index 50f26d1..8aeb340 100644
--- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java
@@ -4,6 +4,7 @@ import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.crypto.symmetric.AES;
import com.electromagnetic.industry.software.common.exception.BizException;
+import com.electromagnetic.industry.software.common.parse.*;
import java.io.File;
import java.io.InputStream;
@@ -12,10 +13,26 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
+import java.util.HashMap;
+import java.util.Map;
import java.util.regex.Pattern;
public final class EleCommonUtil {
+ private static final Map PARSE_MAP = new HashMap<>();
+
+ static {
+ PARSE_MAP.put("doc", new WordParse());
+ PARSE_MAP.put("docx", new WordParse());
+ PARSE_MAP.put("xls", new ExcelParse());
+ PARSE_MAP.put("xlsx", new ExcelParse());
+ PARSE_MAP.put("ppt", new PptParse());
+ PARSE_MAP.put("pptx", new PptParse());
+ PARSE_MAP.put("text", new TextParse());
+ PARSE_MAP.put("txt", new TextParse());
+ PARSE_MAP.put("py", new TextParse());
+ }
+
// 正则表达式模式,匹配中文字符、下划线、连字符、加号、数字和英文字符
private static final String PATTERN = "^[\\u4e00-\\u9fa5a-zA-Z0-9._\\-+]+$";
@@ -59,4 +76,15 @@ public final class EleCommonUtil {
}
}
+ public static boolean isWinOs() {
+ return System.getProperty("os.name").toLowerCase().startsWith("win");
+ }
+
+ public static String parse(InputStream inputStream, String fileType) {
+
+ FileParse fileParse = PARSE_MAP.getOrDefault(fileType, new TextParse());
+
+ return fileParse.parseContent(inputStream, fileType);
+ }
+
}
diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
index ca9380f..415d5a9 100644
--- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
@@ -5,32 +5,80 @@ import com.documents4j.api.DocumentType;
import com.documents4j.api.IConverter;
import com.documents4j.job.LocalConverter;
import lombok.extern.slf4j.Slf4j;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import java.io.*;
import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
@Slf4j
public class OfficeFileUtil {
+ private static final IConverter CONVERT = LocalConverter.builder().build();
+
public static void doc2pdf(String wordPath, String pdfPath) {
- File inputWord = new File(wordPath);
- File outputFile = new File(pdfPath);
- try {
- InputStream docxInputStream = Files.newInputStream(inputWord.toPath());
- OutputStream outputStream = Files.newOutputStream(outputFile.toPath());
- IConverter converter = LocalConverter.builder().build();
- boolean execute = converter.convert(docxInputStream)
- .as(DocumentType.DOCX)
- .to(outputStream)
- .as(DocumentType.PDF).schedule().get();
- Assert.isTrue(execute, "转换失败");
- outputStream.close();
- docxInputStream.close();
- log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath());
- converter.shutDown();
- } catch (Exception e) {
- log.error("[documents4J] word转pdf失败:{}", e.toString());
+ if (EleCommonUtil.isWinOs()) {
+ File inputWord = new File(wordPath);
+ File outputFile = new File(pdfPath);
+ try(InputStream docxInputStream = Files.newInputStream(inputWord.toPath());
+ OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) {
+ boolean execute = CONVERT.convert(docxInputStream)
+ .as(DocumentType.DOCX)
+ .to(outputStream)
+ .as(DocumentType.PDF).schedule().get();
+ Assert.isTrue(execute, "转换失败");
+ log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath());
+ CONVERT.shutDown();
+ } catch (Exception e) {
+ log.error("[documents4J] word转pdf失败:{}", e.toString());
+ }
+ } else {
+
}
+
+
}
+ public static String parseDocx(String wordPath) throws IOException {
+ InputStream fis = Files.newInputStream(Paths.get(wordPath));
+ XWPFDocument document = new XWPFDocument();
+ StringBuilder stringBuilder = new StringBuilder();
+ List paragraphs = document.getParagraphs();
+ for (XWPFParagraph paragraph : paragraphs) {
+ stringBuilder.append(paragraph.getText());
+ }
+ document.close();
+ fis.close();
+ return stringBuilder.toString();
+ }
+
+ public static String parsePdf(String path) throws IOException {
+ // 加载PDF文档
+ PDDocument document = PDDocument.load(new File(path));
+ // 创建PDFTextStripper对象来解析文本
+ PDFTextStripper pdfStripper = new PDFTextStripper();
+ // 提取文本
+ String text = pdfStripper.getText(document);
+ document.close();
+ return text;
+ }
+
+ public static String parseDoc(String path) throws IOException {
+ FileInputStream fis = new FileInputStream(path);
+ HWPFDocument document = new HWPFDocument(fis);
+ WordExtractor wordExtractor = new WordExtractor(document);
+ String[] paragraphText = wordExtractor.getParagraphText();
+ StringBuilder stringBuilder = new StringBuilder();
+ for (String paragraph : paragraphText) {
+ stringBuilder.append(paragraph);
+ }
+ fis.close();
+ return stringBuilder.toString();
+ }
}