diff --git a/electromagnetic-common/pom.xml b/electromagnetic-common/pom.xml index f992669..ecdbafb 100644 --- a/electromagnetic-common/pom.xml +++ b/electromagnetic-common/pom.xml @@ -65,6 +65,34 @@ documents4j-transformer-msoffice-word 1.0.3 + + + org.apache.poi + poi-ooxml + 4.1.2 + + + + + org.apache.poi + poi-ooxml-schemas + 4.1.0 + + + + + org.apache.poi + poi-scratchpad + 4.1.2 + + + + org.apache.pdfbox + pdfbox + 2.0.24 + + + \ No newline at end of file diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/ExcelParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/ExcelParse.java new file mode 100644 index 0000000..d4cc30b --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/ExcelParse.java @@ -0,0 +1,11 @@ +package com.electromagnetic.industry.software.common.parse; + +import java.io.InputStream; + +public class ExcelParse extends FileParse { + + @Override + public String parseContent(InputStream stream, String fileType) { + return ""; + } +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/FileParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/FileParse.java new file mode 100644 index 0000000..9b9d469 --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/FileParse.java @@ -0,0 +1,18 @@ +package com.electromagnetic.industry.software.common.parse; + +import cn.hutool.core.util.IdUtil; + +import java.io.File; +import java.io.InputStream; + +public abstract class FileParse { + + private static String tmpPath = Thread.currentThread().getContextClassLoader().getResource("").getPath(); + + public abstract String parseContent(InputStream stream, String fileType); + + protected String createFileTmpPath(String fileType) { + String uuid = IdUtil.fastSimpleUUID(); + return tmpPath + File.separator + uuid + "." + fileType; + } +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java new file mode 100644 index 0000000..c9e92bb --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java @@ -0,0 +1,21 @@ +package com.electromagnetic.industry.software.common.parse; + +import com.electromagnetic.industry.software.common.util.OfficeFileUtil; +import lombok.extern.slf4j.Slf4j; + +import java.io.InputStream; + +@Slf4j +public class PdfParse extends FileParse { + @Override + public String parseContent(InputStream stream, String fileType) { + String res = ""; + try { + String fileTmpPath = createFileTmpPath(fileType); + res = OfficeFileUtil.parsePdf(fileTmpPath); + } catch (Exception e) { + log.error("解析pdf文件失败{}", e.getMessage(), e); + } + return res; + } +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PptParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PptParse.java new file mode 100644 index 0000000..4b81e5e --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PptParse.java @@ -0,0 +1,11 @@ +package com.electromagnetic.industry.software.common.parse; + +import java.io.InputStream; + +public class PptParse extends FileParse { + + @Override + public String parseContent(InputStream stream, String fileType) { + return ""; + } +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java new file mode 100644 index 0000000..ad29da7 --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java @@ -0,0 +1,26 @@ +package com.electromagnetic.industry.software.common.parse; + +import cn.hutool.core.io.FileUtil; +import lombok.extern.slf4j.Slf4j; + +import java.io.InputStream; +import java.nio.charset.Charset; + +@Slf4j +public class TextParse extends FileParse { + + @Override + public String parseContent(InputStream stream, String fileType) { + String fileTmpPath = createFileTmpPath(fileType); + String res = ""; + try { + FileUtil.writeFromStream(stream, fileTmpPath); + res = FileUtil.readString(fileTmpPath, Charset.defaultCharset()); + } catch (Exception e) { + log.info("解析文件 {} 错误, 原因 {}", fileType, e.getMessage(), e); + } finally { + FileUtil.del(fileTmpPath); + } + return res; + } +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java new file mode 100644 index 0000000..76191e3 --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/WordParse.java @@ -0,0 +1,28 @@ +package com.electromagnetic.industry.software.common.parse; + +import cn.hutool.core.io.FileUtil; +import com.electromagnetic.industry.software.common.util.OfficeFileUtil; +import lombok.extern.slf4j.Slf4j; + +import java.io.InputStream; + +@Slf4j +public class WordParse extends FileParse { + + @Override + public String parseContent(InputStream stream, String fileType) { + String fileTmpPath = createFileTmpPath(fileType); + String res = ""; + try { + FileUtil.writeFromStream(stream, fileTmpPath); + if (fileType.endsWith("docx")) { + return OfficeFileUtil.parseDocx(fileTmpPath); + } else { + return OfficeFileUtil.parseDoc(fileTmpPath); + } + } catch (Exception e) { + log.info("解析{}失败,原因{}", fileType, e.getMessage(), e); + } + return res; + } +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java index 50f26d1..8aeb340 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java @@ -4,6 +4,7 @@ import cn.hutool.core.io.FileUtil; import cn.hutool.core.util.StrUtil; import cn.hutool.crypto.symmetric.AES; import com.electromagnetic.industry.software.common.exception.BizException; +import com.electromagnetic.industry.software.common.parse.*; import java.io.File; import java.io.InputStream; @@ -12,10 +13,26 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; +import java.util.HashMap; +import java.util.Map; import java.util.regex.Pattern; public final class EleCommonUtil { + private static final Map PARSE_MAP = new HashMap<>(); + + static { + PARSE_MAP.put("doc", new WordParse()); + PARSE_MAP.put("docx", new WordParse()); + PARSE_MAP.put("xls", new ExcelParse()); + PARSE_MAP.put("xlsx", new ExcelParse()); + PARSE_MAP.put("ppt", new PptParse()); + PARSE_MAP.put("pptx", new PptParse()); + PARSE_MAP.put("text", new TextParse()); + PARSE_MAP.put("txt", new TextParse()); + PARSE_MAP.put("py", new TextParse()); + } + // 正则表达式模式,匹配中文字符、下划线、连字符、加号、数字和英文字符 private static final String PATTERN = "^[\\u4e00-\\u9fa5a-zA-Z0-9._\\-+]+$"; @@ -59,4 +76,15 @@ public final class EleCommonUtil { } } + public static boolean isWinOs() { + return System.getProperty("os.name").toLowerCase().startsWith("win"); + } + + public static String parse(InputStream inputStream, String fileType) { + + FileParse fileParse = PARSE_MAP.getOrDefault(fileType, new TextParse()); + + return fileParse.parseContent(inputStream, fileType); + } + } diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index ca9380f..415d5a9 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -5,32 +5,80 @@ import com.documents4j.api.DocumentType; import com.documents4j.api.IConverter; import com.documents4j.job.LocalConverter; import lombok.extern.slf4j.Slf4j; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; import java.io.*; import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; @Slf4j public class OfficeFileUtil { + private static final IConverter CONVERT = LocalConverter.builder().build(); + public static void doc2pdf(String wordPath, String pdfPath) { - File inputWord = new File(wordPath); - File outputFile = new File(pdfPath); - try { - InputStream docxInputStream = Files.newInputStream(inputWord.toPath()); - OutputStream outputStream = Files.newOutputStream(outputFile.toPath()); - IConverter converter = LocalConverter.builder().build(); - boolean execute = converter.convert(docxInputStream) - .as(DocumentType.DOCX) - .to(outputStream) - .as(DocumentType.PDF).schedule().get(); - Assert.isTrue(execute, "转换失败"); - outputStream.close(); - docxInputStream.close(); - log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath()); - converter.shutDown(); - } catch (Exception e) { - log.error("[documents4J] word转pdf失败:{}", e.toString()); + if (EleCommonUtil.isWinOs()) { + File inputWord = new File(wordPath); + File outputFile = new File(pdfPath); + try(InputStream docxInputStream = Files.newInputStream(inputWord.toPath()); + OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) { + boolean execute = CONVERT.convert(docxInputStream) + .as(DocumentType.DOCX) + .to(outputStream) + .as(DocumentType.PDF).schedule().get(); + Assert.isTrue(execute, "转换失败"); + log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath()); + CONVERT.shutDown(); + } catch (Exception e) { + log.error("[documents4J] word转pdf失败:{}", e.toString()); + } + } else { + } + + } + public static String parseDocx(String wordPath) throws IOException { + InputStream fis = Files.newInputStream(Paths.get(wordPath)); + XWPFDocument document = new XWPFDocument(); + StringBuilder stringBuilder = new StringBuilder(); + List paragraphs = document.getParagraphs(); + for (XWPFParagraph paragraph : paragraphs) { + stringBuilder.append(paragraph.getText()); + } + document.close(); + fis.close(); + return stringBuilder.toString(); + } + + public static String parsePdf(String path) throws IOException { + // 加载PDF文档 + PDDocument document = PDDocument.load(new File(path)); + // 创建PDFTextStripper对象来解析文本 + PDFTextStripper pdfStripper = new PDFTextStripper(); + // 提取文本 + String text = pdfStripper.getText(document); + document.close(); + return text; + } + + public static String parseDoc(String path) throws IOException { + FileInputStream fis = new FileInputStream(path); + HWPFDocument document = new HWPFDocument(fis); + WordExtractor wordExtractor = new WordExtractor(document); + String[] paragraphText = wordExtractor.getParagraphText(); + StringBuilder stringBuilder = new StringBuilder(); + for (String paragraph : paragraphText) { + stringBuilder.append(paragraph); + } + fis.close(); + return stringBuilder.toString(); + } }