新增pdf文档的图片orc解析

2025-04-09 12:06:23 +08:00 · 2025-04-09 12:06:23 +08:00 · 86cda0a547
parent b5da2e3082
commit 86cda0a547
4 changed files with 130 additions and 30 deletions
--- a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java
+++ b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java
@ -46,6 +46,7 @@ import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.*;

@Slf4j
@ -105,12 +106,13 @@ public class ChatService {
            return ElectromagneticResultUtil.success(fileMd5);
        }
        List<Document> documents = new ArrayList<>();
-
        switch (Objects.requireNonNull(fileType)) {
            case "txt", "csv", "text" -> {
-                Path tempFile = saveUploadedFileToTemp(file);
+                String tmpPath = elePropertyConfig.getEleTmpPath() + File.separator + IdUtil.fastSimpleUUID() + "." + fileType;
+                FileUtil.writeFromStream(file.getInputStream(), tmpPath);
+                Path tempFile = Paths.get(tmpPath);
                DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL()));
-                documents = new TokenTextSplitter().apply(documentReader.get());
+                documents = documentReader.get();
                FileUtil.del(tempFile);
            }
            case "xls", "xlsx" -> {
@ -128,36 +130,35 @@ public class ChatService {
                }
                FileUtil.del(filePath);
            }
-            case "doc", "docx", "ppt", "pptx" -> {
+            case "doc", "docx", "ppt", "pptx", "pdf" -> {
+                List<PageFile> pageFiles;
                String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
                String srcPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + "." + fileType;
+                if (StrUtil.equals(fileType, "pdf")) {
+                    FileUtil.writeFromStream(file.getInputStream(), pdfPath);
+                    pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
+                } else {
+                    FileUtil.writeFromStream(file.getInputStream(), srcPath);
                    if (StrUtil.equals(fileType, "doc") || StrUtil.equals(fileType, "docx")) {
                        OfficeFileUtil.doc2pdf(srcPath, pdfPath);
                    } else {
                        OfficeFileUtil.ppt2pdf(srcPath, pdfPath);
                    }
-                Path path = new File(pdfPath).toPath();
-                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
-                List<Document> tmp = reader.get();
-                for (Document document : tmp) {
-                    Map<String, Object> metadata = document.getMetadata();
-                    Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
-                    Document doc = new Document(Objects.requireNonNull(document.getText()), metaData);
-                    documents.add(doc);
+                    pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
+                }
+                for (PageFile pageFile : pageFiles) {
+                    Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
+                    Document document = new Document(pageFile.getContent(), metaData);
+                    documents.add(document);
                }
                FileUtil.del(pdfPath);
                FileUtil.del(srcPath);
            }
-            case "pdf" -> {
-                Path tempFile = saveUploadedFileToTemp(file);
-                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
-                documents = reader.get();
-                FileUtil.del(tempFile);
-            }
            default -> {
                return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
            }
        }
+        documents = new TokenTextSplitter().apply(documents);
        documents = documents.subList(0, Math.min(elePropertyConfig.getAiMaxDocs(), documents.size()));
        vectorStore.write(documents);
        for (Document document : documents) {
@ -170,13 +171,6 @@ public class ChatService {
        return ElectromagneticResultUtil.success(fileMd5);
    }

-    private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
-        Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
-        Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
-        file.transferTo(tempFile);
-        return tempFile;
-    }
-
    public Flux<String> chatStreamStr(QueryDTO queryDTO) {
        AiQuestionRecord record = new AiQuestionRecord()
                .setQuestion(queryDTO.getMsg())
--- a/electromagnetic-common/pom.xml
+++ b/electromagnetic-common/pom.xml
@ -102,6 +102,32 @@
            <version>2.3.3</version>
        </dependency>

+        <dependency>
+            <groupId>io.github.mymonstercat</groupId>
+            <artifactId>rapidocr</artifactId>
+            <version>0.0.7</version>
+        </dependency>
+
+        <!--  一般只需要引入一个，CPU端建议使用onnx，移动端建议使用ncnn     -->
+        <!--  可前往maven中央仓库https://central.sonatype.com/artifact/io.github.mymonstercat/rapidocr-onnx-platform/versions，查看版本      -->
+        <dependency>
+            <groupId>io.github.mymonstercat</groupId>
+            <artifactId>rapidocr-onnx-platform</artifactId>
+            <version>0.0.7</version>
+        </dependency>
+
+        <dependency>
+            <groupId>io.github.mymonstercat</groupId>
+            <artifactId>rapidocr-ncnn-platform</artifactId>
+            <version>0.0.7</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>jbig2-imageio</artifactId>
+            <version>3.0.2</version>
+        </dependency>
+
    </dependencies>

 </project>
--- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java
@ -14,6 +14,4 @@ public class PageFile {
    private String content;

    private String fileName;
-
-
 }
--- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
@ -3,15 +3,23 @@ package com.electromagnetic.industry.software.common.util;
 import cn.hutool.core.io.FileUtil;
 import cn.hutool.core.lang.Assert;
 import cn.hutool.core.text.StrFormatter;
+import cn.hutool.core.util.IdUtil;
 import cn.hutool.core.util.RuntimeUtil;
+import com.benjaminwan.ocrlibrary.OcrResult;
 import com.documents4j.api.DocumentType;
 import com.documents4j.api.IConverter;
 import com.documents4j.job.LocalConverter;
 import com.electromagnetic.industry.software.common.exception.BizException;
 import com.electromagnetic.industry.software.common.pojo.PageFile;
+import io.github.mymonstercat.Model;
+import io.github.mymonstercat.ocr.InferenceEngine;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 import org.apache.poi.hwpf.HWPFDocument;
@ -25,6 +33,8 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;

+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@ -278,4 +288,76 @@ public class OfficeFileUtil {
        return pageFiles;
    }

+    public static List<PageFile> parsePdfByPage(String filePath) throws IOException {
+        StringBuilder stringBuilder = new StringBuilder();
+        List<PageFile> pageFiles = new ArrayList<>();
+        String fileName = new File(filePath).getName();
+        try (PDDocument document = Loader.loadPDF(new File(filePath))) {
+            // 创建 PDFTextStripper 对象
+            PDFTextStripper textStripper = new PDFTextStripper();
+            // 遍历每一页
+            for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
+                PDPage page = document.getPage(pageIndex);
+                int pageNumber = pageIndex + 1;
+                // 提取文本
+                String text = extractTextFromPage(textStripper, document, pageIndex);
+                // 提取图片
+                String imageText = extractImagesFromPage(page, pageNumber);
+                stringBuilder.append(text).append("\n").append(imageText);
+                PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName);
+                pageFiles.add(pageFile);
+            }
+        } catch (IOException e) {
+            log.error(e.getMessage(), e);
+        }
+        return pageFiles;
+    }
+
+
+    // 提取文本
+    private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException {
+        textStripper.setStartPage(pageIndex + 1);  // PDFBox 页码从 1 开始
+        textStripper.setEndPage(pageIndex + 1);
+        return textStripper.getText(document);
+    }
+
+    // 提取图片
+    private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException {
+        PDResources resources = page.getResources();
+        if (resources == null) {
+            return "";
+        }
+        // 遍历所有 XObject 资源
+        Iterable<COSName> xObjectNames = resources.getXObjectNames();
+        StringBuilder stringBuilder = new StringBuilder();
+        for (COSName name : xObjectNames) {
+            // 获取 XObject 对象
+            org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name);
+            // 检查是否为图片对象
+            if (xObject instanceof PDImage) {
+                PDImage image = (PDImage) xObject;
+                String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID());
+                saveImage(image, filePath);
+                String textFromImage = getTextFromImage(filePath);
+                stringBuilder.append(textFromImage).append("\r\n");
+                FileUtil.del(filePath);
+            }
+        }
+        return stringBuilder.toString();
+    }
+
+    // 保存图片
+    private static void saveImage(PDImage image, String filename) throws IOException {
+        BufferedImage bufferedImage = image.getImage();
+        File outputFile = new File(filename);
+        outputFile.getParentFile().mkdirs();  // 确保目录存在
+        ImageIO.write(bufferedImage, "png", outputFile);
+    }
+
+    private static String getTextFromImage(String imagePath) throws IOException {
+        InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
+        OcrResult ocrResult = engine.runOcr(imagePath);
+        return ocrResult.getStrRes().trim();
+    }
+
 }