From 86cda0a547be5481b1744303ecd8d5d95053543d Mon Sep 17 00:00:00 2001 From: chenxudong Date: Wed, 9 Apr 2025 12:06:23 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Epdf=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E7=9A=84=E5=9B=BE=E7=89=87orc=E8=A7=A3=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/serviceimpl/ChatService.java | 50 +++++------ electromagnetic-common/pom.xml | 26 ++++++ .../software/common/pojo/PageFile.java | 2 - .../software/common/util/OfficeFileUtil.java | 82 +++++++++++++++++++ 4 files changed, 130 insertions(+), 30 deletions(-) diff --git a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java index d908d39..c5bc365 100644 --- a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java +++ b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java @@ -46,6 +46,7 @@ import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; @Slf4j @@ -105,12 +106,13 @@ public class ChatService { return ElectromagneticResultUtil.success(fileMd5); } List documents = new ArrayList<>(); - switch (Objects.requireNonNull(fileType)) { case "txt", "csv", "text" -> { - Path tempFile = saveUploadedFileToTemp(file); + String tmpPath = elePropertyConfig.getEleTmpPath() + File.separator + IdUtil.fastSimpleUUID() + "." + fileType; + FileUtil.writeFromStream(file.getInputStream(), tmpPath); + Path tempFile = Paths.get(tmpPath); DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL())); - documents = new TokenTextSplitter().apply(documentReader.get()); + documents = documentReader.get(); FileUtil.del(tempFile); } case "xls", "xlsx" -> { @@ -128,36 +130,35 @@ public class ChatService { } FileUtil.del(filePath); } - case "doc", "docx", "ppt", "pptx" -> { + case "doc", "docx", "ppt", "pptx", "pdf" -> { + List pageFiles; String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf"; String srcPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + "." + fileType; - if (StrUtil.equals(fileType, "doc") || StrUtil.equals(fileType, "docx")) { - OfficeFileUtil.doc2pdf(srcPath, pdfPath); + if (StrUtil.equals(fileType, "pdf")) { + FileUtil.writeFromStream(file.getInputStream(), pdfPath); + pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath); } else { - OfficeFileUtil.ppt2pdf(srcPath, pdfPath); + FileUtil.writeFromStream(file.getInputStream(), srcPath); + if (StrUtil.equals(fileType, "doc") || StrUtil.equals(fileType, "docx")) { + OfficeFileUtil.doc2pdf(srcPath, pdfPath); + } else { + OfficeFileUtil.ppt2pdf(srcPath, pdfPath); + } + pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath); } - Path path = new File(pdfPath).toPath(); - PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL())); - List tmp = reader.get(); - for (Document document : tmp) { - Map metadata = document.getMetadata(); - Map metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename()); - Document doc = new Document(Objects.requireNonNull(document.getText()), metaData); - documents.add(doc); + for (PageFile pageFile : pageFiles) { + Map metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename()); + Document document = new Document(pageFile.getContent(), metaData); + documents.add(document); } FileUtil.del(pdfPath); FileUtil.del(srcPath); } - case "pdf" -> { - Path tempFile = saveUploadedFileToTemp(file); - PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL())); - documents = reader.get(); - FileUtil.del(tempFile); - } default -> { return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType)); } } + documents = new TokenTextSplitter().apply(documents); documents = documents.subList(0, Math.min(elePropertyConfig.getAiMaxDocs(), documents.size())); vectorStore.write(documents); for (Document document : documents) { @@ -170,13 +171,6 @@ public class ChatService { return ElectromagneticResultUtil.success(fileMd5); } - private Path saveUploadedFileToTemp(MultipartFile file) throws IOException { - Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID()); - Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename())); - file.transferTo(tempFile); - return tempFile; - } - public Flux chatStreamStr(QueryDTO queryDTO) { AiQuestionRecord record = new AiQuestionRecord() .setQuestion(queryDTO.getMsg()) diff --git a/electromagnetic-common/pom.xml b/electromagnetic-common/pom.xml index b47683c..0bc08c8 100644 --- a/electromagnetic-common/pom.xml +++ b/electromagnetic-common/pom.xml @@ -102,6 +102,32 @@ 2.3.3 + + io.github.mymonstercat + rapidocr + 0.0.7 + + + + + + io.github.mymonstercat + rapidocr-onnx-platform + 0.0.7 + + + + io.github.mymonstercat + rapidocr-ncnn-platform + 0.0.7 + + + + org.apache.pdfbox + jbig2-imageio + 3.0.2 + + \ No newline at end of file diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java index 16e49c9..835a3ef 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java @@ -14,6 +14,4 @@ public class PageFile { private String content; private String fileName; - - } diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index 8d897b1..9b6c7b9 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -3,15 +3,23 @@ package com.electromagnetic.industry.software.common.util; import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Assert; import cn.hutool.core.text.StrFormatter; +import cn.hutool.core.util.IdUtil; import cn.hutool.core.util.RuntimeUtil; +import com.benjaminwan.ocrlibrary.OcrResult; import com.documents4j.api.DocumentType; import com.documents4j.api.IConverter; import com.documents4j.job.LocalConverter; import com.electromagnetic.industry.software.common.exception.BizException; import com.electromagnetic.industry.software.common.pojo.PageFile; +import io.github.mymonstercat.Model; +import io.github.mymonstercat.ocr.InferenceEngine; import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hwpf.HWPFDocument; @@ -25,6 +33,8 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -278,4 +288,76 @@ public class OfficeFileUtil { return pageFiles; } + public static List parsePdfByPage(String filePath) throws IOException { + StringBuilder stringBuilder = new StringBuilder(); + List pageFiles = new ArrayList<>(); + String fileName = new File(filePath).getName(); + try (PDDocument document = Loader.loadPDF(new File(filePath))) { + // 创建 PDFTextStripper 对象 + PDFTextStripper textStripper = new PDFTextStripper(); + // 遍历每一页 + for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { + PDPage page = document.getPage(pageIndex); + int pageNumber = pageIndex + 1; + // 提取文本 + String text = extractTextFromPage(textStripper, document, pageIndex); + // 提取图片 + String imageText = extractImagesFromPage(page, pageNumber); + stringBuilder.append(text).append("\n").append(imageText); + PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName); + pageFiles.add(pageFile); + } + } catch (IOException e) { + log.error(e.getMessage(), e); + } + return pageFiles; + } + + + // 提取文本 + private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException { + textStripper.setStartPage(pageIndex + 1); // PDFBox 页码从 1 开始 + textStripper.setEndPage(pageIndex + 1); + return textStripper.getText(document); + } + + // 提取图片 + private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException { + PDResources resources = page.getResources(); + if (resources == null) { + return ""; + } + // 遍历所有 XObject 资源 + Iterable xObjectNames = resources.getXObjectNames(); + StringBuilder stringBuilder = new StringBuilder(); + for (COSName name : xObjectNames) { + // 获取 XObject 对象 + org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name); + // 检查是否为图片对象 + if (xObject instanceof PDImage) { + PDImage image = (PDImage) xObject; + String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID()); + saveImage(image, filePath); + String textFromImage = getTextFromImage(filePath); + stringBuilder.append(textFromImage).append("\r\n"); + FileUtil.del(filePath); + } + } + return stringBuilder.toString(); + } + + // 保存图片 + private static void saveImage(PDImage image, String filename) throws IOException { + BufferedImage bufferedImage = image.getImage(); + File outputFile = new File(filename); + outputFile.getParentFile().mkdirs(); // 确保目录存在 + ImageIO.write(bufferedImage, "png", outputFile); + } + + private static String getTextFromImage(String imagePath) throws IOException { + InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3); + OcrResult ocrResult = engine.runOcr(imagePath); + return ocrResult.getStrRes().trim(); + } + }