新增pdf文档的图片orc解析

This commit is contained in:
chenxudong 2025-04-09 12:06:23 +08:00
parent b5da2e3082
commit 86cda0a547
4 changed files with 130 additions and 30 deletions

View File

@ -46,6 +46,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*; import java.util.*;
@Slf4j @Slf4j
@ -105,12 +106,13 @@ public class ChatService {
return ElectromagneticResultUtil.success(fileMd5); return ElectromagneticResultUtil.success(fileMd5);
} }
List<Document> documents = new ArrayList<>(); List<Document> documents = new ArrayList<>();
switch (Objects.requireNonNull(fileType)) { switch (Objects.requireNonNull(fileType)) {
case "txt", "csv", "text" -> { case "txt", "csv", "text" -> {
Path tempFile = saveUploadedFileToTemp(file); String tmpPath = elePropertyConfig.getEleTmpPath() + File.separator + IdUtil.fastSimpleUUID() + "." + fileType;
FileUtil.writeFromStream(file.getInputStream(), tmpPath);
Path tempFile = Paths.get(tmpPath);
DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL())); DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL()));
documents = new TokenTextSplitter().apply(documentReader.get()); documents = documentReader.get();
FileUtil.del(tempFile); FileUtil.del(tempFile);
} }
case "xls", "xlsx" -> { case "xls", "xlsx" -> {
@ -128,36 +130,35 @@ public class ChatService {
} }
FileUtil.del(filePath); FileUtil.del(filePath);
} }
case "doc", "docx", "ppt", "pptx" -> { case "doc", "docx", "ppt", "pptx", "pdf" -> {
List<PageFile> pageFiles;
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf"; String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
String srcPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + "." + fileType; String srcPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + "." + fileType;
if (StrUtil.equals(fileType, "pdf")) {
FileUtil.writeFromStream(file.getInputStream(), pdfPath);
pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
} else {
FileUtil.writeFromStream(file.getInputStream(), srcPath);
if (StrUtil.equals(fileType, "doc") || StrUtil.equals(fileType, "docx")) { if (StrUtil.equals(fileType, "doc") || StrUtil.equals(fileType, "docx")) {
OfficeFileUtil.doc2pdf(srcPath, pdfPath); OfficeFileUtil.doc2pdf(srcPath, pdfPath);
} else { } else {
OfficeFileUtil.ppt2pdf(srcPath, pdfPath); OfficeFileUtil.ppt2pdf(srcPath, pdfPath);
} }
Path path = new File(pdfPath).toPath(); pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL())); }
List<Document> tmp = reader.get(); for (PageFile pageFile : pageFiles) {
for (Document document : tmp) { Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
Map<String, Object> metadata = document.getMetadata(); Document document = new Document(pageFile.getContent(), metaData);
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename()); documents.add(document);
Document doc = new Document(Objects.requireNonNull(document.getText()), metaData);
documents.add(doc);
} }
FileUtil.del(pdfPath); FileUtil.del(pdfPath);
FileUtil.del(srcPath); FileUtil.del(srcPath);
} }
case "pdf" -> {
Path tempFile = saveUploadedFileToTemp(file);
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
documents = reader.get();
FileUtil.del(tempFile);
}
default -> { default -> {
return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType)); return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
} }
} }
documents = new TokenTextSplitter().apply(documents);
documents = documents.subList(0, Math.min(elePropertyConfig.getAiMaxDocs(), documents.size())); documents = documents.subList(0, Math.min(elePropertyConfig.getAiMaxDocs(), documents.size()));
vectorStore.write(documents); vectorStore.write(documents);
for (Document document : documents) { for (Document document : documents) {
@ -170,13 +171,6 @@ public class ChatService {
return ElectromagneticResultUtil.success(fileMd5); return ElectromagneticResultUtil.success(fileMd5);
} }
private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
file.transferTo(tempFile);
return tempFile;
}
public Flux<String> chatStreamStr(QueryDTO queryDTO) { public Flux<String> chatStreamStr(QueryDTO queryDTO) {
AiQuestionRecord record = new AiQuestionRecord() AiQuestionRecord record = new AiQuestionRecord()
.setQuestion(queryDTO.getMsg()) .setQuestion(queryDTO.getMsg())

View File

@ -102,6 +102,32 @@
<version>2.3.3</version> <version>2.3.3</version>
</dependency> </dependency>
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr</artifactId>
<version>0.0.7</version>
</dependency>
<!-- 一般只需要引入一个CPU端建议使用onnx移动端建议使用ncnn -->
<!-- 可前往maven中央仓库https://central.sonatype.com/artifact/io.github.mymonstercat/rapidocr-onnx-platform/versions查看版本 -->
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-onnx-platform</artifactId>
<version>0.0.7</version>
</dependency>
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-ncnn-platform</artifactId>
<version>0.0.7</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jbig2-imageio</artifactId>
<version>3.0.2</version>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -14,6 +14,4 @@ public class PageFile {
private String content; private String content;
private String fileName; private String fileName;
} }

View File

@ -3,15 +3,23 @@ package com.electromagnetic.industry.software.common.util;
import cn.hutool.core.io.FileUtil; import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Assert; import cn.hutool.core.lang.Assert;
import cn.hutool.core.text.StrFormatter; import cn.hutool.core.text.StrFormatter;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.RuntimeUtil; import cn.hutool.core.util.RuntimeUtil;
import com.benjaminwan.ocrlibrary.OcrResult;
import com.documents4j.api.DocumentType; import com.documents4j.api.DocumentType;
import com.documents4j.api.IConverter; import com.documents4j.api.IConverter;
import com.documents4j.job.LocalConverter; import com.documents4j.job.LocalConverter;
import com.electromagnetic.industry.software.common.exception.BizException; import com.electromagnetic.industry.software.common.exception.BizException;
import com.electromagnetic.industry.software.common.pojo.PageFile; import com.electromagnetic.industry.software.common.pojo.PageFile;
import io.github.mymonstercat.Model;
import io.github.mymonstercat.ocr.InferenceEngine;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
@ -25,6 +33,8 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -278,4 +288,76 @@ public class OfficeFileUtil {
return pageFiles; return pageFiles;
} }
public static List<PageFile> parsePdfByPage(String filePath) throws IOException {
StringBuilder stringBuilder = new StringBuilder();
List<PageFile> pageFiles = new ArrayList<>();
String fileName = new File(filePath).getName();
try (PDDocument document = Loader.loadPDF(new File(filePath))) {
// 创建 PDFTextStripper 对象
PDFTextStripper textStripper = new PDFTextStripper();
// 遍历每一页
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
PDPage page = document.getPage(pageIndex);
int pageNumber = pageIndex + 1;
// 提取文本
String text = extractTextFromPage(textStripper, document, pageIndex);
// 提取图片
String imageText = extractImagesFromPage(page, pageNumber);
stringBuilder.append(text).append("\n").append(imageText);
PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName);
pageFiles.add(pageFile);
}
} catch (IOException e) {
log.error(e.getMessage(), e);
}
return pageFiles;
}
// 提取文本
private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException {
textStripper.setStartPage(pageIndex + 1); // PDFBox 页码从 1 开始
textStripper.setEndPage(pageIndex + 1);
return textStripper.getText(document);
}
// 提取图片
private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException {
PDResources resources = page.getResources();
if (resources == null) {
return "";
}
// 遍历所有 XObject 资源
Iterable<COSName> xObjectNames = resources.getXObjectNames();
StringBuilder stringBuilder = new StringBuilder();
for (COSName name : xObjectNames) {
// 获取 XObject 对象
org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name);
// 检查是否为图片对象
if (xObject instanceof PDImage) {
PDImage image = (PDImage) xObject;
String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID());
saveImage(image, filePath);
String textFromImage = getTextFromImage(filePath);
stringBuilder.append(textFromImage).append("\r\n");
FileUtil.del(filePath);
}
}
return stringBuilder.toString();
}
// 保存图片
private static void saveImage(PDImage image, String filename) throws IOException {
BufferedImage bufferedImage = image.getImage();
File outputFile = new File(filename);
outputFile.getParentFile().mkdirs(); // 确保目录存在
ImageIO.write(bufferedImage, "png", outputFile);
}
private static String getTextFromImage(String imagePath) throws IOException {
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
OcrResult ocrResult = engine.runOcr(imagePath);
return ocrResult.getStrRes().trim();
}
} }