新增pdf文档的图片orc解析
This commit is contained in:
parent
b5da2e3082
commit
86cda0a547
|
|
@ -46,6 +46,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
|
||||
@Slf4j
|
||||
|
|
@ -105,12 +106,13 @@ public class ChatService {
|
|||
return ElectromagneticResultUtil.success(fileMd5);
|
||||
}
|
||||
List<Document> documents = new ArrayList<>();
|
||||
|
||||
switch (Objects.requireNonNull(fileType)) {
|
||||
case "txt", "csv", "text" -> {
|
||||
Path tempFile = saveUploadedFileToTemp(file);
|
||||
String tmpPath = elePropertyConfig.getEleTmpPath() + File.separator + IdUtil.fastSimpleUUID() + "." + fileType;
|
||||
FileUtil.writeFromStream(file.getInputStream(), tmpPath);
|
||||
Path tempFile = Paths.get(tmpPath);
|
||||
DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL()));
|
||||
documents = new TokenTextSplitter().apply(documentReader.get());
|
||||
documents = documentReader.get();
|
||||
FileUtil.del(tempFile);
|
||||
}
|
||||
case "xls", "xlsx" -> {
|
||||
|
|
@ -128,36 +130,35 @@ public class ChatService {
|
|||
}
|
||||
FileUtil.del(filePath);
|
||||
}
|
||||
case "doc", "docx", "ppt", "pptx" -> {
|
||||
case "doc", "docx", "ppt", "pptx", "pdf" -> {
|
||||
List<PageFile> pageFiles;
|
||||
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
|
||||
String srcPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + "." + fileType;
|
||||
if (StrUtil.equals(fileType, "pdf")) {
|
||||
FileUtil.writeFromStream(file.getInputStream(), pdfPath);
|
||||
pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
|
||||
} else {
|
||||
FileUtil.writeFromStream(file.getInputStream(), srcPath);
|
||||
if (StrUtil.equals(fileType, "doc") || StrUtil.equals(fileType, "docx")) {
|
||||
OfficeFileUtil.doc2pdf(srcPath, pdfPath);
|
||||
} else {
|
||||
OfficeFileUtil.ppt2pdf(srcPath, pdfPath);
|
||||
}
|
||||
Path path = new File(pdfPath).toPath();
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
|
||||
List<Document> tmp = reader.get();
|
||||
for (Document document : tmp) {
|
||||
Map<String, Object> metadata = document.getMetadata();
|
||||
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
|
||||
Document doc = new Document(Objects.requireNonNull(document.getText()), metaData);
|
||||
documents.add(doc);
|
||||
pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
|
||||
}
|
||||
for (PageFile pageFile : pageFiles) {
|
||||
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
|
||||
Document document = new Document(pageFile.getContent(), metaData);
|
||||
documents.add(document);
|
||||
}
|
||||
FileUtil.del(pdfPath);
|
||||
FileUtil.del(srcPath);
|
||||
}
|
||||
case "pdf" -> {
|
||||
Path tempFile = saveUploadedFileToTemp(file);
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
|
||||
documents = reader.get();
|
||||
FileUtil.del(tempFile);
|
||||
}
|
||||
default -> {
|
||||
return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
|
||||
}
|
||||
}
|
||||
documents = new TokenTextSplitter().apply(documents);
|
||||
documents = documents.subList(0, Math.min(elePropertyConfig.getAiMaxDocs(), documents.size()));
|
||||
vectorStore.write(documents);
|
||||
for (Document document : documents) {
|
||||
|
|
@ -170,13 +171,6 @@ public class ChatService {
|
|||
return ElectromagneticResultUtil.success(fileMd5);
|
||||
}
|
||||
|
||||
private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
|
||||
Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
|
||||
Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
|
||||
file.transferTo(tempFile);
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
public Flux<String> chatStreamStr(QueryDTO queryDTO) {
|
||||
AiQuestionRecord record = new AiQuestionRecord()
|
||||
.setQuestion(queryDTO.getMsg())
|
||||
|
|
|
|||
|
|
@ -102,6 +102,32 @@
|
|||
<version>2.3.3</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.github.mymonstercat</groupId>
|
||||
<artifactId>rapidocr</artifactId>
|
||||
<version>0.0.7</version>
|
||||
</dependency>
|
||||
|
||||
<!-- 一般只需要引入一个,CPU端建议使用onnx,移动端建议使用ncnn -->
|
||||
<!-- 可前往maven中央仓库https://central.sonatype.com/artifact/io.github.mymonstercat/rapidocr-onnx-platform/versions,查看版本 -->
|
||||
<dependency>
|
||||
<groupId>io.github.mymonstercat</groupId>
|
||||
<artifactId>rapidocr-onnx-platform</artifactId>
|
||||
<version>0.0.7</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.github.mymonstercat</groupId>
|
||||
<artifactId>rapidocr-ncnn-platform</artifactId>
|
||||
<version>0.0.7</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>jbig2-imageio</artifactId>
|
||||
<version>3.0.2</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
@ -14,6 +14,4 @@ public class PageFile {
|
|||
private String content;
|
||||
|
||||
private String fileName;
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,15 +3,23 @@ package com.electromagnetic.industry.software.common.util;
|
|||
import cn.hutool.core.io.FileUtil;
|
||||
import cn.hutool.core.lang.Assert;
|
||||
import cn.hutool.core.text.StrFormatter;
|
||||
import cn.hutool.core.util.IdUtil;
|
||||
import cn.hutool.core.util.RuntimeUtil;
|
||||
import com.benjaminwan.ocrlibrary.OcrResult;
|
||||
import com.documents4j.api.DocumentType;
|
||||
import com.documents4j.api.IConverter;
|
||||
import com.documents4j.job.LocalConverter;
|
||||
import com.electromagnetic.industry.software.common.exception.BizException;
|
||||
import com.electromagnetic.industry.software.common.pojo.PageFile;
|
||||
import io.github.mymonstercat.Model;
|
||||
import io.github.mymonstercat.ocr.InferenceEngine;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
|
|
@ -25,6 +33,8 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
|
@ -278,4 +288,76 @@ public class OfficeFileUtil {
|
|||
return pageFiles;
|
||||
}
|
||||
|
||||
public static List<PageFile> parsePdfByPage(String filePath) throws IOException {
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
List<PageFile> pageFiles = new ArrayList<>();
|
||||
String fileName = new File(filePath).getName();
|
||||
try (PDDocument document = Loader.loadPDF(new File(filePath))) {
|
||||
// 创建 PDFTextStripper 对象
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
// 遍历每一页
|
||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||
PDPage page = document.getPage(pageIndex);
|
||||
int pageNumber = pageIndex + 1;
|
||||
// 提取文本
|
||||
String text = extractTextFromPage(textStripper, document, pageIndex);
|
||||
// 提取图片
|
||||
String imageText = extractImagesFromPage(page, pageNumber);
|
||||
stringBuilder.append(text).append("\n").append(imageText);
|
||||
PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName);
|
||||
pageFiles.add(pageFile);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
return pageFiles;
|
||||
}
|
||||
|
||||
|
||||
// 提取文本
|
||||
private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException {
|
||||
textStripper.setStartPage(pageIndex + 1); // PDFBox 页码从 1 开始
|
||||
textStripper.setEndPage(pageIndex + 1);
|
||||
return textStripper.getText(document);
|
||||
}
|
||||
|
||||
// 提取图片
|
||||
private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException {
|
||||
PDResources resources = page.getResources();
|
||||
if (resources == null) {
|
||||
return "";
|
||||
}
|
||||
// 遍历所有 XObject 资源
|
||||
Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
for (COSName name : xObjectNames) {
|
||||
// 获取 XObject 对象
|
||||
org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name);
|
||||
// 检查是否为图片对象
|
||||
if (xObject instanceof PDImage) {
|
||||
PDImage image = (PDImage) xObject;
|
||||
String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID());
|
||||
saveImage(image, filePath);
|
||||
String textFromImage = getTextFromImage(filePath);
|
||||
stringBuilder.append(textFromImage).append("\r\n");
|
||||
FileUtil.del(filePath);
|
||||
}
|
||||
}
|
||||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
// 保存图片
|
||||
private static void saveImage(PDImage image, String filename) throws IOException {
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
File outputFile = new File(filename);
|
||||
outputFile.getParentFile().mkdirs(); // 确保目录存在
|
||||
ImageIO.write(bufferedImage, "png", outputFile);
|
||||
}
|
||||
|
||||
private static String getTextFromImage(String imagePath) throws IOException {
|
||||
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
|
||||
OcrResult ocrResult = engine.runOcr(imagePath);
|
||||
return ocrResult.getStrRes().trim();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue