解决AI中上传pdf文件解析的bug

This commit is contained in:
chenxudong 2025-04-03 16:30:57 +08:00
parent 9b177b5c1d
commit 2e1de849b2
8 changed files with 93 additions and 45 deletions

View File

@ -128,6 +128,10 @@
<artifactId>elasticsearch-java</artifactId> <artifactId>elasticsearch-java</artifactId>
<version>8.13.4</version> <version>8.13.4</version>
</dependency> </dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -10,7 +10,7 @@ import java.util.concurrent.Callable;
@AllArgsConstructor @AllArgsConstructor
@NoArgsConstructor @NoArgsConstructor
public class ChatTaskThread implements Callable<Flux<String>> { public class ChatTaskThread1 implements Callable<Flux<String>> {
private ChatService chatService; private ChatService chatService;
private QueryDTO queryDTO; private QueryDTO queryDTO;

View File

@ -0,0 +1,23 @@
package com.electromagnetic.industry.software.manage.ai;
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import org.springframework.ai.chat.model.ChatResponse;
import reactor.core.publisher.Flux;
import java.util.concurrent.Callable;
@AllArgsConstructor
@NoArgsConstructor
public class ChatTaskThread2 implements Callable<Flux<ChatResponse>> {
private ChatService chatService;
private QueryDTO queryDTO;
@Override
public Flux<ChatResponse> call() throws Exception {
return chatService.chatStreamResponse(queryDTO);
}
}

View File

@ -2,12 +2,14 @@ package com.electromagnetic.industry.software.manage.controller;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult; import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread; import com.electromagnetic.industry.software.manage.ai.ChatTaskThread1;
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread2;
import com.electromagnetic.industry.software.manage.ai.ThreadUtil; import com.electromagnetic.industry.software.manage.ai.ThreadUtil;
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO; import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService; import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
import jakarta.annotation.Resource; import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.*; import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
@ -44,19 +46,19 @@ public class AiController {
if (StrUtil.isEmpty(queryDTO.getMsg())) { if (StrUtil.isEmpty(queryDTO.getMsg())) {
return Flux.empty(); return Flux.empty();
} }
ChatTaskThread chatTaskThread = new ChatTaskThread(chatService, queryDTO); ChatTaskThread1 chatTaskThread = new ChatTaskThread1(chatService, queryDTO);
Future<Flux<String>> future = ThreadUtil.getThreadPool().submit(chatTaskThread); Future<Flux<String>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
return future.get(); return future.get();
} }
// @PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE) @PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
// public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException { public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
// if (StrUtil.isEmpty(queryDTO.getMsg())) { if (StrUtil.isEmpty(queryDTO.getMsg())) {
// return Flux.empty(); return Flux.empty();
// } }
// ChatTaskThread<Flux<ChatResponse>> chatTaskThread = new ChatTaskThread<>(chatService, queryDTO); ChatTaskThread2 chatTaskThread2 = new ChatTaskThread2(chatService, queryDTO);
// Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread); Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread2);
// return future.get(); return future.get();
// } }
} }

View File

@ -1,16 +1,16 @@
package com.electromagnetic.industry.software.manage.service.serviceimpl; package com.electromagnetic.industry.software.manage.service.serviceimpl;
import cn.hutool.core.collection.ListUtil;
import cn.hutool.core.io.FileUtil; import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.ObjectUtil; import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import cn.hutool.crypto.digest.DigestUtil; import cn.hutool.crypto.digest.DigestUtil;
import com.baomidou.mybatisplus.core.toolkit.Wrappers; import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum; import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult; import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
import com.electromagnetic.industry.software.common.util.EleCommonUtil;
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil; import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
import com.electromagnetic.industry.software.common.util.IdWorker; import com.electromagnetic.industry.software.common.util.IdWorker;
import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper; import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord; import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO; import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
@ -19,15 +19,24 @@ import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.client.advisor.MessageChatMemoryAdvisor; import org.springframework.ai.chat.client.advisor.MessageChatMemoryAdvisor;
import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor; import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
import org.springframework.ai.chat.messages.UserMessage;
import org.springframework.ai.chat.prompt.Prompt;
import org.springframework.ai.document.Document; import org.springframework.ai.document.Document;
import org.springframework.ai.ollama.OllamaChatModel; import org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.ai.vectorstore.VectorStore; import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import reactor.core.publisher.Flux; import reactor.core.publisher.Flux;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -51,6 +60,9 @@ public class ChatService {
@Resource @Resource
private AiFileUploadRecordMapper aiFileUploadRecordMapper; private AiFileUploadRecordMapper aiFileUploadRecordMapper;
@Resource
private ElePropertyConfig elePropertyConfig;
public void add(String content) { public void add(String content) {
List<Document> documents = Stream.of(content).map(Document::new).collect(Collectors.toList()); List<Document> documents = Stream.of(content).map(Document::new).collect(Collectors.toList());
vectorStore.write(documents); vectorStore.write(documents);
@ -75,53 +87,55 @@ public class ChatService {
@Transactional(rollbackFor = Exception.class) @Transactional(rollbackFor = Exception.class)
public ElectromagneticResult<?> addFromUpload(MultipartFile file) throws Exception { public ElectromagneticResult<?> addFromUpload(MultipartFile file) throws Exception {
// 文件是否为空
if (file.isEmpty()) { if (file.isEmpty()) {
return ElectromagneticResultUtil.fail("-1", "文件为空"); return ElectromagneticResultUtil.fail("-1", "文件为空");
} }
// 当前仅支持pdf文件
String fileType = FileUtil.extName(file.getOriginalFilename()); String fileType = FileUtil.extName(file.getOriginalFilename());
if (!StrUtil.equals(fileType, "pdf")) { if (!StrUtil.equals(fileType, "pdf")) {
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件"); return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
} }
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
// 通过md5值判断文件是否被上传过
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
Long count = aiFileUploadRecordMapper.selectCount(Wrappers.lambdaQuery(AiFileUploadRecord.class) Long count = aiFileUploadRecordMapper.selectCount(Wrappers.lambdaQuery(AiFileUploadRecord.class)
.eq(AiFileUploadRecord::getFileMd5, fileMd5)); .eq(AiFileUploadRecord::getFileMd5, fileMd5));
if (count > 0) { if (count > 0) {
return ElectromagneticResultUtil.success(fileMd5); return ElectromagneticResultUtil.success(fileMd5);
} }
String content = EleCommonUtil.parse(file.getInputStream(), "pdf"); Path tempFile = saveUploadedFileToTemp(file);
Document document = new Document(content); PdfDocumentReaderConfig config = PdfDocumentReaderConfig.builder().build();
vectorStore.write(ListUtil.of(document)); PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()), config);
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString()) List<Document> documents = reader.get();
.setVectorId(document.getId()) Files.deleteIfExists(tempFile);
.setFileSize(file.getSize()) vectorStore.write(documents);
.setFileMd5(fileMd5) for (Document document : documents) {
.setFileName(file.getOriginalFilename())); aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
.setVectorId(document.getId())
.setFileSize(file.getSize())
.setFileMd5(fileMd5)
.setFileName(file.getOriginalFilename()));
}
return ElectromagneticResultUtil.success(fileMd5); return ElectromagneticResultUtil.success(fileMd5);
} }
// public String chat(String msg) { private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
// Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
// log.info("Start call model to answer"); Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
// file.transferTo(tempFile);
// return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt() return tempFile;
// .user(msg) }
// .advisors(advisorSpec -> advisorSpec
//// .param(CHAT_MEMORY_CONVERSATION_ID_KEY, queryDTO.getUserId())
// .param(AbstractChatMemoryAdvisor.CHAT_MEMORY_RETRIEVE_SIZE_KEY, 100))
// .call()
// .content();
// }
//
// public Flux<ChatResponse> chatStreamResponse(String msg) {
// ChatClient.StreamResponseSpec stream = ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(new Prompt(new UserMessage(msg))).stream();
// return stream.chatResponse();
// }
public Flux<String> chatStreamStr(QueryDTO queryDTO) { public Flux<String> chatStreamStr(QueryDTO queryDTO) {
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(queryDTO.getMsg()).stream().content(); return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(queryDTO.getMsg()).stream().content();
} }
public Flux<ChatResponse> chatStreamResponse(QueryDTO queryDTO) {
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build()
.prompt(new Prompt(new UserMessage(queryDTO.getMsg())))
.stream().chatResponse();
}
} }

View File

@ -1,6 +1,8 @@
//import com.electromagnetic.industry.software.manage.Application; //import com.electromagnetic.industry.software.manage.Application;
//import jakarta.annotation.Resource; //import jakarta.annotation.Resource;
//import org.junit.jupiter.api.Test; //import org.junit.jupiter.api.Test;
//import org.springframework.ai.document.Document;
//import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
//import org.springframework.ai.vectorstore.VectorStore; //import org.springframework.ai.vectorstore.VectorStore;
//import org.springframework.boot.test.context.SpringBootTest; //import org.springframework.boot.test.context.SpringBootTest;
// //
@ -14,8 +16,10 @@
// //
// @Test // @Test
// public void testTree() { // public void testTree() {
// String id = "c32666b2-36a5-40b5-9048-11349f090cd7"; // String path = "D:/wjj.pdf";
// vectorStore.delete(List.of(id)); // PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(path);
// List<Document> read = pagePdfDocumentReader.read();
// System.out.println("read = " + read);
// } // }
// //
//} //}

View File

@ -89,7 +89,7 @@
<dependency> <dependency>
<groupId>org.apache.pdfbox</groupId> <groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId> <artifactId>pdfbox</artifactId>
<version>2.0.24</version> <version>3.0.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>javax.xml.bind</groupId> <groupId>javax.xml.bind</groupId>

View File

@ -9,6 +9,7 @@ import com.documents4j.api.IConverter;
import com.documents4j.job.LocalConverter; import com.documents4j.job.LocalConverter;
import com.electromagnetic.industry.software.common.exception.BizException; import com.electromagnetic.industry.software.common.exception.BizException;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFSlideShow;
@ -89,7 +90,7 @@ public class OfficeFileUtil {
public static String parsePdfAllText(String path) throws IOException { public static String parsePdfAllText(String path) throws IOException {
log.info("Start parse pdf file, path is {}", path); log.info("Start parse pdf file, path is {}", path);
// 加载PDF文档 // 加载PDF文档
PDDocument document = PDDocument.load(new File(path)); PDDocument document = Loader.loadPDF(new File(path));
// 创建PDFTextStripper对象来解析文本 // 创建PDFTextStripper对象来解析文本
PDFTextStripper pdfStripper = new PDFTextStripper(); PDFTextStripper pdfStripper = new PDFTextStripper();
// 提取文本 // 提取文本