解决AI中上传pdf文件解析的bug

This commit is contained in:
chenxudong 2025-04-03 16:30:57 +08:00
parent 9b177b5c1d
commit 2e1de849b2
8 changed files with 93 additions and 45 deletions

View File

@ -128,6 +128,10 @@
<artifactId>elasticsearch-java</artifactId>
<version>8.13.4</version>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>
</dependencies>
<build>

View File

@ -10,7 +10,7 @@ import java.util.concurrent.Callable;
@AllArgsConstructor
@NoArgsConstructor
public class ChatTaskThread implements Callable<Flux<String>> {
public class ChatTaskThread1 implements Callable<Flux<String>> {
private ChatService chatService;
private QueryDTO queryDTO;

View File

@ -0,0 +1,23 @@
package com.electromagnetic.industry.software.manage.ai;
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import org.springframework.ai.chat.model.ChatResponse;
import reactor.core.publisher.Flux;
import java.util.concurrent.Callable;
@AllArgsConstructor
@NoArgsConstructor
public class ChatTaskThread2 implements Callable<Flux<ChatResponse>> {
private ChatService chatService;
private QueryDTO queryDTO;
@Override
public Flux<ChatResponse> call() throws Exception {
return chatService.chatStreamResponse(queryDTO);
}
}

View File

@ -2,12 +2,14 @@ package com.electromagnetic.industry.software.manage.controller;
import cn.hutool.core.util.StrUtil;
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread;
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread1;
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread2;
import com.electromagnetic.industry.software.manage.ai.ThreadUtil;
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
@ -44,19 +46,19 @@ public class AiController {
if (StrUtil.isEmpty(queryDTO.getMsg())) {
return Flux.empty();
}
ChatTaskThread chatTaskThread = new ChatTaskThread(chatService, queryDTO);
ChatTaskThread1 chatTaskThread = new ChatTaskThread1(chatService, queryDTO);
Future<Flux<String>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
return future.get();
}
// @PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
// public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
// if (StrUtil.isEmpty(queryDTO.getMsg())) {
// return Flux.empty();
// }
// ChatTaskThread<Flux<ChatResponse>> chatTaskThread = new ChatTaskThread<>(chatService, queryDTO);
// Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
// return future.get();
// }
@PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
if (StrUtil.isEmpty(queryDTO.getMsg())) {
return Flux.empty();
}
ChatTaskThread2 chatTaskThread2 = new ChatTaskThread2(chatService, queryDTO);
Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread2);
return future.get();
}
}

View File

@ -1,16 +1,16 @@
package com.electromagnetic.industry.software.manage.service.serviceimpl;
import cn.hutool.core.collection.ListUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.crypto.digest.DigestUtil;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
import com.electromagnetic.industry.software.common.util.EleCommonUtil;
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
import com.electromagnetic.industry.software.common.util.IdWorker;
import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
@ -19,15 +19,24 @@ import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.client.advisor.MessageChatMemoryAdvisor;
import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
import org.springframework.ai.chat.messages.UserMessage;
import org.springframework.ai.chat.prompt.Prompt;
import org.springframework.ai.document.Document;
import org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.multipart.MultipartFile;
import reactor.core.publisher.Flux;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -51,6 +60,9 @@ public class ChatService {
@Resource
private AiFileUploadRecordMapper aiFileUploadRecordMapper;
@Resource
private ElePropertyConfig elePropertyConfig;
public void add(String content) {
List<Document> documents = Stream.of(content).map(Document::new).collect(Collectors.toList());
vectorStore.write(documents);
@ -75,53 +87,55 @@ public class ChatService {
@Transactional(rollbackFor = Exception.class)
public ElectromagneticResult<?> addFromUpload(MultipartFile file) throws Exception {
// 文件是否为空
if (file.isEmpty()) {
return ElectromagneticResultUtil.fail("-1", "文件为空");
}
// 当前仅支持pdf文件
String fileType = FileUtil.extName(file.getOriginalFilename());
if (!StrUtil.equals(fileType, "pdf")) {
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
}
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
// 通过md5值判断文件是否被上传过
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
Long count = aiFileUploadRecordMapper.selectCount(Wrappers.lambdaQuery(AiFileUploadRecord.class)
.eq(AiFileUploadRecord::getFileMd5, fileMd5));
if (count > 0) {
return ElectromagneticResultUtil.success(fileMd5);
}
String content = EleCommonUtil.parse(file.getInputStream(), "pdf");
Document document = new Document(content);
vectorStore.write(ListUtil.of(document));
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
.setVectorId(document.getId())
.setFileSize(file.getSize())
.setFileMd5(fileMd5)
.setFileName(file.getOriginalFilename()));
Path tempFile = saveUploadedFileToTemp(file);
PdfDocumentReaderConfig config = PdfDocumentReaderConfig.builder().build();
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()), config);
List<Document> documents = reader.get();
Files.deleteIfExists(tempFile);
vectorStore.write(documents);
for (Document document : documents) {
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
.setVectorId(document.getId())
.setFileSize(file.getSize())
.setFileMd5(fileMd5)
.setFileName(file.getOriginalFilename()));
}
return ElectromagneticResultUtil.success(fileMd5);
}
// public String chat(String msg) {
//
// log.info("Start call model to answer");
//
// return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt()
// .user(msg)
// .advisors(advisorSpec -> advisorSpec
//// .param(CHAT_MEMORY_CONVERSATION_ID_KEY, queryDTO.getUserId())
// .param(AbstractChatMemoryAdvisor.CHAT_MEMORY_RETRIEVE_SIZE_KEY, 100))
// .call()
// .content();
// }
//
// public Flux<ChatResponse> chatStreamResponse(String msg) {
// ChatClient.StreamResponseSpec stream = ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(new Prompt(new UserMessage(msg))).stream();
// return stream.chatResponse();
// }
private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
file.transferTo(tempFile);
return tempFile;
}
public Flux<String> chatStreamStr(QueryDTO queryDTO) {
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(queryDTO.getMsg()).stream().content();
}
public Flux<ChatResponse> chatStreamResponse(QueryDTO queryDTO) {
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build()
.prompt(new Prompt(new UserMessage(queryDTO.getMsg())))
.stream().chatResponse();
}
}

View File

@ -1,6 +1,8 @@
//import com.electromagnetic.industry.software.manage.Application;
//import jakarta.annotation.Resource;
//import org.junit.jupiter.api.Test;
//import org.springframework.ai.document.Document;
//import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
//import org.springframework.ai.vectorstore.VectorStore;
//import org.springframework.boot.test.context.SpringBootTest;
//
@ -14,8 +16,10 @@
//
// @Test
// public void testTree() {
// String id = "c32666b2-36a5-40b5-9048-11349f090cd7";
// vectorStore.delete(List.of(id));
// String path = "D:/wjj.pdf";
// PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(path);
// List<Document> read = pagePdfDocumentReader.read();
// System.out.println("read = " + read);
// }
//
//}

View File

@ -89,7 +89,7 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.24</version>
<version>3.0.3</version>
</dependency>
<dependency>
<groupId>javax.xml.bind</groupId>

View File

@ -9,6 +9,7 @@ import com.documents4j.api.IConverter;
import com.documents4j.job.LocalConverter;
import com.electromagnetic.industry.software.common.exception.BizException;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
@ -89,7 +90,7 @@ public class OfficeFileUtil {
public static String parsePdfAllText(String path) throws IOException {
log.info("Start parse pdf file, path is {}", path);
// 加载PDF文档
PDDocument document = PDDocument.load(new File(path));
PDDocument document = Loader.loadPDF(new File(path));
// 创建PDFTextStripper对象来解析文本
PDFTextStripper pdfStripper = new PDFTextStripper();
// 提取文本