解决AI中上传pdf文件解析的bug
This commit is contained in:
parent
9b177b5c1d
commit
2e1de849b2
|
|
@ -128,6 +128,10 @@
|
|||
<artifactId>elasticsearch-java</artifactId>
|
||||
<version>8.13.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.ai</groupId>
|
||||
<artifactId>spring-ai-pdf-document-reader</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import java.util.concurrent.Callable;
|
|||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class ChatTaskThread implements Callable<Flux<String>> {
|
||||
public class ChatTaskThread1 implements Callable<Flux<String>> {
|
||||
|
||||
private ChatService chatService;
|
||||
private QueryDTO queryDTO;
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
package com.electromagnetic.industry.software.manage.ai;
|
||||
|
||||
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
||||
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import org.springframework.ai.chat.model.ChatResponse;
|
||||
import reactor.core.publisher.Flux;
|
||||
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class ChatTaskThread2 implements Callable<Flux<ChatResponse>> {
|
||||
|
||||
private ChatService chatService;
|
||||
private QueryDTO queryDTO;
|
||||
|
||||
@Override
|
||||
public Flux<ChatResponse> call() throws Exception {
|
||||
return chatService.chatStreamResponse(queryDTO);
|
||||
}
|
||||
}
|
||||
|
|
@ -2,12 +2,14 @@ package com.electromagnetic.industry.software.manage.controller;
|
|||
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
||||
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread;
|
||||
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread1;
|
||||
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread2;
|
||||
import com.electromagnetic.industry.software.manage.ai.ThreadUtil;
|
||||
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
||||
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
|
||||
import jakarta.annotation.Resource;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.ai.chat.model.ChatResponse;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
|
@ -44,19 +46,19 @@ public class AiController {
|
|||
if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
||||
return Flux.empty();
|
||||
}
|
||||
ChatTaskThread chatTaskThread = new ChatTaskThread(chatService, queryDTO);
|
||||
ChatTaskThread1 chatTaskThread = new ChatTaskThread1(chatService, queryDTO);
|
||||
Future<Flux<String>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
|
||||
return future.get();
|
||||
}
|
||||
|
||||
// @PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
|
||||
// public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
|
||||
// if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
||||
// return Flux.empty();
|
||||
// }
|
||||
// ChatTaskThread<Flux<ChatResponse>> chatTaskThread = new ChatTaskThread<>(chatService, queryDTO);
|
||||
// Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
|
||||
// return future.get();
|
||||
// }
|
||||
@PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
|
||||
public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
|
||||
if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
||||
return Flux.empty();
|
||||
}
|
||||
ChatTaskThread2 chatTaskThread2 = new ChatTaskThread2(chatService, queryDTO);
|
||||
Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread2);
|
||||
return future.get();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
package com.electromagnetic.industry.software.manage.service.serviceimpl;
|
||||
|
||||
import cn.hutool.core.collection.ListUtil;
|
||||
import cn.hutool.core.io.FileUtil;
|
||||
import cn.hutool.core.util.IdUtil;
|
||||
import cn.hutool.core.util.ObjectUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import cn.hutool.crypto.digest.DigestUtil;
|
||||
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
|
||||
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
|
||||
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
||||
import com.electromagnetic.industry.software.common.util.EleCommonUtil;
|
||||
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
|
||||
import com.electromagnetic.industry.software.common.util.IdWorker;
|
||||
import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
|
||||
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
|
||||
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
|
||||
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
||||
|
|
@ -19,15 +19,24 @@ import lombok.extern.slf4j.Slf4j;
|
|||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.ai.chat.client.advisor.MessageChatMemoryAdvisor;
|
||||
import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
|
||||
import org.springframework.ai.chat.messages.UserMessage;
|
||||
import org.springframework.ai.chat.prompt.Prompt;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.ollama.OllamaChatModel;
|
||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.ai.chat.model.ChatResponse;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import reactor.core.publisher.Flux;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
|
@ -51,6 +60,9 @@ public class ChatService {
|
|||
@Resource
|
||||
private AiFileUploadRecordMapper aiFileUploadRecordMapper;
|
||||
|
||||
@Resource
|
||||
private ElePropertyConfig elePropertyConfig;
|
||||
|
||||
public void add(String content) {
|
||||
List<Document> documents = Stream.of(content).map(Document::new).collect(Collectors.toList());
|
||||
vectorStore.write(documents);
|
||||
|
|
@ -75,53 +87,55 @@ public class ChatService {
|
|||
@Transactional(rollbackFor = Exception.class)
|
||||
public ElectromagneticResult<?> addFromUpload(MultipartFile file) throws Exception {
|
||||
|
||||
// 文件是否为空
|
||||
if (file.isEmpty()) {
|
||||
return ElectromagneticResultUtil.fail("-1", "文件为空");
|
||||
}
|
||||
|
||||
// 当前仅支持pdf文件
|
||||
String fileType = FileUtil.extName(file.getOriginalFilename());
|
||||
if (!StrUtil.equals(fileType, "pdf")) {
|
||||
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
|
||||
}
|
||||
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
|
||||
|
||||
// 通过md5值判断文件是否被上传过
|
||||
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
|
||||
Long count = aiFileUploadRecordMapper.selectCount(Wrappers.lambdaQuery(AiFileUploadRecord.class)
|
||||
.eq(AiFileUploadRecord::getFileMd5, fileMd5));
|
||||
if (count > 0) {
|
||||
return ElectromagneticResultUtil.success(fileMd5);
|
||||
}
|
||||
|
||||
String content = EleCommonUtil.parse(file.getInputStream(), "pdf");
|
||||
Document document = new Document(content);
|
||||
vectorStore.write(ListUtil.of(document));
|
||||
Path tempFile = saveUploadedFileToTemp(file);
|
||||
PdfDocumentReaderConfig config = PdfDocumentReaderConfig.builder().build();
|
||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()), config);
|
||||
List<Document> documents = reader.get();
|
||||
Files.deleteIfExists(tempFile);
|
||||
vectorStore.write(documents);
|
||||
for (Document document : documents) {
|
||||
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
|
||||
.setVectorId(document.getId())
|
||||
.setFileSize(file.getSize())
|
||||
.setFileMd5(fileMd5)
|
||||
.setFileName(file.getOriginalFilename()));
|
||||
|
||||
}
|
||||
return ElectromagneticResultUtil.success(fileMd5);
|
||||
}
|
||||
|
||||
// public String chat(String msg) {
|
||||
//
|
||||
// log.info("Start call model to answer");
|
||||
//
|
||||
// return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt()
|
||||
// .user(msg)
|
||||
// .advisors(advisorSpec -> advisorSpec
|
||||
//// .param(CHAT_MEMORY_CONVERSATION_ID_KEY, queryDTO.getUserId())
|
||||
// .param(AbstractChatMemoryAdvisor.CHAT_MEMORY_RETRIEVE_SIZE_KEY, 100))
|
||||
// .call()
|
||||
// .content();
|
||||
// }
|
||||
//
|
||||
// public Flux<ChatResponse> chatStreamResponse(String msg) {
|
||||
// ChatClient.StreamResponseSpec stream = ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(new Prompt(new UserMessage(msg))).stream();
|
||||
// return stream.chatResponse();
|
||||
// }
|
||||
private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
|
||||
Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
|
||||
Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
|
||||
file.transferTo(tempFile);
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
public Flux<String> chatStreamStr(QueryDTO queryDTO) {
|
||||
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(queryDTO.getMsg()).stream().content();
|
||||
}
|
||||
|
||||
public Flux<ChatResponse> chatStreamResponse(QueryDTO queryDTO) {
|
||||
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build()
|
||||
.prompt(new Prompt(new UserMessage(queryDTO.getMsg())))
|
||||
.stream().chatResponse();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
//import com.electromagnetic.industry.software.manage.Application;
|
||||
//import jakarta.annotation.Resource;
|
||||
//import org.junit.jupiter.api.Test;
|
||||
//import org.springframework.ai.document.Document;
|
||||
//import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||
//import org.springframework.ai.vectorstore.VectorStore;
|
||||
//import org.springframework.boot.test.context.SpringBootTest;
|
||||
//
|
||||
|
|
@ -14,8 +16,10 @@
|
|||
//
|
||||
// @Test
|
||||
// public void testTree() {
|
||||
// String id = "c32666b2-36a5-40b5-9048-11349f090cd7";
|
||||
// vectorStore.delete(List.of(id));
|
||||
// String path = "D:/wjj.pdf";
|
||||
// PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(path);
|
||||
// List<Document> read = pagePdfDocumentReader.read();
|
||||
// System.out.println("read = " + read);
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.24</version>
|
||||
<version>3.0.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>javax.xml.bind</groupId>
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import com.documents4j.api.IConverter;
|
|||
import com.documents4j.job.LocalConverter;
|
||||
import com.electromagnetic.industry.software.common.exception.BizException;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
|
|
@ -89,7 +90,7 @@ public class OfficeFileUtil {
|
|||
public static String parsePdfAllText(String path) throws IOException {
|
||||
log.info("Start parse pdf file, path is {}", path);
|
||||
// 加载PDF文档
|
||||
PDDocument document = PDDocument.load(new File(path));
|
||||
PDDocument document = Loader.loadPDF(new File(path));
|
||||
// 创建PDFTextStripper对象来解析文本
|
||||
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||
// 提取文本
|
||||
|
|
|
|||
Loading…
Reference in New Issue