解决AI中上传pdf文件解析的bug
This commit is contained in:
parent
9b177b5c1d
commit
2e1de849b2
|
|
@ -128,6 +128,10 @@
|
||||||
<artifactId>elasticsearch-java</artifactId>
|
<artifactId>elasticsearch-java</artifactId>
|
||||||
<version>8.13.4</version>
|
<version>8.13.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.ai</groupId>
|
||||||
|
<artifactId>spring-ai-pdf-document-reader</artifactId>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import java.util.concurrent.Callable;
|
||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class ChatTaskThread implements Callable<Flux<String>> {
|
public class ChatTaskThread1 implements Callable<Flux<String>> {
|
||||||
|
|
||||||
private ChatService chatService;
|
private ChatService chatService;
|
||||||
private QueryDTO queryDTO;
|
private QueryDTO queryDTO;
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
package com.electromagnetic.industry.software.manage.ai;
|
||||||
|
|
||||||
|
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
||||||
|
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import org.springframework.ai.chat.model.ChatResponse;
|
||||||
|
import reactor.core.publisher.Flux;
|
||||||
|
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
public class ChatTaskThread2 implements Callable<Flux<ChatResponse>> {
|
||||||
|
|
||||||
|
private ChatService chatService;
|
||||||
|
private QueryDTO queryDTO;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Flux<ChatResponse> call() throws Exception {
|
||||||
|
return chatService.chatStreamResponse(queryDTO);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -2,12 +2,14 @@ package com.electromagnetic.industry.software.manage.controller;
|
||||||
|
|
||||||
import cn.hutool.core.util.StrUtil;
|
import cn.hutool.core.util.StrUtil;
|
||||||
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
||||||
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread;
|
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread1;
|
||||||
|
import com.electromagnetic.industry.software.manage.ai.ChatTaskThread2;
|
||||||
import com.electromagnetic.industry.software.manage.ai.ThreadUtil;
|
import com.electromagnetic.industry.software.manage.ai.ThreadUtil;
|
||||||
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
||||||
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
|
import com.electromagnetic.industry.software.manage.service.serviceimpl.ChatService;
|
||||||
import jakarta.annotation.Resource;
|
import jakarta.annotation.Resource;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.ai.chat.model.ChatResponse;
|
||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.*;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
@ -44,19 +46,19 @@ public class AiController {
|
||||||
if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
||||||
return Flux.empty();
|
return Flux.empty();
|
||||||
}
|
}
|
||||||
ChatTaskThread chatTaskThread = new ChatTaskThread(chatService, queryDTO);
|
ChatTaskThread1 chatTaskThread = new ChatTaskThread1(chatService, queryDTO);
|
||||||
Future<Flux<String>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
|
Future<Flux<String>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
|
||||||
return future.get();
|
return future.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
// @PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
|
@PostMapping(path = "/chatStreamResp", produces = MediaType.TEXT_EVENT_STREAM_VALUE)
|
||||||
// public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
|
public Flux<ChatResponse> chatStreamResp(@RequestBody QueryDTO queryDTO) throws ExecutionException, InterruptedException {
|
||||||
// if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
if (StrUtil.isEmpty(queryDTO.getMsg())) {
|
||||||
// return Flux.empty();
|
return Flux.empty();
|
||||||
// }
|
}
|
||||||
// ChatTaskThread<Flux<ChatResponse>> chatTaskThread = new ChatTaskThread<>(chatService, queryDTO);
|
ChatTaskThread2 chatTaskThread2 = new ChatTaskThread2(chatService, queryDTO);
|
||||||
// Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread);
|
Future<Flux<ChatResponse>> future = ThreadUtil.getThreadPool().submit(chatTaskThread2);
|
||||||
// return future.get();
|
return future.get();
|
||||||
// }
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,16 @@
|
||||||
package com.electromagnetic.industry.software.manage.service.serviceimpl;
|
package com.electromagnetic.industry.software.manage.service.serviceimpl;
|
||||||
|
|
||||||
import cn.hutool.core.collection.ListUtil;
|
|
||||||
import cn.hutool.core.io.FileUtil;
|
import cn.hutool.core.io.FileUtil;
|
||||||
|
import cn.hutool.core.util.IdUtil;
|
||||||
import cn.hutool.core.util.ObjectUtil;
|
import cn.hutool.core.util.ObjectUtil;
|
||||||
import cn.hutool.core.util.StrUtil;
|
import cn.hutool.core.util.StrUtil;
|
||||||
import cn.hutool.crypto.digest.DigestUtil;
|
import cn.hutool.crypto.digest.DigestUtil;
|
||||||
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
|
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
|
||||||
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
|
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
|
||||||
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
||||||
import com.electromagnetic.industry.software.common.util.EleCommonUtil;
|
|
||||||
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
|
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
|
||||||
import com.electromagnetic.industry.software.common.util.IdWorker;
|
import com.electromagnetic.industry.software.common.util.IdWorker;
|
||||||
|
import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
|
||||||
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
|
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
|
||||||
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
|
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
|
||||||
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
import com.electromagnetic.industry.software.manage.pojo.req.QueryDTO;
|
||||||
|
|
@ -19,15 +19,24 @@ import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.ai.chat.client.ChatClient;
|
import org.springframework.ai.chat.client.ChatClient;
|
||||||
import org.springframework.ai.chat.client.advisor.MessageChatMemoryAdvisor;
|
import org.springframework.ai.chat.client.advisor.MessageChatMemoryAdvisor;
|
||||||
import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
|
import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
|
||||||
|
import org.springframework.ai.chat.messages.UserMessage;
|
||||||
|
import org.springframework.ai.chat.prompt.Prompt;
|
||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
import org.springframework.ai.ollama.OllamaChatModel;
|
import org.springframework.ai.ollama.OllamaChatModel;
|
||||||
|
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||||
|
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
|
||||||
import org.springframework.ai.vectorstore.VectorStore;
|
import org.springframework.ai.vectorstore.VectorStore;
|
||||||
|
import org.springframework.ai.chat.model.ChatResponse;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import org.springframework.transaction.annotation.Transactional;
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
import reactor.core.publisher.Flux;
|
import reactor.core.publisher.Flux;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
@ -51,6 +60,9 @@ public class ChatService {
|
||||||
@Resource
|
@Resource
|
||||||
private AiFileUploadRecordMapper aiFileUploadRecordMapper;
|
private AiFileUploadRecordMapper aiFileUploadRecordMapper;
|
||||||
|
|
||||||
|
@Resource
|
||||||
|
private ElePropertyConfig elePropertyConfig;
|
||||||
|
|
||||||
public void add(String content) {
|
public void add(String content) {
|
||||||
List<Document> documents = Stream.of(content).map(Document::new).collect(Collectors.toList());
|
List<Document> documents = Stream.of(content).map(Document::new).collect(Collectors.toList());
|
||||||
vectorStore.write(documents);
|
vectorStore.write(documents);
|
||||||
|
|
@ -75,53 +87,55 @@ public class ChatService {
|
||||||
@Transactional(rollbackFor = Exception.class)
|
@Transactional(rollbackFor = Exception.class)
|
||||||
public ElectromagneticResult<?> addFromUpload(MultipartFile file) throws Exception {
|
public ElectromagneticResult<?> addFromUpload(MultipartFile file) throws Exception {
|
||||||
|
|
||||||
|
// 文件是否为空
|
||||||
if (file.isEmpty()) {
|
if (file.isEmpty()) {
|
||||||
return ElectromagneticResultUtil.fail("-1", "文件为空");
|
return ElectromagneticResultUtil.fail("-1", "文件为空");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 当前仅支持pdf文件
|
||||||
String fileType = FileUtil.extName(file.getOriginalFilename());
|
String fileType = FileUtil.extName(file.getOriginalFilename());
|
||||||
if (!StrUtil.equals(fileType, "pdf")) {
|
if (!StrUtil.equals(fileType, "pdf")) {
|
||||||
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
|
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
|
||||||
}
|
}
|
||||||
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
|
|
||||||
|
|
||||||
|
// 通过md5值判断文件是否被上传过
|
||||||
|
String fileMd5 = DigestUtil.md5Hex(file.getInputStream());
|
||||||
Long count = aiFileUploadRecordMapper.selectCount(Wrappers.lambdaQuery(AiFileUploadRecord.class)
|
Long count = aiFileUploadRecordMapper.selectCount(Wrappers.lambdaQuery(AiFileUploadRecord.class)
|
||||||
.eq(AiFileUploadRecord::getFileMd5, fileMd5));
|
.eq(AiFileUploadRecord::getFileMd5, fileMd5));
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
return ElectromagneticResultUtil.success(fileMd5);
|
return ElectromagneticResultUtil.success(fileMd5);
|
||||||
}
|
}
|
||||||
|
|
||||||
String content = EleCommonUtil.parse(file.getInputStream(), "pdf");
|
Path tempFile = saveUploadedFileToTemp(file);
|
||||||
Document document = new Document(content);
|
PdfDocumentReaderConfig config = PdfDocumentReaderConfig.builder().build();
|
||||||
vectorStore.write(ListUtil.of(document));
|
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()), config);
|
||||||
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
|
List<Document> documents = reader.get();
|
||||||
.setVectorId(document.getId())
|
Files.deleteIfExists(tempFile);
|
||||||
.setFileSize(file.getSize())
|
vectorStore.write(documents);
|
||||||
.setFileMd5(fileMd5)
|
for (Document document : documents) {
|
||||||
.setFileName(file.getOriginalFilename()));
|
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
|
||||||
|
.setVectorId(document.getId())
|
||||||
|
.setFileSize(file.getSize())
|
||||||
|
.setFileMd5(fileMd5)
|
||||||
|
.setFileName(file.getOriginalFilename()));
|
||||||
|
}
|
||||||
return ElectromagneticResultUtil.success(fileMd5);
|
return ElectromagneticResultUtil.success(fileMd5);
|
||||||
}
|
}
|
||||||
|
|
||||||
// public String chat(String msg) {
|
private Path saveUploadedFileToTemp(MultipartFile file) throws IOException {
|
||||||
//
|
Path tempDir = Files.createTempDirectory(IdUtil.simpleUUID());
|
||||||
// log.info("Start call model to answer");
|
Path tempFile = tempDir.resolve(Objects.requireNonNull(file.getOriginalFilename()));
|
||||||
//
|
file.transferTo(tempFile);
|
||||||
// return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt()
|
return tempFile;
|
||||||
// .user(msg)
|
}
|
||||||
// .advisors(advisorSpec -> advisorSpec
|
|
||||||
//// .param(CHAT_MEMORY_CONVERSATION_ID_KEY, queryDTO.getUserId())
|
|
||||||
// .param(AbstractChatMemoryAdvisor.CHAT_MEMORY_RETRIEVE_SIZE_KEY, 100))
|
|
||||||
// .call()
|
|
||||||
// .content();
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// public Flux<ChatResponse> chatStreamResponse(String msg) {
|
|
||||||
// ChatClient.StreamResponseSpec stream = ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(new Prompt(new UserMessage(msg))).stream();
|
|
||||||
// return stream.chatResponse();
|
|
||||||
// }
|
|
||||||
|
|
||||||
public Flux<String> chatStreamStr(QueryDTO queryDTO) {
|
public Flux<String> chatStreamStr(QueryDTO queryDTO) {
|
||||||
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(queryDTO.getMsg()).stream().content();
|
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build().prompt(queryDTO.getMsg()).stream().content();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Flux<ChatResponse> chatStreamResponse(QueryDTO queryDTO) {
|
||||||
|
return ChatClient.builder(model).defaultAdvisors(messageChatMemoryAdvisor, questionAnswerAdvisor).build()
|
||||||
|
.prompt(new Prompt(new UserMessage(queryDTO.getMsg())))
|
||||||
|
.stream().chatResponse();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
//import com.electromagnetic.industry.software.manage.Application;
|
//import com.electromagnetic.industry.software.manage.Application;
|
||||||
//import jakarta.annotation.Resource;
|
//import jakarta.annotation.Resource;
|
||||||
//import org.junit.jupiter.api.Test;
|
//import org.junit.jupiter.api.Test;
|
||||||
|
//import org.springframework.ai.document.Document;
|
||||||
|
//import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||||
//import org.springframework.ai.vectorstore.VectorStore;
|
//import org.springframework.ai.vectorstore.VectorStore;
|
||||||
//import org.springframework.boot.test.context.SpringBootTest;
|
//import org.springframework.boot.test.context.SpringBootTest;
|
||||||
//
|
//
|
||||||
|
|
@ -14,8 +16,10 @@
|
||||||
//
|
//
|
||||||
// @Test
|
// @Test
|
||||||
// public void testTree() {
|
// public void testTree() {
|
||||||
// String id = "c32666b2-36a5-40b5-9048-11349f090cd7";
|
// String path = "D:/wjj.pdf";
|
||||||
// vectorStore.delete(List.of(id));
|
// PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(path);
|
||||||
|
// List<Document> read = pagePdfDocumentReader.read();
|
||||||
|
// System.out.println("read = " + read);
|
||||||
// }
|
// }
|
||||||
//
|
//
|
||||||
//}
|
//}
|
||||||
|
|
|
||||||
|
|
@ -89,7 +89,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.pdfbox</groupId>
|
<groupId>org.apache.pdfbox</groupId>
|
||||||
<artifactId>pdfbox</artifactId>
|
<artifactId>pdfbox</artifactId>
|
||||||
<version>2.0.24</version>
|
<version>3.0.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>javax.xml.bind</groupId>
|
<groupId>javax.xml.bind</groupId>
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import com.documents4j.api.IConverter;
|
||||||
import com.documents4j.job.LocalConverter;
|
import com.documents4j.job.LocalConverter;
|
||||||
import com.electromagnetic.industry.software.common.exception.BizException;
|
import com.electromagnetic.industry.software.common.exception.BizException;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||||
|
|
@ -89,7 +90,7 @@ public class OfficeFileUtil {
|
||||||
public static String parsePdfAllText(String path) throws IOException {
|
public static String parsePdfAllText(String path) throws IOException {
|
||||||
log.info("Start parse pdf file, path is {}", path);
|
log.info("Start parse pdf file, path is {}", path);
|
||||||
// 加载PDF文档
|
// 加载PDF文档
|
||||||
PDDocument document = PDDocument.load(new File(path));
|
PDDocument document = Loader.loadPDF(new File(path));
|
||||||
// 创建PDFTextStripper对象来解析文本
|
// 创建PDFTextStripper对象来解析文本
|
||||||
PDFTextStripper pdfStripper = new PDFTextStripper();
|
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||||
// 提取文本
|
// 提取文本
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue