问支持word,ppt,excel,text,csv等上传
This commit is contained in:
parent
b0fe94d2ee
commit
46d68f8d4b
|
|
@ -1,17 +1,20 @@
|
||||||
package com.electromagnetic.industry.software.manage.service.serviceimpl;
|
package com.electromagnetic.industry.software.manage.service.serviceimpl;
|
||||||
|
|
||||||
import cn.hutool.core.io.FileUtil;
|
import cn.hutool.core.io.FileUtil;
|
||||||
|
import cn.hutool.core.text.StrFormatter;
|
||||||
import cn.hutool.core.util.IdUtil;
|
import cn.hutool.core.util.IdUtil;
|
||||||
import cn.hutool.core.util.ObjectUtil;
|
import cn.hutool.core.util.ObjectUtil;
|
||||||
import cn.hutool.core.util.StrUtil;
|
|
||||||
import cn.hutool.crypto.digest.DigestUtil;
|
import cn.hutool.crypto.digest.DigestUtil;
|
||||||
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
|
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
|
||||||
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
|
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
|
||||||
|
import com.electromagnetic.industry.software.common.pojo.PageFile;
|
||||||
import com.electromagnetic.industry.software.common.pojo.UserLoginInfo;
|
import com.electromagnetic.industry.software.common.pojo.UserLoginInfo;
|
||||||
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
|
||||||
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
|
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
|
||||||
import com.electromagnetic.industry.software.common.util.IdWorker;
|
import com.electromagnetic.industry.software.common.util.IdWorker;
|
||||||
|
import com.electromagnetic.industry.software.common.util.OfficeFileUtil;
|
||||||
import com.electromagnetic.industry.software.common.util.UserThreadLocal;
|
import com.electromagnetic.industry.software.common.util.UserThreadLocal;
|
||||||
|
import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
|
||||||
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
|
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
|
||||||
import com.electromagnetic.industry.software.manage.mapper.AiQuestionRecordMapper;
|
import com.electromagnetic.industry.software.manage.mapper.AiQuestionRecordMapper;
|
||||||
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
|
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
|
||||||
|
|
@ -25,7 +28,9 @@ import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
|
||||||
import org.springframework.ai.chat.messages.UserMessage;
|
import org.springframework.ai.chat.messages.UserMessage;
|
||||||
import org.springframework.ai.chat.prompt.Prompt;
|
import org.springframework.ai.chat.prompt.Prompt;
|
||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
|
import org.springframework.ai.document.DocumentReader;
|
||||||
import org.springframework.ai.ollama.OllamaChatModel;
|
import org.springframework.ai.ollama.OllamaChatModel;
|
||||||
|
import org.springframework.ai.reader.TextReader;
|
||||||
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
|
||||||
import org.springframework.ai.vectorstore.VectorStore;
|
import org.springframework.ai.vectorstore.VectorStore;
|
||||||
import org.springframework.ai.chat.model.ChatResponse;
|
import org.springframework.ai.chat.model.ChatResponse;
|
||||||
|
|
@ -34,12 +39,11 @@ import org.springframework.transaction.annotation.Transactional;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
import reactor.core.publisher.Flux;
|
import reactor.core.publisher.Flux;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
|
|
@ -64,6 +68,11 @@ public class ChatService {
|
||||||
@Resource
|
@Resource
|
||||||
private AiQuestionRecordMapper aiQuestionRecordMapper;
|
private AiQuestionRecordMapper aiQuestionRecordMapper;
|
||||||
|
|
||||||
|
@Resource
|
||||||
|
private ElePropertyConfig elePropertyConfig;
|
||||||
|
|
||||||
|
private static final List<String> ALLOWS_FILE_TYPES = List.of("doc", "docx", "txt", "csv", "xls", "xlsx", "pdf", "ppt", "pptx");
|
||||||
|
|
||||||
public ElectromagneticResult<?> deleteDocument(List<String> ids) {
|
public ElectromagneticResult<?> deleteDocument(List<String> ids) {
|
||||||
List<String> vectorIds = aiFileUploadRecordMapper.selectList(Wrappers.lambdaQuery(AiFileUploadRecord.class)
|
List<String> vectorIds = aiFileUploadRecordMapper.selectList(Wrappers.lambdaQuery(AiFileUploadRecord.class)
|
||||||
.in(AiFileUploadRecord::getId, ids))
|
.in(AiFileUploadRecord::getId, ids))
|
||||||
|
|
@ -88,10 +97,9 @@ public class ChatService {
|
||||||
return ElectromagneticResultUtil.fail("-1", "文件为空");
|
return ElectromagneticResultUtil.fail("-1", "文件为空");
|
||||||
}
|
}
|
||||||
|
|
||||||
// 当前仅支持pdf文件
|
|
||||||
String fileType = FileUtil.extName(file.getOriginalFilename());
|
String fileType = FileUtil.extName(file.getOriginalFilename());
|
||||||
if (!StrUtil.equals(fileType, "pdf")) {
|
if (!ALLOWS_FILE_TYPES.contains(fileType)) {
|
||||||
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
|
return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
|
||||||
}
|
}
|
||||||
|
|
||||||
// 通过md5值判断文件是否被上传过
|
// 通过md5值判断文件是否被上传过
|
||||||
|
|
@ -101,11 +109,109 @@ public class ChatService {
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
return ElectromagneticResultUtil.success(fileMd5);
|
return ElectromagneticResultUtil.success(fileMd5);
|
||||||
}
|
}
|
||||||
|
List<Document> documents = new ArrayList<>();
|
||||||
|
|
||||||
|
switch (Objects.requireNonNull(fileType)) {
|
||||||
|
case "txt", "csv", "text" -> {
|
||||||
|
Path tempFile = saveUploadedFileToTemp(file);
|
||||||
|
DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL()));
|
||||||
|
documents = documentReader.get();
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
case "xls" -> {
|
||||||
|
String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xls";
|
||||||
|
List<PageFile> pageInfo = OfficeFileUtil.parseXlsByPage(filePath);
|
||||||
|
for (PageFile pageFile : pageInfo) {
|
||||||
|
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
|
||||||
|
Document document = new Document(pageFile.getContent(), metaData);
|
||||||
|
documents.add(document);
|
||||||
|
}
|
||||||
|
FileUtil.del(filePath);
|
||||||
|
}
|
||||||
|
case "xlsx" -> {
|
||||||
|
String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xlsx";
|
||||||
|
List<PageFile> pageInfo = OfficeFileUtil.parseXlsxByPage(filePath);
|
||||||
|
for (PageFile pageFile : pageInfo) {
|
||||||
|
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
|
||||||
|
Document document = new Document(pageFile.getContent(), metaData);
|
||||||
|
documents.add(document);
|
||||||
|
}
|
||||||
|
FileUtil.del(filePath);
|
||||||
|
}
|
||||||
|
case "doc" -> {
|
||||||
|
String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".doc";
|
||||||
|
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
|
||||||
|
OfficeFileUtil.doc2pdf(wordPath, pdfPath);
|
||||||
|
Path path = new File(pdfPath).toPath();
|
||||||
|
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
|
||||||
|
List<Document> tmp = reader.get();
|
||||||
|
for (Document document : tmp) {
|
||||||
|
Map<String, Object> metadata = document.getMetadata();
|
||||||
|
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
|
||||||
|
Document doc = new Document(document.getContent(), metaData);
|
||||||
|
documents.add(doc);
|
||||||
|
}
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
FileUtil.del(wordPath);
|
||||||
|
}
|
||||||
|
case "docx" -> {
|
||||||
|
String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".docx";
|
||||||
|
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
|
||||||
|
OfficeFileUtil.doc2pdf(wordPath, pdfPath);
|
||||||
|
Path path = new File(pdfPath).toPath();
|
||||||
|
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
|
||||||
|
List<Document> tmp = reader.get();
|
||||||
|
for (Document document : tmp) {
|
||||||
|
Map<String, Object> metadata = document.getMetadata();
|
||||||
|
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
|
||||||
|
Document doc = new Document(document.getContent(), metaData);
|
||||||
|
documents.add(doc);
|
||||||
|
}
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
FileUtil.del(wordPath);
|
||||||
|
}
|
||||||
|
case "ppt" -> {
|
||||||
|
String pptPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".ppt";
|
||||||
|
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
|
||||||
|
OfficeFileUtil.ppt2pdf(pptPath, pdfPath);
|
||||||
|
Path path = new File(pdfPath).toPath();
|
||||||
|
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
|
||||||
|
List<Document> tmp = reader.get();
|
||||||
|
for (Document document : tmp) {
|
||||||
|
Map<String, Object> metadata = document.getMetadata();
|
||||||
|
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
|
||||||
|
Document doc = new Document(document.getContent(), metaData);
|
||||||
|
documents.add(doc);
|
||||||
|
}
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
FileUtil.del(pptPath);
|
||||||
|
}
|
||||||
|
case "pptx" -> {
|
||||||
|
String pptxPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pptx";
|
||||||
|
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
|
||||||
|
OfficeFileUtil.ppt2pdf(pptxPath, pdfPath);
|
||||||
|
Path path = new File(pdfPath).toPath();
|
||||||
|
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
|
||||||
|
List<Document> tmp = reader.get();
|
||||||
|
for (Document document : tmp) {
|
||||||
|
Map<String, Object> metadata = document.getMetadata();
|
||||||
|
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
|
||||||
|
Document doc = new Document(document.getContent(), metaData);
|
||||||
|
documents.add(doc);
|
||||||
|
}
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
FileUtil.del(pptxPath);
|
||||||
|
}
|
||||||
|
case "pdf" -> {
|
||||||
Path tempFile = saveUploadedFileToTemp(file);
|
Path tempFile = saveUploadedFileToTemp(file);
|
||||||
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
|
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
|
||||||
List<Document> documents = reader.get();
|
documents = reader.get();
|
||||||
Files.deleteIfExists(tempFile);
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
default -> {
|
||||||
|
return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
|
||||||
|
}
|
||||||
|
}
|
||||||
vectorStore.write(documents);
|
vectorStore.write(documents);
|
||||||
for (Document document : documents) {
|
for (Document document : documents) {
|
||||||
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
|
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
package com.electromagnetic.industry.software.common.pojo;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class PageFile {
|
||||||
|
|
||||||
|
private int pageNumber;
|
||||||
|
|
||||||
|
private String content;
|
||||||
|
|
||||||
|
private String fileName;
|
||||||
|
}
|
||||||
|
|
@ -8,6 +8,7 @@ import com.documents4j.api.DocumentType;
|
||||||
import com.documents4j.api.IConverter;
|
import com.documents4j.api.IConverter;
|
||||||
import com.documents4j.job.LocalConverter;
|
import com.documents4j.job.LocalConverter;
|
||||||
import com.electromagnetic.industry.software.common.exception.BizException;
|
import com.electromagnetic.industry.software.common.exception.BizException;
|
||||||
|
import com.electromagnetic.industry.software.common.pojo.PageFile;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
|
@ -30,6 +31,7 @@ import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
|
|
@ -46,10 +48,18 @@ public class OfficeFileUtil {
|
||||||
try (InputStream docxInputStream = Files.newInputStream(inputWord.toPath());
|
try (InputStream docxInputStream = Files.newInputStream(inputWord.toPath());
|
||||||
OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) {
|
OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) {
|
||||||
IConverter build = LocalConverter.builder().build();
|
IConverter build = LocalConverter.builder().build();
|
||||||
boolean execute = build.convert(docxInputStream)
|
boolean execute;
|
||||||
|
if (wordPath.endsWith(".docx")) {
|
||||||
|
execute = build.convert(docxInputStream)
|
||||||
.as(DocumentType.DOCX)
|
.as(DocumentType.DOCX)
|
||||||
.to(outputStream)
|
.to(outputStream)
|
||||||
.as(DocumentType.PDF).schedule().get();
|
.as(DocumentType.PDF).schedule().get();
|
||||||
|
} else {
|
||||||
|
execute = build.convert(docxInputStream)
|
||||||
|
.as(DocumentType.DOC)
|
||||||
|
.to(outputStream)
|
||||||
|
.as(DocumentType.PDF).schedule().get();
|
||||||
|
}
|
||||||
Assert.isTrue(execute, "转换失败");
|
Assert.isTrue(execute, "转换失败");
|
||||||
log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath());
|
log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath());
|
||||||
build.shutDown();
|
build.shutDown();
|
||||||
|
|
@ -76,6 +86,31 @@ public class OfficeFileUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void ppt2pdf(String pptPath, String pdfPath) {
|
||||||
|
if (FileUtil.exist(pdfPath)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log.info("Start convert ppt file to pdf, word path: {}, pdf path: {}", pptPath, pdfPath);
|
||||||
|
if (EleCommonUtil.isWinOs()) {
|
||||||
|
throw new BizException("windows平台暂不支持");
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
StringBuilder command = new StringBuilder();
|
||||||
|
command.append("unoconv -f pdf -o").append(" ").append(pdfPath).append(" ").append(pptPath);
|
||||||
|
log.info("convert word file to pdf, command: {}", command);
|
||||||
|
Process process = RuntimeUtil.exec(command.toString());
|
||||||
|
process.waitFor();
|
||||||
|
if (process.exitValue() != 0) {
|
||||||
|
String info = StrFormatter.format("ppt文档{}转换成pdf文档{}失败", pptPath, pdfPath);
|
||||||
|
throw new BizException(info);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
throw new BizException(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static String parseDocxAllText(String wordPath) throws IOException {
|
public static String parseDocxAllText(String wordPath) throws IOException {
|
||||||
log.info("Start parse docx file, path is {}", wordPath);
|
log.info("Start parse docx file, path is {}", wordPath);
|
||||||
InputStream fis = Files.newInputStream(Paths.get(wordPath));
|
InputStream fis = Files.newInputStream(Paths.get(wordPath));
|
||||||
|
|
@ -194,4 +229,14 @@ public class OfficeFileUtil {
|
||||||
return EleCommonUtil.formateString(stringBuilder.toString());
|
return EleCommonUtil.formateString(stringBuilder.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<PageFile> parseXlsxByPage(String filePath) {
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<PageFile> parseXlsByPage(String filePath) {
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue