问支持word,ppt,excel,text,csv等上传

This commit is contained in:
chenxudong 2025-04-07 15:10:14 +08:00
parent b0fe94d2ee
commit 46d68f8d4b
3 changed files with 179 additions and 15 deletions

View File

@ -1,17 +1,20 @@
package com.electromagnetic.industry.software.manage.service.serviceimpl;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.crypto.digest.DigestUtil;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
import com.electromagnetic.industry.software.common.pojo.PageFile;
import com.electromagnetic.industry.software.common.pojo.UserLoginInfo;
import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
import com.electromagnetic.industry.software.common.util.IdWorker;
import com.electromagnetic.industry.software.common.util.OfficeFileUtil;
import com.electromagnetic.industry.software.common.util.UserThreadLocal;
import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
import com.electromagnetic.industry.software.manage.mapper.AiQuestionRecordMapper;
import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
@ -25,7 +28,9 @@ import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
import org.springframework.ai.chat.messages.UserMessage;
import org.springframework.ai.chat.prompt.Prompt;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.chat.model.ChatResponse;
@ -34,12 +39,11 @@ import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.multipart.MultipartFile;
import reactor.core.publisher.Flux;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@ -64,6 +68,11 @@ public class ChatService {
@Resource
private AiQuestionRecordMapper aiQuestionRecordMapper;
@Resource
private ElePropertyConfig elePropertyConfig;
private static final List<String> ALLOWS_FILE_TYPES = List.of("doc", "docx", "txt", "csv", "xls", "xlsx", "pdf", "ppt", "pptx");
public ElectromagneticResult<?> deleteDocument(List<String> ids) {
List<String> vectorIds = aiFileUploadRecordMapper.selectList(Wrappers.lambdaQuery(AiFileUploadRecord.class)
.in(AiFileUploadRecord::getId, ids))
@ -88,10 +97,9 @@ public class ChatService {
return ElectromagneticResultUtil.fail("-1", "文件为空");
}
// 当前仅支持pdf文件
String fileType = FileUtil.extName(file.getOriginalFilename());
if (!StrUtil.equals(fileType, "pdf")) {
return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
if (!ALLOWS_FILE_TYPES.contains(fileType)) {
return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
}
// 通过md5值判断文件是否被上传过
@ -101,11 +109,109 @@ public class ChatService {
if (count > 0) {
return ElectromagneticResultUtil.success(fileMd5);
}
List<Document> documents = new ArrayList<>();
switch (Objects.requireNonNull(fileType)) {
case "txt", "csv", "text" -> {
Path tempFile = saveUploadedFileToTemp(file);
DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL()));
documents = documentReader.get();
Files.deleteIfExists(tempFile);
}
case "xls" -> {
String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xls";
List<PageFile> pageInfo = OfficeFileUtil.parseXlsByPage(filePath);
for (PageFile pageFile : pageInfo) {
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
Document document = new Document(pageFile.getContent(), metaData);
documents.add(document);
}
FileUtil.del(filePath);
}
case "xlsx" -> {
String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xlsx";
List<PageFile> pageInfo = OfficeFileUtil.parseXlsxByPage(filePath);
for (PageFile pageFile : pageInfo) {
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
Document document = new Document(pageFile.getContent(), metaData);
documents.add(document);
}
FileUtil.del(filePath);
}
case "doc" -> {
String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".doc";
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
OfficeFileUtil.doc2pdf(wordPath, pdfPath);
Path path = new File(pdfPath).toPath();
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
List<Document> tmp = reader.get();
for (Document document : tmp) {
Map<String, Object> metadata = document.getMetadata();
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
Document doc = new Document(document.getContent(), metaData);
documents.add(doc);
}
Files.deleteIfExists(path);
FileUtil.del(wordPath);
}
case "docx" -> {
String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".docx";
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
OfficeFileUtil.doc2pdf(wordPath, pdfPath);
Path path = new File(pdfPath).toPath();
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
List<Document> tmp = reader.get();
for (Document document : tmp) {
Map<String, Object> metadata = document.getMetadata();
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
Document doc = new Document(document.getContent(), metaData);
documents.add(doc);
}
Files.deleteIfExists(path);
FileUtil.del(wordPath);
}
case "ppt" -> {
String pptPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".ppt";
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
OfficeFileUtil.ppt2pdf(pptPath, pdfPath);
Path path = new File(pdfPath).toPath();
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
List<Document> tmp = reader.get();
for (Document document : tmp) {
Map<String, Object> metadata = document.getMetadata();
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
Document doc = new Document(document.getContent(), metaData);
documents.add(doc);
}
Files.deleteIfExists(path);
FileUtil.del(pptPath);
}
case "pptx" -> {
String pptxPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pptx";
String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
OfficeFileUtil.ppt2pdf(pptxPath, pdfPath);
Path path = new File(pdfPath).toPath();
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
List<Document> tmp = reader.get();
for (Document document : tmp) {
Map<String, Object> metadata = document.getMetadata();
Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
Document doc = new Document(document.getContent(), metaData);
documents.add(doc);
}
Files.deleteIfExists(path);
FileUtil.del(pptxPath);
}
case "pdf" -> {
Path tempFile = saveUploadedFileToTemp(file);
PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
List<Document> documents = reader.get();
documents = reader.get();
Files.deleteIfExists(tempFile);
}
default -> {
return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
}
}
vectorStore.write(documents);
for (Document document : documents) {
aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())

View File

@ -0,0 +1,13 @@
package com.electromagnetic.industry.software.common.pojo;
import lombok.Data;
@Data
public class PageFile {
private int pageNumber;
private String content;
private String fileName;
}

View File

@ -8,6 +8,7 @@ import com.documents4j.api.DocumentType;
import com.documents4j.api.IConverter;
import com.documents4j.job.LocalConverter;
import com.electromagnetic.industry.software.common.exception.BizException;
import com.electromagnetic.industry.software.common.pojo.PageFile;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
@ -30,6 +31,7 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Objects;
@Slf4j
@ -46,10 +48,18 @@ public class OfficeFileUtil {
try (InputStream docxInputStream = Files.newInputStream(inputWord.toPath());
OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) {
IConverter build = LocalConverter.builder().build();
boolean execute = build.convert(docxInputStream)
boolean execute;
if (wordPath.endsWith(".docx")) {
execute = build.convert(docxInputStream)
.as(DocumentType.DOCX)
.to(outputStream)
.as(DocumentType.PDF).schedule().get();
} else {
execute = build.convert(docxInputStream)
.as(DocumentType.DOC)
.to(outputStream)
.as(DocumentType.PDF).schedule().get();
}
Assert.isTrue(execute, "转换失败");
log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath());
build.shutDown();
@ -76,6 +86,31 @@ public class OfficeFileUtil {
}
}
public static void ppt2pdf(String pptPath, String pdfPath) {
if (FileUtil.exist(pdfPath)) {
return;
}
log.info("Start convert ppt file to pdf, word path: {}, pdf path: {}", pptPath, pdfPath);
if (EleCommonUtil.isWinOs()) {
throw new BizException("windows平台暂不支持");
} else {
try {
StringBuilder command = new StringBuilder();
command.append("unoconv -f pdf -o").append(" ").append(pdfPath).append(" ").append(pptPath);
log.info("convert word file to pdf, command: {}", command);
Process process = RuntimeUtil.exec(command.toString());
process.waitFor();
if (process.exitValue() != 0) {
String info = StrFormatter.format("ppt文档{}转换成pdf文档{}失败", pptPath, pdfPath);
throw new BizException(info);
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new BizException(e.getMessage(), e);
}
}
}
public static String parseDocxAllText(String wordPath) throws IOException {
log.info("Start parse docx file, path is {}", wordPath);
InputStream fis = Files.newInputStream(Paths.get(wordPath));
@ -194,4 +229,14 @@ public class OfficeFileUtil {
return EleCommonUtil.formateString(stringBuilder.toString());
}
public static List<PageFile> parseXlsxByPage(String filePath) {
return null;
}
public static List<PageFile> parseXlsByPage(String filePath) {
return null;
}
}