diff --git a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java index f267e0c..c8e3995 100644 --- a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java +++ b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java @@ -1,17 +1,20 @@ package com.electromagnetic.industry.software.manage.service.serviceimpl; import cn.hutool.core.io.FileUtil; +import cn.hutool.core.text.StrFormatter; import cn.hutool.core.util.IdUtil; import cn.hutool.core.util.ObjectUtil; -import cn.hutool.core.util.StrUtil; import cn.hutool.crypto.digest.DigestUtil; import com.baomidou.mybatisplus.core.toolkit.Wrappers; import com.electromagnetic.industry.software.common.enums.EffectFlagEnum; +import com.electromagnetic.industry.software.common.pojo.PageFile; import com.electromagnetic.industry.software.common.pojo.UserLoginInfo; import com.electromagnetic.industry.software.common.resp.ElectromagneticResult; import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil; import com.electromagnetic.industry.software.common.util.IdWorker; +import com.electromagnetic.industry.software.common.util.OfficeFileUtil; import com.electromagnetic.industry.software.common.util.UserThreadLocal; +import com.electromagnetic.industry.software.manage.config.ElePropertyConfig; import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper; import com.electromagnetic.industry.software.manage.mapper.AiQuestionRecordMapper; import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord; @@ -25,7 +28,9 @@ import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor; import org.springframework.ai.chat.messages.UserMessage; import org.springframework.ai.chat.prompt.Prompt; import org.springframework.ai.document.Document; +import org.springframework.ai.document.DocumentReader; import org.springframework.ai.ollama.OllamaChatModel; +import org.springframework.ai.reader.TextReader; import org.springframework.ai.reader.pdf.PagePdfDocumentReader; import org.springframework.ai.vectorstore.VectorStore; import org.springframework.ai.chat.model.ChatResponse; @@ -34,12 +39,11 @@ import org.springframework.transaction.annotation.Transactional; import org.springframework.web.multipart.MultipartFile; import reactor.core.publisher.Flux; +import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; -import java.util.Objects; -import java.util.Optional; +import java.util.*; import java.util.stream.Collectors; @Slf4j @@ -64,6 +68,11 @@ public class ChatService { @Resource private AiQuestionRecordMapper aiQuestionRecordMapper; + @Resource + private ElePropertyConfig elePropertyConfig; + + private static final List ALLOWS_FILE_TYPES = List.of("doc", "docx", "txt", "csv", "xls", "xlsx", "pdf", "ppt", "pptx"); + public ElectromagneticResult deleteDocument(List ids) { List vectorIds = aiFileUploadRecordMapper.selectList(Wrappers.lambdaQuery(AiFileUploadRecord.class) .in(AiFileUploadRecord::getId, ids)) @@ -88,10 +97,9 @@ public class ChatService { return ElectromagneticResultUtil.fail("-1", "文件为空"); } - // 当前仅支持pdf文件 String fileType = FileUtil.extName(file.getOriginalFilename()); - if (!StrUtil.equals(fileType, "pdf")) { - return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件"); + if (!ALLOWS_FILE_TYPES.contains(fileType)) { + return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType)); } // 通过md5值判断文件是否被上传过 @@ -101,11 +109,109 @@ public class ChatService { if (count > 0) { return ElectromagneticResultUtil.success(fileMd5); } + List documents = new ArrayList<>(); - Path tempFile = saveUploadedFileToTemp(file); - PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL())); - List documents = reader.get(); - Files.deleteIfExists(tempFile); + switch (Objects.requireNonNull(fileType)) { + case "txt", "csv", "text" -> { + Path tempFile = saveUploadedFileToTemp(file); + DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL())); + documents = documentReader.get(); + Files.deleteIfExists(tempFile); + } + case "xls" -> { + String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xls"; + List pageInfo = OfficeFileUtil.parseXlsByPage(filePath); + for (PageFile pageFile : pageInfo) { + Map metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename()); + Document document = new Document(pageFile.getContent(), metaData); + documents.add(document); + } + FileUtil.del(filePath); + } + case "xlsx" -> { + String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xlsx"; + List pageInfo = OfficeFileUtil.parseXlsxByPage(filePath); + for (PageFile pageFile : pageInfo) { + Map metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename()); + Document document = new Document(pageFile.getContent(), metaData); + documents.add(document); + } + FileUtil.del(filePath); + } + case "doc" -> { + String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".doc"; + String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf"; + OfficeFileUtil.doc2pdf(wordPath, pdfPath); + Path path = new File(pdfPath).toPath(); + PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL())); + List tmp = reader.get(); + for (Document document : tmp) { + Map metadata = document.getMetadata(); + Map metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename()); + Document doc = new Document(document.getContent(), metaData); + documents.add(doc); + } + Files.deleteIfExists(path); + FileUtil.del(wordPath); + } + case "docx" -> { + String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".docx"; + String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf"; + OfficeFileUtil.doc2pdf(wordPath, pdfPath); + Path path = new File(pdfPath).toPath(); + PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL())); + List tmp = reader.get(); + for (Document document : tmp) { + Map metadata = document.getMetadata(); + Map metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename()); + Document doc = new Document(document.getContent(), metaData); + documents.add(doc); + } + Files.deleteIfExists(path); + FileUtil.del(wordPath); + } + case "ppt" -> { + String pptPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".ppt"; + String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf"; + OfficeFileUtil.ppt2pdf(pptPath, pdfPath); + Path path = new File(pdfPath).toPath(); + PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL())); + List tmp = reader.get(); + for (Document document : tmp) { + Map metadata = document.getMetadata(); + Map metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename()); + Document doc = new Document(document.getContent(), metaData); + documents.add(doc); + } + Files.deleteIfExists(path); + FileUtil.del(pptPath); + } + case "pptx" -> { + String pptxPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pptx"; + String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf"; + OfficeFileUtil.ppt2pdf(pptxPath, pdfPath); + Path path = new File(pdfPath).toPath(); + PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL())); + List tmp = reader.get(); + for (Document document : tmp) { + Map metadata = document.getMetadata(); + Map metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename()); + Document doc = new Document(document.getContent(), metaData); + documents.add(doc); + } + Files.deleteIfExists(path); + FileUtil.del(pptxPath); + } + case "pdf" -> { + Path tempFile = saveUploadedFileToTemp(file); + PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL())); + documents = reader.get(); + Files.deleteIfExists(tempFile); + } + default -> { + return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType)); + } + } vectorStore.write(documents); for (Document document : documents) { aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString()) diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java new file mode 100644 index 0000000..b980fc7 --- /dev/null +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java @@ -0,0 +1,13 @@ +package com.electromagnetic.industry.software.common.pojo; + +import lombok.Data; + +@Data +public class PageFile { + + private int pageNumber; + + private String content; + + private String fileName; +} diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index c731f12..22537a4 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -8,6 +8,7 @@ import com.documents4j.api.DocumentType; import com.documents4j.api.IConverter; import com.documents4j.job.LocalConverter; import com.electromagnetic.industry.software.common.exception.BizException; +import com.electromagnetic.industry.software.common.pojo.PageFile; import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; @@ -30,6 +31,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.List; import java.util.Objects; @Slf4j @@ -46,10 +48,18 @@ public class OfficeFileUtil { try (InputStream docxInputStream = Files.newInputStream(inputWord.toPath()); OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) { IConverter build = LocalConverter.builder().build(); - boolean execute = build.convert(docxInputStream) - .as(DocumentType.DOCX) - .to(outputStream) - .as(DocumentType.PDF).schedule().get(); + boolean execute; + if (wordPath.endsWith(".docx")) { + execute = build.convert(docxInputStream) + .as(DocumentType.DOCX) + .to(outputStream) + .as(DocumentType.PDF).schedule().get(); + } else { + execute = build.convert(docxInputStream) + .as(DocumentType.DOC) + .to(outputStream) + .as(DocumentType.PDF).schedule().get(); + } Assert.isTrue(execute, "转换失败"); log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath()); build.shutDown(); @@ -76,6 +86,31 @@ public class OfficeFileUtil { } } + public static void ppt2pdf(String pptPath, String pdfPath) { + if (FileUtil.exist(pdfPath)) { + return; + } + log.info("Start convert ppt file to pdf, word path: {}, pdf path: {}", pptPath, pdfPath); + if (EleCommonUtil.isWinOs()) { + throw new BizException("windows平台暂不支持"); + } else { + try { + StringBuilder command = new StringBuilder(); + command.append("unoconv -f pdf -o").append(" ").append(pdfPath).append(" ").append(pptPath); + log.info("convert word file to pdf, command: {}", command); + Process process = RuntimeUtil.exec(command.toString()); + process.waitFor(); + if (process.exitValue() != 0) { + String info = StrFormatter.format("ppt文档{}转换成pdf文档{}失败", pptPath, pdfPath); + throw new BizException(info); + } + } catch (Exception e) { + log.error(e.getMessage(), e); + throw new BizException(e.getMessage(), e); + } + } + } + public static String parseDocxAllText(String wordPath) throws IOException { log.info("Start parse docx file, path is {}", wordPath); InputStream fis = Files.newInputStream(Paths.get(wordPath)); @@ -194,4 +229,14 @@ public class OfficeFileUtil { return EleCommonUtil.formateString(stringBuilder.toString()); } + public static List parseXlsxByPage(String filePath) { + + return null; + } + + public static List parseXlsByPage(String filePath) { + + return null; + } + }