问支持word,ppt,excel，text,csv等上传

2025-04-07 15:10:14 +08:00 · 2025-04-07 15:10:14 +08:00 · 46d68f8d4b
parent b0fe94d2ee
commit 46d68f8d4b
3 changed files with 179 additions and 15 deletions
--- a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java
+++ b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java
@ -1,17 +1,20 @@
 package com.electromagnetic.industry.software.manage.service.serviceimpl;

 import cn.hutool.core.io.FileUtil;
+import cn.hutool.core.text.StrFormatter;
 import cn.hutool.core.util.IdUtil;
 import cn.hutool.core.util.ObjectUtil;
-import cn.hutool.core.util.StrUtil;
 import cn.hutool.crypto.digest.DigestUtil;
 import com.baomidou.mybatisplus.core.toolkit.Wrappers;
 import com.electromagnetic.industry.software.common.enums.EffectFlagEnum;
+import com.electromagnetic.industry.software.common.pojo.PageFile;
 import com.electromagnetic.industry.software.common.pojo.UserLoginInfo;
 import com.electromagnetic.industry.software.common.resp.ElectromagneticResult;
 import com.electromagnetic.industry.software.common.util.ElectromagneticResultUtil;
 import com.electromagnetic.industry.software.common.util.IdWorker;
+import com.electromagnetic.industry.software.common.util.OfficeFileUtil;
 import com.electromagnetic.industry.software.common.util.UserThreadLocal;
+import com.electromagnetic.industry.software.manage.config.ElePropertyConfig;
 import com.electromagnetic.industry.software.manage.mapper.AiFileUploadRecordMapper;
 import com.electromagnetic.industry.software.manage.mapper.AiQuestionRecordMapper;
 import com.electromagnetic.industry.software.manage.pojo.models.AiFileUploadRecord;
@ -25,7 +28,9 @@ import org.springframework.ai.chat.client.advisor.QuestionAnswerAdvisor;
 import org.springframework.ai.chat.messages.UserMessage;
 import org.springframework.ai.chat.prompt.Prompt;
 import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
 import org.springframework.ai.ollama.OllamaChatModel;
+import org.springframework.ai.reader.TextReader;
 import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
 import org.springframework.ai.vectorstore.VectorStore;
 import org.springframework.ai.chat.model.ChatResponse;
@ -34,12 +39,11 @@ import org.springframework.transaction.annotation.Transactional;
 import org.springframework.web.multipart.MultipartFile;
 import reactor.core.publisher.Flux;

+import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.List;
-import java.util.Objects;
-import java.util.Optional;
+import java.util.*;
 import java.util.stream.Collectors;

@Slf4j
@ -64,6 +68,11 @@ public class ChatService {
    @Resource
    private AiQuestionRecordMapper aiQuestionRecordMapper;

+    @Resource
+    private ElePropertyConfig elePropertyConfig;
+
+    private static final List<String> ALLOWS_FILE_TYPES = List.of("doc", "docx", "txt", "csv", "xls", "xlsx", "pdf", "ppt", "pptx");
+
    public ElectromagneticResult<?> deleteDocument(List<String> ids) {
        List<String> vectorIds = aiFileUploadRecordMapper.selectList(Wrappers.lambdaQuery(AiFileUploadRecord.class)
                .in(AiFileUploadRecord::getId, ids))
@ -88,10 +97,9 @@ public class ChatService {
            return ElectromagneticResultUtil.fail("-1", "文件为空");
        }

-        // 当前仅支持pdf文件
        String fileType = FileUtil.extName(file.getOriginalFilename());
-        if (!StrUtil.equals(fileType, "pdf")) {
-            return ElectromagneticResultUtil.fail("-1", "当前仅支持pdf格式文件");
+        if (!ALLOWS_FILE_TYPES.contains(fileType)) {
+            return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
        }

        // 通过md5值判断文件是否被上传过
@ -101,11 +109,109 @@ public class ChatService {
        if (count > 0) {
            return ElectromagneticResultUtil.success(fileMd5);
        }
+        List<Document> documents = new ArrayList<>();

+        switch (Objects.requireNonNull(fileType)) {
+            case "txt", "csv", "text" -> {
+                Path tempFile = saveUploadedFileToTemp(file);
+                DocumentReader documentReader = new TextReader(String.valueOf(tempFile.toUri().toURL()));
+                documents = documentReader.get();
+                Files.deleteIfExists(tempFile);
+            }
+            case "xls" -> {
+                String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xls";
+                List<PageFile> pageInfo = OfficeFileUtil.parseXlsByPage(filePath);
+                for (PageFile pageFile : pageInfo) {
+                    Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
+                    Document document = new Document(pageFile.getContent(), metaData);
+                    documents.add(document);
+                }
+                FileUtil.del(filePath);
+            }
+            case "xlsx" -> {
+                String filePath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".xlsx";
+                List<PageFile> pageInfo = OfficeFileUtil.parseXlsxByPage(filePath);
+                for (PageFile pageFile : pageInfo) {
+                    Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
+                    Document document = new Document(pageFile.getContent(), metaData);
+                    documents.add(document);
+                }
+                FileUtil.del(filePath);
+            }
+            case "doc" -> {
+                String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".doc";
+                String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
+                OfficeFileUtil.doc2pdf(wordPath, pdfPath);
+                Path path = new File(pdfPath).toPath();
+                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
+                List<Document> tmp = reader.get();
+                for (Document document : tmp) {
+                    Map<String, Object> metadata = document.getMetadata();
+                    Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
+                    Document doc = new Document(document.getContent(), metaData);
+                    documents.add(doc);
+                }
+                Files.deleteIfExists(path);
+                FileUtil.del(wordPath);
+            }
+            case "docx" -> {
+                String wordPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".docx";
+                String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
+                OfficeFileUtil.doc2pdf(wordPath, pdfPath);
+                Path path = new File(pdfPath).toPath();
+                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
+                List<Document> tmp = reader.get();
+                for (Document document : tmp) {
+                    Map<String, Object> metadata = document.getMetadata();
+                    Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
+                    Document doc = new Document(document.getContent(), metaData);
+                    documents.add(doc);
+                }
+                Files.deleteIfExists(path);
+                FileUtil.del(wordPath);
+            }
+            case "ppt" -> {
+                String pptPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".ppt";
+                String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
+                OfficeFileUtil.ppt2pdf(pptPath, pdfPath);
+                Path path = new File(pdfPath).toPath();
+                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
+                List<Document> tmp = reader.get();
+                for (Document document : tmp) {
+                    Map<String, Object> metadata = document.getMetadata();
+                    Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
+                    Document doc = new Document(document.getContent(), metaData);
+                    documents.add(doc);
+                }
+                Files.deleteIfExists(path);
+                FileUtil.del(pptPath);
+            }
+            case "pptx" -> {
+                String pptxPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pptx";
+                String pdfPath = elePropertyConfig.getEleTmpPath() + File.separator + fileMd5 + ".pdf";
+                OfficeFileUtil.ppt2pdf(pptxPath, pdfPath);
+                Path path = new File(pdfPath).toPath();
+                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(path.toUri().toURL()));
+                List<Document> tmp = reader.get();
+                for (Document document : tmp) {
+                    Map<String, Object> metadata = document.getMetadata();
+                    Map<String, Object> metaData = Map.of("page_number", metadata.get("page_number"), "file_name", file.getOriginalFilename());
+                    Document doc = new Document(document.getContent(), metaData);
+                    documents.add(doc);
+                }
+                Files.deleteIfExists(path);
+                FileUtil.del(pptxPath);
+            }
+            case "pdf" -> {
                Path tempFile = saveUploadedFileToTemp(file);
                PagePdfDocumentReader reader = new PagePdfDocumentReader(String.valueOf(tempFile.toUri().toURL()));
-        List<Document> documents = reader.get();
+                documents = reader.get();
                Files.deleteIfExists(tempFile);
+            }
+            default -> {
+                return ElectromagneticResultUtil.fail("-1", StrFormatter.format("当前格式 {} 不支持", fileType));
+            }
+        }
        vectorStore.write(documents);
        for (Document document : documents) {
            aiFileUploadRecordMapper.insert(new AiFileUploadRecord().setId(IdWorker.getSnowFlakeIdString())
--- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/pojo/PageFile.java
@ -0,0 +1,13 @@
+package com.electromagnetic.industry.software.common.pojo;
+
+import lombok.Data;
+
+@Data
+public class PageFile {
+
+   private int pageNumber;
+
+   private String content;
+
+   private String fileName;
+}
--- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
+++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java
@ -8,6 +8,7 @@ import com.documents4j.api.DocumentType;
 import com.documents4j.api.IConverter;
 import com.documents4j.job.LocalConverter;
 import com.electromagnetic.industry.software.common.exception.BizException;
+import com.electromagnetic.industry.software.common.pojo.PageFile;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
@ -30,6 +31,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Paths;
+import java.util.List;
 import java.util.Objects;

@Slf4j
@ -46,10 +48,18 @@ public class OfficeFileUtil {
            try (InputStream docxInputStream = Files.newInputStream(inputWord.toPath());
                 OutputStream outputStream = Files.newOutputStream(outputFile.toPath())) {
                IConverter build = LocalConverter.builder().build();
-                boolean execute = build.convert(docxInputStream)
+                boolean execute;
+                if (wordPath.endsWith(".docx")) {
+                    execute = build.convert(docxInputStream)
                            .as(DocumentType.DOCX)
                            .to(outputStream)
                            .as(DocumentType.PDF).schedule().get();
+                } else {
+                    execute = build.convert(docxInputStream)
+                            .as(DocumentType.DOC)
+                            .to(outputStream)
+                            .as(DocumentType.PDF).schedule().get();
+                }
                Assert.isTrue(execute, "转换失败");
                log.info("转换完毕 targetPath = {}", outputFile.getAbsolutePath());
                build.shutDown();
@ -76,6 +86,31 @@ public class OfficeFileUtil {
        }
    }

+    public static void ppt2pdf(String pptPath, String pdfPath) {
+        if (FileUtil.exist(pdfPath)) {
+            return;
+        }
+        log.info("Start convert ppt file to pdf, word path: {}, pdf path: {}", pptPath, pdfPath);
+        if (EleCommonUtil.isWinOs()) {
+            throw new BizException("windows平台暂不支持");
+        } else {
+            try {
+                StringBuilder command = new StringBuilder();
+                command.append("unoconv -f pdf -o").append(" ").append(pdfPath).append(" ").append(pptPath);
+                log.info("convert word file to pdf, command: {}", command);
+                Process process = RuntimeUtil.exec(command.toString());
+                process.waitFor();
+                if (process.exitValue() != 0) {
+                    String info = StrFormatter.format("ppt文档{}转换成pdf文档{}失败", pptPath, pdfPath);
+                    throw new BizException(info);
+                }
+            } catch (Exception e) {
+                log.error(e.getMessage(), e);
+                throw new BizException(e.getMessage(), e);
+            }
+        }
+    }
+
    public static String parseDocxAllText(String wordPath) throws IOException {
        log.info("Start parse docx file, path is {}", wordPath);
        InputStream fis = Files.newInputStream(Paths.get(wordPath));
@ -194,4 +229,14 @@ public class OfficeFileUtil {
        return EleCommonUtil.formateString(stringBuilder.toString());
    }

+    public static List<PageFile> parseXlsxByPage(String filePath) {
+
+        return null;
+    }
+
+    public static List<PageFile> parseXlsByPage(String filePath) {
+
+        return null;
+    }
+
 }