From d1dead88027acbe305b45afd68ee89aaeb851d3f Mon Sep 17 00:00:00 2001 From: chenxudong Date: Wed, 9 Apr 2025 15:15:35 +0800 Subject: [PATCH] =?UTF-8?q?windows=E4=B8=8A=E8=B0=83=E9=80=9A=E4=BA=86?= =?UTF-8?q?=E5=B0=86pdf=E8=BD=AC=E6=88=90=E5=9B=BE=E7=89=87=EF=BC=8C?= =?UTF-8?q?=E5=86=8D=E4=BD=BF=E7=94=A8ocr=E8=AF=86=E5=88=AB=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- electrmangnetic/pom.xml | 5 +- .../manage/config/DruidDataSourceConfig.java | 23 ++++++ .../service/serviceimpl/ChatService.java | 9 +-- electromagnetic-common/pom.xml | 6 +- .../software/common/util/OfficeFileUtil.java | 80 ++++--------------- 5 files changed, 48 insertions(+), 75 deletions(-) create mode 100644 electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/config/DruidDataSourceConfig.java diff --git a/electrmangnetic/pom.xml b/electrmangnetic/pom.xml index 9025132..24a2eb4 100644 --- a/electrmangnetic/pom.xml +++ b/electrmangnetic/pom.xml @@ -85,10 +85,11 @@ 5.1.2 + com.alibaba - druid - 1.1.10 + druid-spring-boot-starter + 1.2.6 diff --git a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/config/DruidDataSourceConfig.java b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/config/DruidDataSourceConfig.java new file mode 100644 index 0000000..e1b0531 --- /dev/null +++ b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/config/DruidDataSourceConfig.java @@ -0,0 +1,23 @@ +package com.electromagnetic.industry.software.manage.config; + +import com.alibaba.druid.pool.DruidDataSource; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import javax.sql.DataSource; + +@Configuration +public class DruidDataSourceConfig { + + /** + * 添加 DruidDataSource 组件到容器中,并绑定属性 + */ + @Bean + @ConfigurationProperties(prefix = "spring.datasource") + @ConditionalOnProperty(name = "spring.datasource.type", havingValue = "com.alibaba.druid.pool.DruidDataSource") + public DataSource druid(){ + return new DruidDataSource(); + } +} diff --git a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java index c5bc365..c84fa39 100644 --- a/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java +++ b/electrmangnetic/src/main/java/com/electromagnetic/industry/software/manage/service/serviceimpl/ChatService.java @@ -146,11 +146,10 @@ public class ChatService { } pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath); } - for (PageFile pageFile : pageFiles) { - Map metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename()); - Document document = new Document(pageFile.getContent(), metaData); - documents.add(document); - } + StringBuilder stringBuilder = new StringBuilder(); + pageFiles.forEach(page -> stringBuilder.append(page.getContent())); + Document document = new Document(stringBuilder.toString(), Map.of("file_name", file.getOriginalFilename())); + documents.add(document); FileUtil.del(pdfPath); FileUtil.del(srcPath); } diff --git a/electromagnetic-common/pom.xml b/electromagnetic-common/pom.xml index 0bc08c8..bc6e346 100644 --- a/electromagnetic-common/pom.xml +++ b/electromagnetic-common/pom.xml @@ -115,13 +115,11 @@ rapidocr-onnx-platform 0.0.7 - io.github.mymonstercat - rapidocr-ncnn-platform - 0.0.7 + rapidocr-onnx-linux-x86_64 + 1.2.2 - org.apache.pdfbox jbig2-imageio diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index 9b6c7b9..84c667e 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -15,11 +15,8 @@ import io.github.mymonstercat.Model; import io.github.mymonstercat.ocr.InferenceEngine; import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hwpf.HWPFDocument; @@ -288,73 +285,28 @@ public class OfficeFileUtil { return pageFiles; } - public static List parsePdfByPage(String filePath) throws IOException { - StringBuilder stringBuilder = new StringBuilder(); + public static List parsePdfByPage(String pdfPath) throws IOException { List pageFiles = new ArrayList<>(); - String fileName = new File(filePath).getName(); - try (PDDocument document = Loader.loadPDF(new File(filePath))) { - // 创建 PDFTextStripper 对象 - PDFTextStripper textStripper = new PDFTextStripper(); + String fileName = new File(pdfPath).getName(); + try (PDDocument document = Loader.loadPDF(new File(pdfPath))) { + PDFRenderer renderer = new PDFRenderer(document); // 遍历每一页 - for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { - PDPage page = document.getPage(pageIndex); - int pageNumber = pageIndex + 1; - // 提取文本 - String text = extractTextFromPage(textStripper, document, pageIndex); - // 提取图片 - String imageText = extractImagesFromPage(page, pageNumber); - stringBuilder.append(text).append("\n").append(imageText); - PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName); - pageFiles.add(pageFile); + for (int page = 0; page < document.getNumberOfPages(); page++) { + // 渲染页面为图像(设置缩放比例,300 dpi) + BufferedImage image = renderer.renderImageWithDPI(page, 300); + // 生成输出路径 + String imagePath = UserThreadLocal.getUser().getPrjTmpDir() + File.separator + IdUtil.fastSimpleUUID() + ".png"; + // 保存图像 + ImageIO.write(image, "png", new File(imagePath)); + String content = getTextFromImage(imagePath); + pageFiles.add(new PageFile(page, content, fileName)); + FileUtil.del(imagePath); } - } catch (IOException e) { - log.error(e.getMessage(), e); } return pageFiles; } - - // 提取文本 - private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException { - textStripper.setStartPage(pageIndex + 1); // PDFBox 页码从 1 开始 - textStripper.setEndPage(pageIndex + 1); - return textStripper.getText(document); - } - - // 提取图片 - private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException { - PDResources resources = page.getResources(); - if (resources == null) { - return ""; - } - // 遍历所有 XObject 资源 - Iterable xObjectNames = resources.getXObjectNames(); - StringBuilder stringBuilder = new StringBuilder(); - for (COSName name : xObjectNames) { - // 获取 XObject 对象 - org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name); - // 检查是否为图片对象 - if (xObject instanceof PDImage) { - PDImage image = (PDImage) xObject; - String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID()); - saveImage(image, filePath); - String textFromImage = getTextFromImage(filePath); - stringBuilder.append(textFromImage).append("\r\n"); - FileUtil.del(filePath); - } - } - return stringBuilder.toString(); - } - - // 保存图片 - private static void saveImage(PDImage image, String filename) throws IOException { - BufferedImage bufferedImage = image.getImage(); - File outputFile = new File(filename); - outputFile.getParentFile().mkdirs(); // 确保目录存在 - ImageIO.write(bufferedImage, "png", outputFile); - } - - private static String getTextFromImage(String imagePath) throws IOException { + private static String getTextFromImage(String imagePath) { InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3); OcrResult ocrResult = engine.runOcr(imagePath); return ocrResult.getStrRes().trim();