windows上调通了将pdf转成图片,再使用ocr识别。

This commit is contained in:
chenxudong 2025-04-09 15:15:35 +08:00
parent 86cda0a547
commit d1dead8802
5 changed files with 48 additions and 75 deletions

View File

@ -85,10 +85,11 @@
</exclusions> </exclusions>
<version>5.1.2</version> <version>5.1.2</version>
</dependency> </dependency>
<!--引入druid数据源-->
<dependency> <dependency>
<groupId>com.alibaba</groupId> <groupId>com.alibaba</groupId>
<artifactId>druid</artifactId> <artifactId>druid-spring-boot-starter</artifactId>
<version>1.1.10</version> <version>1.2.6</version>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -0,0 +1,23 @@
package com.electromagnetic.industry.software.manage.config;
import com.alibaba.druid.pool.DruidDataSource;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import javax.sql.DataSource;
@Configuration
public class DruidDataSourceConfig {
/**
* 添加 DruidDataSource 组件到容器中并绑定属性
*/
@Bean
@ConfigurationProperties(prefix = "spring.datasource")
@ConditionalOnProperty(name = "spring.datasource.type", havingValue = "com.alibaba.druid.pool.DruidDataSource")
public DataSource druid(){
return new DruidDataSource();
}
}

View File

@ -146,11 +146,10 @@ public class ChatService {
} }
pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath); pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
} }
for (PageFile pageFile : pageFiles) { StringBuilder stringBuilder = new StringBuilder();
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename()); pageFiles.forEach(page -> stringBuilder.append(page.getContent()));
Document document = new Document(pageFile.getContent(), metaData); Document document = new Document(stringBuilder.toString(), Map.of("file_name", file.getOriginalFilename()));
documents.add(document); documents.add(document);
}
FileUtil.del(pdfPath); FileUtil.del(pdfPath);
FileUtil.del(srcPath); FileUtil.del(srcPath);
} }

View File

@ -115,13 +115,11 @@
<artifactId>rapidocr-onnx-platform</artifactId> <artifactId>rapidocr-onnx-platform</artifactId>
<version>0.0.7</version> <version>0.0.7</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>io.github.mymonstercat</groupId> <groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-ncnn-platform</artifactId> <artifactId>rapidocr-onnx-linux-x86_64</artifactId>
<version>0.0.7</version> <version>1.2.2</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.pdfbox</groupId> <groupId>org.apache.pdfbox</groupId>
<artifactId>jbig2-imageio</artifactId> <artifactId>jbig2-imageio</artifactId>

View File

@ -15,11 +15,8 @@ import io.github.mymonstercat.Model;
import io.github.mymonstercat.ocr.InferenceEngine; import io.github.mymonstercat.ocr.InferenceEngine;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
@ -288,73 +285,28 @@ public class OfficeFileUtil {
return pageFiles; return pageFiles;
} }
public static List<PageFile> parsePdfByPage(String filePath) throws IOException { public static List<PageFile> parsePdfByPage(String pdfPath) throws IOException {
StringBuilder stringBuilder = new StringBuilder();
List<PageFile> pageFiles = new ArrayList<>(); List<PageFile> pageFiles = new ArrayList<>();
String fileName = new File(filePath).getName(); String fileName = new File(pdfPath).getName();
try (PDDocument document = Loader.loadPDF(new File(filePath))) { try (PDDocument document = Loader.loadPDF(new File(pdfPath))) {
// 创建 PDFTextStripper 对象 PDFRenderer renderer = new PDFRenderer(document);
PDFTextStripper textStripper = new PDFTextStripper();
// 遍历每一页 // 遍历每一页
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { for (int page = 0; page < document.getNumberOfPages(); page++) {
PDPage page = document.getPage(pageIndex); // 渲染页面为图像设置缩放比例300 dpi
int pageNumber = pageIndex + 1; BufferedImage image = renderer.renderImageWithDPI(page, 300);
// 提取文本 // 生成输出路径
String text = extractTextFromPage(textStripper, document, pageIndex); String imagePath = UserThreadLocal.getUser().getPrjTmpDir() + File.separator + IdUtil.fastSimpleUUID() + ".png";
// 提取图片 // 保存图像
String imageText = extractImagesFromPage(page, pageNumber); ImageIO.write(image, "png", new File(imagePath));
stringBuilder.append(text).append("\n").append(imageText); String content = getTextFromImage(imagePath);
PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName); pageFiles.add(new PageFile(page, content, fileName));
pageFiles.add(pageFile); FileUtil.del(imagePath);
} }
} catch (IOException e) {
log.error(e.getMessage(), e);
} }
return pageFiles; return pageFiles;
} }
private static String getTextFromImage(String imagePath) {
// 提取文本
private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException {
textStripper.setStartPage(pageIndex + 1); // PDFBox 页码从 1 开始
textStripper.setEndPage(pageIndex + 1);
return textStripper.getText(document);
}
// 提取图片
private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException {
PDResources resources = page.getResources();
if (resources == null) {
return "";
}
// 遍历所有 XObject 资源
Iterable<COSName> xObjectNames = resources.getXObjectNames();
StringBuilder stringBuilder = new StringBuilder();
for (COSName name : xObjectNames) {
// 获取 XObject 对象
org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name);
// 检查是否为图片对象
if (xObject instanceof PDImage) {
PDImage image = (PDImage) xObject;
String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID());
saveImage(image, filePath);
String textFromImage = getTextFromImage(filePath);
stringBuilder.append(textFromImage).append("\r\n");
FileUtil.del(filePath);
}
}
return stringBuilder.toString();
}
// 保存图片
private static void saveImage(PDImage image, String filename) throws IOException {
BufferedImage bufferedImage = image.getImage();
File outputFile = new File(filename);
outputFile.getParentFile().mkdirs(); // 确保目录存在
ImageIO.write(bufferedImage, "png", outputFile);
}
private static String getTextFromImage(String imagePath) throws IOException {
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3); InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
OcrResult ocrResult = engine.runOcr(imagePath); OcrResult ocrResult = engine.runOcr(imagePath);
return ocrResult.getStrRes().trim(); return ocrResult.getStrRes().trim();