windows上调通了将pdf转成图片,再使用ocr识别。
This commit is contained in:
parent
86cda0a547
commit
d1dead8802
|
|
@ -85,10 +85,11 @@
|
|||
</exclusions>
|
||||
<version>5.1.2</version>
|
||||
</dependency>
|
||||
<!--引入druid数据源-->
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>druid</artifactId>
|
||||
<version>1.1.10</version>
|
||||
<artifactId>druid-spring-boot-starter</artifactId>
|
||||
<version>1.2.6</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,23 @@
|
|||
package com.electromagnetic.industry.software.manage.config;
|
||||
|
||||
import com.alibaba.druid.pool.DruidDataSource;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
|
||||
@Configuration
|
||||
public class DruidDataSourceConfig {
|
||||
|
||||
/**
|
||||
* 添加 DruidDataSource 组件到容器中,并绑定属性
|
||||
*/
|
||||
@Bean
|
||||
@ConfigurationProperties(prefix = "spring.datasource")
|
||||
@ConditionalOnProperty(name = "spring.datasource.type", havingValue = "com.alibaba.druid.pool.DruidDataSource")
|
||||
public DataSource druid(){
|
||||
return new DruidDataSource();
|
||||
}
|
||||
}
|
||||
|
|
@ -146,11 +146,10 @@ public class ChatService {
|
|||
}
|
||||
pageFiles = OfficeFileUtil.parsePdfByPage(pdfPath);
|
||||
}
|
||||
for (PageFile pageFile : pageFiles) {
|
||||
Map<String, Object> metaData = Map.of("page_number", pageFile.getPageNumber(), "file_name", file.getOriginalFilename());
|
||||
Document document = new Document(pageFile.getContent(), metaData);
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
pageFiles.forEach(page -> stringBuilder.append(page.getContent()));
|
||||
Document document = new Document(stringBuilder.toString(), Map.of("file_name", file.getOriginalFilename()));
|
||||
documents.add(document);
|
||||
}
|
||||
FileUtil.del(pdfPath);
|
||||
FileUtil.del(srcPath);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -115,13 +115,11 @@
|
|||
<artifactId>rapidocr-onnx-platform</artifactId>
|
||||
<version>0.0.7</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.github.mymonstercat</groupId>
|
||||
<artifactId>rapidocr-ncnn-platform</artifactId>
|
||||
<version>0.0.7</version>
|
||||
<artifactId>rapidocr-onnx-linux-x86_64</artifactId>
|
||||
<version>1.2.2</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>jbig2-imageio</artifactId>
|
||||
|
|
|
|||
|
|
@ -15,11 +15,8 @@ import io.github.mymonstercat.Model;
|
|||
import io.github.mymonstercat.ocr.InferenceEngine;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
|
|
@ -288,73 +285,28 @@ public class OfficeFileUtil {
|
|||
return pageFiles;
|
||||
}
|
||||
|
||||
public static List<PageFile> parsePdfByPage(String filePath) throws IOException {
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
public static List<PageFile> parsePdfByPage(String pdfPath) throws IOException {
|
||||
List<PageFile> pageFiles = new ArrayList<>();
|
||||
String fileName = new File(filePath).getName();
|
||||
try (PDDocument document = Loader.loadPDF(new File(filePath))) {
|
||||
// 创建 PDFTextStripper 对象
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
String fileName = new File(pdfPath).getName();
|
||||
try (PDDocument document = Loader.loadPDF(new File(pdfPath))) {
|
||||
PDFRenderer renderer = new PDFRenderer(document);
|
||||
// 遍历每一页
|
||||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
|
||||
PDPage page = document.getPage(pageIndex);
|
||||
int pageNumber = pageIndex + 1;
|
||||
// 提取文本
|
||||
String text = extractTextFromPage(textStripper, document, pageIndex);
|
||||
// 提取图片
|
||||
String imageText = extractImagesFromPage(page, pageNumber);
|
||||
stringBuilder.append(text).append("\n").append(imageText);
|
||||
PageFile pageFile = new PageFile(pageIndex, stringBuilder.toString(), fileName);
|
||||
pageFiles.add(pageFile);
|
||||
for (int page = 0; page < document.getNumberOfPages(); page++) {
|
||||
// 渲染页面为图像(设置缩放比例,300 dpi)
|
||||
BufferedImage image = renderer.renderImageWithDPI(page, 300);
|
||||
// 生成输出路径
|
||||
String imagePath = UserThreadLocal.getUser().getPrjTmpDir() + File.separator + IdUtil.fastSimpleUUID() + ".png";
|
||||
// 保存图像
|
||||
ImageIO.write(image, "png", new File(imagePath));
|
||||
String content = getTextFromImage(imagePath);
|
||||
pageFiles.add(new PageFile(page, content, fileName));
|
||||
FileUtil.del(imagePath);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
return pageFiles;
|
||||
}
|
||||
|
||||
|
||||
// 提取文本
|
||||
private static String extractTextFromPage(PDFTextStripper textStripper, PDDocument document, int pageIndex) throws IOException {
|
||||
textStripper.setStartPage(pageIndex + 1); // PDFBox 页码从 1 开始
|
||||
textStripper.setEndPage(pageIndex + 1);
|
||||
return textStripper.getText(document);
|
||||
}
|
||||
|
||||
// 提取图片
|
||||
private static String extractImagesFromPage(PDPage page, int pageNumber) throws IOException {
|
||||
PDResources resources = page.getResources();
|
||||
if (resources == null) {
|
||||
return "";
|
||||
}
|
||||
// 遍历所有 XObject 资源
|
||||
Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
for (COSName name : xObjectNames) {
|
||||
// 获取 XObject 对象
|
||||
org.apache.pdfbox.pdmodel.graphics.PDXObject xObject = resources.getXObject(name);
|
||||
// 检查是否为图片对象
|
||||
if (xObject instanceof PDImage) {
|
||||
PDImage image = (PDImage) xObject;
|
||||
String filePath = StrFormatter.format("D:/{}.png", IdUtil.fastSimpleUUID());
|
||||
saveImage(image, filePath);
|
||||
String textFromImage = getTextFromImage(filePath);
|
||||
stringBuilder.append(textFromImage).append("\r\n");
|
||||
FileUtil.del(filePath);
|
||||
}
|
||||
}
|
||||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
// 保存图片
|
||||
private static void saveImage(PDImage image, String filename) throws IOException {
|
||||
BufferedImage bufferedImage = image.getImage();
|
||||
File outputFile = new File(filename);
|
||||
outputFile.getParentFile().mkdirs(); // 确保目录存在
|
||||
ImageIO.write(bufferedImage, "png", outputFile);
|
||||
}
|
||||
|
||||
private static String getTextFromImage(String imagePath) throws IOException {
|
||||
private static String getTextFromImage(String imagePath) {
|
||||
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
|
||||
OcrResult ocrResult = engine.runOcr(imagePath);
|
||||
return ocrResult.getStrRes().trim();
|
||||
|
|
|
|||
Loading…
Reference in New Issue