From f9a9fee343514dcd53678a1776b57a402554ffd7 Mon Sep 17 00:00:00 2001 From: chenxudong Date: Tue, 18 Feb 2025 09:42:13 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E9=80=9A=E4=BA=86pdf=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8A=E4=BC=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../industry/software/common/parse/PdfParse.java | 1 + .../software/common/parse/TextParse.java | 14 +++++--------- .../software/common/util/EleCommonUtil.java | 16 ++++++++++++++++ .../software/common/util/OfficeFileUtil.java | 13 +++++++------ 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java index 67f690d..702cef2 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/PdfParse.java @@ -13,6 +13,7 @@ public class PdfParse extends FileParse { String res = ""; String fileTmpPath = createFileTmpPath(fileType); try { + FileUtil.writeFromStream(stream, fileTmpPath); res = OfficeFileUtil.parsePdfAllText(fileTmpPath); } catch (Exception e) { log.error("解析pdf文件失败{}", e.getMessage(), e); diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java index e537c12..8d61654 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/parse/TextParse.java @@ -1,6 +1,7 @@ package com.electromagnetic.industry.software.common.parse; import cn.hutool.core.io.FileUtil; +import com.electromagnetic.industry.software.common.util.EleCommonUtil; import lombok.extern.slf4j.Slf4j; import java.io.InputStream; @@ -16,15 +17,7 @@ public class TextParse extends FileParse { try { FileUtil.writeFromStream(stream, fileTmpPath); res = FileUtil.readString(fileTmpPath, Charset.defaultCharset()); - StringBuilder stringBuilder = new StringBuilder(); - for (char c : res.toCharArray()) { - if (c <= 176 && c >= 32) { - stringBuilder.append(c); - } else { - stringBuilder.append(" "); - } - } - res = stringBuilder.toString(); + res = EleCommonUtil.formateString(res); } catch (Exception e) { log.info("解析文件 {} 错误, 原因 {}", fileType, e.getMessage(), e); } finally { @@ -32,4 +25,7 @@ public class TextParse extends FileParse { } return res; } + + + } diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java index 59c9c7d..3fc5ba3 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/EleCommonUtil.java @@ -29,6 +29,7 @@ public final class EleCommonUtil { PARSE_MAP.put("ppt", new PptParse()); PARSE_MAP.put("pptx", new PptParse()); PARSE_MAP.put("text", new TextParse()); + PARSE_MAP.put("pdf", new PdfParse()); } // 正则表达式模式,匹配中文字符、下划线、连字符、加号、数字和英文字符 @@ -96,4 +97,19 @@ public final class EleCommonUtil { return fileParse.parseAllText(inputStream, fileType); } + private static boolean between(int start, int end, char value) { + return value >= start && value <= end; + } + + public static String formateString(String content) { + StringBuilder stringBuilder = new StringBuilder(); + for (char c : content.toCharArray()) { + if (between(0, 31, c) || between(127, 127, c)) { + stringBuilder.append(" "); + } else { + stringBuilder.append(c); + } + } + return stringBuilder.toString(); + } } diff --git a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java index 496b19b..083d992 100644 --- a/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java +++ b/electromagnetic-common/src/main/java/com/electromagnetic/industry/software/common/util/OfficeFileUtil.java @@ -85,7 +85,7 @@ public class OfficeFileUtil { String text = extractor.getText(); document.close(); fis.close(); - return text; + return EleCommonUtil.formateString(text); } public static String parsePdfAllText(String path) throws IOException { @@ -97,7 +97,7 @@ public class OfficeFileUtil { // 提取文本 String text = pdfStripper.getText(document); document.close(); - return text; + return EleCommonUtil.formateString(text); } public static String parseDocAllText(String wordPath) throws IOException { @@ -108,7 +108,7 @@ public class OfficeFileUtil { String text = wordExtractor.getText(); document.close(); fis.close(); - return text; + return EleCommonUtil.formateString(text); } public static String parseXlsxAllText(String path) throws IOException { @@ -129,7 +129,7 @@ public class OfficeFileUtil { } } } - return stringBuilder.toString(); + return EleCommonUtil.formateString(stringBuilder.toString()); } public static String parseXlsAllText(String path) throws IOException { @@ -150,7 +150,7 @@ public class OfficeFileUtil { } } } - return stringBuilder.toString(); + return EleCommonUtil.formateString(stringBuilder.toString()); } private static String getCellValue(Cell cell) { @@ -187,6 +187,7 @@ public class OfficeFileUtil { InputStream input = Files.newInputStream(Paths.get(path)); String pptText = isPptx ? new SlideShowExtractor(new HSLFSlideShow(input)).getText() : new SlideShowExtractor(new XMLSlideShow(input)).getText(); stringBuilder.append(pptText); - return stringBuilder.toString(); + return EleCommonUtil.formateString(stringBuilder.toString()); } + }