调通了pdf文件上传
This commit is contained in:
parent
5b8dfe9577
commit
f9a9fee343
|
|
@ -13,6 +13,7 @@ public class PdfParse extends FileParse {
|
||||||
String res = "";
|
String res = "";
|
||||||
String fileTmpPath = createFileTmpPath(fileType);
|
String fileTmpPath = createFileTmpPath(fileType);
|
||||||
try {
|
try {
|
||||||
|
FileUtil.writeFromStream(stream, fileTmpPath);
|
||||||
res = OfficeFileUtil.parsePdfAllText(fileTmpPath);
|
res = OfficeFileUtil.parsePdfAllText(fileTmpPath);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("解析pdf文件失败{}", e.getMessage(), e);
|
log.error("解析pdf文件失败{}", e.getMessage(), e);
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package com.electromagnetic.industry.software.common.parse;
|
package com.electromagnetic.industry.software.common.parse;
|
||||||
|
|
||||||
import cn.hutool.core.io.FileUtil;
|
import cn.hutool.core.io.FileUtil;
|
||||||
|
import com.electromagnetic.industry.software.common.util.EleCommonUtil;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
@ -16,15 +17,7 @@ public class TextParse extends FileParse {
|
||||||
try {
|
try {
|
||||||
FileUtil.writeFromStream(stream, fileTmpPath);
|
FileUtil.writeFromStream(stream, fileTmpPath);
|
||||||
res = FileUtil.readString(fileTmpPath, Charset.defaultCharset());
|
res = FileUtil.readString(fileTmpPath, Charset.defaultCharset());
|
||||||
StringBuilder stringBuilder = new StringBuilder();
|
res = EleCommonUtil.formateString(res);
|
||||||
for (char c : res.toCharArray()) {
|
|
||||||
if (c <= 176 && c >= 32) {
|
|
||||||
stringBuilder.append(c);
|
|
||||||
} else {
|
|
||||||
stringBuilder.append(" ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
res = stringBuilder.toString();
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.info("解析文件 {} 错误, 原因 {}", fileType, e.getMessage(), e);
|
log.info("解析文件 {} 错误, 原因 {}", fileType, e.getMessage(), e);
|
||||||
} finally {
|
} finally {
|
||||||
|
|
@ -32,4 +25,7 @@ public class TextParse extends FileParse {
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ public final class EleCommonUtil {
|
||||||
PARSE_MAP.put("ppt", new PptParse());
|
PARSE_MAP.put("ppt", new PptParse());
|
||||||
PARSE_MAP.put("pptx", new PptParse());
|
PARSE_MAP.put("pptx", new PptParse());
|
||||||
PARSE_MAP.put("text", new TextParse());
|
PARSE_MAP.put("text", new TextParse());
|
||||||
|
PARSE_MAP.put("pdf", new PdfParse());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 正则表达式模式,匹配中文字符、下划线、连字符、加号、数字和英文字符
|
// 正则表达式模式,匹配中文字符、下划线、连字符、加号、数字和英文字符
|
||||||
|
|
@ -96,4 +97,19 @@ public final class EleCommonUtil {
|
||||||
return fileParse.parseAllText(inputStream, fileType);
|
return fileParse.parseAllText(inputStream, fileType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean between(int start, int end, char value) {
|
||||||
|
return value >= start && value <= end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String formateString(String content) {
|
||||||
|
StringBuilder stringBuilder = new StringBuilder();
|
||||||
|
for (char c : content.toCharArray()) {
|
||||||
|
if (between(0, 31, c) || between(127, 127, c)) {
|
||||||
|
stringBuilder.append(" ");
|
||||||
|
} else {
|
||||||
|
stringBuilder.append(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return stringBuilder.toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -85,7 +85,7 @@ public class OfficeFileUtil {
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
document.close();
|
document.close();
|
||||||
fis.close();
|
fis.close();
|
||||||
return text;
|
return EleCommonUtil.formateString(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String parsePdfAllText(String path) throws IOException {
|
public static String parsePdfAllText(String path) throws IOException {
|
||||||
|
|
@ -97,7 +97,7 @@ public class OfficeFileUtil {
|
||||||
// 提取文本
|
// 提取文本
|
||||||
String text = pdfStripper.getText(document);
|
String text = pdfStripper.getText(document);
|
||||||
document.close();
|
document.close();
|
||||||
return text;
|
return EleCommonUtil.formateString(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String parseDocAllText(String wordPath) throws IOException {
|
public static String parseDocAllText(String wordPath) throws IOException {
|
||||||
|
|
@ -108,7 +108,7 @@ public class OfficeFileUtil {
|
||||||
String text = wordExtractor.getText();
|
String text = wordExtractor.getText();
|
||||||
document.close();
|
document.close();
|
||||||
fis.close();
|
fis.close();
|
||||||
return text;
|
return EleCommonUtil.formateString(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String parseXlsxAllText(String path) throws IOException {
|
public static String parseXlsxAllText(String path) throws IOException {
|
||||||
|
|
@ -129,7 +129,7 @@ public class OfficeFileUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return stringBuilder.toString();
|
return EleCommonUtil.formateString(stringBuilder.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String parseXlsAllText(String path) throws IOException {
|
public static String parseXlsAllText(String path) throws IOException {
|
||||||
|
|
@ -150,7 +150,7 @@ public class OfficeFileUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return stringBuilder.toString();
|
return EleCommonUtil.formateString(stringBuilder.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getCellValue(Cell cell) {
|
private static String getCellValue(Cell cell) {
|
||||||
|
|
@ -187,6 +187,7 @@ public class OfficeFileUtil {
|
||||||
InputStream input = Files.newInputStream(Paths.get(path));
|
InputStream input = Files.newInputStream(Paths.get(path));
|
||||||
String pptText = isPptx ? new SlideShowExtractor(new HSLFSlideShow(input)).getText() : new SlideShowExtractor(new XMLSlideShow(input)).getText();
|
String pptText = isPptx ? new SlideShowExtractor(new HSLFSlideShow(input)).getText() : new SlideShowExtractor(new XMLSlideShow(input)).getText();
|
||||||
stringBuilder.append(pptText);
|
stringBuilder.append(pptText);
|
||||||
return stringBuilder.toString();
|
return EleCommonUtil.formateString(stringBuilder.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue