118 lines
2.8 KiB
Python
118 lines
2.8 KiB
Python
import re
|
|
from typing import Union
|
|
from typing import Optional
|
|
|
|
from zhconv import convert
|
|
|
|
|
|
def handle_blank_character(text: str, repl: Optional[str] = None) -> str:
|
|
"""处理空白字符,默认替换成空字符
|
|
|
|
Attributes:
|
|
text: input text
|
|
repl: replace text
|
|
"""
|
|
if repl is None:
|
|
repl = ""
|
|
return re.sub(r"\s+", repl, text)
|
|
|
|
|
|
def uppercase_to_lowercase(text: str) -> str:
|
|
"""大写转小写
|
|
|
|
Attributes:
|
|
text: input text
|
|
"""
|
|
return text.lower()
|
|
|
|
|
|
def traditional_to_simplified(text: str) -> str:
|
|
"""繁体转简体
|
|
|
|
Attributes:
|
|
text: input text
|
|
"""
|
|
return convert(text, "zh-cn")
|
|
|
|
|
|
def full_angle_to_half_angle(text: str) -> str:
|
|
"""全角转半角
|
|
|
|
Attributes:
|
|
text: input text
|
|
"""
|
|
result = ""
|
|
for uchar in text:
|
|
inside_code = ord(uchar)
|
|
if inside_code == 12288: # 全角空格直接转换
|
|
inside_code = 32
|
|
elif 65281 <= inside_code <= 65374: # 全角字符(除空格)根据关系转化
|
|
inside_code -= 65248
|
|
|
|
result += chr(inside_code)
|
|
return result
|
|
|
|
|
|
def handle_substitute(text: str, ptn: str, repl: str) -> str:
|
|
"""替换一些字符
|
|
|
|
Attributes:
|
|
text: input text
|
|
ptn: re pattern
|
|
repl: replace text
|
|
"""
|
|
return re.sub(ptn, repl, text)
|
|
|
|
|
|
def preprocess(data: Union[str, list], pipelines: Optional[list] = None, params: Optional[dict] = None) \
|
|
-> Union[str, list]:
|
|
"""文本预处理
|
|
|
|
Attributes:
|
|
data: input data.
|
|
pipelines: default is
|
|
["handle_blank_character",
|
|
"uppercase_to_lowercase",
|
|
"traditional_to_simplified",
|
|
"full_angle_to_half_angle"]
|
|
params: function parameters
|
|
"""
|
|
all_pipelines = [
|
|
"handle_blank_character",
|
|
"uppercase_to_lowercase",
|
|
"traditional_to_simplified",
|
|
"full_angle_to_half_angle",
|
|
"handle_substitute"
|
|
]
|
|
|
|
default_pipelines = [
|
|
"handle_blank_character",
|
|
"uppercase_to_lowercase",
|
|
"traditional_to_simplified",
|
|
"full_angle_to_half_angle"
|
|
]
|
|
if pipelines is None:
|
|
pipelines = default_pipelines
|
|
|
|
if type(data) == str:
|
|
data_list = [data]
|
|
else:
|
|
data_list = data
|
|
|
|
results = []
|
|
for text in data_list:
|
|
for func in pipelines:
|
|
if func in all_pipelines:
|
|
if params is not None and func in params.keys():
|
|
text = eval(f"{func}(text, *{params[func]})")
|
|
else:
|
|
text = eval(f"{func}(text)")
|
|
else:
|
|
raise ValueError(f"pipeline: {func} not support!")
|
|
results.append(text)
|
|
|
|
if type(data) == str:
|
|
return results[0]
|
|
else:
|
|
return results
|