rep1/python37/Lib/site-packages/zhconv/zhconv.py

479 lines
19 KiB
Python
Raw Permalink Normal View History

2024-11-21 09:00:42 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This module implements a simple conversion and localization between simplified and traditional Chinese using tables from MediaWiki.
It doesn't contains a segmentation function and uses maximal forward matching, so it's simple.
For a complete and accurate solution, see OpenCC.
For Chinese segmentation, see Jieba.
>>> print(convert('我幹什麼不干你事。', 'zh-cn'))
我干什么不干你事
>>> print(convert('人体内存在很多微生物', 'zh-tw'))
人體內存在很多微生物
Support MediaWiki's convertion format:
>>> print(convert_for_mw('在现代,机械计算-{}-机的应用已经完全被电子计算-{}-机所取代', 'zh-hk'))
在現代機械計算機的應用已經完全被電子計算機所取代
>>> print(convert_for_mw('-{zh-hant:資訊工程;zh-hans:计算机工程学;}-是电子工程的一个分支,主要研究计算机软硬件和二者间的彼此联系。', 'zh-tw'))
資訊工程是電子工程的一個分支主要研究計算機軟硬體和二者間的彼此聯繫
>>> print(convert_for_mw('張國榮曾在英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学學習。', 'zh-sg'))
张国荣曾在英国利兹大学学习
"""
# Only Python3 can pass the doctest here due to unicode problems.
__version__ = '1.4.3'
import os
import sys
import re
import json
try:
from pkg_resources import resource_stream
get_module_res = lambda *res: resource_stream(__name__, os.path.join(*res))
except ImportError:
get_module_res = lambda *res: open(os.path.normpath(
os.path.join(os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
# Locale fallback order lookup dictionary
Locales = {
'zh-cn': ('zh-cn', 'zh-hans', 'zh-sg', 'zh'),
'zh-hk': ('zh-hk', 'zh-hant', 'zh-tw', 'zh'),
'zh-tw': ('zh-tw', 'zh-hant', 'zh-hk', 'zh'),
'zh-sg': ('zh-sg', 'zh-hans', 'zh-cn', 'zh'),
'zh-my': ('zh-my', 'zh-sg', 'zh-hans', 'zh-cn', 'zh'),
'zh-mo': ('zh-mo', 'zh-hk', 'zh-hant', 'zh-tw', 'zh'),
'zh-hant': ('zh-hant', 'zh-tw', 'zh-hk', 'zh'),
'zh-hans': ('zh-hans', 'zh-cn', 'zh-sg', 'zh'),
'zh': ('zh',) # special value for no conversion
}
_DEFAULT_DICT = "zhcdict.json"
DICTIONARY = _DEFAULT_DICT
zhcdicts = None
dict_zhcn = None
dict_zhsg = None
dict_zhtw = None
dict_zhhk = None
pfsdict = {}
RE_langconv = re.compile(r'(-\{|\}-)')
RE_splitflag = re.compile(r'\s*\|\s*')
RE_splitmap = re.compile(r'\s*;\s*')
RE_splituni = re.compile(r'\s*=>\s*')
RE_splitpair = re.compile(r'\s*:\s*')
def loaddict(filename=DICTIONARY):
"""
Load the dictionary from a specific JSON file.
"""
global zhcdicts
if zhcdicts:
return
if filename == _DEFAULT_DICT:
zhcdicts = json.loads(get_module_res(filename).read().decode('utf-8'))
else:
with open(filename, 'rb') as f:
zhcdicts = json.loads(f.read().decode('utf-8'))
zhcdicts['SIMPONLY'] = frozenset(zhcdicts['SIMPONLY'])
zhcdicts['TRADONLY'] = frozenset(zhcdicts['TRADONLY'])
def getdict(locale):
"""
Generate or get convertion dict cache for certain locale.
Dictionaries are loaded on demand.
"""
global zhcdicts, dict_zhcn, dict_zhsg, dict_zhtw, dict_zhhk, pfsdict
if zhcdicts is None:
loaddict(DICTIONARY)
if locale == 'zh-cn':
if dict_zhcn:
got = dict_zhcn
else:
dict_zhcn = zhcdicts['zh2Hans'].copy()
dict_zhcn.update(zhcdicts['zh2CN'])
got = dict_zhcn
elif locale == 'zh-tw':
if dict_zhtw:
got = dict_zhtw
else:
dict_zhtw = zhcdicts['zh2Hant'].copy()
dict_zhtw.update(zhcdicts['zh2TW'])
got = dict_zhtw
elif locale == 'zh-hk' or locale == 'zh-mo':
if dict_zhhk:
got = dict_zhhk
else:
dict_zhhk = zhcdicts['zh2Hant'].copy()
dict_zhhk.update(zhcdicts['zh2HK'])
got = dict_zhhk
elif locale == 'zh-sg' or locale == 'zh-my':
if dict_zhsg:
got = dict_zhsg
else:
dict_zhsg = zhcdicts['zh2Hans'].copy()
dict_zhsg.update(zhcdicts['zh2SG'])
got = dict_zhsg
elif locale == 'zh-hans':
got = zhcdicts['zh2Hans']
elif locale == 'zh-hant':
got = zhcdicts['zh2Hant']
else:
got = {}
if locale not in pfsdict:
pfsdict[locale] = getpfset(got)
return got
def getpfset(convdict):
pfset = []
for word in convdict:
for ch in range(len(word)):
pfset.append(word[:ch+1])
return frozenset(pfset)
def issimp(s, full=False):
"""
Detect text is whether Simplified Chinese or Traditional Chinese.
Returns True for Simplified; False for Traditional; None for unknown.
If full=False, it returns once first simplified- or traditional-only
character is encountered, so it's for quick and rough identification;
else, it compares the count and returns the most likely one.
Use `is` (True/False/None) to check the result.
`s` must be unicode (Python 2) or str (Python 3), or you'll get None.
"""
if zhcdicts is None:
loaddict(DICTIONARY)
simp, trad = 0, 0
if full:
for ch in s:
if ch in zhcdicts['SIMPONLY']:
simp += 1
elif ch in zhcdicts['TRADONLY']:
trad += 1
if simp > trad:
return True
elif simp < trad:
return False
else:
return None
else:
for ch in s:
if ch in zhcdicts['SIMPONLY']:
return True
elif ch in zhcdicts['TRADONLY']:
return False
return None
def fallback(locale, mapping):
for l in Locales[locale]:
if l in mapping:
return mapping[l]
return convert(tuple(mapping.values())[0], locale)
def convtable2dict(convtable, locale, update=None):
"""
Convert a list of conversion dict to a dict for a certain locale.
>>> sorted(convtable2dict([{'zh-hk': '列斯', 'zh-hans': '利兹', 'zh': '利兹', 'zh-tw': '里茲'}, {':uni': '巨集', 'zh-cn': ''}], 'zh-cn').items())
[('列斯', '利兹'), ('利兹', '利兹'), ('巨集', ''), ('里茲', '利兹')]
"""
rdict = update.copy() if update else {}
for r in convtable:
if ':uni' in r:
if locale in r:
rdict[r[':uni']] = r[locale]
elif locale[:-1] == 'zh-han':
if locale in r:
for word in r.values():
rdict[word] = r[locale]
else:
v = fallback(locale, r)
for word in r.values():
rdict[word] = v
return rdict
def tokenize(s, locale, update=None):
"""
Tokenize `s` according to corresponding locale dictionary.
Don't use this for serious text processing.
"""
zhdict = getdict(locale)
pfset = pfsdict[locale]
if update:
zhdict = zhdict.copy()
zhdict.update(update)
newset = set()
for word in update:
for ch in range(len(word)):
newset.add(word[:ch+1])
pfset = pfset | newset
ch = []
N = len(s)
pos = 0
while pos < N:
i = pos
frag = s[pos]
maxword = None
maxpos = 0
while i < N and frag in pfset:
if frag in zhdict:
maxword = frag
maxpos = i
i += 1
frag = s[pos:i+1]
if maxword is None:
maxword = s[pos]
pos += 1
else:
pos = maxpos + 1
ch.append(maxword)
return ch
def convert(s, locale, update=None):
"""
Main convert function.
:param s: must be `unicode` (Python 2) or `str` (Python 3).
:param locale: should be one of ``('zh-hans', 'zh-hant', 'zh-cn', 'zh-sg'
'zh-tw', 'zh-hk', 'zh-my', 'zh-mo')``.
:param update: a dict which updates the conversion table, eg.
``{'from1': 'to1', 'from2': 'to2'}``
>>> print(convert('我幹什麼不干你事。', 'zh-cn'))
我干什么不干你事
>>> print(convert('我幹什麼不干你事。', 'zh-cn', {'不干': '不幹'}))
我干什么不幹你事
>>> print(convert('人体内存在很多微生物', 'zh-tw'))
人體內存在很多微生物
"""
if locale == 'zh' or locale not in Locales:
# "no conversion"
return s
zhdict = getdict(locale)
pfset = pfsdict[locale]
newset = set()
if update:
# TODO: some sort of caching
#zhdict = zhdict.copy()
#zhdict.update(update)
newset = set()
for word in update:
for ch in range(len(word)):
newset.add(word[:ch+1])
#pfset = pfset | newset
ch = []
N = len(s)
pos = 0
while pos < N:
i = pos
frag = s[pos]
maxword = None
maxpos = 0
while i < N and (frag in pfset or frag in newset):
if update and frag in update:
maxword = update[frag]
maxpos = i
elif frag in zhdict:
maxword = zhdict[frag]
maxpos = i
i += 1
frag = s[pos:i+1]
if maxword is None:
maxword = s[pos]
pos += 1
else:
pos = maxpos + 1
ch.append(maxword)
return ''.join(ch)
def convert_for_mw(s, locale, update=None):
"""
Recognizes MediaWiki's human conversion format.
Use locale='zh' for no conversion.
Reference: (all tests passed)
https://zh.wikipedia.org/wiki/Help:高级字词转换语法
https://www.mediawiki.org/wiki/Writing_systems/Syntax
>>> print(convert_for_mw('在现代,机械计算-{}-机的应用已经完全被电子计算-{}-机所取代', 'zh-hk'))
在現代機械計算機的應用已經完全被電子計算機所取代
>>> print(convert_for_mw('-{zh-hant:資訊工程;zh-hans:计算机工程学;}-是电子工程的一个分支,主要研究计算机软硬件和二者间的彼此联系。', 'zh-tw'))
資訊工程是電子工程的一個分支主要研究計算機軟硬體和二者間的彼此聯繫
>>> print(convert_for_mw('張國榮曾在英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学學習。', 'zh-hant'))
張國榮曾在英國里茲大學學習
>>> print(convert_for_mw('張國榮曾在英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学學習。', 'zh-sg'))
张国荣曾在英国利兹大学学习
>>> convert_for_mw('-{zh-hant:;\\nzh-cn:}-', 'zh-tw') == ''
True
>>> print(convert_for_mw('毫米(毫公分)符號mm是長度單位和降雨量單位-{zh-hans:台湾作-{公釐}-或-{公厘}-;zh-hant:港澳和大陸稱為-{毫米}-(台灣亦有使用,但較常使用名稱為毫公分);zh-mo:台灣作-{公釐}-或-{公厘}-;zh-hk:台灣作-{公釐}-或-{公厘}-;}-。', 'zh-tw'))
毫米(毫公分)符號mm是長度單位和降雨量單位港澳和大陸稱為毫米台灣亦有使用但較常使用名稱為毫公分
>>> print(convert_for_mw('毫米(毫公分)符號mm是長度單位和降雨量單位-{zh-hans:台湾作-{公釐}-或-{公厘}-;zh-hant:港澳和大陸稱為-{毫米}-(台灣亦有使用,但較常使用名稱為毫公分);zh-mo:台灣作-{公釐}-或-{公厘}-;zh-hk:台灣作-{公釐}-或-{公厘}-;}-。', 'zh-cn'))
毫米(毫公分)符号mm是长度单位和降雨量单位台湾作公釐或公厘
>>> print(convert_for_mw('毫米(毫公分)符號mm是長度單位和降雨量單位-{zh-hans:台湾作-{公釐}-或-{公厘}-;zh-hant:港澳和大陸稱為-{毫米}-(台灣亦有使用,但較常使用名稱為毫公分);zh-mo:台灣作-{公釐}-或-{公厘}-;zh-hk:台灣作-{公釐}-或-{公厘', 'zh-hk')) # unbalanced test
毫米(毫公分)符號mm是長度單位和降雨量單位台灣作公釐或公厘
>>> print(convert_for_mw('报头的“-{參攷消息}-”四字摘自鲁迅笔迹-{zh-hans:,“-{參}-”是“-{参}-”的繁体字读音cān与简体的“-{参}-”字相同;;zh-hant:;}-“-{攷}-”是“考”的异体字读音kǎo与“考”字相同。', 'zh-tw'))
報頭的參攷消息四字摘自魯迅筆跡的異體字讀音kǎo字相同
>>> print(convert_for_mw('报头的“-{參攷消息}-”四字摘自鲁迅笔迹-{zh-hans:,“-{參}-”是“-{参}-”的繁体字读音cān与简体的“-{参}-”字相同;;zh-hant:;}-“-{攷}-”是“考”的异体字读音kǎo与“考”字相同。', 'zh-cn'))
报头的參攷消息四字摘自鲁迅笔迹的繁体字读音cān与简体的字相同的异体字读音kǎo字相同
>>> print(convert_for_mw('{{Col-break}}--&gt;', 'zh-hant'))
{{Col-break}}--&gt;
"""
ch = []
rules = []
ruledict = update.copy() if update else {}
nested = 0
block = ''
for frag in RE_langconv.split(s):
if frag == '-{':
nested += 1
block += frag
elif frag == '}-':
if not nested:
# bogus }-
ch.append(frag)
continue
block += frag
nested -= 1
if nested:
continue
newrules = []
delim = RE_splitflag.split(block[2:-2].strip(' \t\n\r\f\v;'))
if len(delim) == 1:
flag = None
mapping = RE_splitmap.split(delim[0])
else:
flag = RE_splitmap.split(delim[0].strip(' \t\n\r\f\v;'))
mapping = RE_splitmap.split(delim[1])
rule = {}
for m in mapping:
uni = RE_splituni.split(m)
if len(uni) == 1:
pair = RE_splitpair.split(uni[0])
else:
if rule:
newrules.append(rule)
rule = {':uni': uni[0]}
else:
rule[':uni'] = uni[0]
pair = RE_splitpair.split(uni[1])
if len(pair) == 1:
rule['zh'] = convert_for_mw(pair[0], 'zh', ruledict)
else:
rule[pair[0]] = convert_for_mw(pair[1], pair[0], ruledict)
newrules.append(rule)
if not flag:
ch.append(fallback(locale, newrules[0]))
elif any(ch in flag for ch in 'ATRD-HN'):
for f in flag:
# A: add rule for convert code (all text convert)
# H: Insert a conversion rule without output
if f in ('A', 'H'):
for r in newrules:
if not r in rules:
rules.append(r)
if f == 'A':
if ':uni' in r:
if locale in r:
ch.append(r[locale])
else:
ch.append(convert(r[':uni'], locale))
else:
ch.append(fallback(locale, newrules[0]))
# -: remove convert
elif f == '-':
for r in newrules:
try:
rules.remove(r)
except ValueError:
pass
# D: convert description (useless)
#elif f == 'D':
#ch.append('; '.join(': '.join(x) for x in newrules[0].items()))
# T: title convert (useless)
# R: raw content (implied above)
# N: current variant name (useless)
#elif f == 'N':
#ch.append(locale)
ruledict = convtable2dict(rules, locale, update)
else:
fblimit = frozenset(flag) & frozenset(Locales[locale])
limitedruledict = update.copy() if update else {}
for r in rules:
if ':uni' in r:
if locale in r:
limitedruledict[r[':uni']] = r[locale]
else:
v = None
for l in Locales[locale]:
if l in r and l in fblimit:
v = r[l]
break
for word in r.values():
limitedruledict[word] = v if v else convert(word, locale)
ch.append(convert(delim[1], locale, limitedruledict))
block = ''
elif nested:
block += frag
else:
ch.append(convert(frag, locale, ruledict))
if nested:
# unbalanced
ch.append(convert_for_mw(block + '}-'*nested, locale, ruledict))
return ''.join(ch)
def test_convert_mw(locale, update=None):
s = ('英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学\n'
'-{zh-hans:计算机; zh-hant:電腦;}-\n'
'-{H|巨集=>zh-cn:宏;}-\n'
'测试:巨集、宏\n'
'-{简体字繁體字}-\n'
'北-{}-韓、北朝-{}-鲜\n'
'-{H|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n'
'测试:博客、網誌、部落格\n'
'-{A|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n'
'测试:博客、網誌、部落格\n'
'-{H|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n'
'测试1博客、網誌、部落格\n'
'-{-|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n'
'测试2博客、網誌、部落格\n'
'-{T|zh-cn:汤姆·汉克斯; zh-hk:湯·漢斯; zh-tw:湯姆·漢克斯;}-\n'
'-{D|zh-cn:汤姆·汉克斯; zh-hk:湯·漢斯; zh-tw:湯姆·漢克斯;}-\n'
'-{H|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n'
'测试1-{zh;zh-hans;zh-hant|博客、網誌、部落格}-\n'
'测试2-{zh;zh-cn;zh-hk|博客、網誌、部落格}-')
return convert_for_mw(s, locale, update)
def main():
"""
Simple stdin/stdout interface.
"""
if len(sys.argv) == 2 and sys.argv[1] in Locales:
locale = sys.argv[1]
convertfunc = convert
elif len(sys.argv) == 3 and sys.argv[1] == '-w' and sys.argv[2] in Locales:
locale = sys.argv[2]
convertfunc = convert_for_mw
else:
thisfile = __file__ if __name__ == '__main__' else 'python -mzhconv'
print("usage: %s [-w] {zh-cn|zh-tw|zh-hk|zh-sg|zh-hans|zh-hant|zh} < input > output" % thisfile)
sys.exit(1)
loaddict()
ln = sys.stdin.readline()
while ln:
l = ln.rstrip('\r\n')
if sys.version_info[0] < 3:
l = unicode(l, 'utf-8')
res = convertfunc(l, locale)
if sys.version_info[0] < 3:
print(res.encode('utf-8'))
else:
print(res)
ln = sys.stdin.readline()
if __name__ == '__main__':
main()