#!/usr/bin/env python # -*- coding: utf-8 -*- """ This module implements a simple conversion and localization between simplified and traditional Chinese using tables from MediaWiki. It doesn't contains a segmentation function and uses maximal forward matching, so it's simple. For a complete and accurate solution, see OpenCC. For Chinese segmentation, see Jieba. >>> print(convert('我幹什麼不干你事。', 'zh-cn')) 我干什么不干你事。 >>> print(convert('人体内存在很多微生物', 'zh-tw')) 人體內存在很多微生物 Support MediaWiki's convertion format: >>> print(convert_for_mw('在现代,机械计算-{}-机的应用已经完全被电子计算-{}-机所取代', 'zh-hk')) 在現代,機械計算機的應用已經完全被電子計算機所取代 >>> print(convert_for_mw('-{zh-hant:資訊工程;zh-hans:计算机工程学;}-是电子工程的一个分支,主要研究计算机软硬件和二者间的彼此联系。', 'zh-tw')) 資訊工程是電子工程的一個分支,主要研究計算機軟硬體和二者間的彼此聯繫。 >>> print(convert_for_mw('張國榮曾在英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学學習。', 'zh-sg')) 张国荣曾在英国利兹大学学习。 """ # Only Python3 can pass the doctest here due to unicode problems. __version__ = '1.4.3' import os import sys import re import json try: from pkg_resources import resource_stream get_module_res = lambda *res: resource_stream(__name__, os.path.join(*res)) except ImportError: get_module_res = lambda *res: open(os.path.normpath( os.path.join(os.getcwd(), os.path.dirname(__file__), *res)), 'rb') # Locale fallback order lookup dictionary Locales = { 'zh-cn': ('zh-cn', 'zh-hans', 'zh-sg', 'zh'), 'zh-hk': ('zh-hk', 'zh-hant', 'zh-tw', 'zh'), 'zh-tw': ('zh-tw', 'zh-hant', 'zh-hk', 'zh'), 'zh-sg': ('zh-sg', 'zh-hans', 'zh-cn', 'zh'), 'zh-my': ('zh-my', 'zh-sg', 'zh-hans', 'zh-cn', 'zh'), 'zh-mo': ('zh-mo', 'zh-hk', 'zh-hant', 'zh-tw', 'zh'), 'zh-hant': ('zh-hant', 'zh-tw', 'zh-hk', 'zh'), 'zh-hans': ('zh-hans', 'zh-cn', 'zh-sg', 'zh'), 'zh': ('zh',) # special value for no conversion } _DEFAULT_DICT = "zhcdict.json" DICTIONARY = _DEFAULT_DICT zhcdicts = None dict_zhcn = None dict_zhsg = None dict_zhtw = None dict_zhhk = None pfsdict = {} RE_langconv = re.compile(r'(-\{|\}-)') RE_splitflag = re.compile(r'\s*\|\s*') RE_splitmap = re.compile(r'\s*;\s*') RE_splituni = re.compile(r'\s*=>\s*') RE_splitpair = re.compile(r'\s*:\s*') def loaddict(filename=DICTIONARY): """ Load the dictionary from a specific JSON file. """ global zhcdicts if zhcdicts: return if filename == _DEFAULT_DICT: zhcdicts = json.loads(get_module_res(filename).read().decode('utf-8')) else: with open(filename, 'rb') as f: zhcdicts = json.loads(f.read().decode('utf-8')) zhcdicts['SIMPONLY'] = frozenset(zhcdicts['SIMPONLY']) zhcdicts['TRADONLY'] = frozenset(zhcdicts['TRADONLY']) def getdict(locale): """ Generate or get convertion dict cache for certain locale. Dictionaries are loaded on demand. """ global zhcdicts, dict_zhcn, dict_zhsg, dict_zhtw, dict_zhhk, pfsdict if zhcdicts is None: loaddict(DICTIONARY) if locale == 'zh-cn': if dict_zhcn: got = dict_zhcn else: dict_zhcn = zhcdicts['zh2Hans'].copy() dict_zhcn.update(zhcdicts['zh2CN']) got = dict_zhcn elif locale == 'zh-tw': if dict_zhtw: got = dict_zhtw else: dict_zhtw = zhcdicts['zh2Hant'].copy() dict_zhtw.update(zhcdicts['zh2TW']) got = dict_zhtw elif locale == 'zh-hk' or locale == 'zh-mo': if dict_zhhk: got = dict_zhhk else: dict_zhhk = zhcdicts['zh2Hant'].copy() dict_zhhk.update(zhcdicts['zh2HK']) got = dict_zhhk elif locale == 'zh-sg' or locale == 'zh-my': if dict_zhsg: got = dict_zhsg else: dict_zhsg = zhcdicts['zh2Hans'].copy() dict_zhsg.update(zhcdicts['zh2SG']) got = dict_zhsg elif locale == 'zh-hans': got = zhcdicts['zh2Hans'] elif locale == 'zh-hant': got = zhcdicts['zh2Hant'] else: got = {} if locale not in pfsdict: pfsdict[locale] = getpfset(got) return got def getpfset(convdict): pfset = [] for word in convdict: for ch in range(len(word)): pfset.append(word[:ch+1]) return frozenset(pfset) def issimp(s, full=False): """ Detect text is whether Simplified Chinese or Traditional Chinese. Returns True for Simplified; False for Traditional; None for unknown. If full=False, it returns once first simplified- or traditional-only character is encountered, so it's for quick and rough identification; else, it compares the count and returns the most likely one. Use `is` (True/False/None) to check the result. `s` must be unicode (Python 2) or str (Python 3), or you'll get None. """ if zhcdicts is None: loaddict(DICTIONARY) simp, trad = 0, 0 if full: for ch in s: if ch in zhcdicts['SIMPONLY']: simp += 1 elif ch in zhcdicts['TRADONLY']: trad += 1 if simp > trad: return True elif simp < trad: return False else: return None else: for ch in s: if ch in zhcdicts['SIMPONLY']: return True elif ch in zhcdicts['TRADONLY']: return False return None def fallback(locale, mapping): for l in Locales[locale]: if l in mapping: return mapping[l] return convert(tuple(mapping.values())[0], locale) def convtable2dict(convtable, locale, update=None): """ Convert a list of conversion dict to a dict for a certain locale. >>> sorted(convtable2dict([{'zh-hk': '列斯', 'zh-hans': '利兹', 'zh': '利兹', 'zh-tw': '里茲'}, {':uni': '巨集', 'zh-cn': '宏'}], 'zh-cn').items()) [('列斯', '利兹'), ('利兹', '利兹'), ('巨集', '宏'), ('里茲', '利兹')] """ rdict = update.copy() if update else {} for r in convtable: if ':uni' in r: if locale in r: rdict[r[':uni']] = r[locale] elif locale[:-1] == 'zh-han': if locale in r: for word in r.values(): rdict[word] = r[locale] else: v = fallback(locale, r) for word in r.values(): rdict[word] = v return rdict def tokenize(s, locale, update=None): """ Tokenize `s` according to corresponding locale dictionary. Don't use this for serious text processing. """ zhdict = getdict(locale) pfset = pfsdict[locale] if update: zhdict = zhdict.copy() zhdict.update(update) newset = set() for word in update: for ch in range(len(word)): newset.add(word[:ch+1]) pfset = pfset | newset ch = [] N = len(s) pos = 0 while pos < N: i = pos frag = s[pos] maxword = None maxpos = 0 while i < N and frag in pfset: if frag in zhdict: maxword = frag maxpos = i i += 1 frag = s[pos:i+1] if maxword is None: maxword = s[pos] pos += 1 else: pos = maxpos + 1 ch.append(maxword) return ch def convert(s, locale, update=None): """ Main convert function. :param s: must be `unicode` (Python 2) or `str` (Python 3). :param locale: should be one of ``('zh-hans', 'zh-hant', 'zh-cn', 'zh-sg' 'zh-tw', 'zh-hk', 'zh-my', 'zh-mo')``. :param update: a dict which updates the conversion table, eg. ``{'from1': 'to1', 'from2': 'to2'}`` >>> print(convert('我幹什麼不干你事。', 'zh-cn')) 我干什么不干你事。 >>> print(convert('我幹什麼不干你事。', 'zh-cn', {'不干': '不幹'})) 我干什么不幹你事。 >>> print(convert('人体内存在很多微生物', 'zh-tw')) 人體內存在很多微生物 """ if locale == 'zh' or locale not in Locales: # "no conversion" return s zhdict = getdict(locale) pfset = pfsdict[locale] newset = set() if update: # TODO: some sort of caching #zhdict = zhdict.copy() #zhdict.update(update) newset = set() for word in update: for ch in range(len(word)): newset.add(word[:ch+1]) #pfset = pfset | newset ch = [] N = len(s) pos = 0 while pos < N: i = pos frag = s[pos] maxword = None maxpos = 0 while i < N and (frag in pfset or frag in newset): if update and frag in update: maxword = update[frag] maxpos = i elif frag in zhdict: maxword = zhdict[frag] maxpos = i i += 1 frag = s[pos:i+1] if maxword is None: maxword = s[pos] pos += 1 else: pos = maxpos + 1 ch.append(maxword) return ''.join(ch) def convert_for_mw(s, locale, update=None): """ Recognizes MediaWiki's human conversion format. Use locale='zh' for no conversion. Reference: (all tests passed) https://zh.wikipedia.org/wiki/Help:高级字词转换语法 https://www.mediawiki.org/wiki/Writing_systems/Syntax >>> print(convert_for_mw('在现代,机械计算-{}-机的应用已经完全被电子计算-{}-机所取代', 'zh-hk')) 在現代,機械計算機的應用已經完全被電子計算機所取代 >>> print(convert_for_mw('-{zh-hant:資訊工程;zh-hans:计算机工程学;}-是电子工程的一个分支,主要研究计算机软硬件和二者间的彼此联系。', 'zh-tw')) 資訊工程是電子工程的一個分支,主要研究計算機軟硬體和二者間的彼此聯繫。 >>> print(convert_for_mw('張國榮曾在英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学學習。', 'zh-hant')) 張國榮曾在英國里茲大學學習。 >>> print(convert_for_mw('張國榮曾在英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学學習。', 'zh-sg')) 张国荣曾在英国利兹大学学习。 >>> convert_for_mw('-{zh-hant:;\\nzh-cn:}-', 'zh-tw') == '' True >>> print(convert_for_mw('毫米(毫公分),符號mm,是長度單位和降雨量單位,-{zh-hans:台湾作-{公釐}-或-{公厘}-;zh-hant:港澳和大陸稱為-{毫米}-(台灣亦有使用,但較常使用名稱為毫公分);zh-mo:台灣作-{公釐}-或-{公厘}-;zh-hk:台灣作-{公釐}-或-{公厘}-;}-。', 'zh-tw')) 毫米(毫公分),符號mm,是長度單位和降雨量單位,港澳和大陸稱為毫米(台灣亦有使用,但較常使用名稱為毫公分)。 >>> print(convert_for_mw('毫米(毫公分),符號mm,是長度單位和降雨量單位,-{zh-hans:台湾作-{公釐}-或-{公厘}-;zh-hant:港澳和大陸稱為-{毫米}-(台灣亦有使用,但較常使用名稱為毫公分);zh-mo:台灣作-{公釐}-或-{公厘}-;zh-hk:台灣作-{公釐}-或-{公厘}-;}-。', 'zh-cn')) 毫米(毫公分),符号mm,是长度单位和降雨量单位,台湾作公釐或公厘。 >>> print(convert_for_mw('毫米(毫公分),符號mm,是長度單位和降雨量單位,-{zh-hans:台湾作-{公釐}-或-{公厘}-;zh-hant:港澳和大陸稱為-{毫米}-(台灣亦有使用,但較常使用名稱為毫公分);zh-mo:台灣作-{公釐}-或-{公厘}-;zh-hk:台灣作-{公釐}-或-{公厘', 'zh-hk')) # unbalanced test 毫米(毫公分),符號mm,是長度單位和降雨量單位,台灣作公釐或公厘 >>> print(convert_for_mw('报头的“-{參攷消息}-”四字摘自鲁迅笔迹-{zh-hans:,“-{參}-”是“-{参}-”的繁体字,读音cān,与简体的“-{参}-”字相同;;zh-hant:,;}-“-{攷}-”是“考”的异体字,读音kǎo,与“考”字相同。', 'zh-tw')) 報頭的「參攷消息」四字摘自魯迅筆跡,「攷」是「考」的異體字,讀音kǎo,與「考」字相同。 >>> print(convert_for_mw('报头的“-{參攷消息}-”四字摘自鲁迅笔迹-{zh-hans:,“-{參}-”是“-{参}-”的繁体字,读音cān,与简体的“-{参}-”字相同;;zh-hant:,;}-“-{攷}-”是“考”的异体字,读音kǎo,与“考”字相同。', 'zh-cn')) 报头的“參攷消息”四字摘自鲁迅笔迹,“參”是“参”的繁体字,读音cān,与简体的“参”字相同;“攷”是“考”的异体字,读音kǎo,与“考”字相同。 >>> print(convert_for_mw('{{Col-break}}-->', 'zh-hant')) {{Col-break}}--> """ ch = [] rules = [] ruledict = update.copy() if update else {} nested = 0 block = '' for frag in RE_langconv.split(s): if frag == '-{': nested += 1 block += frag elif frag == '}-': if not nested: # bogus }- ch.append(frag) continue block += frag nested -= 1 if nested: continue newrules = [] delim = RE_splitflag.split(block[2:-2].strip(' \t\n\r\f\v;')) if len(delim) == 1: flag = None mapping = RE_splitmap.split(delim[0]) else: flag = RE_splitmap.split(delim[0].strip(' \t\n\r\f\v;')) mapping = RE_splitmap.split(delim[1]) rule = {} for m in mapping: uni = RE_splituni.split(m) if len(uni) == 1: pair = RE_splitpair.split(uni[0]) else: if rule: newrules.append(rule) rule = {':uni': uni[0]} else: rule[':uni'] = uni[0] pair = RE_splitpair.split(uni[1]) if len(pair) == 1: rule['zh'] = convert_for_mw(pair[0], 'zh', ruledict) else: rule[pair[0]] = convert_for_mw(pair[1], pair[0], ruledict) newrules.append(rule) if not flag: ch.append(fallback(locale, newrules[0])) elif any(ch in flag for ch in 'ATRD-HN'): for f in flag: # A: add rule for convert code (all text convert) # H: Insert a conversion rule without output if f in ('A', 'H'): for r in newrules: if not r in rules: rules.append(r) if f == 'A': if ':uni' in r: if locale in r: ch.append(r[locale]) else: ch.append(convert(r[':uni'], locale)) else: ch.append(fallback(locale, newrules[0])) # -: remove convert elif f == '-': for r in newrules: try: rules.remove(r) except ValueError: pass # D: convert description (useless) #elif f == 'D': #ch.append('; '.join(': '.join(x) for x in newrules[0].items())) # T: title convert (useless) # R: raw content (implied above) # N: current variant name (useless) #elif f == 'N': #ch.append(locale) ruledict = convtable2dict(rules, locale, update) else: fblimit = frozenset(flag) & frozenset(Locales[locale]) limitedruledict = update.copy() if update else {} for r in rules: if ':uni' in r: if locale in r: limitedruledict[r[':uni']] = r[locale] else: v = None for l in Locales[locale]: if l in r and l in fblimit: v = r[l] break for word in r.values(): limitedruledict[word] = v if v else convert(word, locale) ch.append(convert(delim[1], locale, limitedruledict)) block = '' elif nested: block += frag else: ch.append(convert(frag, locale, ruledict)) if nested: # unbalanced ch.append(convert_for_mw(block + '}-'*nested, locale, ruledict)) return ''.join(ch) def test_convert_mw(locale, update=None): s = ('英國-{zh:利兹;zh-hans:利兹;zh-hk:列斯;zh-tw:里茲}-大学\n' '-{zh-hans:计算机; zh-hant:電腦;}-\n' '-{H|巨集=>zh-cn:宏;}-\n' '测试:巨集、宏\n' '-{简体字繁體字}-\n' '北-{}-韓、北朝-{}-鲜\n' '-{H|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n' '测试:博客、網誌、部落格\n' '-{A|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n' '测试:博客、網誌、部落格\n' '-{H|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n' '测试1:博客、網誌、部落格\n' '-{-|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n' '测试2:博客、網誌、部落格\n' '-{T|zh-cn:汤姆·汉克斯; zh-hk:湯·漢斯; zh-tw:湯姆·漢克斯;}-\n' '-{D|zh-cn:汤姆·汉克斯; zh-hk:湯·漢斯; zh-tw:湯姆·漢克斯;}-\n' '-{H|zh-cn:博客; zh-hk:網誌; zh-tw:部落格;}-\n' '测试1:-{zh;zh-hans;zh-hant|博客、網誌、部落格}-\n' '测试2:-{zh;zh-cn;zh-hk|博客、網誌、部落格}-') return convert_for_mw(s, locale, update) def main(): """ Simple stdin/stdout interface. """ if len(sys.argv) == 2 and sys.argv[1] in Locales: locale = sys.argv[1] convertfunc = convert elif len(sys.argv) == 3 and sys.argv[1] == '-w' and sys.argv[2] in Locales: locale = sys.argv[2] convertfunc = convert_for_mw else: thisfile = __file__ if __name__ == '__main__' else 'python -mzhconv' print("usage: %s [-w] {zh-cn|zh-tw|zh-hk|zh-sg|zh-hans|zh-hant|zh} < input > output" % thisfile) sys.exit(1) loaddict() ln = sys.stdin.readline() while ln: l = ln.rstrip('\r\n') if sys.version_info[0] < 3: l = unicode(l, 'utf-8') res = convertfunc(l, locale) if sys.version_info[0] < 3: print(res.encode('utf-8')) else: print(res) ln = sys.stdin.readline() if __name__ == '__main__': main()