Pick up combination of Pingyin and Chinese characters
It's time to check combination of pingyin and Chinese characters(Hanzi). First, I have created a new function named split_per_hanzi().
def split_per_hanzi(word_list): # ['你好','ni3 hao3'] --> [['ni3', '你'],['hao3', '好']] s_hanz = [] ping_hanz = [] try: if len(word_list) != 2: raise Exception, 'Invalid format: %s' % str(word_list) s_ping = split_multiple(word_list[1]) for var in range (0, len(word_list[0]), 3): if word_list[0][var:var+3] != u'−' and word_list[0][var:var+3] != u',': s_hanz.append(word_list[0][var:var+3]) if len(s_ping) == len(s_hanz): for var in range(0, len(s_ping)): ping_hanz.append([s_ping[var], s_hanz[var]]) else: # Recovery failed raise Exception, 'Recovery faild: %s' % str(word_list) return ping_hanz except Exception, etext: info = sys.exc_info() raise info[0], info[1], info[2]
Note:This function is designed to executed under unicode environment.
Now generate a list.
>>> import_file = codecs.open('/Users/ken/Documents/workspace/NLTK Learning/text files/ChineseWords.csv') >>> raw_data=[] >>> for row in csv.reader(import_file): ... raw_data.append(row) ... >>> ping_hanzi = [] >>> for elm in raw_data: ... ping_hanzi.append(split_per_hanzi(elm)) ... >>> ping_hanzi[:30] [[['an4', '\xe6\x8c\x89'], ['lai2', '\xe6\x9d\xa5']], [['ao2', '\xe7\x86\xac'], ['ye4', '\xe5\xa4\x9c']], [['ao4', '\xe6\x87\x8a'], ['nao3', '\xe6\x81\xbc']], [['ba1', '\xe5\xb7\xb4'], ['bu', '\xe4\xb8\x8d'], ['de', '\xe5\xbe\x97']], [['ba1', '\xe5\x90\xa7'], ['da1', '\xe5\x97\x92']], [['ba3', '\xe6\x8a\x8a'], ['wo4', '\xe6\x8f\xa1']], [['ba4', '\xe9\x9c\xb8']], [['bai2', '\xe7\x99\xbd'], ['tou2', '\xe5\xa4\xb4'], ['diao1', '\xe9\x9b\x95']], [['bai3', '\xe6\x91\x86'], ['tuo1', '\xe8\x84\xb1']], [['ban1', '\xe6\x90\xac'], ['yun4', '\xe8\xbf\x90'], ['gong1', '\xe5\xb7\xa5']], [['ban3', '\xe6\x9d\xbf'], ['lan2', '\xe8\x93\x9d'], ['gen1', '\xe6\xa0\xb9']], [['ban4', '\xe5\x8d\x8a'], ['ye4', '\xe5\xa4\x9c']], [['bang1', '\xe5\xb8\xae']], [['bao1', '\xe5\x8c\x85'], ['kuo4', '\xe6\x8b\xac']], [['bao1', '\xe5\x8c\x85'], ['zhuang1', '\xe8\xa3\x85']], [['bao1', '\xe8\xa4\x92'], ['bian3', '\xe8\xb4\xac']], [['bao3', '\xe5\xae\x9d'], ['bei4', '\xe8\xb4\x9d']], [['bao3', '\xe4\xbf\x9d'], ['chi2', '\xe6\x8c\x81']], [['bao3', '\xe4\xbf\x9d'], ['liu2', '\xe7\x95\x99']], [['bao3', '\xe4\xbf\x9d'], ['mu3', '\xe5\xa7\x86']], [['bao3', '\xe4\xbf\x9d'], ['shou3', '\xe5\xae\x88']], [['bao4', '\xe6\x8a\xb1'], ['yuan4', '\xe6\x80\xa8']], [['bei4', '\xe8\x83\x8c'], ['jing3', '\xe6\x99\xaf']], [['bei4', '\xe8\xa2\xab'], ['po4', '\xe8\xbf\xab']], [['ben3', '\xe6\x9c\xac'], ['fen4', '\xe5\x88\x86']], [['ben3', '\xe6\x9c\xac'], ['neng2', '\xe8\x83\xbd']], [['ben3', '\xe6\x9c\xac'], ['zhi4', '\xe8\xb4\xa8']], [['bi1', '\xe9\x80\xbc']], [['bi3', '\xe6\xaf\x94'], ['bi3', '\xe6\xaf\x94'], ['jie1', '\xe7\x9a\x86'], ['shi4', '\xe6\x98\xaf']], [['bi3', '\xe6\xaf\x94'], ['qiu1', '\xe4\xb8\x98'], ['ni2', '\xe5\xb0\xbc']]]
This time, I ignore words themselves. Just focus on each single Chinese character.
>>> hanzi_element = [] >>> for elm1 in ping_hanzi: ... for elm2 in elm1: ... hanzi_element.append(elm2) ... >>> hanzi_element[:10] [['an4', '\xe6\x8c\x89'], ['lai2', '\xe6\x9d\xa5'], ['ao2', '\xe7\x86\xac'], ['ye4', '\xe5\xa4\x9c'], ['ao4', '\xe6\x87\x8a'], ['nao3', '\xe6\x81\xbc'], ['ba1', '\xe5\xb7\xb4'], ['bu', '\xe4\xb8\x8d'], ['de', '\xe5\xbe\x97'], ['ba1', '\xe5\x90\xa7']]
Let's check how many Hanzi(Chinese characters) included?
>>> len(hanzi_element) 11429
There must be duplicated entries. In this case, set(hanzi_element) cannot be used as hanzi_element itself is set.
>>> hanzi_element_set = [] >>> for elm in hanzi_element: ... if elm not in hanzi_element_set: ... hanzi_element_set.append(elm) ... >>> len(hanzi_element_set) 2913
OK then 2913 unique Chinese characters are found. In this result, some Hanzi might still be duplicated because some Chinese characters have more than one pingyin, so called duoyinzi(多音字).
Then grouping per pingyin and print Chinese characters.
>>> pings = [p for (p, w) in sorted(hanzi_element_set)] >>> for ping in pings: ... words = [w for (p, w) in sorted(hanzi_element_set) if p == ping] ... print ping, ':' ... for word in words: ... print word, ... print '' ... ? : 庆 役 饪 a : 啊 a1 : 阿 ai1 : 哀 哎 唉 挨 ai2 : 癌 ....
Due to original data quality problem, strange data (?) is displayed at the beginning. Anyway, most of other parts seem reliable.
Note:
I faced an error due to BOM. My workaround is like this:
.... s_ping = split_multiple(word_list[1]) <strong>if word_list[0].startswith('\xef\xbb\xbf'): word_list[0] = re.sub(r"\xef\xbb\xbf", "", word_list[0]).strip('"')</strong> for var in range (0, len(word_list[0]), 3): ....