Accessing Chinese word database
As I already mentioned previously, I have a Chinese word database which was created when I was learning Chinese. This database includes 5000+ words and mainly picked up from HSK Level 6 vocabulary.
First I wrote some codes to process Pingyin (pronunciation) and saved as pingyin.py.
import re, sys def pingyin_spliter(pingyin): s_ping = re.findall(r"[0-9]|er|[aeiouv]+[n|ng]*|[^aeiouv]{1,2}", pingyin.lower()) try: # Check split results if len(s_ping) > 3: #too long, invalid raise Exception, 'Too long: %d elements found' % len(s_ping) elif len(s_ping) == 1 and not re.match(r"er|[aeiouv]+[n|ng]*", s_ping[0]): #too short, invaild raise Exception, 'Too short: only %d elements found' % len(s_ping) if s_ping[-1].isdigit() == False and len(s_ping) < 3: s_ping.append('0') #Qingsheng if s_ping[-1].isdigit() and len(s_ping) < 3: s_ping.append(s_ping[-1]) s_ping[1] = s_ping[0] s_ping[0] = '' #Final format check if not (re.match(r'[^aeiouv]{1,2}', s_ping[0]) or s_ping[0] == ''): raise Exception, 'MissingConsanant' if not re.match(r"er|[aeiouv]+[n|ng]*", s_ping[1]): raise Exception, 'MissingVowel' if not re.match(r"[0-9]", s_ping[2]): raise Exception, 'MissingTone' return s_ping except Exception, etext: info = sys.exc_info() raise info[0], info[1], info[2] def split_multiple(m_ping): m_ping = m_ping.lower() r_ping = m_ping.split() return r_ping def split_tone(pingyin): s_tone = re.findall(r"[0-9]$|[a-z]+", pingyin.lower()) try: if len(s_tone) == 1 and s_tone[-1].isdigit() == False: s_tone.append('0') if s_tone[0].isalpha and s_tone[1].isdigit == False: raise Exception, s_tone if len(s_tone) != 2: raise Exception, s_tone return s_tone except Exception, etext: info = sys.exc_info() raise info[0], info[1], info[2] ||< Frankly speaking the logic is much far from perfect, for example, it is not possible to handle "ng2". This is kind of exceptional one because this does not include vowel. Therefore some manual operations are still required. <strong>Step 1: Import the CSV file</strong> >|python| >>> import pingyin >>> from pingyin import * >>> import_file = codecs.open('/Users/ken/Documents/workspace/NLTK Learning/text files/ChineseWords.csv') >>> raw_data = [] >>> for row in csv.reader(import_file): ... raw_data.append(row) ... >>> raw_data[:30] [['\xef\xbb\xbf"\xe6\x8c\x89\xef\xbc\x8d\xe6\x9d\xa5\xef\xbc\x8d"', 'an4 lai2'], ['\xe7\x86\xac\xe5\xa4\x9c', 'ao2 ye4'], ['\xe6\x87\x8a\xe6\x81\xbc', 'ao4 nao3'], ['\xe5\xb7\xb4\xe4\xb8\x8d\xe5\xbe\x97', 'ba1 bu de'], ['\xe5\x90\xa7\xe5\x97\x92', 'ba1 da1'], ['\xe6\x8a\x8a\xe6\x8f\xa1', 'ba3 wo4'], ['\xe9\x9c\xb8', 'ba4'], ['\xe7\x99\xbd\xe5\xa4\xb4\xe9\x9b\x95', 'bai2 tou2 diao1'], ['\xe6\x91\x86\xe8\x84\xb1', 'bai3 tuo1'], ['\xe6\x90\xac\xe8\xbf\x90\xe5\xb7\xa5', 'ban1 yun4 gong1'], ['\xe6\x9d\xbf\xe8\x93\x9d\xe6\xa0\xb9', 'ban3 lan2 gen1'], ['\xe5\x8d\x8a\xe5\xa4\x9c', 'ban4 ye4'], ['\xe5\xb8\xae', 'bang1'], ['\xe5\x8c\x85\xe6\x8b\xac', 'bao1 kuo4'], ['\xe5\x8c\x85\xe8\xa3\x85', 'bao1 zhuang1'], ['\xe8\xa4\x92\xe8\xb4\xac', 'bao1 bian3'], ['\xe5\xae\x9d\xe8\xb4\x9d', 'bao3 bei4'], ['\xe4\xbf\x9d\xe6\x8c\x81', 'bao3 chi2'], ['\xe4\xbf\x9d\xe7\x95\x99', 'bao3 liu2'], ['\xe4\xbf\x9d\xe5\xa7\x86', 'bao3 mu3'], ['\xe4\xbf\x9d\xe5\xae\x88', 'bao3 shou3'], ['\xe6\x8a\xb1\xe6\x80\xa8', 'bao4 yuan4'], ['\xe8\x83\x8c\xe6\x99\xaf', 'bei4 jing3'], ['\xe8\xa2\xab\xe8\xbf\xab', 'bei4 po4'], ['\xe6\x9c\xac\xe5\x88\x86', 'ben3 fen4'], ['\xe6\x9c\xac\xe8\x83\xbd', 'ben3 neng2'], ['\xe6\x9c\xac\xe8\xb4\xa8', 'ben3 zhi4'], ['\xe9\x80\xbc', 'bi1'], ['\xe6\xaf\x94\xe6\xaf\x94\xe7\x9a\x86\xe6\x98\xaf', 'bi3 bi3 jie1 shi4'], ['\xe6\xaf\x94\xe4\xb8\x98\xe5\xb0\xbc', 'bi3 qiu1 ni2']]
Step2: Only pick up pingyin data
>>> raw_pingyin = [] >>> for row in raw_data: ... raw_pingyin.append(row[1]) ... >>> raw_pingyin[:30] ['an4 lai2', 'ao2 ye4', 'ao4 nao3', 'ba1 bu de', 'ba1 da1', 'ba3 wo4', 'ba4', 'bai2 tou2 diao1', 'bai3 tuo1', 'ban1 yun4 gong1', 'ban3 lan2 gen1', 'ban4 ye4', 'bang1', 'bao1 kuo4', 'bao1 zhuang1', 'bao1 bian3', 'bao3 bei4', 'bao3 chi2', 'bao3 liu2', 'bao3 mu3', 'bao3 shou3', 'bao4 yuan4', 'bei4 jing3', 'bei4 po4', 'ben3 fen4', 'ben3 neng2', 'ben3 zhi4', 'bi1', 'bi3 bi3 jie1 shi4', 'bi3 qiu1 ni2']
Step3: Split data into one by one
>>> split_ping = [] >>> for row in raw_pingyin: ... split_ping.append(split_multiple(row)) ... >>> split_ping[:30] [['an4', 'lai2'], ['ao2', 'ye4'], ['ao4', 'nao3'], ['ba1', 'bu', 'de'], ['ba1', 'da1'], ['ba3', 'wo4'], ['ba4'], ['bai2', 'tou2', 'diao1'], ['bai3', 'tuo1'], ['ban1', 'yun4', 'gong1'], ['ban3', 'lan2', 'gen1'], ['ban4', 'ye4'], ['bang1'], ['bao1', 'kuo4'], ['bao1', 'zhuang1'], ['bao1', 'bian3'], ['bao3', 'bei4'], ['bao3', 'chi2'], ['bao3', 'liu2'], ['bao3', 'mu3'], ['bao3', 'shou3'], ['bao4', 'yuan4'], ['bei4', 'jing3'], ['bei4', 'po4'], ['ben3', 'fen4'], ['ben3', 'neng2'], ['ben3', 'zhi4'], ['bi1'], ['bi3', 'bi3', 'jie1', 'shi4'], ['bi3', 'qiu1', 'ni2']]
Step4: Merge into single list. Ignore word information this time...
>>> ping_list = [] >>> for row1 in split_ping: ... for elm in row1: ... ping_list.append(elm) ... >>> ping_list[:30] ['an4', 'lai2', 'ao2', 'ye4', 'ao4', 'nao3', 'ba1', 'bu', 'de', 'ba1', 'da1', 'ba3', 'wo4', 'ba4', 'bai2', 'tou2', 'diao1', 'bai3', 'tuo1', 'ban1', 'yun4', 'gong1', 'ban3', 'lan2', 'gen1', 'ban4', 'ye4', 'bang1', 'bao1', 'kuo4']
Step5: Split into sound elements, vowels, consonants and tone(shengdiao). Need to get rid of errors caused by mistakes in raw data by hand.
>>> ping_elements = [] >>> for elm in ping_list: ... ping_elements.append(pingyin_spliter(elm)) ... Traceback (most recent call last): File "<stdin>", line 2, in <module> File "/Users/ken/Documents/workspace/NLTK Learning/scripts/pingyin.py", line 8, in pingyin_spliter raise Exception, 'Too long: %d elements found' % len(s_ping) Exception: Too long: 7 elements found >>> len(ping_elements) 120 >>> ping_list[118:120 ... ... ] ['ru4', 'hu3'] >>> ping_list[118:121] ['ru4', 'hu3', 'xue2\xe3\x80\x80yan1'] >>> ping_list[121] 'de2' >>> ping_list[120] 'xue2\xe3\x80\x80yan1' >>> ping_list[120] = 'xue2' >>> ping_list.append('yan1') >>> ping_elements = [] >>> for elm in ping_list: ... ping_elements.append(pingyin_spliter(elm)) ... Traceback (most recent call last): File "<stdin>", line 2, in <module> File "/Users/ken/Documents/workspace/NLTK Learning/scripts/pingyin.py", line 23, in pingyin_spliter raise Exception, 'MissingVowel' Exception: MissingVowel >>> len(ping_elements) 6933 >>> ping_list[6933] 'ng2' >>> ping_list.pop(6933) 'ng2' >>> ping_elements = [] >>> for elm in ping_list: ... ping_elements.append(pingyin_spliter(elm)) ... >>> ping_elements[:30] [['', 'an', '4'], ['l', 'ai', '2'], ['', 'ao', '2'], ['y', 'e', '4'], ['', 'ao', '4'], ['n', 'ao', '3'], ['b', 'a', '1'], ['b', 'u', '0'], ['d', 'e', '0'], ['b', 'a', '1'], ['d', 'a', '1'], ['b', 'a', '3'], ['w', 'o', '4'], ['b', 'a', '4'], ['b', 'ai', '2'], ['t', 'ou', '2'], ['d', 'iao', '1'], ['b', 'ai', '3'], ['t', 'uo', '1'], ['b', 'an', '1'], ['y', 'un', '4'], ['g', 'ong', '1'], ['b', 'an', '3'], ['l', 'an', '2'], ['g', 'en', '1'], ['b', 'an', '4'], ['y', 'e', '4'], ['b', 'ang', '1'], ['b', 'ao', '1'], ['k', 'uo', '4']]
Done!
Step6: Generate another analysis, pronunciation and tone
>>> ping_tone=[] >>> for elm in ping_list: ... ping_tone.append(split_tone(elm)) ... >>> ping_tone[:30] [['an', '4'], ['lai', '2'], ['ao', '2'], ['ye', '4'], ['ao', '4'], ['nao', '3'], ['ba', '1'], ['bu', '0'], ['de', '0'], ['ba', '1'], ['da', '1'], ['ba', '3'], ['wo', '4'], ['ba', '4'], ['bai', '2'], ['tou', '2'], ['diao', '1'], ['bai', '3'], ['tuo', '1'], ['ban', '1'], ['yun', '4'], ['gong', '1'], ['ban', '3'], ['lan', '2'], ['gen', '1'], ['ban', '4'], ['ye', '4'], ['bang', '1'], ['bao', '1'], ['kuo', '4']]
Take some stats for test:
>>> cw_tfd = nltk.FreqDist((sound, tone) for (sound, tone) in ping_tone) >>> cw_tfd.tabulate(10) ('shi', '4') ('li', '4') ('yi', '4') ('bu', '4') ('shi', '2') ('xin', '1') ('zhi', '4') ('cheng', '2') ('ji', '4') ('zi', '0') 144 103 102 92 87 84 82 73 71 63 >>> cwcv_tfd = nltk.FreqDist((c,v) for (c, v, tone) in ping_elements) >>> cwcv_tfd.tabulate(10) ('sh', 'i') ('j', 'i') ('y', 'i') ('zh', 'i') ('l', 'i') ('q', 'i') ('b', 'u') ('z', 'i') ('j', 'ie') ('f', 'u') 288 195 194 180 165 143 134 119 117 113
For most of Chinese learners, to remember pronunciation and tone is the most difficult.
Based on this data, I might be able to do some deeper analysis.
t.b.c...