Analysing Chinese words 2
I missed ConditionalFreqDist since last article.
>>> ccfd = nltk.ConditionalFreqDist((c,v) for (c, v, tone) in ping_elements) >>> ccfd.conditions() ['', 'b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 'w', 'x', 'y', 'z', 'zh'] >>> ccfd['b']['a'] 27 >>> ccfd['b'] <FreqDist with 16 samples and 613 outcomes> >>> ccfd['b']['i'] 61 >>> ccfd['b']['u'] 134 >>> ccfd['b']['e'] 0 >>> ccfd['b']['o'] 33 >>> ccfd.tabulate() a ai an ang ao e ei en eng er i ia ian iang iao ie in ing iong iu o ong ou u ua uai uan uang ue ui un uo v ve 2 29 29 1 7 13 0 1 0 48 0 0 0 0 0 0 0 0 0 0 1 0 7 0 0 0 0 0 0 0 0 0 0 0 ....
>>> tcfd = nltk.ConditionalFreqDist((ping,tone) for (ping, tone) in ping_tone) >>> tcfd.tabulate() 0 1 2 3 4 a 1 1 0 0 0 ai 0 4 1 2 22 ....
After that I revised the code of pingyin_spliter().
import re, sys def pingyin_spliter(pingyin): # List of Consonants / Vowels for final check consonants = ['b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 'u', 'w', 'x', 'y', 'z', 'ng'] vowels = ['a', 'ai', 'ang', 'ao', 'e', 'ei', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ue', 'ui', 'un', 'uo', 'v', 've'] s_ping = re.findall(r"[0-9]|er|[aeiouv]+[n|ng]*|[^aeiouv0-9]+", pingyin.lower()) try: # Check split results if len(s_ping) == 0 or len(s_ping) > 3: #Invalid Pingyin raise Exception, 'Invalid Pingyin enterd: %s' % str(s_ping) elif len(s_ping) == 1: if s_ping[0].isdigit(): raise Exception, 'Invalid Pingyin enterd: %s', str(s_ping) else: s_ping.append('') s_ping.append('0') elif len(s_ping) == 2: if s_ping[-1].isdigit(): s_ping.append('') s_ping[2] = s_ping[1] s_ping[1] = '' else: s_ping.append('0') #Qingsheng #All entry should have 3 elements in s_ping if not s_ping[-1].isdigit(): raise Exception, 'Invalid Pingyin entered: %s', str(s_ping) elif s_ping[0] in vowels and s_ping[1] == '': s_ping[1] = s_ping[0] s_ping[0] = '' elif s_ping[0] in consonants and s_ping[1] in vowels: pass elif s_ping[0] == 'ng': s_ping[1] = '' else: raise Exception, 'Invalid Pingyin entered: %s', str(s_ping) return s_ping except Exception, etext: info = sys.exc_info() raise info[0], info[1], info[2] def split_multiple(m_ping): m_ping = m_ping.lower() r_ping = m_ping.split() return r_ping def split_tone(pingyin): s_tone = re.findall(r"[0-9]$|[a-z]+", pingyin.lower()) try: if len(s_tone) == 1 and s_tone[-1].isdigit() == False: s_tone.append('0') if s_tone[0].isalpha and s_tone[1].isdigit == False: raise Exception, s_tone if len(s_tone) != 2: raise Exception, s_tone return s_tone except Exception, etext: info = sys.exc_info() raise info[0], info[1], info[2]