Analysing Chinese words 2

I missed ConditionalFreqDist since last article.

>>> ccfd = nltk.ConditionalFreqDist((c,v) for (c, v, tone) in ping_elements)
>>> ccfd.conditions()
['', 'b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 'w', 'x', 'y', 'z', 'zh']
>>> ccfd['b']['a']
27
>>> ccfd['b']
<FreqDist with 16 samples and 613 outcomes>
>>> ccfd['b']['i']
61
>>> ccfd['b']['u']
134
>>> ccfd['b']['e']
0
>>> ccfd['b']['o']
33
>>> ccfd.tabulate()
      a   ai   an  ang   ao    e   ei   en  eng   er    i   ia  ian iang  iao   ie   in  ing iong   iu    o  ong   ou    u   ua  uai  uan uang   ue   ui   un   uo    v   ve
      2   29   29    1    7   13    0    1    0   48    0    0    0    0    0    0    0    0    0    0    1    0    7    0    0    0    0    0    0    0    0    0    0    0
....
>>> tcfd = nltk.ConditionalFreqDist((ping,tone) for (ping, tone) in ping_tone)
>>> tcfd.tabulate()
          0    1    2    3    4
     a    1    1    0    0    0
    ai    0    4    1    2   22
....

After that I revised the code of pingyin_spliter().

import re, sys

def pingyin_spliter(pingyin):
	# List of Consonants / Vowels for final check
        consonants = ['b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 
		      'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 
                      'u', 'w', 'x', 'y', 'z', 'ng']
        vowels = ['a', 'ai', 'ang', 'ao', 'e', 'ei', 'eng', 'er', 'i',
                  'ia', 'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong',
                  'iu', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 
                  'uang', 'ue', 'ui', 'un', 'uo', 'v', 've']

	s_ping = re.findall(r"[0-9]|er|[aeiouv]+[n|ng]*|[^aeiouv0-9]+", pingyin.lower())
	try:
	# Check split results
		if len(s_ping) == 0 or len(s_ping) > 3:	#Invalid Pingyin
			raise Exception, 'Invalid Pingyin enterd: %s' % str(s_ping)
		elif len(s_ping) == 1:
			if s_ping[0].isdigit():
				raise Exception, 'Invalid Pingyin enterd: %s', str(s_ping)
			else:
				s_ping.append('')
				s_ping.append('0')
		elif len(s_ping) == 2:
			if s_ping[-1].isdigit():
				s_ping.append('')
				s_ping[2] = s_ping[1]
				s_ping[1] = ''
			else:
				s_ping.append('0')	#Qingsheng

		#All entry should have 3 elements in s_ping
		if not s_ping[-1].isdigit():
			raise Exception, 'Invalid Pingyin entered: %s', str(s_ping)
		elif s_ping[0] in vowels and s_ping[1] == '':
			s_ping[1] = s_ping[0]
			s_ping[0] = ''
		elif s_ping[0] in consonants and s_ping[1] in vowels:
			pass
		elif s_ping[0] == 'ng':
			s_ping[1] = ''
		else:
			raise Exception, 'Invalid Pingyin entered: %s', str(s_ping)

		return s_ping

	except Exception, etext:
		info = sys.exc_info()
		raise info[0], info[1], info[2]

def split_multiple(m_ping):
	m_ping = m_ping.lower()
	r_ping = m_ping.split()
	return r_ping

def split_tone(pingyin):
	s_tone = re.findall(r"[0-9]$|[a-z]+", pingyin.lower())
	
	try:
		if len(s_tone) == 1 and s_tone[-1].isdigit() == False:
			s_tone.append('0')
		if s_tone[0].isalpha and s_tone[1].isdigit == False:
			raise Exception, s_tone
		if len(s_tone) != 2:
			raise Exception, s_tone

		return s_tone

	except Exception, etext:
		info = sys.exc_info()
		raise info[0], info[1], info[2]