Pick up combination of Pingyin and Chinese characters

It's time to check combination of pingyin and Chinese characters(Hanzi). First, I have created a new function named split_per_hanzi().

def split_per_hanzi(word_list):
# ['&#20320;好','ni3 hao3'] --> [['ni3', '&#20320;'],['hao3', '好']]

	s_hanz = []
	ping_hanz = []

	try:
		if len(word_list) != 2:
			raise Exception, 'Invalid format: %s' % str(word_list)

		s_ping = split_multiple(word_list[1])
		for var in range (0, len(word_list[0]), 3):
			if word_list[0][var:var+3] != u'−' and word_list[0][var:var+3] != u'，':
				s_hanz.append(word_list[0][var:var+3])
		
		if len(s_ping) == len(s_hanz):
			for var in range(0, len(s_ping)):
				ping_hanz.append([s_ping[var], s_hanz[var]])
		else:	# Recovery failed
			raise Exception, 'Recovery faild: %s' % str(word_list)

		return ping_hanz

	except Exception, etext:
		info = sys.exc_info()
		raise info[0], info[1], info[2]

Note:This function is designed to executed under unicode environment.

Now generate a list.

>>> import_file = codecs.open('/Users/ken/Documents/workspace/NLTK Learning/text files/ChineseWords.csv')
>>> raw_data=[]
>>> for row in csv.reader(import_file):
...     raw_data.append(row)
... 
>>> ping_hanzi = []
>>> for elm in raw_data:
...     ping_hanzi.append(split_per_hanzi(elm))                                 ... 
>>> ping_hanzi[:30]
[[['an4', '\xe6\x8c\x89'], ['lai2', '\xe6\x9d\xa5']], [['ao2', '\xe7\x86\xac'], ['ye4', '\xe5\xa4\x9c']], [['ao4', '\xe6\x87\x8a'], ['nao3', '\xe6\x81\xbc']], [['ba1', '\xe5\xb7\xb4'], ['bu', '\xe4\xb8\x8d'], ['de', '\xe5\xbe\x97']], [['ba1', '\xe5\x90\xa7'], ['da1', '\xe5\x97\x92']], [['ba3', '\xe6\x8a\x8a'], ['wo4', '\xe6\x8f\xa1']], [['ba4', '\xe9\x9c\xb8']], [['bai2', '\xe7\x99\xbd'], ['tou2', '\xe5\xa4\xb4'], ['diao1', '\xe9\x9b\x95']], [['bai3', '\xe6\x91\x86'], ['tuo1', '\xe8\x84\xb1']], [['ban1', '\xe6\x90\xac'], ['yun4', '\xe8\xbf\x90'], ['gong1', '\xe5\xb7\xa5']], [['ban3', '\xe6\x9d\xbf'], ['lan2', '\xe8\x93\x9d'], ['gen1', '\xe6\xa0\xb9']], [['ban4', '\xe5\x8d\x8a'], ['ye4', '\xe5\xa4\x9c']], [['bang1', '\xe5\xb8\xae']], [['bao1', '\xe5\x8c\x85'], ['kuo4', '\xe6\x8b\xac']], [['bao1', '\xe5\x8c\x85'], ['zhuang1', '\xe8\xa3\x85']], [['bao1', '\xe8\xa4\x92'], ['bian3', '\xe8\xb4\xac']], [['bao3', '\xe5\xae\x9d'], ['bei4', '\xe8\xb4\x9d']], [['bao3', '\xe4\xbf\x9d'], ['chi2', '\xe6\x8c\x81']], [['bao3', '\xe4\xbf\x9d'], ['liu2', '\xe7\x95\x99']], [['bao3', '\xe4\xbf\x9d'], ['mu3', '\xe5\xa7\x86']], [['bao3', '\xe4\xbf\x9d'], ['shou3', '\xe5\xae\x88']], [['bao4', '\xe6\x8a\xb1'], ['yuan4', '\xe6\x80\xa8']], [['bei4', '\xe8\x83\x8c'], ['jing3', '\xe6\x99\xaf']], [['bei4', '\xe8\xa2\xab'], ['po4', '\xe8\xbf\xab']], [['ben3', '\xe6\x9c\xac'], ['fen4', '\xe5\x88\x86']], [['ben3', '\xe6\x9c\xac'], ['neng2', '\xe8\x83\xbd']], [['ben3', '\xe6\x9c\xac'], ['zhi4', '\xe8\xb4\xa8']], [['bi1', '\xe9\x80\xbc']], [['bi3', '\xe6\xaf\x94'], ['bi3', '\xe6\xaf\x94'], ['jie1', '\xe7\x9a\x86'], ['shi4', '\xe6\x98\xaf']], [['bi3', '\xe6\xaf\x94'], ['qiu1', '\xe4\xb8\x98'], ['ni2', '\xe5\xb0\xbc']]]

This time, I ignore words themselves. Just focus on each single Chinese character.

>>> hanzi_element = []
>>> for elm1 in ping_hanzi:
...     for elm2 in elm1:
...             hanzi_element.append(elm2)
... 
>>> hanzi_element[:10]
[['an4', '\xe6\x8c\x89'], ['lai2', '\xe6\x9d\xa5'], ['ao2', '\xe7\x86\xac'], ['ye4', '\xe5\xa4\x9c'], ['ao4', '\xe6\x87\x8a'], ['nao3', '\xe6\x81\xbc'], ['ba1', '\xe5\xb7\xb4'], ['bu', '\xe4\xb8\x8d'], ['de', '\xe5\xbe\x97'], ['ba1', '\xe5\x90\xa7']]

Let's check how many Hanzi(Chinese characters) included?

>>> len(hanzi_element)
11429

There must be duplicated entries. In this case, set(hanzi_element) cannot be used as hanzi_element itself is set.

>>> hanzi_element_set = []
>>> for elm in hanzi_element:
...     if elm not in hanzi_element_set:
...             hanzi_element_set.append(elm)
... 
>>> len(hanzi_element_set)
2913

OK then 2913 unique Chinese characters are found. In this result, some Hanzi might still be duplicated because some Chinese characters have more than one pingyin, so called duoyinzi(多音字).

Then grouping per pingyin and print Chinese characters.

>>> pings = [p for (p, w) in sorted(hanzi_element_set)]
>>> for ping in pings:
...     words = [w for (p, w) in sorted(hanzi_element_set) if p == ping]
...     print ping, ':'
...     for word in words:
...             print word,
...     print ''
... 
? :
&#24198; 役 &#39274; 
a :
&#21834; 
a1 :
阿 
ai1 :
哀 &#21710; &#21769; 挨 
ai2 :
癌 
....

Due to original data quality problem, strange data (?) is displayed at the beginning. Anyway, most of other parts seem reliable.

Note:
I faced an error due to BOM. My workaround is like this:

....
		s_ping = split_multiple(word_list[1])
		&lt;strong>if word_list[0].startswith('\xef\xbb\xbf'):
			word_list[0] = re.sub(r"\xef\xbb\xbf", "", word_list[0]).strip('"')&lt;/strong>
		for var in range (0, len(word_list[0]), 3):
....