Incrementing dictionary values (5.3.5) - Deutschina's Tech Diary

Count numbers per tag.

>>> counts = nltk.defaultdict(int)
>>> from nltk.corpus import brown
>>> for (word, tag) in brown.tagged_words(categories='news'):
...     counts[tag] += 1
... 
>>> counts['N']
0

The result was different from the textbook.

>>> list(counts)
['BE', 'BEZ-HL', 'NP$', 'WQL', 'AT-TL', 'BEDZ*', 'WDT', 'JJ', 'NR-HL', 'AP$', 'RP', 'WPS+BEZ', 'JJ-NC', '(', 'PPSS+BER', ',', 'VBN-TL-HL', 'HVD-HL', 'PPSS+BEM', 'NPS-HL', 'RB', 'FW-PP$-NC', 'JJ-HL', 'NNS', 'WRB', 'MD-TL', 'NN-NC', 'DOD*', 'NN$', 'PPLS', ')-HL', 'BEZ*', 'RB-HL', 'NNS$', 'NPS-TL', 'NNS-HL', 'FW-IN+NN-TL', '--', 'BER-TL', 'OD', 'PP$$', 'CC-TL', 'FW-NN-TL', 'NP-TL-HL', 'AP-TL', 'PPSS+MD', 'FW-JJ', 'FW-DT', 'BER*', 'FW-WDT', 'NPS', 'DTI', 'BEN', 'BEM', 'EX+BEZ', 'HV', 'BEG', 'BED', 'HVD', 'BEZ', 'DTX', 'FW-VB-NC', 'VBZ', 'DTS', 'RB-TL', 'VB-TL', 'NNS-TL', 'FW-CC', 'CS-HL', 'NP$-TL', 'FW-CD', 'ABN-HL', 'IN-HL', 'JJT-HL', 'BED*', 'BEDZ', 'NN-TL-HL', 'PN', 'JJR-HL', 'FW-AT-TL', 'PPSS+HVD', 'VBD-HL', 'MD-HL', 'NNS-TL-HL', 'DTI-HL', 'EX', 'VBN-HL', 'NNS$-HL', 'PPSS-HL', 'MD', 'BE-HL', 'TO-TL', 'NN-HL', 'VBZ-HL', 'NR$-TL', 'DT$', 'WP$', 'N', 'MD+HV', 'TO-HL', 'PPS+BEZ', 'DT-HL', 'CD$', 'VBG', 'VBD', 'VBN-TL', 'DOZ*', 'VBN', 'DOD', 'UH-TL', 'DOZ', 'NR-TL', 'AP-HL', 'AT-HL', '.', 'FW-AT', 'NN', '(-HL', 'MD*-HL', '*', 'WPS', 'WPO', 'FW-NNS', 'NP', 'JJR-NC', 'NR', ':', 'BER-HL', 'MD*', '``', ':-HL', 'RP-HL', 'CC', 'PP$-TL', 'WDT+BEZ', 'CD-HL', 'NPS$-TL', 'CD', 'DT+BEZ', ',-HL', 'OD-HL', 'PPS+MD', 'CS', 'NN$-HL', 'NP-TL', 'QL-TL', 'DO*', 'PPS+BEZ-HL', 'VB-HL', 'DO-HL', 'HVN', 'JJT', 'JJS', 'JJR', 'HVG', 'HVZ', 'PN+HVZ', 'NNS$-TL', 'CC-HL', 'JJ-TL', 'HVZ*', 'VBG-TL', 'DO', 'FW-JJ-TL', 'FW-*', 'NP+BEZ', 'NP-HL', 'NPS$', 'NN-TL', 'PPSS', 'NR$', "''", 'BER', 'FW-VB', 'PN-HL', 'CD-TL', 'BEDZ-HL', 'DT', 'VBD-TL', 'PN$', 'VB+PPO', ')', 'VBG-HL', 'PPO', 'PPL', 'PPS', 'TO', 'RB$', 'FW-IN+NN', 'UH', 'VB', 'OD-TL', 'FW-IN', 'PP$', 'RBT', 'ABL', 'RBR', 'ABN', 'AP', 'PPSS+HV', 'AT', 'JJS-TL', 'IN', 'ABX', '*-HL', 'FW-AT-HL', 'HVD*', "'", 'JJR-TL', 'RB+BEZ', 'NN$-TL', 'FW-IN-TL', 'QLP', 'IN-TL', 'FW-NN', 'FW-IN+AT-TL', 'PPS+HVZ', 'QL', '.-HL']
>>> counts['BE']
525
>>>

There is no tag named 'N'. This might be simplify_tags. Let's change a little bit.

>>> for (word, tag) in brown.tagged_words(categories='news', simplify_tags=True):
...     counts[tag] += 1                                                        ... 
>>> counts['N']
22226
>>> list(counts)
['', 'FW', 'DET', 'WH', "''", 'VBZ', 'VB+PPO', "'", ')', 'ADJ', 'PRO', '*', ',', '.', 'TO', 'NUM', 'NP', ':', 'ADV', '``', 'VD', 'VG', 'VN', 'N', 'P', 'EX', 'V', 'CNJ', 'UH', '(', 'MOD']

Now worked as expected!

>>> from operator import itemgetter
>>> sorted(counts.items(), key=itemgetter(1), reverse=True)
[('N', 22226), ('DET', 11602), ('P', 10845), ('NP', 8336), ('V', 6392), ('ADJ', 5435), (',', 5188), ('.', 4472), ('CNJ', 4227), ('PRO', 3408), ('ADV', 2770), ('VD', 2531), ('NUM', 2508), ('VN', 2410), ('VG', 1425), ('TO', 1244), ('WH', 1101), ('MOD', 1082), ('``', 732), ("''", 702), ('VBZ', 558), ('', 300), ('*', 257), (')', 171), ('(', 168), ('EX', 163), (':', 149), ('FW', 92), ("'", 46), ('UH', 13), ('VB+PPO', 1)]
>>> [t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]
['N', 'DET', 'P', 'NP', 'V', 'ADJ', ',', '.', 'CNJ', 'PRO', 'ADV', 'VD', 'NUM', 'VN', 'VG', 'TO', 'WH', 'MOD', '``', "''", 'VBZ', '', '*', ')', '(', 'EX', ':', 'FW', "'", 'UH', 'VB+PPO']

itemgetter() is really new for me.

>>> pair = ('NP', 9123)
>>> pair[1]
9123
>>> itemgetter(1)(pair)
9123

>>> last_letters = nltk.defaultdict(list)
>>> words = nltk.corpus.words.words('en')
>>> for word in words:
...     key = word[-2:]
...     last_letters[key].append(word)
... 
>>> last_letters['ly']
['abactinally', 'abandonedly', 'abasedly', 'abashedly', 'abashlessly', 
....
>>> last_letters['py']
['abdominoscopy', 'actinoscopy', 'actinostereoscopy', 'actinotherapy', 'aeolotropy',

What kind of process is this?

>>> anagrams = nltk.defaultdict(list)
>>> for word in words:
...     key = ''.join(sorted(word))
...     anagrams[key].append(word)
... 
>>> anagrams['aeilnrt']
['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

The easiest way to understand is to see the inside of anagrams.

>>> anagrams
....
'eghlnt': ['length'], 'Kaelort': ['Keratol'], 'eeilnoorrstu': ['resolutioner']})

Characters in each word is sorted by alphabetical orders.

>>> anagrams['aer']
['aer', 'are', 'ear', 'era', 'rea']
>>> anagrams['aber']
['bare', 'bear', 'brae']

The same way can be realized with nltk.Index().

>>> anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)
>>> anagrams['aeilnrt']
['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']