Normalization of text (3.6)

Chapter 3.6 of the whale book.

>>> raw = """DENNIS: Listen, stange women lying in ponds distributing swords 
... is no basis for a system of government. Supreme executive power derives from
... a mandate from the masses, not from some farcical aquatic ceremony."""
>>> raw 
'DENNIS: Listen, stange women lying in ponds distributing swords \nis no basis for a system of government. Supreme executive power derives from\na mandate from the masses, not from some farcical aquatic ceremony.'
>>> tokens = nltk.word_tokenize(raw)
>>> tokens
['DENNIS', ':', 'Listen', ',', 'stange', 'women', 'lying', 'in', 'ponds', 'distributing', 'swords', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'masses', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']

Comparing PorterStemmer and LancasterStemmer.

>>> porter = nltk.PorterStemmer()
>>> lancaster = nltk.LancasterStemmer()
>>> [porter.stem(t) for t in tokens]
['DENNI', ':', 'Listen', ',', 'stang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'government.', 'Suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']
>>> [lancaster.stem(t) for t in tokens]
['den', ':', 'list', ',', 'stang', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'government.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']
>>> 

I got an error at Example 3-1.

>>> class IndexedText(object):
...     def __init__(self, stemmer, text):
...             self._text = text
...             self._stemmer = stemmer                                                        ...             self._index = nltk.Index((self._stem(word), i) for (i, word) in enumerate(text))
...     def concordance(self, word, width=40):
...             key = self._stem(word)
...             wc = int(width/4)
...             for i in self._index[key]:
...                     lconttext = ' '.join(self._text[i-wc:i])
...                     rconttext = ' '.join(self._text[i:i+wc])
...                     ldisplay = '%*s' % (width, lconttext[-width:])
...                     rdisplay = '%-*s' % (width, rconttext[:width])
...                     print ldisplay, rdisplay
...     def _stem(self, word):
...             return self._stemmer.stem(word).lower()
... 
>>> porter = nltk.PorterStemmer()
>>> grail = nltk.corpus.webtext.words('grail.txt')
>>> text = IndexedText(porter, grail)>>> text.concordance('lie')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 10, in concordance
  File "/Library/Python/2.7/site-packages/nltk/corpus/reader/util.py", line 249, in __getitem__
    return self._cache[2][start-offset:stop-offset]
TypeError: slice indices must be integers or None or have an __index__ method

It seems slicing was failed then tried again.

>>> class IndexedText(object):
...     def __init__(self, stemmer, text):
...             self._text = text
...             self._stemmer = stemmer
...             self._index = nltk.Index((self._stem(word), i) for (i, word) in
enumerate(text))
...     def concordance(self, word, width=40):
...             key = self._stem(word)
...             wc = int(width/4)
...             for i in self._index[key]:
...                     lcontext = ' '.join(self._text[i-wc:i])
...                     rcontext = ' '.join(self._text[i:i+wc])
...                     ldisplay = '%*s' % (width, lcontext[-width:])
...                     rdisplay = '%-*s' % (width, rcontext[:width])
...                     print ldisplay, rdisplay
...     def _stem(self, word):
...             return self._stemmer.stem(word).lower()
...
>>> text = IndexedText(porter, grail)
>>> text.concordance('lie')
r king ! DENNIS : Listen , strange women lying in ponds distributing swords is no
 beat a very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of
       Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded !
doctors immediately ! No , no , please ! Lie down . [ clap clap ] PIGLET : Well
ere is much danger , for beyond the cave lies the Gorge of Eternal Peril , which
   you . Oh ... TIM : To the north there lies a cave -- the cave of Caerbannog --
h it and lived ! Bones of full fifty men lie strewn about its lair . So , bravek
not stop our fight ' til each one of you lies dead , and the Holy Grail returnst
>>>

Error gone... Lie/Lies/Lying are picked up.

Lemmatizer example:

>>> wnl = nltk.WordNetLemmatizer()
>>> [wnl.lemmatize(t) for t in tokens]
['DENNIS', ':', 'Listen', ',', 'stange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']
>>> 

As spent long time for error analysis (but not reached to the root cause), stop here today.