Exercise chapter 2 (16-19) - Deutschina's Tech Diary

16.

>>> for category in nltk.corpus.brown.categories():
...     token = len(nltk.corpus.brown.words(categories=category))
...     vocab = len(set(nltk.corpus.brown.words(categories=category)))
...     divst = token / vocab
...     print category, token, vocab, divst
... 
adventure 69342 8874 7.81406355646
belles_lettres 173096 18421 9.39666684762
editorial 61604 9890 6.22891809909
fiction 68488 9302 7.36271769512
government 70117 8181 8.57071262682
hobbies 82345 11935 6.89945538333
humor 21695 5017 4.32429738888
learned 181888 16859 10.7887775076
lore 110299 14503 7.60525408536
mystery 57169 6982 8.18805499857
news 100554 14394 6.9858274281
religion 39399 6373 6.18217479994
reviews 40704 8626 4.71875724554
romance 70022 8452 8.28466635116
science_fiction 14470 3233 4.4757191463
>>>

17.

Do some test before creating "function".

>>> stopwords = nltk.corpus.stopwords.words('english')
>>> fdist = FreqDist([w for w in nltk.corpus.brown.words(categories="romance") if w not in stopwords])
>>> fdist
&lt;FreqDist with 8331 samples and 44216 outcomes>
>>> fdist.items()[:50]
[(',', 3899), ('.', 3736), ('``', 1045), ("''", 1044), ('I', 951), ('?', 690), ('He', 366), ('said', 330), ('!', 316), ('--', 291), (';', 264), ('would', 244), ('She', 232), ('The', 230), ('could', 193), ('like', 185), ('one', 166), ('It', 144), ('But', 135), ('And', 129), ('back', 126), ('thought', 105), ('You', 102), ("didn't", 101), ('little', 99), ('time', 93), ('get', 92), ('got', 89), ('know', 88), ('man', 87), ('never', 84), ('way', 83), ('went', 82), ("I'm", 77), ('eyes', 76), ('go', 76), ('came', 75), ('see', 74), ('come', 73), ('even', 73), ('old', 73), ('looked', 72), ('They', 69), ('knew', 69), ('much', 69), ('around', 68), ('There', 66), ('good', 65), ('long', 65), ('away', 64)]
>>>

Let's check stopwords are really excluded.

>>> fdist2 = FreqDist([w for w in nltk.corpus.brown.words(categories="romance")])
>>> fdist2.items()[:50]
[(',', 3899), ('.', 3736), ('the', 2758), ('and', 1776), ('to', 1502), ('a', 1335), ('of', 1186), ('``', 1045), ("''", 1044), ('was', 993), ('I', 951), ('in', 875), ('he', 702), ('had', 692), ('?', 690), ('her', 651), ('that', 583), ('it', 573), ('his', 559), ('she', 496), ('with', 460), ('you', 456), ('for', 410), ('at', 402), ('He', 366), ('on', 362), ('him', 339), ('said', 330), ('!', 316), ('--', 291), ('be', 289), ('as', 282), (';', 264), ('have', 258), ('but', 252), ('not', 250), ('would', 244), ('She', 232), ('The', 230), ('out', 217), ('were', 214), ('up', 211), ('all', 209), ('from', 202), ('could', 193), ('me', 193), ('like', 185), ('been', 179), ('so', 174), ('there', 169)]
>>>

Seems ok. Then create a function.

>>> def FreqDistNSW(s_words):
...     stopwords = nltk.corpus.stopwords.words('english')
...     fdist = FreqDist([w for w in s_words if w not in stopwords])
...     return fdist.items()[:50]
... 
>>> FreqDistNSW(nltk.corpus.brown.words(categories="romance"))
[(',', 3899), ('.', 3736), ('``', 1045), ("''", 1044), ('I', 951), ('?', 690), ('He', 366), ('said', 330), ('!', 316), ('--', 291), (';', 264), ('would', 244), ('She', 232), ('The', 230), ('could', 193), ('like', 185), ('one', 166), ('It', 144), ('But', 135), ('And', 129), ('back', 126), ('thought', 105), ('You', 102), ("didn't", 101), ('little', 99), ('time', 93), ('get', 92), ('got', 89), ('know', 88), ('man', 87), ('never', 84), ('way', 83), ('went', 82), ("I'm", 77), ('eyes', 76), ('go', 76), ('came', 75), ('see', 74), ('come', 73), ('even', 73), ('old', 73), ('looked', 72), ('They', 69), ('knew', 69), ('much', 69), ('around', 68), ('There', 66), ('good', 65), ('long', 65), ('away', 64)]
>>> FreqDistNSW(nltk.corpus.brown.words(categories="editorial"))
[(',', 2766), ('.', 2481), ('The', 453), ('``', 396), ("''", 382), ('?', 294), ('I', 201), (';', 196), ('--', 192), ('would', 180), (':', 154), ('one', 150), ('But', 118), ('It', 113), ('Mr.', 110), ('He', 96), ('(', 95), (')', 95), ('In', 90), ('new', 80), ('American', 77), ('United', 76), ('people', 75), ('may', 74), ('time', 72), ('A', 68), ('This', 67), ('first', 66), ('world', 66), ('If', 65), ('us', 64), ('many', 63), ('years', 63), ('We', 60), ('last', 59), ('two', 59), ('States', 58), ('good', 58), ('public', 58), ('could', 56), ('man', 56), ('And', 55), ('even', 55), ('New', 54), ('much', 54), ('war', 54), ('West', 53), ('make', 53), ('must', 53), ('East', 52)]
>>>

18.

Try something before creating.

>>> bigrams = nltk.bigrams(nltk.corpus.brown.words(categories="romance"))
>>> fdist = FreqDist(bigrams)
>>> fdist.items()[:50]
[(('.', '``'), 739), ((',', 'and'), 479), (("''", '.'), 422), (('?', '?'), 345), (('.', 'He'), 311), (("''", ','), 275), (('in', 'the'), 250), (('.', 'I'), 243), (('of', 'the'), 233), (('.', 'The'), 196), (("''", '?'), 187), (('.', 'She'), 184), ((',', 'but'), 174), ((',', 'he'), 168), (('!', '!'), 158), ((',', 'the'), 144), (('to', 'the'), 135), (('on', 'the'), 133), ((';', ';'), 132), (('.', 'It'), 125), ((',', 'she'), 120), (('``', 'I'), 119), (('at', 'the'), 114), ((',', 'I'), 113), ((',', '``'), 112), (('and', 'the'), 111), (('to', 'be'), 111), (('.', 'But'), 110), (('it', 'was'), 104), (('was', 'a'), 97), (('said', ','), 95), (('.', 'And'), 93), (('he', 'had'), 91), (('said', '.'), 91), (('had', 'been'), 90), (('?', '``'), 87), (('he', 'was'), 85), ((',', 'a'), 84), (("''", '!'), 79), (('in', 'a'), 78), (('It', 'was'), 75), (('I', 'was'), 71), (('it', '.'), 71), (('I', 'had'), 69), (('from', 'the'), 69), (('and', 'I'), 67), (('with', 'the'), 66), (('with', 'a'), 65), (('him', '.'), 63), (('into', 'the'), 63)]

Then exclude stopwords.

>>> fdist = FreqDist([w for w in bigrams if w not in stopwords])
>>> fdist.items()[:50]
[(('.', '``'), 739), ((',', 'and'), 479), (("''", '.'), 422), (('?', '?'), 345), (('.', 'He'), 311), (("''", ','), 275), (('in', 'the'), 250), (('.', 'I'), 243), (('of', 'the'), 233), (('.', 'The'), 196), (("''", '?'), 187), (('.', 'She'), 184), ((',', 'but'), 174), ((',', 'he'), 168), (('!', '!'), 158), ((',', 'the'), 144), (('to', 'the'), 135), (('on', 'the'), 133), ((';', ';'), 132), (('.', 'It'), 125), ((',', 'she'), 120), (('``', 'I'), 119), (('at', 'the'), 114), ((',', 'I'), 113), ((',', '``'), 112), (('and', 'the'), 111), (('to', 'be'), 111), (('.', 'But'), 110), (('it', 'was'), 104), (('was', 'a'), 97), (('said', ','), 95), (('.', 'And'), 93), (('he', 'had'), 91), (('said', '.'), 91), (('had', 'been'), 90), (('?', '``'), 87), (('he', 'was'), 85), ((',', 'a'), 84), (("''", '!'), 79), (('in', 'a'), 78), (('It', 'was'), 75), (('I', 'was'), 71), (('it', '.'), 71), (('I', 'had'), 69), (('from', 'the'), 69), (('and', 'I'), 67), (('with', 'the'), 66), (('with', 'a'), 65), (('him', '.'), 63), (('into', 'the'), 63)]

No change? Maybe something wrong in the condition.

>>> fdist = FreqDist([w for w in bigrams if w[0] not in stopwords])
>>> fdist.items()[:50]
[(('.', '``'), 739), ((',', 'and'), 479), (("''", '.'), 422), (('?', '?'), 345), (('.', 'He'), 311), (("''", ','), 275), (('.', 'I'), 243), (('.', 'The'), 196), (("''", '?'), 187), (('.', 'She'), 184), ((',', 'but'), 174), ((',', 'he'), 168), (('!', '!'), 158), ((',', 'the'), 144), ((';', ';'), 132), (('.', 'It'), 125), ((',', 'she'), 120), (('``', 'I'), 119), ((',', 'I'), 113), ((',', '``'), 112), (('.', 'But'), 110), (('said', ','), 95), (('.', 'And'), 93), (('said', '.'), 91), (('?', '``'), 87), ((',', 'a'), 84), (("''", '!'), 79), (('It', 'was'), 75), (('I', 'was'), 71), (('I', 'had'), 69), (('.', 'They'), 60), (('He', 'was'), 57), (('``', 'You'), 57), (('.', 'There'), 55), (('.', 'In'), 46), (('like', 'a'), 45), ((',', 'for'), 42), ((',', 'with'), 42), (('would', 'be'), 41), (("''", '--'), 40), (('.', 'His'), 40), ((',', 'in'), 37), (('.', 'Then'), 36), (('I', 'said'), 36), (('.', 'When'), 35), (('.', 'You'), 35), (('``', "I'm"), 35), (('going', 'to'), 35), ((',', 'it'), 34), ((',', 'then'), 34)]

This one is only excluding if the first element is in stopwords. To meet the requirements, need to change as follows.

>>> fdist = FreqDist([w for w in bigrams if w[0] not in stopwords and w[1] not in stopwords])
>>> fdist.items()[:50]
[(('.', '``'), 739), (("''", '.'), 422), (('?', '?'), 345), (('.', 'He'), 311), (("''", ','), 275), (('.', 'I'), 243), (('.', 'The'), 196), (("''", '?'), 187), (('.', 'She'), 184), (('!', '!'), 158), ((';', ';'), 132), (('.', 'It'), 125), (('``', 'I'), 119), ((',', 'I'), 113), ((',', '``'), 112), (('.', 'But'), 110), (('said', ','), 95), (('.', 'And'), 93), (('said', '.'), 91), (('?', '``'), 87), (("''", '!'), 79), (('.', 'They'), 60), (('``', 'You'), 57), (('.', 'There'), 55), (('.', 'In'), 46), (("''", '--'), 40), (('.', 'His'), 40), (('.', 'Then'), 36), (('I', 'said'), 36), (('.', 'When'), 35), (('.', 'You'), 35), (('``', "I'm"), 35), (('I', 'could'), 34), (('.', 'A'), 33), (('?', 'She'), 32), (('.', 'This'), 29), (('``', 'What'), 28), (('I', "don't"), 27), (('?', 'He'), 26), (('I', "didn't"), 26), (('I', 'thought'), 26), (('.', 'Her'), 25), (('``', 'Well'), 25), (('.', 'At'), 24), ((':', '``'), 24), (('--', '``'), 23), (('.', 'We'), 23), (('?', 'I'), 23), (('Well', ','), 23), (('thought', ','), 23)]
>>>

This seems not so meaningful, isn't this?

19.

>>> genre_word = [(genre, word)
...     for genre in ['adventure', 'fiction', 'government', 'news']
...     for word in nltk.corpus.brown.words(categories=genre)]

>>> 
>>> len(genre_word)
308501
>>> cfd = nltk.ConditionalFreqDist(genre_word)
>>> cfd
&lt;ConditionalFreqDist with 4 conditions>
>>> cfd.conditions()
['adventure', 'fiction', 'government', 'news']
>>> words = ['love', 'city', 'sun', 'train', 'car']
>>> cfd.tabulate(samples=words)
           love city  sun train  car
 adventure    9    5   21   10   36
   fiction   16   18    6   12   12
government    1    7    0    2    3
      news    3   49    2    4   50
>>>

Do adventures take a train even in their journeys?