Vocabulary resources

Continuing O'Reilly's textbook chapter 2.4.1:

Created this code then save under the name unusual_words.py.

def unusual_words(text):
    import nltk
    from nltk.corpus import words

    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab.difference(english_vocab)
    return sorted(unusual)

text_vocab.difference() is to get words not in english_vocab but only in text_vocab.
Then tried to import but got an error.

>>> import nltk
>>> from nltk.corpus import brown
>>> from unusual_words import *

Traceback (most recent call last):
  File "<pyshell#3>", line 1, in <module>
    from unusual_words import *
ImportError: No module named unusual_words

The reason is quite clear that path is not set to the location of unusual_words.py. Last time I saved the file, the path seemed to be automatically updated when saving.

Let's check the path.

>>> import sys
>>> print sys.path
['', '/Users/xxx/Documents', '/Library/Python/2.7/site-packages/setuptools-0.6c11-py2.7.egg', '/Library/Python/2.7/site-packages/pip-1.3.1-py2.7.egg', '/Library/Python/2.7/site-packages/ipython-0.13.2-py2.7.egg', '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python27.zip', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-darwin', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac/lib-scriptpackages', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-tk', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-old', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-dynload', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages', '/Library/Python/2.7/site-packages']

My assumption was correct. Then set a path again.

>>> sys.path.append('/Users/ken/Documents/workspace/NLTK learning/scripts')
>>> print sys.path
['', '/Users/xxx/Documents', '/Library/Python/2.7/site-packages/setuptools-0.6c11-py2.7.egg', '/Library/Python/2.7/site-packages/pip-1.3.1-py2.7.egg', '/Library/Python/2.7/site-packages/ipython-0.13.2-py2.7.egg', '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python27.zip', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-darwin', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac/lib-scriptpackages', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-tk', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-old', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-dynload', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages', '/Library/Python/2.7/site-packages', '/Users/xxx/Documents/workspace/NLTK learning/scripts']

Now it's ok.

>>> from unusual_words import *
>>> 

Now ready to try "unusual_words".

>>> from unusual_words import *
>>> from nltk.corpus import gutenberg
>>> unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))
['abbeyland', 'abhorred', 'abilities', 'abounded', 'abridgement', 'abused', 'abuses', 'accents', 
....
'workmen', 'worlds', 'wrapt', 'writes', 'yards', 'years', 'yielded', 'youngest']
>>> 
>>> unusual_words(nltk.corpus.nps_chat.words())
['aaaaaaaaaaaaaaaaa', 'aaahhhh', 'abortions', 'abou', 'abourted', 'abs', 'ack', 'acros', 'actualy',
....
'yup', 'yuuuuuuuuuuuummmmmmmmmmmm', 'yvw', 'yw', 'zebrahead', 'zoloft', 'zyban', 'zzzzzzzing', 'zzzzzzzz']
>>> 

I have tried stopwords in other textbook as well.

>>> from nltk.corpus import stopwords
>>> stopwords.words('english')
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
>>> 

This one is to calculate how much portion of words are not included in stopwords.

>>> def content_fraction(text):
...     stopwords = nltk.corpus.stopwords.words('english')
...     content = [w for w in text if w.lower() not in stopwords]
...     return len(content) / len(text)
... 
>>> content_fraction(nltk.corpus.reuters.words())
0
>>> from __future__ import division
>>> content_fraction(nltk.corpus.reuters.words())
0
>>> def content_fraction(text):
...     stopwords = nltk.corpus.stopwords.words('english')
...     content = [w for w in text if w.lower() not in stopwords]
...     return len(content) / len(text)
... 
>>> content_fraction(nltk.corpus.reuters.words())
0.7364374824583169
>>> 

The result was slightly different from the textbook. Anyway, I would say 26% of words are in stopwords.

EGIVRVONL puzzle:

>>> puzzle_letters = nltk.FreqDist('egivrvonl')
>>> obligatory = 'r'

>>> wordlist = nltk.corpus.words.words()
>>> [w for w in wordlist if len(w) >= 6
...     and obligatory in w
...     and nltk.FreqDist(w) <= puzzle_letters]
['glover', 'gorlin', 'govern', 'grovel', 'ignore', 'involver', 'lienor', 'linger', 'longer', 'lovering', 'noiler', 'overling', 'region', 'renvoi', 'revolving', 'ringle', 'roving', 'violer', 'virole']

How about including and longer than 4? I was afraid it might take long time as it took several seconds for 6 characters and longer, but it ended around 5 seconds.

>>> [w for w in wordlist if len(w) >= 4
...     and obligatory in w
...     and nltk.FreqDist(w) <= puzzle_letters]
['enrol', 'ergon', 'genro', 'girl', 'girn', 'giro', 'giver', 'glor', 'glore', 'glover', 'goer', 'goner', 'gore', 'gorlin', 'govern', 'grein', 'grin', 'groin', 'grove', 'grovel', 'ignore', 'inro', 'involver', 'iron', 'irone', 'levir', 'lienor', 'lier', 'liner', 'linger', 'lire', 'liver', 'livor', 'livre', 'loir', 'longer', 'lore', 'lori', 'lorn', 'lover', 'lovering', 'negro', 'nigre', 'noiler', 'noir', 'nori', 'norie', 'ogler', 'ogre', 'oiler', 'oner', 'oriel', 'orle', 'over', 'overling', 'regin', 'region', 'reign', 'rein', 'renvoi', 'reoil', 'revolving', 'rigol', 'rile', 'rine', 'ring', 'ringe', 'ringle', 'rive', 'rivel', 'riven', 'roil', 'role', 'rone', 'rove', 'roving', 'vergi', 'veri', 'vier', 'vigor', 'viner', 'violer', 'vire', 'vireo', 'virl', 'virole', 'viron', 'viver', 'girl', 'iron', 'over', 'ring']
>>> 

Using names, find out names which exist in both genders.

>>> names = nltk.corpus.names
>>> names.fileids()
['female.txt', 'male.txt']
>>> male_names = names.words('male.txt')
>>> female_names = names.words('female.txt')
>>> [w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis', 'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel', 'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', 'Barrie', 'Barry', 'Beau', 'Bennie', 'Benny', 'Bernie', 'Bert', 'Bertie', 'Bill', 'Billie', 'Billy', 'Blair', 'Blake', 'Bo', 'Bobbie', 'Bobby', 'Brandy', 'Brett', 'Britt', 'Brook', 'Brooke', 'Brooks', 'Bryn', 'Cal', 'Cam', 'Cammy', 'Carey', 'Carlie', 'Carlin', 'Carmine', 'Carroll', 'Cary', 'Caryl', 'Casey', 'Cass', 'Cat', 'Cecil', 'Chad', 'Chris', 'Chrissy', 'Christian', 'Christie', 'Christy', 'Clair', 'Claire', 'Clare', 'Claude', 'Clem', 'Clemmie', 'Cody', 'Connie', 'Constantine', 'Corey', 'Corrie', 'Cory', 'Courtney', 'Cris', 'Daffy', 'Dale', 'Dallas', 'Dana', 'Dani', 'Daniel', 'Dannie', 'Danny', 'Darby', 'Darcy', 'Darryl', 'Daryl', 'Deane', 'Del', 'Dell', 'Demetris', 'Dennie', 'Denny', 'Devin', 'Devon', 'Dion', 'Dionis', 'Dominique', 'Donnie', 'Donny', 'Dorian', 'Dory', 'Drew', 'Eddie', 'Eddy', 'Edie', 'Elisha', 'Emmy', 'Erin', 'Esme', 'Evelyn', 'Felice', 'Fran', 'Francis', 'Frank', 'Frankie', 'Franky', 'Fred', 'Freddie', 'Freddy', 'Gabriel', 'Gabriell', 'Gail', 'Gale', 'Gay', 'Gayle', 'Gene', 'George', 'Georgia', 'Georgie', 'Geri', 'Germaine', 'Gerri', 'Gerry', 'Gill', 'Ginger', 'Glen', 'Glenn', 'Grace', 'Gretchen', 'Gus', 'Haleigh', 'Haley', 'Hannibal', 'Harley', 'Hazel', 'Heath', 'Henrie', 'Hilary', 'Hillary', 'Holly', 'Ike', 'Ikey', 'Ira', 'Isa', 'Isador', 'Isadore', 'Jackie', 'Jaime', 'Jamie', 'Jan', 'Jean', 'Jere', 'Jermaine', 'Jerrie', 'Jerry', 'Jess', 'Jesse', 'Jessie', 'Jo', 'Jodi', 'Jodie', 'Jody', 'Joey', 'Jordan', 'Juanita', 'Jude', 'Judith', 'Judy', 'Julie', 'Justin', 'Karel', 'Kellen', 'Kelley', 'Kelly', 'Kelsey', 'Kerry', 'Kim', 'Kip', 'Kirby', 'Kit', 'Kris', 'Kyle', 'Lane', 'Lanny', 'Lauren', 'Laurie', 'Lee', 'Leigh', 'Leland', 'Lesley', 'Leslie', 'Lin', 'Lind', 'Lindsay', 'Lindsey', 'Lindy', 'Lonnie', 'Loren', 'Lorne', 'Lorrie', 'Lou', 'Luce', 'Lyn', 'Lynn', 'Maddie', 'Maddy', 'Marietta', 'Marion', 'Marlo', 'Martie', 'Marty', 'Mattie', 'Matty', 'Maurise', 'Max', 'Maxie', 'Mead', 'Meade', 'Mel', 'Meredith', 'Merle', 'Merrill', 'Merry', 'Meryl', 'Michal', 'Michel', 'Michele', 'Mickie', 'Micky', 'Millicent', 'Morgan', 'Morlee', 'Muffin', 'Nat', 'Nichole', 'Nickie', 'Nicky', 'Niki', 'Nikki', 'Noel', 'Ollie', 'Page', 'Paige', 'Pat', 'Patrice', 'Patsy', 'Pattie', 'Patty', 'Pen', 'Pennie', 'Penny', 'Perry', 'Phil', 'Pooh', 'Quentin', 'Quinn', 'Randi', 'Randie', 'Randy', 'Ray', 'Regan', 'Reggie', 'Rene', 'Rey', 'Ricki', 'Rickie', 'Ricky', 'Rikki', 'Robbie', 'Robin', 'Ronnie', 'Ronny', 'Rory', 'Ruby', 'Sal', 'Sam', 'Sammy', 'Sandy', 'Sascha', 'Sasha', 'Saundra', 'Sayre', 'Scotty', 'Sean', 'Shaine', 'Shane', 'Shannon', 'Shaun', 'Shawn', 'Shay', 'Shayne', 'Shea', 'Shelby', 'Shell', 'Shelley', 'Sibyl', 'Simone', 'Sonnie', 'Sonny', 'Stacy', 'Sunny', 'Sydney', 'Tabbie', 'Tabby', 'Tallie', 'Tally', 'Tammie', 'Tammy', 'Tate', 'Ted', 'Teddie', 'Teddy', 'Terri', 'Terry', 'Theo', 'Tim', 'Timmie', 'Timmy', 'Tobe', 'Tobie', 'Toby', 'Tommie', 'Tommy', 'Tony', 'Torey', 'Trace', 'Tracey', 'Tracie', 'Tracy', 'Val', 'Vale', 'Valentine', 'Van', 'Vin', 'Vinnie', 'Vinny', 'Virgie', 'Wallie', 'Wallis', 'Wally', 'Whitney', 'Willi', 'Willie', 'Willy', 'Winnie', 'Winny', 'Wynn']
>>> 

When I write e-mail, sometime I am not sure gender of the recipient, Male or Female. In terms of that, this analysis is interesting for me.

>>> cfd = nltk.ConditionalFreqDist(
...     (fileid, name[-1])
...     for fileid in names.fileids()
...     for name in names.words(fileid))
>>> cfd.plot()

figure_1

According to this plot, most of names end with 'k' is male. Let's see some samples.

>>> [name for name in names.words('male.txt') if name.endswith('k')]
['Adrick', 'Aleck', 'Beck', 'Berk', 'Brock', 'Broderick', 'Brook', 'Buck', 'Chadwick', 'Chuck', 'Clark', 'Darrick', 'Derek', 'Derick', 'Derk', 'Derrek', 'Derrick', 'Dick', 'Dirk', 'Dominick', 'Erek', 'Erick', 'Erik', 'Frank', 'Frederick', 'Frederik', 'Fredrick', 'Friedrick', 'Hank', 'Hendrick', 'Hendrik', 'Henrik', 'Herrick', 'Isaak', 'Izaak', 'Izak', 'Jack', 'Jock', 'John-Patrick', 'Kendrick', 'Kirk', 'Mack', 'Mark', 'Merrick', 'Mick', 'Murdock', 'Nick', 'Park', 'Patrick', 'Patrik', 'Rick', 'Rik', 'Rock', 'Roderick', 'Rodrick', 'Sherlock', 'Shurlock', 'Tedrick', 'Thedrick', 'Tuck', 'Ulick', 'Ulrick', 'Vick', 'Westbrook', 'Wilek', 'Yank', 'Zack', 'Zak', 'Zerk']
>>> 

How about 'o'? Many of them should be male, but some parts still are female.

>>> [name for name in names.words('male.txt') if name.endswith('o')]
['Adolfo', 'Adolpho', 'Aguinaldo', 'Alberto', 'Aldo', 'Alejandro', 'Alessandro', 'Alfonso', 'Alfonzo', 'Alfredo', 'Alonso', 'Alonzo', 'Alphonso', 'Ambrosio', 'Anatollo', 'Angelico', 'Angelo', 'Antonino', 'Antonio', 'Apollo', 'Arlo', 'Armando', 'Arnoldo', 'Arturo', 'Augusto', 'Bartholomeo', 'Bartolemo', 'Bartolomeo', 'Benito', 'Bernardo', 'Bo', 'Bruno', 'Carlo', 'Christiano', 'Claudio', 'Cosmo', 'Dario', 'Diego', 'Dino', 'Domenico', 'Edgardo', 'Eduardo', 'Emilio', 'Enrico', 'Ernesto', 'Fabio', 'Federico', 'Felicio', 'Fernando', 'Francesco', 'Francisco', 'Frederico', 'Gabriello', 'Geo', 'Geraldo', 'Giacomo', 'Giancarlo', 'Gilberto', 'Gino', 'Giordano', 'Giorgio', 'Giraldo', 'Gonzalo', 'Gregorio', 'Guido', 'Guillermo', 'Gustavo', 'Hernando', 'Hiro', 'Horacio', 'Horatio', 'Hugo', 'Iago', 'Ignacio', 'Ignazio', 'Inigo', 'Jethro', 'Jimbo', 'Jo', 'Joao', 'Julio', 'Kimmo', 'Lazaro', 'Lazlo', 'Leo', 'Leonardo', 'Lionello', 'Lorenzo', 'Luciano', 'Lucio', 'Marcello', 'Marcelo', 'Marcio', 'Marco', 'Mario', 'Marko', 'Marlo', 'Martino', 'Mateo', 'Matteo', 'Mauricio', 'Milo', 'Munmro', 'Mylo', 'Nero', 'Niccolo', 'Nico', 'Nunzio', 'Orazio', 'Orlando', 'Othello', 'Otho', 'Otto', 'Pablo', 'Paco', 'Paolo', 'Patricio', 'Paulo', 'Pedro', 'Pepillo', 'Pepito', 'Pietro', 'Plato', 'Raymundo', 'Renado', 'Renaldo', 'Renato', 'Ricardo', 'Riccardo', 'Richardo', 'Rinaldo', 'Roberto', 'Roderigo', 'Rodolfo', 'Rodrigo', 'Rolando', 'Rollo', 'Romeo', 'Rudolfo', 'Ruperto', 'Salomo', 'Sancho', 'Sandro', 'Sauncho', 'Sebastiano', 'Sergio', 'Shlomo', 'Silvano', 'Silvio', 'Spiro', 'Stefano', 'Taddeo', 'Teodorico', 'Teodoro', 'Terencio', 'Theo', 'Tito', 'Ugo', 'Ulberto', 'Umberto', 'Urbano', 'Virgilio', 'Vito', 'Waldo', 'Zorro']
>>> [name for name in names.words('female.txt') if name.endswith('o')]
['Amargo', 'Bo', 'Calypso', 'Cameo', 'Caro', 'Charo', 'Cleo', 'Clio', 'Clo', 'Consuelo', 'Dido', 'Doro', 'Flo', 'Jo', 'Jojo', 'Kameko', 'Koo', 'Margalo', 'Margo', 'Marijo', 'Marlo', 'Maryjo', 'Mikako', 'Nariko', 'Reiko', 'Rosario', 'Tamiko', 'Terri-Jo', 'Theo', 'Tomiko', 'Umeko', 'Yoko', 'Yoshiko']

In the list of female, I can see several Japanese name. Yes, usually names end with 'o' and the length is including and shorter than 3 Japanese characters (6 or 7 chars in alphabet) are female in Japanese names. (If it is longer than 8 chars in alphabet that should be male's name.)