patternpythonMinor
Tokenizing texts from Gutenberg archive for analysis
Viewed 0 times
textstokenizinganalysisgutenbergforfromarchive
Problem
I am writing a program to analyze books from the Gutenberg archive. The program takes the title and URL and finds the text and downloads it. Then it goes through the text and tokenizes it. Here is the code:
```
from urllib import request
import nltk
import os.path
canon_titles = [
'Moby Dick',
'Great Expectations'
]
canon_urls = [
'http://www.gutenberg.org/files/2701/2701-0.txt', # Moby Dick
'http://www.gutenberg.org/files/1400/1400-0.txt'
]
# Canon_beginnings is the location of where the book actually begins in
# the text file (skips headers, publisher info, etc.)
canon_beginnings = [
28876,
886
]
# canon endings exists just to grab a small amount of text for prototyping
canon_endings = [x + 500 for x in canon_beginnings]
canon_raw = [None] * len(canon_titles)
canon_tokens = [None] * len(canon_titles)
canon_words = [None] * len(canon_titles)
canon_words2tokens = [None] * len(canon_titles)
canon_pos = [None] * len(canon_titles)
# Now I combine all these together into a dictionary
canon_dict = {z[0]: list(z[1:]) for z in zip(canon_titles, canon_urls, canon_beginnings, canon_endings, canon_raw, canon_tokens,
canon_words, canon_words2tokens, canon_pos)}
# Now I go through each title in the dict and see if I already have the text (I rerun this in Jupyter Notebook sometimes)
# And if not I grab it from online
for x in canon_dict:
print("Working on {}".format(x))
if canon_dict[x][3] == None:
print("{} does not already exist, grabbing the text".format(x))
url = canon_dict[x][0]
response = request.urlopen(url)
canon_text_draft = response.read().decode('utf8')
canon_dict[x][3] = canon_text_draft[canon_dict[x][1]:canon_dict[x][2]]
else:
print("Already have this text, skipping")
# OK, now we'll tokenize, do parts of speech, etc.
def tokinze_text(raw_text):
tokens = nltk.word_tokenize(raw_text)
return tokens
# Now let's find the tokens
```
from urllib import request
import nltk
import os.path
canon_titles = [
'Moby Dick',
'Great Expectations'
]
canon_urls = [
'http://www.gutenberg.org/files/2701/2701-0.txt', # Moby Dick
'http://www.gutenberg.org/files/1400/1400-0.txt'
]
# Canon_beginnings is the location of where the book actually begins in
# the text file (skips headers, publisher info, etc.)
canon_beginnings = [
28876,
886
]
# canon endings exists just to grab a small amount of text for prototyping
canon_endings = [x + 500 for x in canon_beginnings]
canon_raw = [None] * len(canon_titles)
canon_tokens = [None] * len(canon_titles)
canon_words = [None] * len(canon_titles)
canon_words2tokens = [None] * len(canon_titles)
canon_pos = [None] * len(canon_titles)
# Now I combine all these together into a dictionary
canon_dict = {z[0]: list(z[1:]) for z in zip(canon_titles, canon_urls, canon_beginnings, canon_endings, canon_raw, canon_tokens,
canon_words, canon_words2tokens, canon_pos)}
# Now I go through each title in the dict and see if I already have the text (I rerun this in Jupyter Notebook sometimes)
# And if not I grab it from online
for x in canon_dict:
print("Working on {}".format(x))
if canon_dict[x][3] == None:
print("{} does not already exist, grabbing the text".format(x))
url = canon_dict[x][0]
response = request.urlopen(url)
canon_text_draft = response.read().decode('utf8')
canon_dict[x][3] = canon_text_draft[canon_dict[x][1]:canon_dict[x][2]]
else:
print("Already have this text, skipping")
# OK, now we'll tokenize, do parts of speech, etc.
def tokinze_text(raw_text):
tokens = nltk.word_tokenize(raw_text)
return tokens
# Now let's find the tokens
Solution
This is the ideal place for a class. Each book is its own object with its own method of returning its tokens. I would make a method
Something like this:
I commented out the
Alternatively, if you don't want to insist on the delayed getting of the values, you can do it all already in the constructor:
Which you can use exactly the same way.
tokens, which I would make a property that fills itself on the first call to it.Something like this:
from urllib import request
import nltk
class Book(object):
def __init__(self, title, url, start=0, end=-1):
self.title = title
self.url = url
self.start = start
self.end = end
self._raw = None
self._tokens = None
# self.words = None
# self.words2tokens = None
# self.pos = None
def __str__(self):
return self.title
@property
def raw(self):
if self._raw is None:
response = request.urlopen(self.url)
draft = response.read().decode('utf8')
self._raw = draft[self.start:self.end]
return self._raw
@property
def tokens(self):
if self._tokens is None:
self._tokens = nltk.word_tokenize(self.raw)
return self._tokens
if __name__ == "__main__":
books = [Book('Moby Dick', 'http://www.gutenberg.org/files/2701/2701-0.txt', 28876, 28876 + 500),
Book('Great Expectations', 'http://www.gutenberg.org/files/1400/1400-0.txt', 886, 886 + 500)]
for book in books:
print book
print book.tokensI commented out the
words, words2tokens and pos attributes as they are not currently needed.Alternatively, if you don't want to insist on the delayed getting of the values, you can do it all already in the constructor:
class Book(object):
def __init__(self, title, url, start=0, end=-1):
self.title = title
self.url = url
self.start = start
self.end = end
self.raw = self.get_raw(url)
self.tokens = nltk.word_tokenize(self.raw)
# self.words = None
# self.words2tokens = None
# self.pos = None
def __str__(self):
return self.title
def get_raw(self, url):
response = request.urlopen(url)
draft = response.read().decode('utf8')
return draft[self.start:self.end]Which you can use exactly the same way.
Code Snippets
from urllib import request
import nltk
class Book(object):
def __init__(self, title, url, start=0, end=-1):
self.title = title
self.url = url
self.start = start
self.end = end
self._raw = None
self._tokens = None
# self.words = None
# self.words2tokens = None
# self.pos = None
def __str__(self):
return self.title
@property
def raw(self):
if self._raw is None:
response = request.urlopen(self.url)
draft = response.read().decode('utf8')
self._raw = draft[self.start:self.end]
return self._raw
@property
def tokens(self):
if self._tokens is None:
self._tokens = nltk.word_tokenize(self.raw)
return self._tokens
if __name__ == "__main__":
books = [Book('Moby Dick', 'http://www.gutenberg.org/files/2701/2701-0.txt', 28876, 28876 + 500),
Book('Great Expectations', 'http://www.gutenberg.org/files/1400/1400-0.txt', 886, 886 + 500)]
for book in books:
print book
print book.tokensclass Book(object):
def __init__(self, title, url, start=0, end=-1):
self.title = title
self.url = url
self.start = start
self.end = end
self.raw = self.get_raw(url)
self.tokens = nltk.word_tokenize(self.raw)
# self.words = None
# self.words2tokens = None
# self.pos = None
def __str__(self):
return self.title
def get_raw(self, url):
response = request.urlopen(url)
draft = response.read().decode('utf8')
return draft[self.start:self.end]Context
StackExchange Code Review Q#160158, answer score: 6
Revisions (0)
No revisions yet.