HiveBrain v1.2.0
Get Started
← Back to all entries
patternpythonMinor

Tokenizing texts from Gutenberg archive for analysis

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
textstokenizinganalysisgutenbergforfromarchive

Problem

I am writing a program to analyze books from the Gutenberg archive. The program takes the title and URL and finds the text and downloads it. Then it goes through the text and tokenizes it. Here is the code:

```
from urllib import request
import nltk
import os.path

canon_titles = [
'Moby Dick',
'Great Expectations'
]
canon_urls = [
'http://www.gutenberg.org/files/2701/2701-0.txt', # Moby Dick
'http://www.gutenberg.org/files/1400/1400-0.txt'
]
# Canon_beginnings is the location of where the book actually begins in
# the text file (skips headers, publisher info, etc.)
canon_beginnings = [
28876,
886
]
# canon endings exists just to grab a small amount of text for prototyping
canon_endings = [x + 500 for x in canon_beginnings]
canon_raw = [None] * len(canon_titles)
canon_tokens = [None] * len(canon_titles)
canon_words = [None] * len(canon_titles)
canon_words2tokens = [None] * len(canon_titles)
canon_pos = [None] * len(canon_titles)

# Now I combine all these together into a dictionary
canon_dict = {z[0]: list(z[1:]) for z in zip(canon_titles, canon_urls, canon_beginnings, canon_endings, canon_raw, canon_tokens,
canon_words, canon_words2tokens, canon_pos)}

# Now I go through each title in the dict and see if I already have the text (I rerun this in Jupyter Notebook sometimes)
# And if not I grab it from online
for x in canon_dict:
print("Working on {}".format(x))
if canon_dict[x][3] == None:
print("{} does not already exist, grabbing the text".format(x))
url = canon_dict[x][0]
response = request.urlopen(url)
canon_text_draft = response.read().decode('utf8')
canon_dict[x][3] = canon_text_draft[canon_dict[x][1]:canon_dict[x][2]]
else:
print("Already have this text, skipping")

# OK, now we'll tokenize, do parts of speech, etc.
def tokinze_text(raw_text):
tokens = nltk.word_tokenize(raw_text)
return tokens

# Now let's find the tokens

Solution

This is the ideal place for a class. Each book is its own object with its own method of returning its tokens. I would make a method tokens, which I would make a property that fills itself on the first call to it.

Something like this:

from urllib import request
import nltk

class Book(object):
    def __init__(self, title, url, start=0, end=-1):
        self.title = title
        self.url = url
        self.start = start
        self.end = end
        self._raw = None
        self._tokens = None
        # self.words = None
        # self.words2tokens = None
        # self.pos = None

    def __str__(self):
        return self.title

    @property
    def raw(self):
        if self._raw is None:
            response = request.urlopen(self.url)
            draft = response.read().decode('utf8')
            self._raw = draft[self.start:self.end]
        return self._raw

    @property
    def tokens(self):
        if self._tokens is None:
            self._tokens = nltk.word_tokenize(self.raw)
        return self._tokens

if __name__ == "__main__":
    books = [Book('Moby Dick', 'http://www.gutenberg.org/files/2701/2701-0.txt', 28876, 28876 + 500),
             Book('Great Expectations', 'http://www.gutenberg.org/files/1400/1400-0.txt', 886, 886 + 500)]

    for book in books:
        print book
        print book.tokens


I commented out the words, words2tokens and pos attributes as they are not currently needed.

Alternatively, if you don't want to insist on the delayed getting of the values, you can do it all already in the constructor:

class Book(object):
    def __init__(self, title, url, start=0, end=-1):
        self.title = title
        self.url = url
        self.start = start
        self.end = end
        self.raw = self.get_raw(url)
        self.tokens = nltk.word_tokenize(self.raw)
        # self.words = None
        # self.words2tokens = None
        # self.pos = None

    def __str__(self):
        return self.title

    def get_raw(self, url):
        response = request.urlopen(url)
        draft = response.read().decode('utf8')
        return draft[self.start:self.end]


Which you can use exactly the same way.

Code Snippets

from urllib import request
import nltk


class Book(object):
    def __init__(self, title, url, start=0, end=-1):
        self.title = title
        self.url = url
        self.start = start
        self.end = end
        self._raw = None
        self._tokens = None
        # self.words = None
        # self.words2tokens = None
        # self.pos = None

    def __str__(self):
        return self.title

    @property
    def raw(self):
        if self._raw is None:
            response = request.urlopen(self.url)
            draft = response.read().decode('utf8')
            self._raw = draft[self.start:self.end]
        return self._raw

    @property
    def tokens(self):
        if self._tokens is None:
            self._tokens = nltk.word_tokenize(self.raw)
        return self._tokens


if __name__ == "__main__":
    books = [Book('Moby Dick', 'http://www.gutenberg.org/files/2701/2701-0.txt', 28876, 28876 + 500),
             Book('Great Expectations', 'http://www.gutenberg.org/files/1400/1400-0.txt', 886, 886 + 500)]

    for book in books:
        print book
        print book.tokens
class Book(object):
    def __init__(self, title, url, start=0, end=-1):
        self.title = title
        self.url = url
        self.start = start
        self.end = end
        self.raw = self.get_raw(url)
        self.tokens = nltk.word_tokenize(self.raw)
        # self.words = None
        # self.words2tokens = None
        # self.pos = None

    def __str__(self):
        return self.title

    def get_raw(self, url):
        response = request.urlopen(url)
        draft = response.read().decode('utf8')
        return draft[self.start:self.end]

Context

StackExchange Code Review Q#160158, answer score: 6

Revisions (0)

No revisions yet.