patternpythonMinor
Alternative to Python's Naive Bayes Classifier for Twitter Sentiment Mining
Viewed 0 times
naivealternativeminingsentimentbayesclassifierpythonfortwitter
Problem
I am doing sentiment analysis on tweets. I have code that I developed from following an online tutorial (found here) and adding in some parts myself, which looks like this:
```
#!/usr/bin/env python
import csv, string, HTMLParser, nltk, pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
test_file = 'Dataset/SmallSample.csv'
#test_file = 'Dataset/Dataset.csv'
csv_file = csv.DictReader(open(test_file, 'rb'), delimiter=',', quotechar='"')
pos_tweets = {}
neg_tweets = {}
for line in csv_file:
if line['Sentiment']=='1':
pos_tweets.update({(line['SentimentText'],"positive")})
else:
neg_tweets.update({(line['SentimentText'],"negative")})
tweets = []
labeltweets = []
for (text, sentiment) in pos_tweets.items() + neg_tweets.items():
text = HTMLParser.HTMLParser().unescape(text.decode('utf-8'))
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
cleanedText = [e.translate(remove_punctuation_map).lower() for e in text.split() if not e.startswith(('http', '@')) ]
shortenedText = [e for e in cleanedText if len(e) >= 3]
tweets.append((shortenedText, sentiment))
# Produces list of all words in text of tweets (including duplicates).
def get_words_in_tweets(tweets):
all_words = []
for (text, sentiment) in tweets:
all_words.extend(text)
return all_words
def get_word_features(wordlist):
# This line calculates the frequency distrubtion of all words in tweets
# e.g. word "love" appears 5 times, word "dog" appears 3 times etc.
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
# This prints out the list of all distinct words in the text in order
# of their number of occurrences.
return word_features
word_features = get_word_features(get_words_in_tweets(tweets))
def extract_features(document):
setOfDocument = set(document)
features = {}
for word in word_features:
f
```
#!/usr/bin/env python
import csv, string, HTMLParser, nltk, pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
test_file = 'Dataset/SmallSample.csv'
#test_file = 'Dataset/Dataset.csv'
csv_file = csv.DictReader(open(test_file, 'rb'), delimiter=',', quotechar='"')
pos_tweets = {}
neg_tweets = {}
for line in csv_file:
if line['Sentiment']=='1':
pos_tweets.update({(line['SentimentText'],"positive")})
else:
neg_tweets.update({(line['SentimentText'],"negative")})
tweets = []
labeltweets = []
for (text, sentiment) in pos_tweets.items() + neg_tweets.items():
text = HTMLParser.HTMLParser().unescape(text.decode('utf-8'))
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
cleanedText = [e.translate(remove_punctuation_map).lower() for e in text.split() if not e.startswith(('http', '@')) ]
shortenedText = [e for e in cleanedText if len(e) >= 3]
tweets.append((shortenedText, sentiment))
# Produces list of all words in text of tweets (including duplicates).
def get_words_in_tweets(tweets):
all_words = []
for (text, sentiment) in tweets:
all_words.extend(text)
return all_words
def get_word_features(wordlist):
# This line calculates the frequency distrubtion of all words in tweets
# e.g. word "love" appears 5 times, word "dog" appears 3 times etc.
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
# This prints out the list of all distinct words in the text in order
# of their number of occurrences.
return word_features
word_features = get_word_features(get_words_in_tweets(tweets))
def extract_features(document):
setOfDocument = set(document)
features = {}
for word in word_features:
f
Solution
take a look at the answers in
Is there a rule-of-thumb for how to divide a dataset into training and validation sets?
"If you have 100,000 instances, ... indeed you may choose to use less training data if your method is particularly computationally intensive)."
Good luck!
Is there a rule-of-thumb for how to divide a dataset into training and validation sets?
"If you have 100,000 instances, ... indeed you may choose to use less training data if your method is particularly computationally intensive)."
Good luck!
Context
StackExchange Code Review Q#29573, answer score: 2
Revisions (0)
No revisions yet.