Skip to content

Commit

Permalink
working sentiment detector
Browse files Browse the repository at this point in the history
  • Loading branch information
LukeDefeo committed Aug 20, 2013
1 parent dcd07bd commit e6d506b
Show file tree
Hide file tree
Showing 11 changed files with 260 additions and 50 deletions.
28 changes: 28 additions & 0 deletions NLP_Engine/Common/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

__author__ = 'Luke'

"Code from http://stackoverflow.com/questions/323750/how-to-access-previous-next-element-while-for-looping"
def neighborhood(iterable):
iterator = iter(iterable)
prev = None
item = iterator.next()
for next in iterator:
yield (prev, item, next)
prev = item
item = next
yield (prev, item, None)


def extract_tags(tagged_sent):
tags = [0] * len(tag_index)
for word, tag in tagged_sent:
tags[tag_index[tag]] += 1

return tags


tag_index = {'CC': 0, 'CD': 1, 'DT': 2, 'EX': 3, 'FW': 4, 'IN': 5, 'JJ': 6, 'JJR': 7, 'JJS': 8, 'LS': 9, 'MD': 10,
'NN': 11, 'NNS': 12, 'NNP': 13, 'NNPS': 14, 'PDT': 15, 'POS': 16, 'PRP': 17, 'PP$': 18, 'RB': 19,
'RBR': 20, 'RBS': 21, 'RP': 22, 'SYM': 23, 'TO': 24, 'UH': 25, 'VB': 26, 'VBD': 27, 'VBG': 28, 'VBN': 29,
'VBP': 30, 'VBZ': 31, 'WDT': 32, 'WP': 33, 'WP$': 34, 'WRB': 35, '#': 36, '$': 37, '.': 38, ',': 39,
':': 40, '(': 41, ')': 42, '"': 43, "'": 44, "``": 45, "''": 46, 'PRP$': 47}
22 changes: 12 additions & 10 deletions NLP_Engine/Common/tokeniser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

__author__ = 'Luke'

negations = {'no', 'not', 'never', "don't", 'dont',}

def contains_url(word):
url_pattern = r'(\S+\.(com|co\.uk|ac|info|ly|net|org|edu|gov)(\/\S+)?)|http://'
Expand All @@ -35,13 +36,7 @@ def _ends_with_punct(word):


def contains_foreign_chars(word):
# pattern = r'![\w\s]'
# pattern2 = r'[^(\x20-\x7F)]'
# if re.match(pattern2, word):
# return True
# else:
# return False
exceptions = '£€'
exceptions = u'£€'
for char in word:
if ord(char) > 127:
if char not in exceptions:
Expand Down Expand Up @@ -80,15 +75,22 @@ def delete_char(word, index):
return word[:index] + word[index + 1:]


def tokenise_tweet(tweet):

for word in tweet.split():
if contains_url(word):
continue


def tokenise(word):
if contains_url(word):
return None
return ''
# if contains_repeated_chars(word):
# return
if word[0] == '@':
return None
return ''
if '&' in word:
return None
return ''

word = word.lower()
word = strip_punctuation(word)
Expand Down
17 changes: 11 additions & 6 deletions NLP_Engine/NaiveBayes/preprocess_unigrams.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from nltk.corpus import stopwords
from NLP_Engine.Common.helper import neighborhood

__author__ = 'Luke'
import time
Expand All @@ -17,16 +18,21 @@ def add_to_dict(word):
word_dict[word] = 1



print "begin"
with open("../../Data/Training/training-data-small.csv") as training_in:
with open("../../Data/Training/training-data.csv") as training_in:
for line in training_in:
line = line.decode(encoding='latin1')
sentiment, tweet_content = line.split('\t', 1)
if contains_foreign_chars(tweet_content):
continue

tweets.append((tweet_content, sentiment))
for word in tweet_content.split():
add_to_dict(tokenise(word))
for prev, word, after in neighborhood(tweet_content.split()):
if prev in negations:
add_to_dict(tokenise(u'neg-' + word))
else:
add_to_dict(tokenise(word))

for key in word_dict.keys():
if word_dict[key] < 5:
Expand All @@ -37,13 +43,12 @@ def add_to_dict(word):
try:
words.remove(word)
except:
print "cant find " + word
pass

print "done " + str(time.time() - start_time) + 'seconds'
print "pickling"
pickle.dump(tweets, open("../../Data/Training/tweets-small.obj", "wb"))
pickle.dump(words, open("../../Data/Training/word_set-small.obj", "wb"))
pickle.dump(tweets, open("../../Data/Training/tweets.obj", "wb"))
pickle.dump(words, open("../../Data/Training/word_set.obj", "wb"))

print "done picking " + str(time.time() - start_time) + 'seconds'

24 changes: 17 additions & 7 deletions NLP_Engine/NaiveBayes/process_unigrams.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
from NLP_Engine.Common.tokeniser import tokenise
from NLP_Engine.Common.helper import neighborhood
from NLP_Engine.Common.tokeniser import tokenise, negations

__author__ = 'Luke'
import cPickle as pickle
import nltk
from nltk import NaiveBayesClassifier

mode = "big"


tweets = pickle.load(open("../../Data/Training/tweets-small.obj"))
word_set = pickle.load((open("../../Data/Training/word_set-small.obj")))
tweets = pickle.load(open("../../Data/Training/tweets.obj"))
word_set = pickle.load((open("../../Data/Training/word_set.obj")))

# tweets = pickle.load(open("../../Data/Training/tweets-small.obj"))
# word_set = pickle.load((open("../../Data/Training/word_set-small.obj")))
#

def tweet_features(tweet):
tweet_words = tweet.split()
tokenised_words = set([tokenise(word) for word in tweet_words])
tokenised_words = [tokenise(word) for word in tweet.split()]
to_remove = set()
# print tokenised_words
for prev, word, next in neighborhood(tokenised_words):
if prev in negations:

to_remove.add(prev)
to_remove.add(word)
tokenised_words.append('neg-' + word)

tokenised_words = set(tokenised_words)
for word in to_remove:
tokenised_words.remove(word)

features = {}
for word in tokenised_words:
Expand Down
25 changes: 25 additions & 0 deletions NLP_Engine/POSTagger/build_classifyer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import cPickle as pickle
from sklearn.svm import SVC
from NLP_Engine.Common.helper import extract_tags


__author__ = 'Luke'

source = pickle.load(open('../../Data/Training/sentiment_detector_training.obj'))

tagged_set, total_target = zip(*source)

total_data = [extract_tags(sent) for sent in tagged_set]
cut_off = int(0.85 * len(total_data))

training_data = total_data[:cut_off]
test_data = total_data[cut_off:]

training_target = total_target[:cut_off]
test_target = total_target[cut_off:]

svm = SVC()
svm.fit(training_data, training_target)

pickle.dump(svm, open('../../Data/Models/sentiment-classifyer-svm','wb'))
print svm.score(test_data, test_target)
24 changes: 0 additions & 24 deletions NLP_Engine/POSTagger/post.py

This file was deleted.

56 changes: 56 additions & 0 deletions NLP_Engine/POSTagger/sentiment_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import random

from nltk.tag.stanford import POSTagger
from sklearn.svm import SVC
from NLP_Engine.Common.helper import extract_tags


__author__ = 'Luke'
import cPickle as pickle


def tokenise_tweet():
pass



objective_tweets = pickle.load(open('../../Data/Training/objective-tweets.obj'))
subjective_tweets = pickle.load(open('../../Data/Training/subjective-tweets.obj'))

objective_tweets = [(tweet, u'obj') for tweet in objective_tweets]
subjective_tweets = [(tweet, u'sub') for tweet, sent in subjective_tweets]
#
# objective_tweets = objective_tweets[:100]
# subjective_tweets = subjective_tweets[:100]

total_set = objective_tweets + subjective_tweets
random.shuffle(total_set)
cut_off = int(0.85*len(total_set))

tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8')

tagged_set = tagger.batch_tag([sent.split() for sent, label in total_set])

total_target = [label for sent, label in total_set]

to_disk = zip(tagged_set, total_target)
print len(tagged_set)
print len(total_set)
pickle.dump(to_disk, open('../../Data/Training/sentiment_detector_training.obj', 'wb'))

total_data = [extract_tags(sent) for sent in tagged_set]
training_data = total_data[:cut_off]
test_data = total_data[cut_off:]

training_target = total_target[:cut_off]
test_target = total_target[cut_off:]

svm = SVC()
svm.fit(training_data, training_target)
print svm.score(test_data,test_target)

print subjective_tweets[0]
print objective_tweets[0]
print len(objective_tweets)
print len(subjective_tweets)

23 changes: 20 additions & 3 deletions NLP_Engine/POSTagger/standford.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# coding=utf-8
from nltk.corpus import brown
from nltk.tag.stanford import POSTagger

Expand All @@ -7,11 +8,27 @@

# print tagger.tag("what is the airspeed of an unlaiden swallow?".split())

l = ["what is the airspeed of an unlaiden swallow?".split(), "Call me Luke.".split()]
l = ["what is the airspeed of an unlaiden swallow?".split(), "Call me Luke."]
print l
print tagger.batch_tag(l)

reviews_sent_untagged = brown.sents(categories='reviews')[0:20] + brown.sents(categories='news')[0:20]

print tagger.batch_tag(reviews_sent_untagged)

s= 'ok α'
u = u'ok α'
def whatisthis(s):
if isinstance(s, str):
print "ordinary string" + s
elif isinstance(s, unicode):
print "unicode string" + s
else:
print "not a string"


whatisthis(s)
whatisthis(u)
print tagger.tag(s)
print tagger.tag(u.encode('utf8'))
whatisthis(u.encode('utf8'))
whatisthis(unicode(s,'utf8'))
print
30 changes: 30 additions & 0 deletions NLP_Engine/POSTagger/test_classifyer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import cPickle as pickle
from nltk.tag.stanford import POSTagger
from NLP_Engine.Common.helper import extract_tags

__author__ = 'Luke'



test_set = []
with open("../../Data/Test/test-data.csv") as test_in:
for line in test_in:
sentiment, tweet_content = line.split('\t', 1)
if sentiment == 'neg' or sentiment == 'pos':
sentiment = 'sub'
elif sentiment == 'neutral':
sentiment = 'obj'
else:
print 'error'
test_set.append((tweet_content, sentiment))

print len(test_set)
tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8')

tagged_set = tagger.batch_tag([sent.split() for sent, label in test_set])
test_data = [extract_tags(sent) for sent in tagged_set]

targets = [label for sent, label in test_set]
svm = pickle.load(open('../../Data/Models/sentiment-classifyer-svm'))

print svm.score(test_data,targets)
30 changes: 30 additions & 0 deletions Scripts/CreateExtraSmallTrainingData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
__author__ = 'Luke'


import os

__author__ = 'Luke'

print 'start '
counter = 0
with open("/Users/Luke/Documents/Project-Files/Training-Data/training.1600000.processed.noemoticon.csv") as input:
with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data-extra-small.csv",'w') as output:
for each_line in input:
elements = each_line.split('","')
elements[0] = elements[0][1:]
elements[5] = elements[5][:-2]
if(elements[0] == '0'):
elements[0] = 'neg'
if(elements[0] == '2'):
elements[0] = 'neutral'
if(elements[0] == '4'):
elements[0] = 'pos'

if counter % 100 == 0:
output.write(elements[0] + "\t" + elements[5] + '\n')

counter += 1



print 'done'
Loading

0 comments on commit e6d506b

Please sign in to comment.