working sentiment detector

LukeDefeo · Aug 20, 2013 · e6d506b · e6d506b
1 parent dcd07bd
commit e6d506b
Show file tree

Hide file tree

Showing 11 changed files with 260 additions and 50 deletions.
diff --git a/NLP_Engine/Common/helper.py b/NLP_Engine/Common/helper.py
@@ -0,0 +1,28 @@
+
+__author__ = 'Luke'
+
+"Code from http://stackoverflow.com/questions/323750/how-to-access-previous-next-element-while-for-looping"
+def neighborhood(iterable):
+    iterator = iter(iterable)
+    prev = None
+    item = iterator.next()
+    for next in iterator:
+        yield (prev, item, next)
+        prev = item
+        item = next
+    yield (prev, item, None)
+
+
+def extract_tags(tagged_sent):
+    tags = [0] * len(tag_index)
+    for word, tag in tagged_sent:
+        tags[tag_index[tag]] += 1
+
+    return tags
+
+
+tag_index = {'CC': 0, 'CD': 1, 'DT': 2, 'EX': 3, 'FW': 4, 'IN': 5, 'JJ': 6, 'JJR': 7, 'JJS': 8, 'LS': 9, 'MD': 10,
+             'NN': 11, 'NNS': 12, 'NNP': 13, 'NNPS': 14, 'PDT': 15, 'POS': 16, 'PRP': 17, 'PP$': 18, 'RB': 19,
+             'RBR': 20, 'RBS': 21, 'RP': 22, 'SYM': 23, 'TO': 24, 'UH': 25, 'VB': 26, 'VBD': 27, 'VBG': 28, 'VBN': 29,
+             'VBP': 30, 'VBZ': 31, 'WDT': 32, 'WP': 33, 'WP$': 34, 'WRB': 35, '#': 36, '$': 37, '.': 38, ',': 39,
+             ':': 40, '(': 41, ')': 42, '"': 43, "'": 44, "``": 45, "''": 46, 'PRP$': 47}
diff --git a/NLP_Engine/Common/tokeniser.py b/NLP_Engine/Common/tokeniser.py
@@ -9,6 +9,7 @@
 
 __author__ = 'Luke'
 
+negations = {'no', 'not', 'never', "don't", 'dont',}
 
 def contains_url(word):
     url_pattern = r'(\S+\.(com|co\.uk|ac|info|ly|net|org|edu|gov)(\/\S+)?)|http://'
@@ -35,13 +36,7 @@ def _ends_with_punct(word):
 
 
 def contains_foreign_chars(word):
-    # pattern = r'![\w\s]'
-    # pattern2 = r'[^(\x20-\x7F)]'
-    # if re.match(pattern2, word):
-    #     return True
-    # else:
-    #     return False
-    exceptions = '£€'
+    exceptions = u'£€'
     for char in word:
         if ord(char) > 127:
             if char not in exceptions:
@@ -80,15 +75,22 @@ def delete_char(word, index):
     return word[:index] + word[index + 1:]
 
 
+def tokenise_tweet(tweet):
+
+    for word in tweet.split():
+        if contains_url(word):
+            continue
+
+
 def tokenise(word):
     if contains_url(word):
-        return None
+        return ''
         # if contains_repeated_chars(word):
     #     return
     if word[0] == '@':
-        return None
+        return ''
     if '&' in word:
-        return None
+        return ''
 
     word = word.lower()
     word = strip_punctuation(word)

diff --git a/NLP_Engine/NaiveBayes/preprocess_unigrams.py b/NLP_Engine/NaiveBayes/preprocess_unigrams.py
@@ -1,4 +1,5 @@
 from nltk.corpus import stopwords
+from NLP_Engine.Common.helper import neighborhood
 
 __author__ = 'Luke'
 import time
@@ -17,16 +18,21 @@ def add_to_dict(word):
         word_dict[word] = 1
 
 
+
 print "begin"
-with open("../../Data/Training/training-data-small.csv") as training_in:
+with open("../../Data/Training/training-data.csv") as training_in:
     for line in training_in:
+        line = line.decode(encoding='latin1')
         sentiment, tweet_content = line.split('\t', 1)
         if contains_foreign_chars(tweet_content):
             continue
 
         tweets.append((tweet_content, sentiment))
-        for word in tweet_content.split():
-            add_to_dict(tokenise(word))
+        for prev, word, after in neighborhood(tweet_content.split()):
+            if prev in negations:
+                add_to_dict(tokenise(u'neg-' + word))
+            else:
+                add_to_dict(tokenise(word))
 
 for key in word_dict.keys():
     if word_dict[key] < 5:
@@ -37,13 +43,12 @@ def add_to_dict(word):
     try:
         words.remove(word)
     except:
-        print "cant find " + word
         pass
 
 print "done " + str(time.time() - start_time) + 'seconds'
 print "pickling"
-pickle.dump(tweets, open("../../Data/Training/tweets-small.obj", "wb"))
-pickle.dump(words, open("../../Data/Training/word_set-small.obj", "wb"))
+pickle.dump(tweets, open("../../Data/Training/tweets.obj", "wb"))
+pickle.dump(words, open("../../Data/Training/word_set.obj", "wb"))
 
 print "done picking " + str(time.time() - start_time) + 'seconds'
 
diff --git a/NLP_Engine/NaiveBayes/process_unigrams.py b/NLP_Engine/NaiveBayes/process_unigrams.py
@@ -1,23 +1,33 @@
-from NLP_Engine.Common.tokeniser import tokenise
+from NLP_Engine.Common.helper import neighborhood
+from NLP_Engine.Common.tokeniser import tokenise, negations
 
 __author__ = 'Luke'
 import cPickle as pickle
 import nltk
 from nltk import NaiveBayesClassifier
 
-mode = "big"
 
-
-tweets = pickle.load(open("../../Data/Training/tweets-small.obj"))
-word_set = pickle.load((open("../../Data/Training/word_set-small.obj")))
+tweets = pickle.load(open("../../Data/Training/tweets.obj"))
+word_set = pickle.load((open("../../Data/Training/word_set.obj")))
 
 # tweets = pickle.load(open("../../Data/Training/tweets-small.obj"))
 # word_set = pickle.load((open("../../Data/Training/word_set-small.obj")))
 #
 
 def tweet_features(tweet):
-    tweet_words = tweet.split()
-    tokenised_words = set([tokenise(word) for word in tweet_words])
+    tokenised_words = [tokenise(word) for word in tweet.split()]
+    to_remove = set()
+    # print tokenised_words
+    for prev, word, next in neighborhood(tokenised_words):
+        if prev in negations:
+
+            to_remove.add(prev)
+            to_remove.add(word)
+            tokenised_words.append('neg-' + word)
+
+    tokenised_words = set(tokenised_words)
+    for word in to_remove:
+        tokenised_words.remove(word)
 
     features = {}
     for word in tokenised_words:

diff --git a/NLP_Engine/POSTagger/build_classifyer.py b/NLP_Engine/POSTagger/build_classifyer.py
@@ -0,0 +1,25 @@
+import cPickle as pickle
+from sklearn.svm import SVC
+from NLP_Engine.Common.helper import extract_tags
+
+
+__author__ = 'Luke'
+
+source = pickle.load(open('../../Data/Training/sentiment_detector_training.obj'))
+
+tagged_set, total_target = zip(*source)
+
+total_data = [extract_tags(sent) for sent in tagged_set]
+cut_off = int(0.85 * len(total_data))
+
+training_data = total_data[:cut_off]
+test_data = total_data[cut_off:]
+
+training_target = total_target[:cut_off]
+test_target = total_target[cut_off:]
+
+svm = SVC()
+svm.fit(training_data, training_target)
+
+pickle.dump(svm, open('../../Data/Models/sentiment-classifyer-svm','wb'))
+print svm.score(test_data, test_target)
diff --git a/NLP_Engine/POSTagger/post.py b/NLP_Engine/POSTagger/post.py
diff --git a/NLP_Engine/POSTagger/sentiment_detector.py b/NLP_Engine/POSTagger/sentiment_detector.py
@@ -0,0 +1,56 @@
+import random
+
+from nltk.tag.stanford import POSTagger
+from sklearn.svm import SVC
+from NLP_Engine.Common.helper import extract_tags
+
+
+__author__ = 'Luke'
+import cPickle as pickle
+
+
+def tokenise_tweet():
+    pass
+
+
+
+objective_tweets = pickle.load(open('../../Data/Training/objective-tweets.obj'))
+subjective_tweets = pickle.load(open('../../Data/Training/subjective-tweets.obj'))
+
+objective_tweets = [(tweet, u'obj') for tweet in objective_tweets]
+subjective_tweets = [(tweet, u'sub') for tweet, sent in subjective_tweets]
+#
+# objective_tweets = objective_tweets[:100]
+# subjective_tweets = subjective_tweets[:100]
+
+total_set = objective_tweets + subjective_tweets
+random.shuffle(total_set)
+cut_off = int(0.85*len(total_set))
+
+tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8')
+
+tagged_set = tagger.batch_tag([sent.split() for sent, label in total_set])
+
+total_target = [label for sent, label in total_set]
+
+to_disk = zip(tagged_set, total_target)
+print len(tagged_set)
+print len(total_set)
+pickle.dump(to_disk, open('../../Data/Training/sentiment_detector_training.obj', 'wb'))
+
+total_data = [extract_tags(sent) for sent in tagged_set]
+training_data = total_data[:cut_off]
+test_data = total_data[cut_off:]
+
+training_target = total_target[:cut_off]
+test_target = total_target[cut_off:]
+
+svm = SVC()
+svm.fit(training_data, training_target)
+print svm.score(test_data,test_target)
+
+print subjective_tweets[0]
+print objective_tweets[0]
+print len(objective_tweets)
+print len(subjective_tweets)
+
diff --git a/NLP_Engine/POSTagger/standford.py b/NLP_Engine/POSTagger/standford.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 from nltk.corpus import brown
 from nltk.tag.stanford import POSTagger
 
@@ -7,11 +8,27 @@
 
 # print tagger.tag("what is the airspeed of an unlaiden swallow?".split())
 
-l = ["what is the airspeed of an unlaiden swallow?".split(), "Call me Luke.".split()]
+l = ["what is the airspeed of an unlaiden swallow?".split(), "Call me Luke."]
 print l
 print tagger.batch_tag(l)
 
-reviews_sent_untagged = brown.sents(categories='reviews')[0:20] + brown.sents(categories='news')[0:20]
 
-print tagger.batch_tag(reviews_sent_untagged)
 
+s= 'ok α'
+u = u'ok α'
+def whatisthis(s):
+    if isinstance(s, str):
+        print "ordinary string" + s
+    elif isinstance(s, unicode):
+        print "unicode string" + s
+    else:
+        print "not a string"
+
+
+whatisthis(s)
+whatisthis(u)
+print tagger.tag(s)
+print tagger.tag(u.encode('utf8'))
+whatisthis(u.encode('utf8'))
+whatisthis(unicode(s,'utf8'))
+print
diff --git a/NLP_Engine/POSTagger/test_classifyer.py b/NLP_Engine/POSTagger/test_classifyer.py
@@ -0,0 +1,30 @@
+import cPickle as pickle
+from nltk.tag.stanford import POSTagger
+from NLP_Engine.Common.helper import extract_tags
+
+__author__ = 'Luke'
+
+
+
+test_set = []
+with open("../../Data/Test/test-data.csv") as test_in:
+    for line in test_in:
+        sentiment, tweet_content = line.split('\t', 1)
+        if sentiment == 'neg' or sentiment == 'pos':
+            sentiment = 'sub'
+        elif sentiment == 'neutral':
+            sentiment = 'obj'
+        else:
+            print 'error'
+        test_set.append((tweet_content, sentiment))
+
+print len(test_set)
+tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8')
+
+tagged_set = tagger.batch_tag([sent.split() for sent, label in test_set])
+test_data = [extract_tags(sent) for sent in tagged_set]
+
+targets = [label for sent, label in test_set]
+svm = pickle.load(open('../../Data/Models/sentiment-classifyer-svm'))
+
+print svm.score(test_data,targets)
diff --git a/Scripts/CreateExtraSmallTrainingData.py b/Scripts/CreateExtraSmallTrainingData.py
@@ -0,0 +1,30 @@
+__author__ = 'Luke'
+
+
+import os
+
+__author__ = 'Luke'
+
+print 'start '
+counter = 0
+with open("/Users/Luke/Documents/Project-Files/Training-Data/training.1600000.processed.noemoticon.csv") as input:
+    with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data-extra-small.csv",'w') as output:
+        for each_line in input:
+            elements = each_line.split('","')
+            elements[0] = elements[0][1:]
+            elements[5] = elements[5][:-2]
+            if(elements[0] == '0'):
+                elements[0] = 'neg'
+            if(elements[0] == '2'):
+                elements[0] = 'neutral'
+            if(elements[0] == '4'):
+                elements[0] = 'pos'
+
+            if counter % 100 == 0:
+                output.write(elements[0] + "\t" + elements[5] + '\n')
+
+            counter += 1
+
+
+
+print 'done'