First Commit

LukeDefeo · Jul 19, 2013 · 85bbb02 · 85bbb02
1 parent 1dba68f
commit 85bbb02
Show file tree

Hide file tree

Showing 15 changed files with 227 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,8 @@ nosetests.xml
 .mr.developer.cfg
 .project
 .pydevproject
+
+.idea/*
+/Data/*
+
+.DS_STORE
diff --git a/Experimental/Common/__init__.py b/Experimental/Common/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Experimental/Common/tokeniser.py b/Experimental/Common/tokeniser.py
@@ -0,0 +1,16 @@
+import re
+import string
+
+__author__ = 'Luke'
+
+
+
+def contains_url(word):
+    url_pattern = '(\S+\.(com|co\.uk|ac|info|ly|net|org|edu|gov)(\/\S+)?)'
+    if re.match(url_pattern, word):
+        return True
+    else:
+        return False
+
+def strip_punctuation(word):
+    return word.translate(None, string.punctuation)
diff --git a/Experimental/NaiveBayes/NGrams/UniGrams.py b/Experimental/NaiveBayes/NGrams/UniGrams.py
@@ -0,0 +1,78 @@
+import sets
+import time
+
+__author__ = 'Luke'
+
+import nltk
+from Experimental.Common.tokeniser import *
+import random
+import string
+from nltk import NaiveBayesClassifier
+
+
+
+#read data
+
+start_time = time.time()
+tweets = []
+i = 0
+
+words = set()
+
+
+
+
+def add_word_to_set(word):
+    if contains_url(word):
+        return
+    if word[0] == '@':
+        return
+
+    word.translate(None, string.punctuation)
+    words.add(word)
+
+
+with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data.csv") as training_in:
+    for line in training_in:
+        sentiment, tweet_content = line.split('\t', 1)
+        tweets.append((sentiment, tweet_content))
+        for word in tweet_content.split():
+            add_word_to_set(word.lower())
+        i += 1
+        # if i == 1200:
+        #     break
+
+# print words
+print words
+print len(words)
+
+print "done " +  str(time.time() - start_time) + 's'
+
+
+reviews = [(movie_reviews.words(fileid), category)
+           for category in movie_reviews.categories()
+           for fileid in movie_reviews.fileids(category)]
+
+random.shuffle(reviews)
+
+all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
+common_words = all_words.keys()[:2000]
+
+
+def tweet_features(tweet):
+    tweet_words = set(tweet)
+    features = {}
+    for word in common_words:
+        features["contains(%s)" % word] = (word in doc_words)
+
+    return features
+
+#generate feature set tuples, dict of features and catagory
+feature_set = [(doc_features(doc), cat) for (doc, cat) in reviews]
+
+cut_off = int(0.8 * len(feature_set))
+train_set = feature_set[:cut_off]
+test_set = feature_set[cut_off:]
+classifier = NaiveBayesClassifier.train(train_set)
+
+print nltk.classify.accuracy(classifier, test_set)
diff --git a/Experimental/NaiveBayes/NGrams/__init__.py b/Experimental/NaiveBayes/NGrams/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Experimental/NaiveBayes/NGrams/gender.py b/Experimental/NaiveBayes/NGrams/gender.py
@@ -0,0 +1,33 @@
+__author__ = 'Luke'
+from nltk import NaiveBayesClassifier
+from nltk.corpus import names
+import nltk
+import random
+
+
+#Define a function to extract  features from data, these are just key value pairs,
+#the value is important and is what the classifyer uses
+def gender_features(word):
+    return {'last_letter': word[-1],'first_letter' : word[0],'length' : len(word),'last_two' : word[len(word) - 2:],'last_three' : word[len(word) - 3:],'first_two' : word[:2]}
+
+#Get data
+names = ([(name, 'male') for name in names.words('male.txt')]) + \
+        ([(name, 'female') for name in names.words('female.txt')])
+
+random.shuffle(names)
+
+#Define a feature set which is a set of tuples of gender features (last letter of name) and correct labels
+feature_set = [(gender_features(name), gender) for (name, gender) in names]
+
+#Split into test and training data
+cut_off = int(0.8 * len(feature_set))
+training_data = feature_set[:cut_off]
+test_data = feature_set[cut_off:]
+
+classifier = NaiveBayesClassifier.train(training_data)
+print nltk.classify.accuracy(classifier,test_data)
+
+print classifier.show_most_informative_features()
+
+
+
diff --git a/Experimental/NaiveBayes/NGrams/movie_sent.py b/Experimental/NaiveBayes/NGrams/movie_sent.py
@@ -0,0 +1,39 @@
+__author__ = 'Luke'
+
+import nltk
+import random
+from nltk import NaiveBayesClassifier
+from nltk.corpus import names
+
+from nltk.corpus import movie_reviews
+
+
+
+#Get data
+reviews = [(movie_reviews.words(fileid), category)
+           for category in movie_reviews.categories()
+           for fileid in movie_reviews.fileids(category)]
+
+random.shuffle(reviews)
+
+all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
+common_words = all_words.keys() [:2000]
+
+
+def doc_features(doc):
+    doc_words = set(doc)
+    features = {}
+    for word in common_words:
+        features["contains(%s)" % word] = (word in doc_words)
+
+    return features
+
+#generate feature set tuples, dict of features and catagory
+feature_set = [(doc_features(doc), cat) for (doc, cat) in reviews]
+
+cut_off = int(0.8 * len(feature_set))
+train_set = feature_set[:cut_off]
+test_set = feature_set[cut_off:]
+classifier = NaiveBayesClassifier.train(train_set)
+
+print nltk.classify.accuracy(classifier,test_set)
diff --git a/Experimental/NaiveBayes/__init__.py b/Experimental/NaiveBayes/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Experimental/POSTagger/__init__.py b/Experimental/POSTagger/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Experimental/__init__.py b/Experimental/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Production/__init__.py b/Production/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Scripts/CreateTrainingData.py b/Scripts/CreateTrainingData.py
@@ -0,0 +1,26 @@
+import os
+
+__author__ = 'Luke'
+
+print 'start '
+
+with open("/Users/Luke/Documents/Project-Files/Training-Data/training.1600000.processed.noemoticon.csv") as input:
+    with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data.csv",'w') as output:
+        for each_line in input:
+            elements = each_line.split('","')
+            elements[0] = elements[0][1:]
+            elements[5] = elements[5][:-2]
+            if(elements[0] == '0'):
+                elements[0] = 'neg'
+            if(elements[0] == '2'):
+                elements[0] = 'neutral'
+            if(elements[0] == '4'):
+                elements[0] = 'pos'
+
+
+            output.write(elements[0] + "\t" + elements[5] + '\n')
+
+
+
+
+print 'done'
diff --git a/Scripts/__init__.py b/Scripts/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Test/__init__.py b/Test/__init__.py
@@ -0,0 +1 @@
+__author__ = 'Luke'
diff --git a/Test/test_tokeniser.py b/Test/test_tokeniser.py
@@ -0,0 +1,22 @@
+__author__ = 'Luke'
+
+from unittest import TestCase
+import unittest
+from Experimental.Common.tokeniser import *
+
+class test_tokenizer(TestCase):
+
+    def test_contains_url(self):
+        urls = ['www.abc.com', 'http://www.cwac.co.uk/dwad', 'https://dwda.org/dwad', 'twitpic.com/wdadawdd']
+        for url in urls:
+            self.assertTrue(contains_url(url))
+
+    def test_strip_punctuation(self):
+        words = ['a sentence,', 'with.', 'lots?', 'of',  'strange!', 'strange', 'stange...', 'punctuation']
+        for word in words:
+            strip_punctuation(word)
+            self.assertFalse(ch for ch in string.punctuation)
+
+
+if __name__ == '__main__':
+    unittest.main()