Skip to content

Commit

Permalink
First Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
LukeDefeo committed Jul 19, 2013
1 parent 1dba68f commit 85bbb02
Show file tree
Hide file tree
Showing 15 changed files with 227 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ nosetests.xml
.mr.developer.cfg
.project
.pydevproject

.idea/*
/Data/*

.DS_STORE
1 change: 1 addition & 0 deletions Experimental/Common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
16 changes: 16 additions & 0 deletions Experimental/Common/tokeniser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import re
import string

__author__ = 'Luke'



def contains_url(word):
url_pattern = '(\S+\.(com|co\.uk|ac|info|ly|net|org|edu|gov)(\/\S+)?)'
if re.match(url_pattern, word):
return True
else:
return False

def strip_punctuation(word):
return word.translate(None, string.punctuation)
78 changes: 78 additions & 0 deletions Experimental/NaiveBayes/NGrams/UniGrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import sets
import time

__author__ = 'Luke'

import nltk
from Experimental.Common.tokeniser import *
import random
import string
from nltk import NaiveBayesClassifier



#read data

start_time = time.time()
tweets = []
i = 0

words = set()




def add_word_to_set(word):
if contains_url(word):
return
if word[0] == '@':
return

word.translate(None, string.punctuation)
words.add(word)


with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data.csv") as training_in:
for line in training_in:
sentiment, tweet_content = line.split('\t', 1)
tweets.append((sentiment, tweet_content))
for word in tweet_content.split():
add_word_to_set(word.lower())
i += 1
# if i == 1200:
# break

# print words
print words
print len(words)

print "done " + str(time.time() - start_time) + 's'


reviews = [(movie_reviews.words(fileid), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

random.shuffle(reviews)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
common_words = all_words.keys()[:2000]


def tweet_features(tweet):
tweet_words = set(tweet)
features = {}
for word in common_words:
features["contains(%s)" % word] = (word in doc_words)

return features

#generate feature set tuples, dict of features and catagory
feature_set = [(doc_features(doc), cat) for (doc, cat) in reviews]

cut_off = int(0.8 * len(feature_set))
train_set = feature_set[:cut_off]
test_set = feature_set[cut_off:]
classifier = NaiveBayesClassifier.train(train_set)

print nltk.classify.accuracy(classifier, test_set)
1 change: 1 addition & 0 deletions Experimental/NaiveBayes/NGrams/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
33 changes: 33 additions & 0 deletions Experimental/NaiveBayes/NGrams/gender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
__author__ = 'Luke'
from nltk import NaiveBayesClassifier
from nltk.corpus import names
import nltk
import random


#Define a function to extract features from data, these are just key value pairs,
#the value is important and is what the classifyer uses
def gender_features(word):
return {'last_letter': word[-1],'first_letter' : word[0],'length' : len(word),'last_two' : word[len(word) - 2:],'last_three' : word[len(word) - 3:],'first_two' : word[:2]}

#Get data
names = ([(name, 'male') for name in names.words('male.txt')]) + \
([(name, 'female') for name in names.words('female.txt')])

random.shuffle(names)

#Define a feature set which is a set of tuples of gender features (last letter of name) and correct labels
feature_set = [(gender_features(name), gender) for (name, gender) in names]

#Split into test and training data
cut_off = int(0.8 * len(feature_set))
training_data = feature_set[:cut_off]
test_data = feature_set[cut_off:]

classifier = NaiveBayesClassifier.train(training_data)
print nltk.classify.accuracy(classifier,test_data)

print classifier.show_most_informative_features()



39 changes: 39 additions & 0 deletions Experimental/NaiveBayes/NGrams/movie_sent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
__author__ = 'Luke'

import nltk
import random
from nltk import NaiveBayesClassifier
from nltk.corpus import names

from nltk.corpus import movie_reviews



#Get data
reviews = [(movie_reviews.words(fileid), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

random.shuffle(reviews)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
common_words = all_words.keys() [:2000]


def doc_features(doc):
doc_words = set(doc)
features = {}
for word in common_words:
features["contains(%s)" % word] = (word in doc_words)

return features

#generate feature set tuples, dict of features and catagory
feature_set = [(doc_features(doc), cat) for (doc, cat) in reviews]

cut_off = int(0.8 * len(feature_set))
train_set = feature_set[:cut_off]
test_set = feature_set[cut_off:]
classifier = NaiveBayesClassifier.train(train_set)

print nltk.classify.accuracy(classifier,test_set)
1 change: 1 addition & 0 deletions Experimental/NaiveBayes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
1 change: 1 addition & 0 deletions Experimental/POSTagger/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
1 change: 1 addition & 0 deletions Experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
1 change: 1 addition & 0 deletions Production/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
26 changes: 26 additions & 0 deletions Scripts/CreateTrainingData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os

__author__ = 'Luke'

print 'start '

with open("/Users/Luke/Documents/Project-Files/Training-Data/training.1600000.processed.noemoticon.csv") as input:
with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data.csv",'w') as output:
for each_line in input:
elements = each_line.split('","')
elements[0] = elements[0][1:]
elements[5] = elements[5][:-2]
if(elements[0] == '0'):
elements[0] = 'neg'
if(elements[0] == '2'):
elements[0] = 'neutral'
if(elements[0] == '4'):
elements[0] = 'pos'


output.write(elements[0] + "\t" + elements[5] + '\n')




print 'done'
1 change: 1 addition & 0 deletions Scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
1 change: 1 addition & 0 deletions Test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'Luke'
22 changes: 22 additions & 0 deletions Test/test_tokeniser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
__author__ = 'Luke'

from unittest import TestCase
import unittest
from Experimental.Common.tokeniser import *

class test_tokenizer(TestCase):

def test_contains_url(self):
urls = ['www.abc.com', 'http://www.cwac.co.uk/dwad', 'https://dwda.org/dwad', 'twitpic.com/wdadawdd']
for url in urls:
self.assertTrue(contains_url(url))

def test_strip_punctuation(self):
words = ['a sentence,', 'with.', 'lots?', 'of', 'strange!', 'strange', 'stange...', 'punctuation']
for word in words:
strip_punctuation(word)
self.assertFalse(ch for ch in string.punctuation)


if __name__ == '__main__':
unittest.main()

0 comments on commit 85bbb02

Please sign in to comment.