-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
227 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,3 +33,8 @@ nosetests.xml | |
.mr.developer.cfg | ||
.project | ||
.pydevproject | ||
|
||
.idea/* | ||
/Data/* | ||
|
||
.DS_STORE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import re | ||
import string | ||
|
||
__author__ = 'Luke' | ||
|
||
|
||
|
||
def contains_url(word): | ||
url_pattern = '(\S+\.(com|co\.uk|ac|info|ly|net|org|edu|gov)(\/\S+)?)' | ||
if re.match(url_pattern, word): | ||
return True | ||
else: | ||
return False | ||
|
||
def strip_punctuation(word): | ||
return word.translate(None, string.punctuation) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import sets | ||
import time | ||
|
||
__author__ = 'Luke' | ||
|
||
import nltk | ||
from Experimental.Common.tokeniser import * | ||
import random | ||
import string | ||
from nltk import NaiveBayesClassifier | ||
|
||
|
||
|
||
#read data | ||
|
||
start_time = time.time() | ||
tweets = [] | ||
i = 0 | ||
|
||
words = set() | ||
|
||
|
||
|
||
|
||
def add_word_to_set(word): | ||
if contains_url(word): | ||
return | ||
if word[0] == '@': | ||
return | ||
|
||
word.translate(None, string.punctuation) | ||
words.add(word) | ||
|
||
|
||
with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data.csv") as training_in: | ||
for line in training_in: | ||
sentiment, tweet_content = line.split('\t', 1) | ||
tweets.append((sentiment, tweet_content)) | ||
for word in tweet_content.split(): | ||
add_word_to_set(word.lower()) | ||
i += 1 | ||
# if i == 1200: | ||
# break | ||
|
||
# print words | ||
print words | ||
print len(words) | ||
|
||
print "done " + str(time.time() - start_time) + 's' | ||
|
||
|
||
reviews = [(movie_reviews.words(fileid), category) | ||
for category in movie_reviews.categories() | ||
for fileid in movie_reviews.fileids(category)] | ||
|
||
random.shuffle(reviews) | ||
|
||
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) | ||
common_words = all_words.keys()[:2000] | ||
|
||
|
||
def tweet_features(tweet): | ||
tweet_words = set(tweet) | ||
features = {} | ||
for word in common_words: | ||
features["contains(%s)" % word] = (word in doc_words) | ||
|
||
return features | ||
|
||
#generate feature set tuples, dict of features and catagory | ||
feature_set = [(doc_features(doc), cat) for (doc, cat) in reviews] | ||
|
||
cut_off = int(0.8 * len(feature_set)) | ||
train_set = feature_set[:cut_off] | ||
test_set = feature_set[cut_off:] | ||
classifier = NaiveBayesClassifier.train(train_set) | ||
|
||
print nltk.classify.accuracy(classifier, test_set) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
__author__ = 'Luke' | ||
from nltk import NaiveBayesClassifier | ||
from nltk.corpus import names | ||
import nltk | ||
import random | ||
|
||
|
||
#Define a function to extract features from data, these are just key value pairs, | ||
#the value is important and is what the classifyer uses | ||
def gender_features(word): | ||
return {'last_letter': word[-1],'first_letter' : word[0],'length' : len(word),'last_two' : word[len(word) - 2:],'last_three' : word[len(word) - 3:],'first_two' : word[:2]} | ||
|
||
#Get data | ||
names = ([(name, 'male') for name in names.words('male.txt')]) + \ | ||
([(name, 'female') for name in names.words('female.txt')]) | ||
|
||
random.shuffle(names) | ||
|
||
#Define a feature set which is a set of tuples of gender features (last letter of name) and correct labels | ||
feature_set = [(gender_features(name), gender) for (name, gender) in names] | ||
|
||
#Split into test and training data | ||
cut_off = int(0.8 * len(feature_set)) | ||
training_data = feature_set[:cut_off] | ||
test_data = feature_set[cut_off:] | ||
|
||
classifier = NaiveBayesClassifier.train(training_data) | ||
print nltk.classify.accuracy(classifier,test_data) | ||
|
||
print classifier.show_most_informative_features() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
__author__ = 'Luke' | ||
|
||
import nltk | ||
import random | ||
from nltk import NaiveBayesClassifier | ||
from nltk.corpus import names | ||
|
||
from nltk.corpus import movie_reviews | ||
|
||
|
||
|
||
#Get data | ||
reviews = [(movie_reviews.words(fileid), category) | ||
for category in movie_reviews.categories() | ||
for fileid in movie_reviews.fileids(category)] | ||
|
||
random.shuffle(reviews) | ||
|
||
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) | ||
common_words = all_words.keys() [:2000] | ||
|
||
|
||
def doc_features(doc): | ||
doc_words = set(doc) | ||
features = {} | ||
for word in common_words: | ||
features["contains(%s)" % word] = (word in doc_words) | ||
|
||
return features | ||
|
||
#generate feature set tuples, dict of features and catagory | ||
feature_set = [(doc_features(doc), cat) for (doc, cat) in reviews] | ||
|
||
cut_off = int(0.8 * len(feature_set)) | ||
train_set = feature_set[:cut_off] | ||
test_set = feature_set[cut_off:] | ||
classifier = NaiveBayesClassifier.train(train_set) | ||
|
||
print nltk.classify.accuracy(classifier,test_set) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import os | ||
|
||
__author__ = 'Luke' | ||
|
||
print 'start ' | ||
|
||
with open("/Users/Luke/Documents/Project-Files/Training-Data/training.1600000.processed.noemoticon.csv") as input: | ||
with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data.csv",'w') as output: | ||
for each_line in input: | ||
elements = each_line.split('","') | ||
elements[0] = elements[0][1:] | ||
elements[5] = elements[5][:-2] | ||
if(elements[0] == '0'): | ||
elements[0] = 'neg' | ||
if(elements[0] == '2'): | ||
elements[0] = 'neutral' | ||
if(elements[0] == '4'): | ||
elements[0] = 'pos' | ||
|
||
|
||
output.write(elements[0] + "\t" + elements[5] + '\n') | ||
|
||
|
||
|
||
|
||
print 'done' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__author__ = 'Luke' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
__author__ = 'Luke' | ||
|
||
from unittest import TestCase | ||
import unittest | ||
from Experimental.Common.tokeniser import * | ||
|
||
class test_tokenizer(TestCase): | ||
|
||
def test_contains_url(self): | ||
urls = ['www.abc.com', 'http://www.cwac.co.uk/dwad', 'https://dwda.org/dwad', 'twitpic.com/wdadawdd'] | ||
for url in urls: | ||
self.assertTrue(contains_url(url)) | ||
|
||
def test_strip_punctuation(self): | ||
words = ['a sentence,', 'with.', 'lots?', 'of', 'strange!', 'strange', 'stange...', 'punctuation'] | ||
for word in words: | ||
strip_punctuation(word) | ||
self.assertFalse(ch for ch in string.punctuation) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |