-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
260 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
|
||
__author__ = 'Luke' | ||
|
||
"Code from http://stackoverflow.com/questions/323750/how-to-access-previous-next-element-while-for-looping" | ||
def neighborhood(iterable): | ||
iterator = iter(iterable) | ||
prev = None | ||
item = iterator.next() | ||
for next in iterator: | ||
yield (prev, item, next) | ||
prev = item | ||
item = next | ||
yield (prev, item, None) | ||
|
||
|
||
def extract_tags(tagged_sent): | ||
tags = [0] * len(tag_index) | ||
for word, tag in tagged_sent: | ||
tags[tag_index[tag]] += 1 | ||
|
||
return tags | ||
|
||
|
||
tag_index = {'CC': 0, 'CD': 1, 'DT': 2, 'EX': 3, 'FW': 4, 'IN': 5, 'JJ': 6, 'JJR': 7, 'JJS': 8, 'LS': 9, 'MD': 10, | ||
'NN': 11, 'NNS': 12, 'NNP': 13, 'NNPS': 14, 'PDT': 15, 'POS': 16, 'PRP': 17, 'PP$': 18, 'RB': 19, | ||
'RBR': 20, 'RBS': 21, 'RP': 22, 'SYM': 23, 'TO': 24, 'UH': 25, 'VB': 26, 'VBD': 27, 'VBG': 28, 'VBN': 29, | ||
'VBP': 30, 'VBZ': 31, 'WDT': 32, 'WP': 33, 'WP$': 34, 'WRB': 35, '#': 36, '$': 37, '.': 38, ',': 39, | ||
':': 40, '(': 41, ')': 42, '"': 43, "'": 44, "``": 45, "''": 46, 'PRP$': 47} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import cPickle as pickle | ||
from sklearn.svm import SVC | ||
from NLP_Engine.Common.helper import extract_tags | ||
|
||
|
||
__author__ = 'Luke' | ||
|
||
source = pickle.load(open('../../Data/Training/sentiment_detector_training.obj')) | ||
|
||
tagged_set, total_target = zip(*source) | ||
|
||
total_data = [extract_tags(sent) for sent in tagged_set] | ||
cut_off = int(0.85 * len(total_data)) | ||
|
||
training_data = total_data[:cut_off] | ||
test_data = total_data[cut_off:] | ||
|
||
training_target = total_target[:cut_off] | ||
test_target = total_target[cut_off:] | ||
|
||
svm = SVC() | ||
svm.fit(training_data, training_target) | ||
|
||
pickle.dump(svm, open('../../Data/Models/sentiment-classifyer-svm','wb')) | ||
print svm.score(test_data, test_target) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import random | ||
|
||
from nltk.tag.stanford import POSTagger | ||
from sklearn.svm import SVC | ||
from NLP_Engine.Common.helper import extract_tags | ||
|
||
|
||
__author__ = 'Luke' | ||
import cPickle as pickle | ||
|
||
|
||
def tokenise_tweet(): | ||
pass | ||
|
||
|
||
|
||
objective_tweets = pickle.load(open('../../Data/Training/objective-tweets.obj')) | ||
subjective_tweets = pickle.load(open('../../Data/Training/subjective-tweets.obj')) | ||
|
||
objective_tweets = [(tweet, u'obj') for tweet in objective_tweets] | ||
subjective_tweets = [(tweet, u'sub') for tweet, sent in subjective_tweets] | ||
# | ||
# objective_tweets = objective_tweets[:100] | ||
# subjective_tweets = subjective_tweets[:100] | ||
|
||
total_set = objective_tweets + subjective_tweets | ||
random.shuffle(total_set) | ||
cut_off = int(0.85*len(total_set)) | ||
|
||
tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8') | ||
|
||
tagged_set = tagger.batch_tag([sent.split() for sent, label in total_set]) | ||
|
||
total_target = [label for sent, label in total_set] | ||
|
||
to_disk = zip(tagged_set, total_target) | ||
print len(tagged_set) | ||
print len(total_set) | ||
pickle.dump(to_disk, open('../../Data/Training/sentiment_detector_training.obj', 'wb')) | ||
|
||
total_data = [extract_tags(sent) for sent in tagged_set] | ||
training_data = total_data[:cut_off] | ||
test_data = total_data[cut_off:] | ||
|
||
training_target = total_target[:cut_off] | ||
test_target = total_target[cut_off:] | ||
|
||
svm = SVC() | ||
svm.fit(training_data, training_target) | ||
print svm.score(test_data,test_target) | ||
|
||
print subjective_tweets[0] | ||
print objective_tweets[0] | ||
print len(objective_tweets) | ||
print len(subjective_tweets) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import cPickle as pickle | ||
from nltk.tag.stanford import POSTagger | ||
from NLP_Engine.Common.helper import extract_tags | ||
|
||
__author__ = 'Luke' | ||
|
||
|
||
|
||
test_set = [] | ||
with open("../../Data/Test/test-data.csv") as test_in: | ||
for line in test_in: | ||
sentiment, tweet_content = line.split('\t', 1) | ||
if sentiment == 'neg' or sentiment == 'pos': | ||
sentiment = 'sub' | ||
elif sentiment == 'neutral': | ||
sentiment = 'obj' | ||
else: | ||
print 'error' | ||
test_set.append((tweet_content, sentiment)) | ||
|
||
print len(test_set) | ||
tagger = POSTagger('stanford-model.tagger', 'stanford-postagger.jar', encoding='utf8') | ||
|
||
tagged_set = tagger.batch_tag([sent.split() for sent, label in test_set]) | ||
test_data = [extract_tags(sent) for sent in tagged_set] | ||
|
||
targets = [label for sent, label in test_set] | ||
svm = pickle.load(open('../../Data/Models/sentiment-classifyer-svm')) | ||
|
||
print svm.score(test_data,targets) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
__author__ = 'Luke' | ||
|
||
|
||
import os | ||
|
||
__author__ = 'Luke' | ||
|
||
print 'start ' | ||
counter = 0 | ||
with open("/Users/Luke/Documents/Project-Files/Training-Data/training.1600000.processed.noemoticon.csv") as input: | ||
with open("/Users/Luke/Documents/PyCharmProjects/TwitterSentiment/Data/Training/training-data-extra-small.csv",'w') as output: | ||
for each_line in input: | ||
elements = each_line.split('","') | ||
elements[0] = elements[0][1:] | ||
elements[5] = elements[5][:-2] | ||
if(elements[0] == '0'): | ||
elements[0] = 'neg' | ||
if(elements[0] == '2'): | ||
elements[0] = 'neutral' | ||
if(elements[0] == '4'): | ||
elements[0] = 'pos' | ||
|
||
if counter % 100 == 0: | ||
output.write(elements[0] + "\t" + elements[5] + '\n') | ||
|
||
counter += 1 | ||
|
||
|
||
|
||
print 'done' |
Oops, something went wrong.