Skip to content

Commit

Permalink
tokenization modified, figures added
Browse files Browse the repository at this point in the history
  • Loading branch information
mesutgurlek committed Nov 22, 2016
1 parent 7455f4f commit f6b0113
Show file tree
Hide file tree
Showing 19 changed files with 165 additions and 699 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified preprocess/.DS_Store
Binary file not shown.
748 changes: 101 additions & 647 deletions preprocess/.idea/workspace.xml

Large diffs are not rendered by default.

Binary file added preprocess/ProcessedSubtitles/.DS_Store
Binary file not shown.
Binary file modified preprocess/__pycache__/preprocess.cpython-35.pyc
Binary file not shown.
Binary file not shown.
Binary file added preprocess/figures:category_word_freq/action.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added preprocess/figures:category_word_freq/comedy.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added preprocess/figures:category_word_freq/horror.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added preprocess/figures:category_word_freq/romance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added preprocess/figures:category_word_freq/war.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added preprocess/figures:category_word_freq/war_f.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
116 changes: 64 additions & 52 deletions preprocess/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.stem.porter import *
import nltk
from sklearn.metrics import accuracy_score
from os import path
from os import mkdir
from os import listdir
import codecs
import re

import matplotlib.pyplot as plt
import numpy as np

categories = ['Action', 'Adventure', 'Comedy', 'Horror', 'Romance', 'War']

Expand All @@ -27,12 +27,6 @@ def stemming(text_array):
return result


def plot_data(data):
fd = nltk.FreqDist(data)
fd.plot(30, cumulative=False)
fd.most_common(12)


def categorize_words(input_folder):
subtitles_path = path.relpath(input_folder)

Expand Down Expand Up @@ -131,71 +125,89 @@ def bag_of_words_and_tf(data):


def randomize(text, genre):

return shuffle(text, genre, random_state=100)
return shuffle(text, genre)


def filter_words(text):
to_be_filtered = ["grunt", "beep", "grunts", ",", "groan", "speak", "music"]

for i, movie in enumerate(text):

for filter in to_be_filtered:
text[i] = movie.lower().replace( filter, "")
print(text[i])

for _i, movie in enumerate(text):
for f in to_be_filtered:
text[_i].lower().replace(f, '')
return text

# Categorize words and plot them
category_dict = categorize_words(path.relpath("ProcessedSubtitles"))
# # Categorize words and plot them
# category_dict = categorize_words(path.relpath("CategoryData"))
# to_be_filtered = ['grunt', 'beep', 'grunts', ',', 'groan', 'speak', 'music']
#
# # for i in categories:
# # for f in to_be_filtered:
# # category_dict[i] = category_dict[i].replace(f, '')
#
# for c in categories:
# cleaned_list = clean_stopword(category_dict[c])
# stemmed_data = stemming(cleaned_list)
#
# fd = nltk.FreqDist(stemmed_data)
# print('Category: ', c)
# print(fd.most_common(12))
# fd.plot(12, cumulative=False)

# process_movie_subtitles(path.relpath("ProcessedSubtitles"), path.relpath("CategoryData"))w

cleaned_list = clean_stopword( filter_words([category_dict["Adventure"]])[0])
stemmed_data = stemming(cleaned_list)
plot_data(stemmed_data)


#process_movie_subtitles(path.relpath("ProcessedSubtitles"), path.relpath("CategoryData"))
test_size = 50
test_size = 150

text, genre = tag_subtitles(path.relpath('CategoryData'))
for i, mov in enumerate(text):
clean = clean_stopword(mov)
stem = stemming(clean)
text[i] = " ".join(stem)


text = filter_words(text)
print(text)
text, genre = randomize(text,genre)
to_be_filtered = ['grunt', 'beep', 'grunts', ',', 'groan', 'speak', 'music']
for i in range(len(text)):
for f in to_be_filtered:
text[i] = text[i].replace(f, '')

bow_tf = bag_of_words_and_tf(text)
clf = MultinomialNB().fit(bow_tf[test_size:], genre[test_size:])
# Initialize naive bayes object

total_adv = 0
for i in genre[test_size:]:
if i == 'Adventure':
total_adv +=1
acc_scores = []
alpha_values = np.arange(0.1, 2.0, 0.1)
alpha_values = [0.1, 0.5, 0.01, 0.05, 0.001, 0.005]
print(alpha_values)
for a in alpha_values:
clf = MultinomialNB(alpha=a)
acc = 0
print('.', a)
for i in range(50):
text, genre = randomize(text, genre)

print( "total adv", total_adv)
print( "total movies", len(genre[test_size:]))
bow_tf = bag_of_words_and_tf(text)

clf.fit(bow_tf[test_size:], genre[test_size:])

test_data = bow_tf[:test_size]
test_genre = genre[:test_size]

test_data = bow_tf[:test_size]
test_genre = genre[:test_size]
predicted = clf.predict(test_data)
predicted = clf.predict(test_data)
acc += accuracy_score(test_genre, predicted)*100
acc_scores.append(float(acc/50))

print(predicted)
print(acc_scores)

true_data = 0
for i in range(test_size):
if predicted[i] == test_genre[i]:
true_data += 1
print('Predicted: {}, Original: {}'.format(predicted[i], test_genre[i]))
plt.plot(alpha_values, acc_scores, 'o')
plt.axis([0, 5, -1, 100])

print("overall accuracy: ", float(true_data/test_size))
plt.xlabel('Alpha values')
plt.ylabel('Accuracy')
plt.legend(loc='upper right', numpoints=1)
plt.title("Accuracies / Alpha values")

#for k, accuracy in zip(k_values, accuracies):
# plt.text(k - 0.6, accuracy+1, str(k) + ", " + str(format(accuracy, '.1f')), fontsize=10)

plt.show()
# true_data = 0
# for i in range(test_size):
# if predicted[i] == test_genre[i]:
# true_data += 1
# print('Predicted: {}, Original: {}'.format(predicted[i], test_genre[i]))
# print("overall accuracy: ", float(true_data/test_size))

#print('scikit learn score: ', accuracy_score(test_genre, predicted))

0 comments on commit f6b0113

Please sign in to comment.