diff --git a/.DS_Store b/.DS_Store index 89845cff..fbc2a9cf 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/preprocess/.DS_Store b/preprocess/.DS_Store index 88d8ec62..1138ec35 100644 Binary files a/preprocess/.DS_Store and b/preprocess/.DS_Store differ diff --git a/preprocess/.idea/workspace.xml b/preprocess/.idea/workspace.xml index 55ce3850..ba90eff1 100644 --- a/preprocess/.idea/workspace.xml +++ b/preprocess/.idea/workspace.xml @@ -2,585 +2,8 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -599,11 +22,11 @@ - + - + @@ -615,7 +38,7 @@ - + @@ -627,8 +50,8 @@ - - + + @@ -658,8 +81,9 @@ + @@ -687,7 +111,6 @@ - @@ -708,10 +131,10 @@ + - @@ -917,6 +340,9 @@ + + + @@ -928,25 +354,25 @@ - + - + - - - - + + + + - - - + + + - - + + @@ -963,7 +389,7 @@ - @@ -972,96 +398,126 @@ - + - - - + + + + + - + - - - + + + + + - - - - + + + + + + + - + - - + + - + - - + + - + - + + + - + - - - + + + + + - + - - - + + - + - - - + + - - - - - + + + + + + + + + + - + + + + + + + + + + + + + + + + + - + @@ -1069,28 +525,26 @@ - + - - + + - + - - - - - + + + - - + + diff --git a/preprocess/ProcessedSubtitles/.DS_Store b/preprocess/ProcessedSubtitles/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/preprocess/ProcessedSubtitles/.DS_Store differ diff --git a/preprocess/__pycache__/preprocess.cpython-35.pyc b/preprocess/__pycache__/preprocess.cpython-35.pyc index 85d9368f..759d00a9 100644 Binary files a/preprocess/__pycache__/preprocess.cpython-35.pyc and b/preprocess/__pycache__/preprocess.cpython-35.pyc differ diff --git a/preprocess/figures:category_word_freq/.DS_Store b/preprocess/figures:category_word_freq/.DS_Store new file mode 100644 index 00000000..bd018071 Binary files /dev/null and b/preprocess/figures:category_word_freq/.DS_Store differ diff --git a/preprocess/figures:category_word_freq/action.png b/preprocess/figures:category_word_freq/action.png new file mode 100644 index 00000000..92da58bb Binary files /dev/null and b/preprocess/figures:category_word_freq/action.png differ diff --git a/preprocess/figures:category_word_freq/action_f.png b/preprocess/figures:category_word_freq/action_f.png new file mode 100644 index 00000000..683a30fa Binary files /dev/null and b/preprocess/figures:category_word_freq/action_f.png differ diff --git a/preprocess/figures:category_word_freq/adventure.png b/preprocess/figures:category_word_freq/adventure.png new file mode 100644 index 00000000..6a44b90c Binary files /dev/null and b/preprocess/figures:category_word_freq/adventure.png differ diff --git a/preprocess/figures:category_word_freq/adventure_f.png b/preprocess/figures:category_word_freq/adventure_f.png new file mode 100644 index 00000000..e88f175b Binary files /dev/null and b/preprocess/figures:category_word_freq/adventure_f.png differ diff --git a/preprocess/figures:category_word_freq/comedy.png b/preprocess/figures:category_word_freq/comedy.png new file mode 100644 index 00000000..0084ffd5 Binary files /dev/null and b/preprocess/figures:category_word_freq/comedy.png differ diff --git a/preprocess/figures:category_word_freq/comedy_f.png b/preprocess/figures:category_word_freq/comedy_f.png new file mode 100644 index 00000000..7a6a9d6d Binary files /dev/null and b/preprocess/figures:category_word_freq/comedy_f.png differ diff --git a/preprocess/figures:category_word_freq/horror.png b/preprocess/figures:category_word_freq/horror.png new file mode 100644 index 00000000..5a3569ba Binary files /dev/null and b/preprocess/figures:category_word_freq/horror.png differ diff --git a/preprocess/figures:category_word_freq/horror_f.png b/preprocess/figures:category_word_freq/horror_f.png new file mode 100644 index 00000000..1c835d9e Binary files /dev/null and b/preprocess/figures:category_word_freq/horror_f.png differ diff --git a/preprocess/figures:category_word_freq/romance.png b/preprocess/figures:category_word_freq/romance.png new file mode 100644 index 00000000..36cb58fc Binary files /dev/null and b/preprocess/figures:category_word_freq/romance.png differ diff --git a/preprocess/figures:category_word_freq/romance_f.png b/preprocess/figures:category_word_freq/romance_f.png new file mode 100644 index 00000000..a1b9b8d2 Binary files /dev/null and b/preprocess/figures:category_word_freq/romance_f.png differ diff --git a/preprocess/figures:category_word_freq/war.png b/preprocess/figures:category_word_freq/war.png new file mode 100644 index 00000000..8989ca95 Binary files /dev/null and b/preprocess/figures:category_word_freq/war.png differ diff --git a/preprocess/figures:category_word_freq/war_f.png b/preprocess/figures:category_word_freq/war_f.png new file mode 100644 index 00000000..4ed0d8a6 Binary files /dev/null and b/preprocess/figures:category_word_freq/war_f.png differ diff --git a/preprocess/tokenization.py b/preprocess/tokenization.py index 37df5ab4..e8c5beef 100644 --- a/preprocess/tokenization.py +++ b/preprocess/tokenization.py @@ -4,13 +4,13 @@ from sklearn.utils import shuffle from nltk.corpus import stopwords from nltk.stem.porter import * -import nltk +from sklearn.metrics import accuracy_score from os import path from os import mkdir from os import listdir import codecs -import re - +import matplotlib.pyplot as plt +import numpy as np categories = ['Action', 'Adventure', 'Comedy', 'Horror', 'Romance', 'War'] @@ -27,12 +27,6 @@ def stemming(text_array): return result -def plot_data(data): - fd = nltk.FreqDist(data) - fd.plot(30, cumulative=False) - fd.most_common(12) - - def categorize_words(input_folder): subtitles_path = path.relpath(input_folder) @@ -131,71 +125,89 @@ def bag_of_words_and_tf(data): def randomize(text, genre): - - return shuffle(text, genre, random_state=100) + return shuffle(text, genre) def filter_words(text): to_be_filtered = ["grunt", "beep", "grunts", ",", "groan", "speak", "music"] - - for i, movie in enumerate(text): - - for filter in to_be_filtered: - text[i] = movie.lower().replace( filter, "") - print(text[i]) - + for _i, movie in enumerate(text): + for f in to_be_filtered: + text[_i].lower().replace(f, '') return text - # Categorize words and plot them -category_dict = categorize_words(path.relpath("ProcessedSubtitles")) +# # Categorize words and plot them +# category_dict = categorize_words(path.relpath("CategoryData")) +# to_be_filtered = ['grunt', 'beep', 'grunts', ',', 'groan', 'speak', 'music'] +# +# # for i in categories: +# # for f in to_be_filtered: +# # category_dict[i] = category_dict[i].replace(f, '') +# +# for c in categories: +# cleaned_list = clean_stopword(category_dict[c]) +# stemmed_data = stemming(cleaned_list) +# +# fd = nltk.FreqDist(stemmed_data) +# print('Category: ', c) +# print(fd.most_common(12)) +# fd.plot(12, cumulative=False) +# process_movie_subtitles(path.relpath("ProcessedSubtitles"), path.relpath("CategoryData"))w -cleaned_list = clean_stopword( filter_words([category_dict["Adventure"]])[0]) -stemmed_data = stemming(cleaned_list) -plot_data(stemmed_data) - -#process_movie_subtitles(path.relpath("ProcessedSubtitles"), path.relpath("CategoryData")) -test_size = 50 +test_size = 150 text, genre = tag_subtitles(path.relpath('CategoryData')) -for i, mov in enumerate(text): - clean = clean_stopword(mov) - stem = stemming(clean) - text[i] = " ".join(stem) - -text = filter_words(text) -print(text) -text, genre = randomize(text,genre) +to_be_filtered = ['grunt', 'beep', 'grunts', ',', 'groan', 'speak', 'music'] +for i in range(len(text)): + for f in to_be_filtered: + text[i] = text[i].replace(f, '') -bow_tf = bag_of_words_and_tf(text) -clf = MultinomialNB().fit(bow_tf[test_size:], genre[test_size:]) +# Initialize naive bayes object -total_adv = 0 -for i in genre[test_size:]: - if i == 'Adventure': - total_adv +=1 +acc_scores = [] +alpha_values = np.arange(0.1, 2.0, 0.1) +alpha_values = [0.1, 0.5, 0.01, 0.05, 0.001, 0.005] +print(alpha_values) +for a in alpha_values: + clf = MultinomialNB(alpha=a) + acc = 0 + print('.', a) + for i in range(50): + text, genre = randomize(text, genre) -print( "total adv", total_adv) -print( "total movies", len(genre[test_size:])) + bow_tf = bag_of_words_and_tf(text) + clf.fit(bow_tf[test_size:], genre[test_size:]) + test_data = bow_tf[:test_size] + test_genre = genre[:test_size] -test_data = bow_tf[:test_size] -test_genre = genre[:test_size] -predicted = clf.predict(test_data) + predicted = clf.predict(test_data) + acc += accuracy_score(test_genre, predicted)*100 + acc_scores.append(float(acc/50)) -print(predicted) +print(acc_scores) -true_data = 0 -for i in range(test_size): - if predicted[i] == test_genre[i]: - true_data += 1 - print('Predicted: {}, Original: {}'.format(predicted[i], test_genre[i])) +plt.plot(alpha_values, acc_scores, 'o') +plt.axis([0, 5, -1, 100]) -print("overall accuracy: ", float(true_data/test_size)) +plt.xlabel('Alpha values') +plt.ylabel('Accuracy') +plt.legend(loc='upper right', numpoints=1) +plt.title("Accuracies / Alpha values") +#for k, accuracy in zip(k_values, accuracies): +# plt.text(k - 0.6, accuracy+1, str(k) + ", " + str(format(accuracy, '.1f')), fontsize=10) +plt.show() +# true_data = 0 +# for i in range(test_size): +# if predicted[i] == test_genre[i]: +# true_data += 1 +# print('Predicted: {}, Original: {}'.format(predicted[i], test_genre[i])) +# print("overall accuracy: ", float(true_data/test_size)) +#print('scikit learn score: ', accuracy_score(test_genre, predicted))