-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnaive_bayes.py
147 lines (115 loc) · 5.39 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import re
import numpy as np
from collections import Counter
import pandas as pd
# import random
# from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download
TRAIN_DATA_PATH = "data/data_train.pkl"
TEST_DATA_PATH = "data/data_test.pkl"
class Naive_Bayes:
def __init__(self, train_X, train_Y, ):
self.train_X = train_X
self.train_Y = train_Y
self.classes = list(set(train_Y))
self.stop_words_list = stopwords.words('english')
self.stemmer = PorterStemmer()
self.word_stats, self.num_unique_words_in_class, self.total_word_count_per_class = self.get_word_count_per_class()
self.priors = self.get_priors()
# pre-processing dataset
def preprocess_sentence(self, sentence):
# remove links if needed
# sentence = re.sub(r'^https?:\/\/.*[\r\n]*', ' website_link ', sentence)
# remove special characters and stop words
pattern = re.compile(r'\b\w\w+\b')
# for word in re.findall(pattern, sentence):
sentence = [self.stemmer.stem(word) for word in re.findall(pattern, sentence.lower()) if word not in self.stop_words_list]
return sentence
def get_priors(self):
priors = {}
total_examples = len(self.train_Y)
for class_ in self.classes:
priors[class_] = np.sum(self.train_Y == class_) / total_examples
return priors
# bag of word features - get dictionary of word count per class
def get_word_count_per_class(self):
word_stats, num_unique_words_in_class, words_count_per_class = {}, {}, {}
for class_ in self.classes:
# create a list of all words in the class
word_list = []
for review in self.train_X[self.train_Y == class_]:
word_list += self.preprocess_sentence(review)
# count the number of unique words and occurrences of each word
num_unique_words_in_class[class_] = len(set(word_list))
word_stats[class_] = Counter(word_list)
# get dictionary of total word count per class
words_count_per_class[class_] = len(word_list)
return word_stats, num_unique_words_in_class, words_count_per_class
# test accuracy
def accuracy(self, predicted, target):
return np.mean(predicted == target) * 100.00
def compute_probability(self, class_, word_list, laplacian, alpha):
prob = np.log(self.priors[class_])
# override alpha value by setting it 0 if it isn't laplacian
if not laplacian:
alpha = 0.0
denominator_count = self.total_word_count_per_class[class_] + alpha*self.num_unique_words_in_class[class_]
for word in word_list:
prob += np.log((self.word_stats[class_][word] + alpha) / denominator_count)
return prob
# predict
def predict_class(self, test_sentences_list, laplacian=True, alpha=0.25):
# Naive Bayes: for all classes, argmax P(c)*P(d|c)
predicted_classes = np.zeros(len(test_sentences_list), dtype=np.object)
for index, test_sentence in enumerate(test_sentences_list):
word_list = self.preprocess_sentence(test_sentence)
probabilities = np.zeros(len(self.classes))
for idx, class_ in enumerate(self.classes):
probabilities[idx] = self.compute_probability(class_, word_list, laplacian, alpha)
predicted_classes[index] = self.classes[np.argmax(probabilities)]
return predicted_classes
# create csv in desired submission format
def create_and_save_submission(predictions, file_name="submission.csv"):
ids = [i for i in range(len(predictions))]
sub_df = pd.DataFrame(data=list(zip(ids, predictions)), columns=["Id","Category"])
sub_df.to_csv(file_name, index=False)
# Return mean model performance after k-fold cross-validation
def evaluate_model(Model, X, y):
print("Evaluating model performance using k-fold...")
kfold = KFold(
n_splits=4,
shuffle=True,
random_state=42
)
accuracy = []
for train_index, test_index in kfold.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
my_model = Model(X_train, y_train)
predictions = my_model.predict_class(X_test)
accuracy.append(my_model.accuracy(predictions, y_test))
return np.mean(accuracy)
# Train with full dataset; write test data label predictions to .csv file
def predict_test_labels(Model, X, y, X_test):
print("Predicting test labels and writing to submission.csv...")
my_model = Model(X, y)
predictions = my_model.predict_class(X_test)
create_and_save_submission(predictions)
def main():
print("Downloading stop-words..")
download('stopwords') # nltk
train_data = pd.read_pickle(TRAIN_DATA_PATH)
test_data = pd.read_pickle(TEST_DATA_PATH)
X = np.array(train_data[0])
y = np.array(train_data[1])
X_test = np.array(test_data)
# Print performance of model after k-fold cross-validation
accuracy = evaluate_model(Naive_Bayes, X, y)
print("Accuracy: %.2f"%accuracy)
# Train with full dataset; write test data label predictions to .csv file
predict_test_labels(Naive_Bayes, X, y, X_test)
if __name__ == "__main__":
main()