-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataClassifier.py
220 lines (168 loc) · 8.62 KB
/
dataClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# dataClassifier.py
# -----------------
# Main execution file for machine learning algorithms in MLOCR.
#
# Chet Aldrich, Laura Biester
import naivebayes
import perceptron
import neuralnet
import loadFeatures
import argparse
import time
import operator
from util import Counter
def readCommand():
"""
readCommand() begins the machine learning program based on parameters given by the user. The command line arguments detailed in comments below.
"""
# default values
numTestValues = 100
numTrainValues = 100
pixels = 0
parser = argparse.ArgumentParser()
# `-c` selects the classifier (`naivebayes` and `perceptron` are options)
parser.add_argument("-c", type = str, choices=["naivebayes", "perceptron", "neuralnet"], help="selects the classifier for use with the MNIST data")
# `-a` Tune iterations/Laplace smoothing while training
parser.add_argument("-a", action="store_true", default=False, help="tune while training")
# `-u` Use pre-learned weights for the perceptron. These were created after training the program for about 3 hours on all of the training data, and this option is available so you can see the results of the classifier without having to wait for the perceptron to train.
parser.add_argument("-u", action="store_true", default=False, help="use pre-learned weights for perceptron or pre-learned probabilities for Naive Bayes")
# `--pixels` Chops off some pixels from the edges of each image to improve processing time. Sometimes leads to a slight loss in accuracy. Options are 1-12 pixels.
parser.add_argument("--pixels", type = int, choices=range(13), help="remove this many pixels from outside of photo for faster training")
# `--train` selects the number of training data samples to be used by the classifier
parser.add_argument("--train", type = int, help="selects the number of training data samples to be used by the classifier")
# `--test` selects the number of test data samples to be used by the classifier
parser.add_argument("--test", type = int, help="selects the number of testing data samples to be used by the classifier")
# '-i' gives information about most frequent incorrect classifications
parser.add_argument("-i", action="store_true", default=False, help="gives information about most frequent incorrect classifications")
args = parser.parse_args()
# Here, we set arguments if they are given by optional params.
if args.train != None:
numTrainValues = args.train
if args.test != None:
numTestValues = args.test
if args.pixels != None:
pixels = args.pixels
# Here, we determine which algorithm to run based on input on
# the -c parameter.
if args.c == "naivebayes":
runNaiveBayes(numTrainValues, numTestValues, pixels, args.a, args.u, args.i)
elif args.c == "perceptron":
runPerceptron(numTrainValues, numTestValues, pixels, args.a, args.u, args.i)
elif args.c == "neuralnet":
runNeuralNet(numTrainValues, numTestValues, pixels, args.a, args.u, args.i)
def runNeuralNet(numTrainValues, numTestValues, pixels, tune, useTrainedWeights, info):
"""
runNeuralNet() runs the neural net machine learning algorithm on the MNIST
dataset.
"""
# TODO: Add the rest of the params to function argument.
t = time.clock()
neuralClassifier = neuralnet.NeuralNet(range(10))
print "Loading Testing Data....\n"
trainingData, trainingLabels, validationData, validationLabels, features = loadFeatures.loadTrainingData(numTrainValues, pixels, tune)
print "Loading Testing Data....\n"
testingData, testingLabels = loadFeatures.loadTestingData(numTestValues, pixels)
print "Testing Neural Net....\n"
classifiedData = neuralClassifier.classify(testingData)
test(classifiedData, testingLabels, info)
print "Total Time {0}".format(time.clock() - t)
def runPerceptron(numTrainValues, numTestValues, pixels, tune, useTrainedWeights, info):
"""
runPerceptron() runs the perceptron learning algorithm on the MNIST dataset.
It also prints associated analytics, including the accuracy and time taken
to run.
Keyword arguments:
numTrainValues -- number of training values to train the perceptron
numTestValues -- number of test values to test the trained perceptron
pixels -- number of pixels to chop from the margins of the image
tune -- a boolean for whether to tune to find the optimal number of iterations
useTrainedWeights -- boolean to use pretrained weights
info -- boolean to get information about common classification mistakes
"""
t = time.clock()
perceptronClassifier = perceptron.Perceptron(range(10), 3)
if useTrainedWeights:
perceptronClassifier.useTrainedWeights()
else:
print "Loading Testing Data....\n"
trainingData, trainingLabels, validationData, validationLabels, features = loadFeatures.loadTrainingData(numTrainValues, pixels, tune)
print "Training Perceptron....\n"
perceptronClassifier.train(trainingData, trainingLabels, validationData, validationLabels, tune)
print "Loading Testing Data....\n"
testingData, testingLabels = loadFeatures.loadTestingData(numTestValues, pixels)
print "Testing Perceptron....\n"
classifiedData = perceptronClassifier.classify(testingData)
test(classifiedData, testingLabels, info)
print "Total Time {0}".format(time.clock() - t)
def runNaiveBayes(numTrainValues, numTestValues, pixels, tune, useTrainedProbs, info):
"""
runNaiveBayes() runs the Naive Bayes learning algorithm on the MNIST dataset.
It also prints associated analytics, including the accuracy and time taken
to run.
Keyword arguments:
numTrainValues -- number of training values to train the perceptron
numTestValues -- number of test values to test the trained perceptron
pixels -- number of pixels to chop from the margins of the image
tune -- a boolean for whether to tune to find the optimal number of iterations
info -- boolean to get information about common classification mistakes
"""
t = time.clock()
naiveBayesClassifier = naivebayes.NaiveBayes(range(10))
if useTrainedProbs:
naiveBayesClassifier.useTrainedProbs(loadFeatures.getFeatureList())
else:
print "Loading Training Data....\n"
trainingData, trainingLabels, validationData, validationLabels, features = loadFeatures.loadTrainingData(numTrainValues, pixels, tune)
print "Training Naive Bayes Classifier....\n"
naiveBayesClassifier.train(trainingData, trainingLabels, validationData, validationLabels, features, tune)
print "Loading Testing Data....\n"
testingData, testingLabels = loadFeatures.loadTestingData(numTestValues, pixels)
print "Testing Naive Bayes Classifier....\n"
classifiedData = naiveBayesClassifier.classify(testingData)
test(classifiedData, testingLabels, info)
print "Total Time {0}".format(time.clock() - t)
def test(classifiedData, testingLabels, info):
"""
test() gets a classification for the test data and checks
if it matches the labels. It then returns a performance metric
on the test set.
Keyword Arguments:
classifiedData -- the labels outputted by the trained algorithm on the test set
testingLabels -- the correct labels associated with test set
info -- boolean to get information about common classification mistakes
"""
countCorrect = 0
problems = Counter()
# check if classification matches label
for i in range(len(testingLabels)):
if testingLabels[i] == classifiedData[i]:
countCorrect += 1
else:
problems[(testingLabels[i], classifiedData[i])] += 1
print "Number of Correct Classifications"
print "================================="
print countCorrect
print "Percent of Correct Classifications"
print "=================================="
print float(countCorrect) / len(testingLabels) * 100.0
if info:
getInfo(problems)
def getInfo(problems):
"""
getInfo() prints out, in order, all incorrect classifications
"""
print "Common Problems with Classification"
print "==================================="
# sort by number of problems of each type, in decreasing order
sorted_problems = sorted(problems.items(), key=operator.itemgetter(1))
sorted_problems.reverse()
# print out the top 10 issues
counter = 0
for problem in sorted_problems:
if counter > 10:
break
print "Label {0} Classified as {1} Occurrences {2}".format(problem[0][0], problem[0][1], problem[1])
counter += 1
if __name__ == "__main__":
# begin program
readCommand()