forked from apachecn/Interview
-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathsvm-python3.6.py
223 lines (182 loc) · 8.24 KB
/
svm-python3.6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/python3
# coding: utf-8
'''
Created on 2017-10-26
Update on 2017-10-26
Author: 片刻
Github: https://github.com/apachecn/kaggle
PCA主成成分分析
'''
import csv
import time
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# 加载数据
def opencsv():
print('Load Data...')
# 使用 pandas 打开
dataTrain = pd.read_csv('datasets/getting-started/digit-recognizer/input/train.csv')
dataPre = pd.read_csv('datasets/getting-started/digit-recognizer/input/test.csv')
trainData = dataTrain.values[:, 1:] # 读入全部训练数据
trainLabel = dataTrain.values[:, 0]
preData = dataPre.values[:, :] # 测试全部测试个数据
return trainData, trainLabel, preData
# 数据预处理-降维 PCA主成成分分析
def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
print('dimensionality reduction...')
trainData = np.array(x_train)
testData = np.array(x_test)
preData = np.array(preData)
'''
使用说明:https://www.cnblogs.com/pinard/p/6243025.html
n_components>=1
n_components=NUM 设置占特征数量比
0 < n_components < 1
n_components=0.99 设置阈值总方差占比
'''
pca = PCA(n_components=COMPONENT_NUM, whiten=True)
pca.fit(trainData) # Fit the model with X
pcaTrainData = pca.transform(trainData) # Fit the model with X and 在X上完成降维.
pcaTestData = pca.transform(testData) # Fit the model with X and 在X上完成降维.
pcaPreData = pca.transform(preData) # Fit the model with X and 在X上完成降维.
# pca 方差大小、方差占比、特征数量
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n', pca.n_components_)
print(sum(pca.explained_variance_ratio_))
return pcaTrainData, pcaTestData, pcaPreData
# 训练模型
def trainModel(trainData, trainLabel):
print('Train SVM...')
svmClf = SVC(C=4, kernel='rbf')
svmClf.fit(trainData, trainLabel) # 训练SVM
return svmClf
# 结果输出保存
def saveResult(result, csvName):
with open(csvName, 'w') as myFile:
myWriter = csv.writer(myFile)
myWriter.writerow(["ImageId", "Label"])
index = 0
for r in result:
index += 1
myWriter.writerow([index, int(r)])
print('Saved successfully...') # 保存预测结果
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
def analyse_data(dataMat):
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
#计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
#cov(X,0) = cov(X) 除数是n-1(n为样本个数)
#cov(X,1) 除数是n
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
# np.mat 是用来生成一个矩阵的
# 保存特征值(eigvals)和对应的特征向量(eigVects)
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
eigValInd = eigValInd[:-(topNfeat+1):-1]
# 计算特征值的总和
cov_all_score = float(sum(eigvals))
sum_cov_score = 0
for i in range(0, len(eigValInd)):
# 特征值进行相加
line_cov_score = float(eigvals[eigValInd[i]])
sum_cov_score += line_cov_score
'''
我们发现其中有超过20%的特征值都是0。
这就意味着这些特征都是其他特征的副本,也就是说,它们可以通过其他特征来表示,而本身并没有提供额外的信息。
最前面15个值的数量级大于10^5,实际上那以后的值都变得非常小。
这就相当于告诉我们只有部分重要特征,重要特征的数目也很快就会下降。
最后,我们可能会注意到有一些小的负值,他们主要源自数值误差应该四舍五入成0.
'''
print('主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.2f'), format(sum_cov_score/cov_all_score*100, '4.1f')))
# 找出最高准确率
def getOptimalAccuracy(trainData, trainLabel, preData):
# 分析数据 100个特征左右
# analyse_data(trainData)
x_train, x_test, y_train, y_test = train_test_split(trainData, trainLabel, test_size=0.1)
lineLen, featureLen = np.shape(x_test) # shape 返回矩阵或者数值的长度
# print(lineLen, type(lineLen), featureLen, type(featureLen))
minErr = 1
minSumErr = 0
optimalNum = 1
optimalLabel = []
optimalSVMClf = None
pcaPreDataResult = None
for i in range(30, 45, 1):
# 评估训练结果
pcaTrainData, pcaTestData, pcaPreData = dRCsv(x_train, x_test, preData, i)
svmClf = trainModel(pcaTrainData, y_train)
svmtestLabel = svmClf.predict(pcaTestData)
errArr = np.mat(np.ones((lineLen, 1)))
sumErrArr = errArr[svmtestLabel != y_test].sum()
sumErr = sumErrArr/lineLen
print('i=%s' % i, lineLen, sumErrArr, sumErr)
if sumErr <= minErr:
minErr = sumErr
minSumErr = sumErrArr
optimalNum = i
optimalSVMClf = svmClf
optimalLabel = svmtestLabel
pcaPreDataResult = pcaPreData
print("i=%s >>>>> \t" % i, lineLen, int(minSumErr), 1-minErr)
'''
展现 准确率与召回率
precision 准确率
recall 召回率
f1-score 准确率和召回率的一个综合得分
support 参与比较的数量
参考链接:http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
'''
# target_names 以 y的label分类为准
# target_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
target_names = [str(i) for i in list(set(y_test))]
print(target_names)
print(classification_report(y_test, optimalLabel, target_names=target_names))
print("特征数量= %s, 存在最优解:>>> \t" % optimalNum, lineLen, int(minSumErr), 1-minErr)
return optimalSVMClf, pcaPreDataResult
# 存储模型
def storeModel(model, filename):
import pickle
with open(filename, 'wb') as fw:
pickle.dump(model, fw)
# 加载模型
def getModel(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
def trainDRSVM():
startTime = time.time()
# 加载数据
trainData, trainLabel, preData = opencsv()
# 模型训练 (数据预处理-降维)
optimalSVMClf, pcaPreData = getOptimalAccuracy(trainData, trainLabel, preData)
storeModel(optimalSVMClf, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.model')
storeModel(pcaPreData, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.pcaPreData')
print("finish!")
stopTime = time.time()
print('TrainModel store time used:%f s' % (stopTime - startTime))
def preDRSVM():
startTime = time.time()
# 加载模型和数据
optimalSVMClf = getModel('datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.model')
pcaPreData = getModel('datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.pcaPreData')
# 结果预测
testLabel = optimalSVMClf.predict(pcaPreData)
# print("testLabel = %f" % testscore)
# 结果的输出
saveResult(testLabel, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.csv')
print("finish!")
stopTime = time.time()
print('PreModel load time used:%f s' % (stopTime - startTime))
if __name__ == '__main__':
trainData, trainLabel, preData = opencsv()
# 训练并保存模型
trainDRSVM()
# 分析数据
analyse_data(trainData)
# 加载预测数据集
# preDRSVM()