-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmodel_gridsearch_load.py
206 lines (180 loc) · 6.15 KB
/
model_gridsearch_load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape, Lambda, LSTM
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import ModelCheckpoint
import numpy as np
from Bio import SeqIO
from numpy import array
from numpy import argmax
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
from keras.layers.embeddings import Embedding
from keras import backend as K
from keras.backend import expand_dims
import matplotlib.pyplot as plt
from keras.regularizers import l1, l2
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.metrics import accuracy_score
from numpy import tensordot
from numpy.linalg import norm
from scipy.optimize import differential_evolution
from keras.models import load_model
from itertools import product
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
r_test_x = []
r_test_y = []
r_test_x2 = []
r_test_y2 = []
posit_1 = 1;
negat_0 = 0;
win_size = 51 # actual window size
win_size1 = 39
win_size2 = 21
num_classes = 2
n_models = 2
cut_off1 = int((51 - win_size1)/2)
cut_off2 = int((51 - win_size2)/2)
# define universe of possible input values
alphabet = 'ARNDCQEGHILKMFPSTWYV-'
# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))
# Test Dataset for CNN -------------------------------------------------------------
#for positive sequence
def innertest3():
#Input
data = seq_record.seq
data = data[cut_off1:-cut_off1]
#rint(data)
# integer encode input data
for char in data:
if char not in alphabet:
return
integer_encoded = [char_to_int[char] for char in data]
r_test_x2.append(integer_encoded)
r_test_y2.append(posit_1)
for seq_record in SeqIO.parse("test_s33_Pos_51.fasta", "fasta"):
innertest3()
#for negative sequence
def innertest4():
#Input
data = seq_record.seq
data = data[cut_off1:-cut_off1]
#print(data)
# integer encode input data
for char in data:
if char not in alphabet:
return
integer_encoded = [char_to_int[char] for char in data]
r_test_x2.append(integer_encoded)
r_test_y2.append(negat_0)
for seq_record in SeqIO.parse("test_s33_Neg_51.fasta", "fasta"):
innertest4()
# Changing to array (matrix)
r_test_x2 = array(r_test_x2)
r_test_y2 = array(r_test_y2)
# Balancing test dataset
# Testing Data Balancing by undersampling####################################
rus = RandomUnderSampler(random_state=7)
x_res4, y_res4 = rus.fit_resample(r_test_x2, r_test_y2)
#Shuffling
r_test_x2, r_test_y2 = shuffle(x_res4, y_res4, random_state=7)
r_test_x2 = np.array(r_test_x2)
r_test_y2 = np.array(r_test_y2)
#-Test Dataset for LSTM----------------------------------------
#for positive sequence
def innertest1():
#Input
data = seq_record.seq
data = data[cut_off2:-cut_off2]
#rint(data)
# integer encode input data
for char in data:
if char not in alphabet:
return
integer_encoded = [char_to_int[char] for char in data]
r_test_x.append(integer_encoded)
r_test_y.append(posit_1)
for seq_record in SeqIO.parse("test_s33_Pos_51.fasta", "fasta"):
innertest1()
#for negative sequence
def innertest2():
#Input
data = seq_record.seq
data = data[cut_off2:-cut_off2]
#print(data)
# integer encode input data
for char in data:
if char not in alphabet:
return
integer_encoded = [char_to_int[char] for char in data]
r_test_x.append(integer_encoded)
r_test_y.append(negat_0)
for seq_record in SeqIO.parse("test_s33_Neg_51.fasta", "fasta"):
innertest2()
# Changing to array (matrix)
r_test_x = array(r_test_x)
r_test_y = array(r_test_y)
# Balancing test dataset
# Testing Data Balancing by undersampling####################################
rus = RandomUnderSampler(random_state=7)
x_res3, y_res3 = rus.fit_resample(r_test_x, r_test_y)
#Shuffling
r_test_x, r_test_y = shuffle(x_res3, y_res3, random_state=7)
r_test_x = np.array(r_test_x)
r_test_y = np.array(r_test_y)
############################################################################
###############################################################################################################################################
# Ensemble Part
def ensemble_final_pred(members, weights, testX, testX2):
# make predictions
yhats = []
yhats.append(array(members[0].predict(testX)))
yhats.append(array(members[1].predict(testX2)))
yhats = array(yhats)
# weighted sum across ensemble members
summed = tensordot(yhats, weights, axes=((0),(0)))
# argmax across classes
#result = argmax(summed, axis=1)
return summed
# Load models
models = [load_model('model_best_lstm.h5'),load_model('model_best_cnn.h5')]
for i in models:
print(i.summary())
# grid search weights
#weights = grid_search(models, x_test, y_test)
#weights = grid_search(models, r_test_x, r_test_y, r_test_x2, r_test_y2)
weights = [0.16666667,0.83333333]
# Independent Test
Y_pred = ensemble_final_pred(models, weights, r_test_x, r_test_x2)
t_pred2 = Y_pred[:,1]
Y_pred = (Y_pred > 0.5)
y_pred1 = [np.argmax(y, axis=None, out=None) for y in Y_pred]
y_pred1 = np.array(y_pred1)
print("Matthews Correlation : ",matthews_corrcoef(r_test_y, y_pred1))
print("Confusion Matrix : \n",confusion_matrix(r_test_y, y_pred1))
# For sensitivity and specificity
sp_1, sn_1 = confusion_matrix(r_test_y, y_pred1)
sp_2 = sp_1[0]/(sp_1[0]+sp_1[1])
sn_2 = sn_1[1]/(sn_1[0]+sn_1[1])
# ROC
fpr, tpr, _ = roc_curve(r_test_y, t_pred2)
roc_auc = auc(fpr, tpr)
print("AUC : ", roc_auc)
print(classification_report(r_test_y, y_pred1))
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for ST')
plt.legend(loc="lower right")
plt.show()
print("Specificity = ",sp_2, " Sensitivity = ",sn_2)