-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathForwStep1.R
229 lines (206 loc) · 9.32 KB
/
ForwStep1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
ForwStep1 <- function(X,namesX, y,kstop,nNeg,nPos) {
# Forward stepwise regression
# D. Meko
# Last revised 2023-08-16
#
# The forward stepwise entry is done until all variables are in. The final model
# is selected optionally as the model for step at which adjusted R-square is maximum
# or the cross-validation RE is a maximum
#
# INPUT ARGUMENTS
# y [matrix] 1-col of predictand
# X [matrix] one-or-more cols of pool of potential predictors
# namesX [character] vector of ids of potential predictors (what's in cols of X)
# kstop [numeric] stopping criterion: 1=maximimum adjusted R-squared,
# 2=maximum cross-validation RE
# nNeg [numeric] maximum negative lag on chronologies allowed in modeling
# nPos [numeric] maximum positive...
#
# OUTPUT
# H: named list, with elements:
# c('Model','StopCriterion','StoppingStep','ColsInModel','Coefficients',
# 'ColsInOrderEntry','Rsquared','RsquaredAdj','Step',
# 'RsquaredAllSteps','RsquaredAdjAllSteps','Foverall',
# 'Fpvalue',
# 'CrossValidStorage','NleftOutinCV','REcv','RMSEcv',
# 'ModelTable')
# Model [lm object] special type of R object that holds many of the
# regression statistics and data, such as the residuals and predicted data.
# R has functions that operate on lm objects. For example,
# summary(H$Model) shows a summary table of the regression
# StopCriterion: criterion for picking best model (1=max adjusted R-squared;
# 2=maximum cross-validation RE)
# StoppingStep: the step at which stepwise regression stopped according to StopCriterion
# ColsInModel: which columns of input X are in the final model; the order is important,
# and matches the order of coefficients in H$coefficients (after the intercept)
# Coefficients: the regression coefficients, starting with intercept term and then
# in order as in H$ColsInModel.The coefficients, combined with ColsInModel allow
# a reconstruction to be generated from matrix X.
# ColsInOrderEntry: order in which the columns of X entered the stepwise model. In general,
# this differs from order in H$ColsInModel
# Rsquared,RsquaredAdj: R-squared and adjusted R-squared of the final model
# Step, RsquaredAllSteps,RsquaredAdjAllSteps: step is a numeric vector (1,2, ...) of
# the step in the stepwise process; the other two variables are the R-squared and
# adjusted R-squared at each step
# Foverall,Fpvalue: overall-F of the final regression model and p-value of that F
# CrossValidStorage: a list of detailed cross-validation statistics returned by
# CrossValid1(). See opening comments of function Crossvalid1 details
# NleftOutinCV,REcv,RMSEcv: cross-validation statistics. NleftOutCV is the number of
# observations left out at each iteration of cross-validation. With lags, this is
# greater than 1, and set to ensure that no tree-ring data are used both to provide
# a cross-validation prediction in a specific and to calibrate the model giving
# that prediction. REcv is the reduction-of-error statistic. RMSEcv is the
# root-mean-square error.
# ModelTable: a table object listing the same coefficients as in H$coefficients, but also
# including the corresponding id of the predictor, cross-referenced to columns of
# input matrix X and indicating if lagged positively or negatively and by how much.
# For example, X7 is the seventh column of X, no lags; and X2P2 is the second column of
# of X lagged +2 years forward from the predictand.
#
# Rev 2023-08-16: cosmetic, to make sure that summary(G) retains correct col
# name when just on predictor, on call to lm()
# Rev 2024-12-21: so that global "code_dir" used to access CrossValid1.R
source(paste(code_dir,"CrossValid1.R",sep=""))
#--- ALLOCATE AND INITIALIZE
Np<-dim(X)[2] # size predictor pool
i1<-1:Np # index to predictors in pool
i2<-rep(NA,Np) # index in order of entry
Lin=rep(FALSE,Np) # logical pointer to variables in model
R2<-rep(NA,Np) # Regression R-squared at each step
R2a<-rep(NA,Np) # Adjusted...
#--- FORWARD STEPWISE REGRESSION
for (j in 1:Np){
if (j==1){
# First, pass pick X correlated highest (absolute correlation) with y
colnames(X)<-namesX # re-initialize these
r<-cor(y,X)
H<-sort(abs(r),decreasing=TRUE,index.return=TRUE)
iwinner<-H$ix[1]
i2[j]<-iwinner
Lin[iwinner]<-TRUE # turn on logical for variable just entered
U<-X[,Lin,drop="FALSE"] # cull matrix of those predictors
# If only 1 predictor, need next to make sure lm receive col name
if (dim(U)[2]==1){
colnames(U) <- colnames(X)[Lin]
}
G<-lm(y~U) # estimate model (G is a "lm" object )
R2this<-summary(G)$r.squared
R2adjthis<-summary(G)$adj.r.squared
R2[j]<-R2this # store R-squared for this model
R2a[j]<-R2adjthis # store adjusted R-squared ...
e<-as.matrix(G$residuals) # model residuals;
} else {
# iteration after the first. e are the residuals of the previous step.
# At each step, e is correlated with the cols of X not yet in model.
# "Chosen one" is the the col of X with highest correlations
colnames(X)<-namesX # re-initialize these
ibullpen <-i1[!Lin] # cols of X NOT yet in models
r<-cor(e,X[,ibullpen])
H<-sort(abs(r),decreasing=TRUE,index.return=TRUE)
iwinner<-H$ix[1] # pointer to highes correlated member of sub-matrix
i2[j]<-ibullpen[iwinner] # corresponding column of that member in X
ithis<-ibullpen[iwinner] #... renamed for simplicity
Lin[ithis]<-TRUE # logical of cols of X in current model to be estimated
# Next statements exactly same as those described for first iteration
U<-X[,Lin,drop="FALSE"]
G<-lm(y~U) # estimation
R2this<-summary(G)$r.squared
R2adjthis<-summary(G)$adj.r.squared
R2[j]<-R2this
R2a[j]<-R2adjthis
e<-as.matrix(G$residuals)
}
}
#--- LEAVE-M-OUT CROSS-VALIDATION, STEPWISE
#
# Do regardless of stopping rule. CV will be a list with cross-validation results
CV <-CrossValid1(X, y, nNeg, nPos, i2)
#--- FIND "BEST" MODEL AND RE-FIT IT
rm(H)
Lin<-rep(FALSE,Np) # re-initialize as no variables in model
if (kstop==1){
# MAXIMUM ADJUSTED R-SQUARED VERSION
# Commented out code for using maximum adjusted R2 as criterion for best
# model. For large number of variables in pool of potential predictors, and
# with those possibly chosen by correlation screening from larger number
# of possible predictors, this criterion tends to over-fit model. Decided
# to stop entry instead FIRST maximum of adjusted R2
#
s<-sort(R2a,decreasing=TRUE,index.return=TRUE) # sorted adjusted R-squared,
# from highest to lowest.
iwinner<-s$ix[1] # step at which adjusted R-squared highest
}else{
# MAXIMUM RE VERSION
iwinner <- CV$REmaxStep # step at which RE highest
}
i2a<-i2[1:iwinner] # cols of X in model to be fit
Lin[i2a]<-TRUE # turn on logical for cols of X in model to be fit
#---- PULL SUBMATRIX OF PREDICTORS AND RE-ESTIMATE MODEL
U<-X[,Lin,drop="FALSE"]
colnames(X)<-namesX # re-initialize these
colnames(U)<-colnames(X)[Lin]
G<-lm(y~U) # estimation
# Do not want variables to be listed U1, U2, etc. Want real IDs
bbnms <- names(G$coefficients)
bbnms1 <- bbnms[-1] # names of terms, without "(Intercept)"
bbnms2 <- colnames(X)[Lin]
names(G$coefficients) <- c(bbnms[1],bbnms2)
rm(bb,bbnms,bbnms1,bbnms2)
#--- STATISTICS OF FINAL FITTED MODEL
kstep<-sum(Lin) # final step (max adj R-square)
if (kstop==1){
stopHow<-'Max R-squared adjusted'
}else{
stopHow<-'Max RE'
}
H = list()
H[[1]]<-G
H[[2]]<-stopHow
H[[3]]<-kstep
H[[4]]<-i1[Lin]
H[[5]]<-G$coefficients
H[[6]]<-i2[1:kstep]
H[[7]]<-summary(G)$r.squared
H[[8]]<-summary(G)$adj.r.squared
jstep<-(1:Np)
H[[9]]<-jstep
H[[10]]<-R2
H[[11]]<-R2a
H[[12]]<-summary(G)$fstatistic[1]
# function for p-value of F; from
# https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
p<-lmp(G)
H[[13]]<-p # pvalue of overall F
H[[14]] <- CV
H[[15]] <- CV$LeftOut
if (kstop==2){
H[[16]] <- CV$REcv
H[[17]] <- CV$RMSEcv
}else{
# CrossValid1 returns CVREv and RMSEcv as the "best" according to maximum RE, but
# if using maximum adjusted R-squared, want instead to use index iwinner to grab those
# applicable statistics.
H[[16]] <- CV$REcvAll[iwinner]
H[[17]] <- CV$RMSEvall[iwinner]
}
# Build table with regression coeffients opposite variable names
tab<-matrix(t(G$coefficients))
colnames(tab)<-c('Model')
rownames(tab)<-c('Intercept',namesX[H[[4]]])
tab<- as.table(tab)
H[[18]]<-tab
names(H)<-c('Model','StopCriterion','StoppingStep','ColsInModel','Coefficients',
'ColsInOrderEntry','Rsquared','RsquaredAdj','Step',
'RsquaredAllSteps','RsquaredAdjAllSteps','Foverall',
'Fpvalue',
'CrossValidStorage','NleftOutinCV','REcv','RMSEcv',
'ModelTable')
return(H)
}