-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomization_tests.py
212 lines (183 loc) · 7.88 KB
/
randomization_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#%%
# Import libraries
from typing import List, Tuple
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from ucimlrepo import fetch_ucirepo
#%%
# Load the dataset
real_estate_valuation = fetch_ucirepo(id=477)
#%%
# Features and target variable
X = real_estate_valuation.data.features
y = real_estate_valuation.data.targets
#%%
# Define the permutation regression functions
def calculate_p_values(
X: pd.DataFrame,
y: pd.DataFrame,
permuted_coefs: List,
model_coefs: List,
precision: int = 3,
p_value_threshold_one: float = 0.05,
p_value_threshold_two: float = 0.01
) -> Tuple[List[str], List[str]]:
"""
Calculate p-values for a given model using both permutation and classical methods.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data to fit the model.
y : array-like of shape (n_samples,)
The target values.
permuted_coefs : list of float
The coefficients obtained from the permutation test.
model_coefs : list of float
The original coefficients of the fitted model.
precision : int, optional (default=3)
The number of decimal places to round the p-values.
p_value_threshold_one : float, optional (default=0.05)
The threshold for the first level of significance.
p_value_threshold_two : float, optional (default=0.01)
The threshold for the second level of significance.
Returns
----------
permuted_p_values : list of str
The empirical p-values for each coefficient, formatted with significance stars if below the thresholds.
classic_p_values : list of str
The asymptotic p-values for each coefficient, formatted with significance stars if below the thresholds.
"""
permuted_p_values: List = []
classic_p_values: List = []
model = sm.OLS(y, sm.add_constant(X)).fit()
for i in range(len(model_coefs)):
p_value = (np.abs(np.array(permuted_coefs)[:, i]) >= np.abs(np.array(model_coefs)[i])).mean()
if p_value_threshold_two <= p_value < p_value_threshold_one:
p_value_str = str(np.round(p_value, precision)) + ' (*)'
elif p_value < p_value_threshold_two:
p_value_str = str(np.round(p_value, precision)) + ' (**)'
else:
p_value_str = str(np.round(p_value, precision)) + ' (ns)'
permuted_p_values.append(p_value_str)
for p_value in model.pvalues[1:]:
if p_value_threshold_two <= p_value < p_value_threshold_one:
p_value_str = str(np.round(p_value, precision)) + ' (*)'
elif p_value < p_value_threshold_two:
p_value_str = str(np.round(p_value, precision)) + ' (**)'
else:
p_value_str = str(np.round(p_value, precision)) + ' (ns)'
classic_p_values.append(p_value_str)
return permuted_p_values, classic_p_values
def permutation_test_regression(
X: pd.DataFrame,
y: pd.DataFrame,
n_permutations: int = 100_000,
precision: int = 3,
p_value_threshold_one: float = 0.05,
p_value_threshold_two: float = 0.01,
method: str = 'manly'
) -> Tuple[List[float], List[str], List[str], float, float]:
"""
Perform a permutation test for a multiple linear regression model to assess the significance of model
coefficients using the ter Braak (1992) or Manly (1997) methods.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data to fit the model.
y : array-like of shape (n_samples,)
The target values.
n_permutations : int, optional (default=100_000)
The number of permutations to perform.
precision : int, optional (default=3)
The number of decimal places to round the p-values.
p_value_threshold_one : float, optional (default=0.05)
The threshold for the first level of significance.
p_value_threshold_two : float, optional (default=0.01)
The threshold for the second level of significance.
method : str, optional (default='manly')
Methods
----------
'terbraak' : Permutation test using the method described by ter Braak (1992).
'manly' : Permutation test using the method described by Manly (1997).
Returns
----------
model_coefs : list of float
The original coefficients of the fitted model.
permuted_p_values : list of str
The empirical p-values for each coefficient, formatted with significance stars if below the thresholds.
classic_p_values : list of str
The asymptotic p-values for each coefficient, formatted with significance stars if below the thresholds.
p_value_threshold_one : float
The threshold for the first level of significance.
p_value_threshold_two : float
The threshold for the second level of significance.
References
----------
ter Braak, Cajo J. F. "Permutation versus bootstrap significance tests in multiple regression and ANOVA."
In Handbook of Statistics, Vol. 9. Elsevier, Amsterdam. (1992).
Manly, Bryan F. J. Randomization, Bootstrap, and Monte Carlo Methods in Biology, 2nd ed. Texts in Statistical
Science Series. Chapman & Hall, London. (1997).
Hardin, Johanna, Lauren Quesada, Julie Ye, and Nicholas J. Horton. "The Exchangeability Assumption for
Purmutation Tests of Multiple Regression Models: Implications for Statistics and Data Science Educators."
(2024) [Online]. Available: https://arxiv.org/pdf/2406.07756.
"""
permuted_coefs: List = []
model = LinearRegression().fit(X, y)
model_coefs = np.ravel(model.coef_).tolist()
model_preds = model.predict(X)
model_resids = y - model_preds
if method == 'manly':
for _ in range(n_permutations):
model.fit(X, np.random.permutation(y))
permuted_coefs.append(np.ravel(model.coef_).tolist())
elif method == 'terbraak':
for _ in range(n_permutations):
model.fit(X, np.random.permutation(model_preds) + np.random.permutation(model_resids))
permuted_coefs.append(np.ravel(model.coef_).tolist())
else:
raise ValueError("Invalid method. Please select 'manly' or 'terbraak'.")
permuted_p_values, classic_p_values = calculate_p_values(X, y, permuted_coefs, model_coefs, precision, p_value_threshold_one, p_value_threshold_two)
return model_coefs, permuted_p_values, classic_p_values, p_value_threshold_one, p_value_threshold_two
#%%
# Perform permutation test by the Manly (1997) method
(
coefs,
permuted_p_values,
classic_p_values,
p_value_threshold_one,
p_value_threshold_two
) = permutation_test_regression(X, y)
#%%
# Print the results obtained by the Manly (1997) method
print("Regression Model Coefficients and p-Values obtained by the Manly (1997) method\n")
print(f"Target: {y.columns.tolist()}")
print(f"Features: {X.columns.tolist()}\n")
print(f"Coefficients: {coefs}\n")
print(f"Empirical p-Values: {permuted_p_values}")
print(f"Asymptotic p-Values: {classic_p_values}")
print(f"\n(*) p-value < {p_value_threshold_one}")
print(f"(**) p-value < {p_value_threshold_two}")
print(f"(ns) p-value >= {p_value_threshold_one}\n")
#%%
# Perform permutation test by the ter Braak (1992) method
(
coefs,
permuted_p_values,
classic_p_values,
p_value_threshold_one,
p_value_threshold_two
) = permutation_test_regression(X, y, method='terbraak')
#%%
# Print the results obtained by the ter Braak (1992) method
print("Regression Model Coefficients and p-Values obtained by the ter Braak (1992) method\n")
print(f"Target: {y.columns.tolist()}")
print(f"Features: {X.columns.tolist()}\n")
print(f"Coefficients: {coefs}\n")
print(f"Empirical p-Values: {permuted_p_values}")
print(f"Asymptotic p-Values: {classic_p_values}")
print(f"\n(*) p-value < {p_value_threshold_one}")
print(f"(**) p-value < {p_value_threshold_two}")
print(f"(ns) p-value >= {p_value_threshold_one}\n")
#%%