This repository has been archived by the owner on Dec 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClean code final.Rmd
327 lines (247 loc) · 9.86 KB
/
Clean code final.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
---
title: "Clean Final Code"
output: pdf_document
date: "2024-12-04"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
]
# loading required packages
```{r}
# used for general functions, plotting
library(tidyverse)
#used for diversity index calculations
library(vegan)
# used for data visualization
library(ggpubr)
# used for forecasting values of time series
library(forecast)
```
# reading in required data
```{r}
plankton_raw <- read.csv("plankton.csv")
pco_raw <- read.csv("atmo_data_medians_1991-2018.csv")
combined_data <- read.csv("combined_data_1991-2018.csv")
```
# cleaning and wrangling the plankton data
```{r}
head(plankton_raw)
# filter for year 1991 to 2018 to match pCO2 data and have full plankton data
new_plankton <- plankton_raw %>%
filter(Year >= 1991 & Year <= 2018) %>%
select_if(~ !any(is.na(.)))
# pivot the dataset longer so counts are in one row for a year and taxa id in
# a second column. Summed on recommendation of Mete and Zoe
new_plankton %>%
pivot_longer(cols = starts_with("id"),
names_to = "taxa_ID",
values_to = "count") %>%
group_by(Year, taxa_ID) %>%
summarise(total_count = sum(count)) -> clean_plankton
# Shannons diversity index calculated
clean_plankton %>%
group_by(Year) %>%
summarize(diversity_index =
diversity(total_count, index = "shannon")) -> clean_div_data
# plotting quickly to see general trend
clean_div_data %>%
ggplot(aes(x = Year, y = diversity_index)) +
geom_point() +
geom_smooth(method = 'lm')
# testing to see if removing taxa added after 1991 will affect diversity values
# removing taxa identified as being added later
plankton_raw %>%
subset(select = -c(id_963, id_357, id_347, id_354, id_355, id_10671,
id_130, id_133, id_195, id_198, id_962, id_985, id_134,
id_135, id_197, id_270, id_634, id_805, id_807, id_806,
id_818, id_997, id_999, id_148, id_813, id_1568, id_601,
id_1576, id_1577, id_1543, id_1570, id_988, id_1592,
id_1593, id_10595, id_1549, id_1551, id_1588, id_1579,
id_1585, id_1599, id_1608, id_1609, id_1601, id_1602,
id_1603, id_1604, id_1606, id_1607, id_1616, id_10375,
id_1622, id_1629, id_1644, id_1648, id_1654, id_1612,
id_1621, id_1623, id_1627, id_1632, id_1634, id_1635,
id_1639, id_1642, id_1574, id_1618, id_5500, id_1624,
id_1690, id_1677, id_1679, id_1680, id_1681, id_1682,
id_1628, id_626, id_10668, id_10532, id_10705, id_10695,
id_10069, id_10745, id_1678, id_10744, id_10737,
id_10749)) ->omit_plankton
# filtering to the same parameters and wrangling the same
new_omit_plankton <- omit_plankton %>%
filter(Year >= 1991 & Year <= 2018) %>%
select_if(~ !any(is.na(.)))
new_omit_plankton %>%
pivot_longer(cols = starts_with("id"),
names_to = "taxa_ID",
values_to = "count") %>%
group_by(Year, taxa_ID) %>%
summarise(total_count = sum(count)) -> count_omit_plankton
# calculating diversity values
count_omit_plankton %>%
group_by(Year) %>%
summarize(diversity_index = diversity(total_count,
index = "shannon")) -> div_omit_dat
# viewing data frames, they were not different
div_omit_dat
clean_div_data
# as they are not different, we did not remove taxa added after 1991
```
# cleaning and wrangling the pCO2 data
# time series analysis
```{r}
# turning data into time series for analysis
# the variable of interest (diversity and pCO2) are the data and the
# year of start is noted
ts_plankton <- ts(data = clean_div_data$diversity_index,
start = c(1991))
ts_pco <- ts(data = pco_raw$median_PCO2_TEQ,
start = c(1991))
# checking code worked
class(ts_plankton)
class(ts_pco)
# plotting time series across time to visualize patterns
plot(ts_plankton,
ylab = "Plankton Diversity")
plot(ts_pco,
ylab = "pCO2 (atm)")
plot(cbind(ts_plankton, ts_pco), yax.flip = TRUE,
main = "Time Series of Plankton Diversity and pCO2",
xlab = "Time"
)
# plotting using ggplot for final report to look cleaner
head(combined_data)
plankton_plot <- combined_data %>%
ggplot(aes(x = YEAR, y = diversity_index)) +
geom_line(colour = "lightgreen", size = 1) +
labs(x = "Year", y = "Shannon Diversity Index") +
ggtitle("Time Series of Plankton Diversity") +
theme_bw()
carbon_plot <- combined_data %>%
ggplot(aes(x = YEAR, y = median_PCO2_TEQ)) +
geom_line(colour = "lightblue", size = 1) +
labs(x = "Year", y = "pCO2 (atm)") +
ggtitle("Time Series of Ocean Acidity") +
theme_bw()
ggarrange(plankton_plot, carbon_plot,
ncol= 1,
nrow = 2) -> combo_plot
combo_plot
# cross correlation was performed - gives a plot and stores values
ccf(ts_pco, ts_plankton, ylab = "Cross-correlation",
main = "Results of Cross-Correlation") -> ccf_values
ccf_values
# ARIMA model was initially automatically fit
# turning pCO2 values into numeric values to be external regressor in model
as.numeric(ts_pco) -> pco_reg
model_auto <- auto.arima(ts_plankton, xreg = pco_reg)
model_auto
# also tried providing parameters to compare how they were to the autofitted
# model out of interest
model_arima1 <- arima(ts_plankton, xreg = pco_reg, order = c(0,1,0))
model_arima1
model_arima2 <- arima(ts_plankton, xreg = pco_reg, order = c(0,0,1))
model_arima2
model_arima3 <- arima(ts_plankton, xreg = pco_reg, order = c(1,0,0))
model_arima3
model_arima4 <- arima(ts_plankton, xreg = pco_reg, order = c(1,1,1))
model_arima4
# out of these models, the autofitted one did have the lowest AIC score
# therefore we proceeded with this model
# checking the model with Ljung-Box Test and plotting residuals
checkresiduals(model_auto)
# forecasting pCO2 values into the future from 2019 - 2035
forecast(ts_pco, h = 17) -> forecast_pco
# making the forecast values numerical so we can use them in the forecast
# as the future values of the external regressor
# calling the mean gives the forecast values
as.numeric(forecast_pco$mean) -> pco_forecast_values
pco_forecast_values
# using the autofitted model and the forecast values to predict diversity
forecast(model_auto, xreg = pco_forecast_values) -> diversity_forecast
print(diversity_forecast$mean)
# plotting our forecast
autoplot(diversity_forecast, ylab = "Forecast Diversity Index",
main = "Forecast Estimate From ARIMA Model")
```
---------Naveen's section-----------
```{r}
# reading data
atmo_data <- read_csv("LDEO_Database_V2019.csv")
```
```{r}
#filtering to convert date to just year and to fit plankton geographic range
atmo_data$`MONTH/DAY/YEAR` <- as.Date(atmo_data$`MONTH/DAY/YEAR`, format = "%m/%d/%Y")
atmo_data$`MONTH/DAY/YEAR`<-format(atmo_data$`MONTH/DAY/YEAR`, "%Y")
atmo_data <- atmo_data%>%
rename(YEAR = `MONTH/DAY/YEAR`)
atmo_data <- atmo_data %>%
filter(LAT >= 36.28 & LAT <= 64.907 & LON >= -74.743 & LON <= -23.092)
```
```{r}
#selecting medians from each year and filtering to 1991-2018
atmo_data_medians <- atmo_data %>%
group_by(YEAR) %>%
summarise(across(c(TEMP, PCO2_TEQ), median, .names = "median_{col}"))
```
```{r}
#reading in plankton data and merging it with pco2 data
plankton_diversity <- read_csv("div_clean_dat.csv")
plankton_diversity <- plankton_diversity %>%
rename(YEAR = Year)
combined_lm_data<- merge(atmo_data_medians, plankton_diversity) %>%
select(-...1, -median_TEMP)
combined_lm_data$YEAR <- as.numeric(combined_lm_data$YEAR)
combined_lm_data <- combined_lm_data %>%
filter(YEAR >=1991 & YEAR <= 2018)
```
```{r}
#creating linear models and checking output
model_pco2_diversity<- lm(formula = diversity_index ~ median_PCO2_TEQ, data = combined_lm_data)
summary(model_pco2_diversity)
model_pco2_year <- lm(median_PCO2_TEQ ~ YEAR, data = combined_lm_data)
summary(model_pco2_year)
```
```{r}
#checking the diagnostic plots
plot(model_pco2_diversity)
plot(model_pco2_year)
```
```{r}
#making pco2 forecasts
timeline <- data.frame(YEAR = 2019:2035)
timeline_prediction_pco2 <- predict(model_pco2_year, newdata = timeline)
pco2_predicted_vals <- data.frame(timeline_prediction_pco2) %>%
rename(median_PCO2_TEQ=timeline_prediction_pco2)
#making diversity forecasts with this data
timeline_prediction_diversity <- predict(model_pco2_diversity, newdata = pco2_predicted_vals)
predicted_diversity <- data.frame(timeline_prediction_diversity) %>%
mutate(YEAR = timeline$YEAR) %>%
mutate(median_PCO2_TEQ = pco2_predicted_vals$median_PCO2_TEQ) %>%
rename(diversity_index = timeline_prediction_diversity)
```
```{r}
#making plots
library(ggplot2)
#combining past and predicted data
prediction_past <- rbind(combined_lm_data, predicted_diversity)
prediction_past$color <- ifelse(prediction_past$YEAR > 2018, "slategrey", "black")
#making a plot for pco2
prediction_past %>%
ggplot(aes(x=YEAR, y=median_PCO2_TEQ)) +
geom_point(aes(color=color), size = 3) +
scale_color_identity() +
geom_smooth() +
theme_minimal() +
labs(title = "Change in pCO2 over time and predictions for the future", y = "Median pCO2", x = "Year")
#making a plot for diversity
prediction_past$color <- ifelse(prediction_past$YEAR > 2018, "lightgreen", "darkgreen")
prediction_past %>%
ggplot(aes(x=YEAR, y=diversity_index)) +
geom_point(aes(color=color), size=3) +
scale_color_identity() +
geom_smooth() +
theme_minimal() +
labs(title = "Change in Shannon Diversity over time and predictions for the future", y= "Shannon Diversity", x="Year")
```
```