LAST_Final Project Master Code.Rmd

---
title: "Final Project Master Code"
output: pdf_document
date: "2024-11-14"
---

# Loading packages
```{r, message=F, warning=F}
require(tidyverse)
require(deSolve)
require(vegan)
require(car)
require(rgl)
require(ggfortify)
require(fitdistrplus)

options(rgl.useNULL = TRUE) 
rgl::setupKnitr(autoprint = TRUE)
```

# Loading csv files
```{r}
#Provided original csv files (separated sheets from the master excel file) 
# column names were edited for simplicity sake
read_csv("Arthropod_List.csv") -> Arthropod_List

read_csv("Plant_Traits.csv") -> Plant_Traits

read_csv("Soil_Traits.csv") -> Soil_Traits

```
## Rearraged Arthropod_List dataframe (each species is a column, each threatment is a row)
```{r}
teatment_cols <- Arthropod_List[,5:79]
species_col <- Arthropod_List[c("Species")]

new_AL <- data.frame(species_col, teatment_cols)

AL_long <- new_AL %>%
  pivot_longer(
    cols = (starts_with("OP") | starts_with("RP") | starts_with("TP") | 
              starts_with("IS") | starts_with("PS")), # Select treatment columns
    names_to = "Treatment", # Name of new "Treatment" column
    values_to = "Count" #  # Name of new "Count" column
  )

```

```{r}
AL_wide <- AL_long %>%
  pivot_wider(
    names_from = Species, # Create new columns from Species
    values_from = Count   # Fill values from the "Count" column
  )

fixed_arthro <- AL_wide
```


# Shannon Diversity Calculations
```{r}
#Excluding the first and last column (as they d not contain data)
for_shannon = fixed_arthro[,2:350]

#Shannon diversity 
shannon_indices = diversity(for_shannon, index = "shannon")
```

```{r}
#Reformatting with column names specified 
final_shannon = data.frame(Treatment = fixed_arthro$Treatment,
                           Shannon = shannon_indices)
```

## Histogram of Shannon Diversity 
Preliminary to determining distribution type.
```{r}
ggplot(final_shannon, aes(Shannon)) +
  geom_histogram(binwidth = 0.05, alpha = 0.7, colour="white")
```

## Another way to histogram... looking kind of normal..
```{r}
hist(final_shannon$Shannon, probability = TRUE, col = "lightblue", border = "black", breaks = 25)
lines(density(final_shannon$Shannon), col = "red", lwd = 2)  # Adds a density curve
```
This is normal looking, however we need to determine distribution type in order to make the correct analysis and conclusions about our findings...

#Data wrangling 
##Group Treatments into Communities
```{r}
#Data is separated by into 75 treatments -> groups with 15 treatments each
#To draw better conclusions, we need to group by group
final_shannon$Community <- ifelse(startsWith(final_shannon$Treatment, "OP"), "OP",
                                          ifelse(startsWith(final_shannon$Treatment, "RP"), "RP",
                                                 ifelse(startsWith(final_shannon$Treatment, "TP"), "TP",
                                                        ifelse(startsWith(final_shannon$Treatment, "IS"), "IS",
                                                               ifelse(startsWith(final_shannon$Treatment, "PS"), "PS", "Other")))))
```

Rearranging columns so "Community" is beside "Treatment"
```{r}
final_shannon <- final_shannon[, c("Treatment", "Community", "Shannon")]
```

Visualization of Shannon diversity for each community 
```{r}
ggplot(final_shannon, aes(x = Community, y = Shannon)) +
  geom_bar(stat = "identity", fill = "yellowgreen", color = "black") +
  theme_minimal() +
  labs(title = "Shannon Diversity for each Community", x = "Community", y = "Shannon Diversity")
```


## Add all the predictor variable columns (soil and plant traits) onto final_shannon...
```{r}
final_shannon$soil_C <- Soil_Traits$Soil_C
final_shannon$soil_N <- Soil_Traits$Soil_N
final_shannon$soil_P <- Soil_Traits$Soil_P
final_shannon$soil_pH <- Soil_Traits$Soil_pH
final_shannon$soil_sal <- Soil_Traits$Soil_Salinity
final_shannon$soil_wat <- Soil_Traits$Soil_Water

final_shannon$biomass_above <- Plant_Traits$Aboveground_Biomass
final_shannon$plant_dens <- Plant_Traits$Plant_Density
final_shannon$leaf_N <- Plant_Traits$Leaf_N
final_shannon$leaf_C <- Plant_Traits$Leaf_C
final_shannon$leaf_P <- Plant_Traits$Leaf_P
```

## Making sure all the number columns are numeric!!
Note that this also converts 'n/a' cells into NULL -- we will have to replace this later with an estimate!
```{r}
final_shannon[, 3:14] <- lapply(final_shannon[, 3:14], as.numeric)
```

###Saving the final_shannon dataframe as a csv file for sharing and future edits...
```{r}
write.csv(final_shannon, "final_shannon.csv")
```

##Filtering data for our final dataset
OP14 and OP15 have NULL leaf_N values (due to equipment failure) -> here, we replace them with the mean of their community instead...
```{r}
replaced_FS <- final_shannon
```

```{r}
#First filter out the data to involve only the OP community and then exclude the 2 rows with "n/a" leaf_N values:

final_shannon %>%
  filter(final_shannon$Community == "OP",
         !is.na(final_shannon$leaf_N)) -> OPonly_noNA_FS
```


```{r}
# Replacing the 2 cells with the mean of the OP'S leaf_N column
replaced_FS <- final_shannon

replaced_FS[replaced_FS$Treatment == "OP14", "leaf_N"] <- 
  mean(OPonly_noNA_FS$leaf_N)
replaced_FS[replaced_FS$Treatment == "OP15", "leaf_N"] <- 
  mean(OPonly_noNA_FS$leaf_N)
```

###Renaming master data frame to a shorter name bc I will have to use it over and over again in log-likelihood evalutors!
```{r}
data <- replaced_FS
```

# Making ranges of parameter combinations for each distribution 
## MLE: We test for normal, gamma, and inverse gaussian because they are the only distribution types that can reasonably fit our data and are generally accepted in a generalized linear mixed model! 
```{r}
norm_combos <- expand.grid(mean = seq(2.4, 4.2, 0.01),
                            sd = seq(0.15, 0.7, 0.01))

gamma_combos <- expand.grid(shape = seq(100, 150, 0.1),
                            scale = seq(0.02, 0.04, 0.0005))

invgaus_combos <- expand.grid(mu = seq(2, 6, 0.05),
                              lambda = seq(200, 600, 1))
```


#Log-Likelihood evalutions 
```{r}
norm_loglik <- function(x = data$Shannon, mean, sd){
  return (sum(log(dnorm(x = x, mean = mean, sd = sd))))
}
```

```{r}
gamma_loglik <- function(x = data$Shannon, shape, scale){
  return (sum(log(dgamma(x = x, shape = shape, scale = scale))))
}
```

```{r}
invgaus_loglik <- function(x = data$Shannon, mu, lambda){
  return (sum(log(
    sqrt(lambda/(2*pi*x^3)) * exp((-lambda*(x-mu)^2)/(2*x*mu^2))
    )))
}
```

```{r}

LogLik_norm <- c() # empty log-likelihoods vector for NORMAL

for (i in 1:nrow(norm_combos)){
  LogLik_norm[i] <- norm_loglik(mean = norm_combos$mean[i],
                                sd = norm_combos$sd[i])
}

LLs_norm <- data.frame(LogLik_Norm = LogLik_norm,
                       mean = norm_combos$mean,
                       sd = norm_combos$sd)

```

```{r}

LogLik_gamma <- c() # empty log-likelihoods vector for GAMMA

for (i in 1:nrow(gamma_combos)){
  LogLik_gamma[i] <- gamma_loglik(shape = gamma_combos$shape[i],
                                  scale = gamma_combos$scale[i])
}

LLs_gamma <- data.frame(LogLik_Gamma = LogLik_gamma,
                       shape = gamma_combos$shape,
                       scale = gamma_combos$scale)

```

```{r}

LogLik_invgaus <- c() # empty log-likelihoods vector for INVERSE GAUSSIAN

for (i in 1:nrow(invgaus_combos)){
  LogLik_invgaus[i] <- invgaus_loglik(mu = invgaus_combos$mu[i],
                                      lambda = invgaus_combos$lambda[i])
}

LLs_invgaus <- data.frame(LogLik_InvGaus = LogLik_invgaus,
                          mu = invgaus_combos$mu,
                          lambda = invgaus_combos$lambda)

```

```{r}
ggplot(LLs_norm, aes(x = mean, y = LogLik_Norm)) + geom_line()
ggplot(LLs_norm, aes(x = sd, y = LogLik_Norm)) + geom_line()
MLE_norm <- LLs_norm %>% subset(LogLik_Norm == max(LogLik_Norm))
MLE_norm
```

```{r}
ggplot(LLs_gamma, aes(x = shape, y = LogLik_Gamma)) + geom_line()
ggplot(LLs_gamma, aes(x = scale, y = LogLik_Gamma)) + geom_line()
MLE_gamma <- LLs_gamma %>% subset(LogLik_Gamma == max(LogLik_Gamma))
MLE_gamma
```

```{r}
ggplot(LLs_invgaus, aes(x = mu, y = LogLik_InvGaus)) + geom_line()
ggplot(LLs_invgaus, aes(x = lambda, y = LogLik_InvGaus)) + geom_line()
MLE_invgaus <- LLs_invgaus %>% subset(LogLik_InvGaus == max(LogLik_InvGaus))
MLE_invgaus
```

# Find which varaiables are linearly and positively correlated
```{r}
par(mfrow=c(1,1))

pairs(data[, c("soil_C", "soil_N", "soil_P", "soil_pH", "soil_sal", "soil_wat", 
               "biomass_above", "plant_dens", "leaf_N", "leaf_C", "leaf_P")])
```

#Test for co-linearality?
```{r}
cor_matrix <- cor(data[, c("soil_C", "soil_N", "soil_P", "soil_pH", "soil_sal",
                           "soil_wat", "biomass_above", "plant_dens", "leaf_N", 
                           "leaf_C", "leaf_P")], 
                  use = "pairwise.complete.obs")
print(cor_matrix)
```

#PCA 
```{r}
pca.out <- prcomp(data[, c("soil_C", "soil_N", "soil_P", "soil_pH", "soil_sal", 
                           "soil_wat", "biomass_above", "plant_dens", "leaf_N", 
                           "leaf_C", "leaf_P")], scale.=T)
```

```{r}
pca.out
```

```{r}
summary(pca.out)
```

```{r}
head(pca.out$rotation)
```
## Visualized PCA
```{r}
autoplot(pca.out, x=1,y=2,
         data=data, 
         colour="Community",
         frame=TRUE) +
  theme_classic()
```

#Things to do after PCA: regression on PC1, PC2, and interaction!! Discriminate!...