GROUP B - Master Code.Rmd

---
title: "Final Project Master Code"
output: pdf_document
date: "2024-12-10"
---

```{r, message=F, warning=F}
require(tidyverse)
require(lme4)
require(deSolve)
require(vegan)
require(car)
require(rgl)
require(ggfortify)
require(fitdistrplus)

options(rgl.useNULL = TRUE) 
rgl::setupKnitr(autoprint = TRUE)
```

```{r}
read_csv("Arthropod_List.csv") -> Arthropod_List

read_csv("Plant_Traits.csv") -> Plant_Traits

read_csv("Soil_Traits.csv") -> Soil_Traits
```
## REARRANGE ARTHROPOD_LIST DATA FRAME SO THAT EACH SPECIES IS A COLUMN AND EACH SITE IS A ROW
```{r}
site_cols <- Arthropod_List[,5:79]
species_col <- Arthropod_List[c("Species")]

new_AL <- data.frame(species_col, site_cols)

AL_long <- new_AL %>%
  pivot_longer(
    cols = (starts_with("OP") | starts_with("RP") | starts_with("TP") | 
              starts_with("IS") | starts_with("PS")), # Select site columns
    names_to = "Site", # Name of new "Site" column
    values_to = "Count" #  # Name of new "Count" column
  )

AL_wide <- AL_long %>%
  pivot_wider(
    names_from = Species, # Create new columns from Species
    values_from = Count   # Fill values from the "Count" column
  )

Rearranged_Arthropod_List <- AL_wide
```


# MAKING SURE THE VEGAN PACKAGE DOES WHAT WE THINK IT DOES - DOING SHANNON DIVERSITY BY "HAND"...
```{r}
Site_Manual <- Rearranged_Arthropod_List$Site
Shannon_Manual <- rep(NA, 75)

Manual_Shannons <- data.frame(Site_Manual, Shannon_Manual)

for (i in 1:75){
  
  #counts of each species in one site at a time
  species_counts <- Rearranged_Arthropod_List[i,2:350]
  
  #sum to get total amount of species in that site
  total_individuals <- sum(species_counts) 
  
  # calculate proportions (p_i)
  proportions <- species_counts / total_individuals 
  
  #calculate p_i * ln(p_i) for each species
  pi_ln_pi <- proportions * log(proportions) 
  
  #sum up all the p_i * ln(p_i), na.rm handles any NA values
  shannon_diversity <- -sum(pi_ln_pi, na.rm = TRUE)
  
  Manual_Shannons[i,2] <- shannon_diversity
}
```
# YAYYYY -- WE GOT THE EXACT SAME SHANNON DIVERSITIES!


#Distribution of Shannon Diversity against Site
```{r}
ggplot(Manual_Shannons, aes(x = Site_Manual, y = Shannon_Manual)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  theme_minimal() +
  labs(title = "Shannon Diversity for each Site", 
       x = "Site", y = "Shannon Diversity Index")
```

## FINAL SHANNON DIVERSITY CALCULATION
```{r}
for_shannon = Rearranged_Arthropod_List[,2:350]

shannon_indices = diversity(for_shannon, index = "shannon")
view(shannon_indices)
```

```{r}
final_shannon = data.frame(Site = Rearranged_Arthropod_List$Site,
                           Shannon = shannon_indices)
```

#Histogram of Shannon Diversity -- kinda normal lookin' ?!
```{r}
ggplot(final_shannon, aes(Shannon)) +
  geom_histogram(binwidth = 0.05, alpha = 0.7, colour="white")
```

#Another way to histogram... still kinda normal looking?!?!
```{r}
hist(final_shannon$Shannon, probability = TRUE, col = "lightblue", 
     border = "black", breaks = 25)
# Adds a density curve
lines(density(final_shannon$Shannon), col = "red", lwd = 2) 
```

# Group Sites into Communities
```{r}
final_shannon$Community <- 
  ifelse(startsWith(final_shannon$Site, "OP"), "OP",
         ifelse(startsWith(final_shannon$Site, "RP"), "RP",
                ifelse(startsWith(final_shannon$Site, "TP"), "TP",
                       ifelse(startsWith(final_shannon$Site, "IS"), "IS",
                              ifelse(startsWith(final_shannon$Site, "PS"), "PS", 
                                "Other")))))
```

# Move Communities column to the left to be beside the Site column...
```{r}
final_shannon <- final_shannon[, c("Site", "Community", "Shannon")]
```

# Find the mean of the Shannon diversity indices for each of the five communities
# And also find the standard error for each community (used later to make error bars)
```{r}
bar_plot_shannon <- final_shannon %>%
  group_by(Community) %>%
  summarise(Avg_Shannon = mean(Shannon),
            SE_Shannon = sd(Shannon)/sqrt(length(Shannon)))

# Reorder the Community factor levels to represent the "gradient" of change
# from fully native to fully invaded to restored back to native...
# (This is just for a better visuals -- also aligns with OG paper)
bar_plot_shannon$Community <- factor(bar_plot_shannon$Community, 
                                      levels = c("OP", "TP", "PS", "IS", "RP"))
```

#Average SDI of each community with error bars!
```{r}
ggplot(bar_plot_shannon, aes(x = Community, y = Avg_Shannon, fill = Community)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymax = Avg_Shannon + SE_Shannon, 
                    ymin = Avg_Shannon - SE_Shannon), width = 0.2) +
  # Display rounded Avg_Shannon values
  geom_text(aes(label = round(Avg_Shannon, 2)), 
            vjust = 3, color = "black") +
  # Add "p < 0.001" annotation
  annotate("text", x = 5, y = max(bar_plot_shannon$Avg_Shannon) + 0.2, 
           label = "p < 0.001", size = 3, hjust = -0.55, vjust = -0.55) +
  labs(title = "Average Shannon Diversity Index (SDI) for each Community", 
       x = "Community", 
       y = "Average Shannon Diversity Index (SDI)") +
  theme(legend.position = "none") # Remove that redundant legend!
```
# Find the significance using ANOVA (what's the p-value?)
```{r}
# Perform an ANOVA
anova_result <- aov(Shannon ~ Community, data = final_shannon)

# Summary of the ANOVA
summary(anova_result)

```
So the p-value for Community is 4.23e-08 or 0.0000000423.
This extremely small p-value indicates a highly significant effect of the Community factor on the response variable.

# Standard Error for Entire Dataset
```{r}
SE_total <- sd(final_shannon$Shannon) / sqrt(nrow(final_shannon))
SE_total
```
# Add all the predictor variable columns (soil and plant traits) onto final_shannon...
```{r}
final_shannon$soil_C <- Soil_Traits$Soil_C
final_shannon$soil_N <- Soil_Traits$Soil_N
final_shannon$soil_P <- Soil_Traits$Soil_P
final_shannon$soil_pH <- Soil_Traits$Soil_pH
final_shannon$soil_sal <- Soil_Traits$Soil_Salinity
final_shannon$soil_wat <- Soil_Traits$Soil_Water

final_shannon$biomass_above <- Plant_Traits$Aboveground_Biomass
final_shannon$plant_dens <- Plant_Traits$Plant_Density
final_shannon$leaf_N <- Plant_Traits$Leaf_N
final_shannon$leaf_C <- Plant_Traits$Leaf_C
final_shannon$leaf_P <- Plant_Traits$Leaf_P
```

# Make sure all the number columns are actually numeric!!
*Note* that this also converts 'n/a' cells into NULL -- we will have to replace this later with an estimate!
```{r}
final_shannon[, 3:14] <- lapply(final_shannon[, 3:14], as.numeric)
```
# First filter out the data to involve only the OP community and then exclude the 2 rows with "n/a" leaf_N values:
```{r}
final_shannon %>%
  filter(final_shannon$Community == "OP",
         !is.na(final_shannon$leaf_N)) -> OPonly_noNA_FS
```

# Then replace the 2 cells with the mean of the OP'S leaf_N column!...
```{r}
replaced_FS <- final_shannon

replaced_FS[replaced_FS$Site == "OP14", "leaf_N"] <- 
  mean(OPonly_noNA_FS$leaf_N)
replaced_FS[replaced_FS$Site == "OP15", "leaf_N"] <- 
  mean(OPonly_noNA_FS$leaf_N)

```

# Renaming master data frame to a shorter name bc I will have to use it over and over again in log-likelihood evalutors!...
```{r}
data <- replaced_FS
```

# OPTIONAL: Save the data dataframe as a csv file for sharing and future edits...
```{r}
write.csv(data, "GROUP_B_MASTER_DATAFRAME.csv")
```

### MAKING RANGES OF PARAMETER COMBONINATIONS FOR EACH DISTRIBUTION

## dgamma parameters:
- shape: Controls the shape of the distribution. Low values -> Highly right-skewed. High values -> Symmetric or normal-like.
- scale: Scales the spread of the data (mean and variance). Larger values result in broader distributions.

## inverse gaussian parameters:
- mu (mean): Determines the central tendency of the distribution. It should align with the average value of your data.
- lambda (shape): Controls the variance and skewness. Larger implies less skewness and lower variance.

```{r}
# MLE: LogLik = -19.28128, mean = 3.43, sd = 0.31

norm_combos <- expand.grid(mean = seq(2.4, 4.2, 0.01),
                            sd = seq(0.15, 0.7, 0.01))
```

```{r}
# MLE: LogLik = -21.03414, shape = 98, scale = 0.035

gamma_combos <- expand.grid(shape = seq(98, 100, 0.001),
                            scale = seq(0.03, 0.04, 0.001))

```

```{r}
# MLE: LogLik = -21.50412, mu =	3.43, lambda = 381.81

invgaus_combos <- expand.grid(mu = seq(2.4, 4.2, 0.01),
                              lambda = seq(375, 385, 0.01))
	
```

### UPDATED METE'S RANGES

```{r}
# MLE: -19.28128, mean = 3.43, sd = 0.31	
norm_combos <- expand.grid(mean = seq(2.4, 4.2, 0.01),
                            sd = seq(0.15, 0.7, 0.01))

# MLE: LogLik = -20.56567, shape = 116.1, scale = 0.0295	
gamma_combos <- expand.grid(shape = seq(100, 150, 0.1),
                            scale = seq(0.02, 0.04, 0.0005))

# MLE: LogLik = -21.72646, mu = 3.4, lambda = 379.6	
invgaus_combos <- expand.grid(mu = seq(2, 6, 0.1),
                              lambda = seq(200, 600, 0.1))
```


## LOG-LIKELIHOOD EVALUATORS
```{r}
norm_loglik <- function(x = data$Shannon, mean, sd){
  return (sum(log(dnorm(x = x, mean = mean, sd = sd))))
}
```

```{r}
gamma_loglik <- function(x = data$Shannon, shape, scale){
  return (sum(log(dgamma(x = x, shape = shape, scale = scale))))
}
```

```{r}
invgaus_loglik <- function(x = data$Shannon, mu, lambda){
  return (sum(log(
    sqrt(lambda/(2*pi*x^3)) * exp((-lambda*(x-mu)^2)/(2*x*mu^2))
    )))
}
```

```{r}

LogLik_norm <- c() # empty log-likelihoods vector for NORMAL

for (i in 1:nrow(norm_combos)){
  LogLik_norm[i] <- norm_loglik(mean = norm_combos$mean[i],
                                sd = norm_combos$sd[i])
}

LLs_norm <- data.frame(LogLik_Norm = LogLik_norm,
                       mean = norm_combos$mean,
                       sd = norm_combos$sd)

```

```{r}

LogLik_gamma <- c() # empty log-likelihoods vector for GAMMA

for (i in 1:nrow(gamma_combos)){
  LogLik_gamma[i] <- gamma_loglik(shape = gamma_combos$shape[i],
                                  scale = gamma_combos$scale[i])
}

LLs_gamma <- data.frame(LogLik_Gamma = LogLik_gamma,
                       shape = gamma_combos$shape,
                       scale = gamma_combos$scale)

```

```{r}

LogLik_invgaus <- c() # empty log-likelihoods vector for INVERSE GAUSSIAN

for (i in 1:nrow(invgaus_combos)){
  LogLik_invgaus[i] <- invgaus_loglik(mu = invgaus_combos$mu[i],
                                      lambda = invgaus_combos$lambda[i])
}

LLs_invgaus <- data.frame(LogLik_InvGaus = LogLik_invgaus,
                          mu = invgaus_combos$mu,
                          lambda = invgaus_combos$lambda)

```

```{r}
ggplot(LLs_norm, aes(x = mean, y = LogLik_Norm)) + geom_point()
ggplot(LLs_norm, aes(x = sd, y = LogLik_Norm)) + geom_point()
MLE_norm <- LLs_norm %>% subset(LogLik_Norm == max(LogLik_Norm))
MLE_norm
```

```{r}
ggplot(LLs_gamma, aes(x = shape, y = LogLik_Gamma)) + geom_point()
ggplot(LLs_gamma, aes(x = scale, y = LogLik_Gamma)) + geom_point()
MLE_gamma <- LLs_gamma %>% subset(LogLik_Gamma == max(LogLik_Gamma))
MLE_gamma
```

```{r}
ggplot(LLs_invgaus, aes(x = mu, y = LogLik_InvGaus)) + geom_point()
ggplot(LLs_invgaus, aes(x = lambda, y = LogLik_InvGaus)) + geom_point()
MLE_invgaus <- LLs_invgaus %>% subset(LogLik_InvGaus == max(LogLik_InvGaus))
MLE_invgaus
```

# Find which varaiables are linearly and positively correlated...
```{r}
par(mfrow=c(1,1))

pairs(data[, c("soil_C", "soil_N", "soil_P", "soil_pH", "soil_sal", "soil_wat", 
               "biomass_above", "plant_dens", "leaf_N", "leaf_C", "leaf_P")])
```

#Test for co-linearality?
```{r}
cor_matrix <- cor(data[, c("soil_C", "soil_N", "soil_P", "soil_pH", "soil_sal",
                           "soil_wat", "biomass_above", "plant_dens", "leaf_N", 
                           "leaf_C", "leaf_P")], 
                  use = "pairwise.complete.obs")
print(cor_matrix)
```

```{r}
read_csv("GROUP_B_MASTER_DATAFRAME.csv") -> data
```


### PCA TIME!
```{r}
pca.out <- prcomp(data[, c("soil_C", "soil_N", "soil_P", "soil_pH", "soil_sal", 
                           "soil_wat", "biomass_above", "plant_dens", "leaf_N", 
                           "leaf_C", "leaf_P")], scale.=T)
```

# PC1: (1) soil_N (-0.3543157) (2) soil_wat (-0.3536549) (3) soil_pH (0.3370312) (4) biomass_above (0.3269548) (5) soil_C (-0.3267151)
# PC2: (1) soil_C (0.44891398) (2) plant_dens (-0.40295140) (3) soil_sal (0.39164170) (4) soil_N (0.37709659)
```{r}
pca.out
```

```{r}
summary(pca.out)
```

```{r}
PC1 <- pca.out$x[, 1]
PC2 <- pca.out$x[, 2]
```

```{r}
data_with_PCs <- data
data_with_PCs$PC1 <- PC1
data_with_PCs$PC2 <- PC2
```

```{r}
model1 <- lmer(Shannon ~ PC1 + (1 | Community), data = data_with_PCs)
model2 <- lmer(Shannon ~ PC1 * PC2 + (1 | Community), data = data_with_PCs)
model3 <- lm(Shannon ~ PC1 * PC2, data = data_with_PCs)
model4 <- lm(Shannon ~ PC1, data = data_with_PCs)
model5 <- lm(Shannon ~ PC2, data = data_with_PCs)
model6 <- lmer(Shannon ~ PC2 + (1 | Community), data = data_with_PCs)
```

```{r}
AIC(model1)
AIC(model2)
AIC(model3)
AIC(model4)
AIC(model5)
AIC(model6)
```

```{r}

data_with_PCs  |>  
  mutate(fit.m = predict(model4, re.form = NA), # does not include random effects
         fit.c = predict(model1, re.form = NULL) # includes random effects
         ) ->
  predicted_values 

predicted_values |>
  ggplot(aes(x = PC1, y = Shannon, color = Community)) +
  geom_point(size = 3) +
  geom_line(inherit.aes = F, aes(x = PC1, y = fit.m), color = "black", size = 2) +
  geom_line(inherit.aes = F, aes(x = PC1, y = fit.c, color = Community), size = 1)


```