GroupProject_Segmented.Rmd

---
title: "EEB313_SegmentedVirus"
output: html_document
date: "2024-11-05"
---

```{r}
library(tidyverse)
```

#CLEANING UP MAMMAL-VIRUS DATA
```{r}
#read viruses in
mv_associations <- read.csv("MammalViruses_Associations.csv")

#selects columns from MV-associations that we actually need!
mv_sample_select <- select(mv_associations, HostOrder, Virus)

#changes virus names to match virus data, replacing spaces with underscores
mv_sample_select$Virus <- gsub(" ", "_", mv_sample_select$Virus)

#changes all text to lower case, to standardize with other df!
mv_sample_select$Virus <- tolower(mv_sample_select$Virus)

```

#CLEANING UP VIRUS DATA
```{r}
#read in
virus_data <- read.csv("HP3_viruses.csv")

#see proportion of DNA/RNA viruses segmented
virus_data |> 
  group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
  summarize(sum = sum(vSegmentedTF == TRUE), n = n()) 
#Since there are not segmented DNA viruses in the dataset, so we've decided to 
#remove all DNA viruses since we won't end up examining them anyway

#selects predictor columns (virus traits) and filters out viruses
virus_select <- virus_data |> 
  filter(vDNAoRNA == 'RNA') |> 
  select(vFamily, vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)

#changes text to lower case, replaces spaces with underscores
#to match mammal-viruses assoc data)
virus_select$vVirusNameCorrected <- tolower(virus_select$vVirusNameCorrected)
virus_select$vVirusNameCorrected <- gsub(" ", "_", virus_select$vVirusNameCorrected)
```


```{r}
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected', 
                  all.x = TRUE, all.y = TRUE) 


#removing NA
df_noNA <- na.omit(df_merge)
#total of 20k rows! which is a lot less than we had before
```


```{r}
#checking if which host orders don't have enough data. 
#decided arbitrary cut off at 40

# examining which hosts have less than 40 data points
df_noNA |> 
  group_by(HostOrder) |>
  summarize(sum = sum(vSegmentedTF == TRUE), n = n()) |> 
  filter(n < 40)

#removing host orders with < 40 data points
df_hostremoved <- df_noNA |> 
  filter(HostOrder != "cingulata" & HostOrder != "dasyuromorphia" & 
           HostOrder != "eulipotyphla" & HostOrder != "peramelemorphia" &
           HostOrder != "pilosa" & HostOrder != "proboscidea")

#our final dataset! shes a beauty
df_hostremoved
```


```{r}
#fixing host removed to have 0s and 1s instead of DS/SS and non/enveloped
df_hostremoved %>%
  mutate(vEnvelope = case_when(
    vEnvelope == 'enveloped' ~ 1,
    vEnvelope == 'non-enveloped' ~ 0)) -> df_hostremoved
  
  
df_hostremoved |>
  mutate(vSSoDS = case_when(
    vSSoDS == 'SS' ~ 1,
    vSSoDS == 'DS' ~ 0)) -> df_hostremoved

df_hostremoved |>
  mutate(vSegmentedTF = case_when(
    vSegmentedTF == TRUE ~ 1,
    vSegmentedTF == FALSE ~ 0)) -> df_hostremoved

df_hostremoved |>
  mutate(vCytoReplicTF = case_when(
    vCytoReplicTF == TRUE ~ 1,
    vCytoReplicTF == FALSE ~ 0)) -> df_hostremoved
```

#GLM _ MODEL COMPARISON
```{r}
#final model, without host order
model = glm(vSegmentedTF ~ vGenomeAveLength*vEnvelope  + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, family = 'binomial', data = df_hostremoved)
summary(model)

#final model, with host order
model2 = glm(vSegmentedTF ~ HostOrder + vGenomeAveLength*vEnvelope  + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, family = 'binomial', data = df_hostremoved)
summary(model2)

```


```{r}
#UNUSED MODEL > originally was going to use virus family, decided not to due to lots of multicollinearity

model_unused = glm(vSegmentedTF ~ vFamily + vGenomeAveLength*vEnvelope  + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, family = 'binomial', data = df_hostremoved)

summary(model_unused)

```


```{r}
#VISUALIZATION OF FITTED MODEL, W/O HOST ORDER
df_hostremoved |>
  ggplot(aes(x = vGenomeAveLength*vEnvelope  + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, y = vSegmentedTF)) + 
  geom_point() +
  geom_smooth(method = "glm", 
              method.args = list(family = "binomial")
              )
```