This repository has been archived by the owner on Dec 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGroupProject_Segmented.Rmd
146 lines (105 loc) · 4.05 KB
/
GroupProject_Segmented.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
---
title: "EEB313_SegmentedVirus"
output: html_document
date: "2024-11-05"
---
```{r}
library(tidyverse)
```
#CLEANING UP MAMMAL-VIRUS DATA
```{r}
#read viruses in
mv_associations <- read.csv("MammalViruses_Associations.csv")
#selects columns from MV-associations that we actually need!
mv_sample_select <- select(mv_associations, HostOrder, Virus)
#changes virus names to match virus data, replacing spaces with underscores
mv_sample_select$Virus <- gsub(" ", "_", mv_sample_select$Virus)
#changes all text to lower case, to standardize with other df!
mv_sample_select$Virus <- tolower(mv_sample_select$Virus)
```
#CLEANING UP VIRUS DATA
```{r}
#read in
virus_data <- read.csv("HP3_viruses.csv")
#see proportion of DNA/RNA viruses segmented
virus_data |>
group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#Since there are not segmented DNA viruses in the dataset, so we've decided to
#remove all DNA viruses since we won't end up examining them anyway
#selects predictor columns (virus traits) and filters out viruses
virus_select <- virus_data |>
filter(vDNAoRNA == 'RNA') |>
select(vFamily, vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)
#changes text to lower case, replaces spaces with underscores
#to match mammal-viruses assoc data)
virus_select$vVirusNameCorrected <- tolower(virus_select$vVirusNameCorrected)
virus_select$vVirusNameCorrected <- gsub(" ", "_", virus_select$vVirusNameCorrected)
```
```{r}
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
#total of 20k rows! which is a lot less than we had before
```
```{r}
#checking if which host orders don't have enough data.
#decided arbitrary cut off at 40
# examining which hosts have less than 40 data points
df_noNA |>
group_by(HostOrder) |>
summarize(sum = sum(vSegmentedTF == TRUE), n = n()) |>
filter(n < 40)
#removing host orders with < 40 data points
df_hostremoved <- df_noNA |>
filter(HostOrder != "cingulata" & HostOrder != "dasyuromorphia" &
HostOrder != "eulipotyphla" & HostOrder != "peramelemorphia" &
HostOrder != "pilosa" & HostOrder != "proboscidea")
#our final dataset! shes a beauty
df_hostremoved
```
```{r}
#fixing host removed to have 0s and 1s instead of DS/SS and non/enveloped
df_hostremoved %>%
mutate(vEnvelope = case_when(
vEnvelope == 'enveloped' ~ 1,
vEnvelope == 'non-enveloped' ~ 0)) -> df_hostremoved
df_hostremoved |>
mutate(vSSoDS = case_when(
vSSoDS == 'SS' ~ 1,
vSSoDS == 'DS' ~ 0)) -> df_hostremoved
df_hostremoved |>
mutate(vSegmentedTF = case_when(
vSegmentedTF == TRUE ~ 1,
vSegmentedTF == FALSE ~ 0)) -> df_hostremoved
df_hostremoved |>
mutate(vCytoReplicTF = case_when(
vCytoReplicTF == TRUE ~ 1,
vCytoReplicTF == FALSE ~ 0)) -> df_hostremoved
```
#GLM _ MODEL COMPARISON
```{r}
#final model, without host order
model = glm(vSegmentedTF ~ vGenomeAveLength*vEnvelope + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, family = 'binomial', data = df_hostremoved)
summary(model)
#final model, with host order
model2 = glm(vSegmentedTF ~ HostOrder + vGenomeAveLength*vEnvelope + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, family = 'binomial', data = df_hostremoved)
summary(model2)
```
```{r}
#UNUSED MODEL > originally was going to use virus family, decided not to due to lots of multicollinearity
model_unused = glm(vSegmentedTF ~ vFamily + vGenomeAveLength*vEnvelope + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, family = 'binomial', data = df_hostremoved)
summary(model_unused)
```
```{r}
#VISUALIZATION OF FITTED MODEL, W/O HOST ORDER
df_hostremoved |>
ggplot(aes(x = vGenomeAveLength*vEnvelope + vSSoDS*IsZoonotic + vCytoReplicTF + vEnvelope, y = vSegmentedTF)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial")
)
```