.Rhistory

#EXPLORING VIRUS DATA
#see proportion of cytoplasm replication segmented
virus_data |>
group_by(vCytoReplicTF) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
library(tidyverse)
set.seed(313)
mv_associations <- read.csv("MammalViruses_Associations.csv")
#samples only 1000 so we can test it, and then run on all data
mv_sample <- mv_associations %>% sample_n(1000)
#selects columns
mv_sample_select <- select(mv_sample, HostOrder, Virus)
#changes virus names to match virus data
mv_sample_select$Virus <- sub(" ", "_", mv_sample_select$Virus)
#changes to lower case
mv_sample_select$Virus <- tolower(mv_sample_select$Virus)
#CLEANING UP VIRUS DATA
virus_data <- read.csv("HP3_viruses.csv")
#selects needed columns
virus_select <- select(virus_data, vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vDNAoRNA)
#changes to lower case (to match mv data)
virus_select$vVirusNameCorrected <- tolower(virus_select$vVirusNameCorrected)
#EXPLORING VIRUS DATA
#see proportion of cytoplasm replication segmented
virus_data |>
group_by(vCytoReplicTF) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
virus_data |>
group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
ication segmented
virus_data |>
group_by(vCytoReplicTF) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#kept it, cause
irus_data |>
group_by(vSSods) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
virus_data |>
group_by(vSSods) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#see proportion of SS or DS viruses segmented
virus_data |>
group_by(vSSoDS) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#EXPLORING VIRUS DATA
#see proportion of DNA/RNA viruses segmented
virus_data |>
group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#Since there are not segmented DNA viruses in the dataset, decided to remove all DNA viruses
#see proportion of DNA/RNA viruses segmented
virus_data |>
group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#Since there are not segmented DNA viruses in the dataset, decided to remove all DNA viruses
View(virus_data)
virus_select <- virus_data |>
filter(vDNAoRNA, RNA) |>
select(vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vFamily, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)
virus_select <- virus_data |>
filter(vDNAoRNA == RNA) |>
select(vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vFamily, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)
virus_select <- virus_data |>
filter(vDNAoRNA == 'RNA') |>
select(vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vFamily, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)
View(virus_select)
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
View(df_noNA)
View(df_merge)
mv_associations <- read.csv("MammalViruses_Associations.csv")
#samples only 1000 so we can test it, and then run on all data
#mv_sample <- mv_associations %>% sample_n(1000)
mv_sample <- mv_associations
#selects columns
mv_sample_select <- select(mv_sample, HostOrder, Virus)
#changes virus names to match virus data
mv_sample_select$Virus <- sub(" ", "_", mv_sample_select$Virus)
#changes to lower case
mv_sample_select$Virus <- tolower(mv_sample_select$Virus)
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
#CLEANING UP VIRUS DATA
virus_data <- read.csv("HP3_viruses.csv")
#see proportion of DNA/RNA viruses segmented
virus_data |>
group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#Since there are not segmented DNA viruses in the dataset, decided to remove all DNA viruses
#selects needed columns and only DNA viruses
virus_select <- virus_data |>
filter(vDNAoRNA == 'RNA') |>
select(vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vFamily, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)
#changes to lower case (to match mv data)
virus_select$vVirusNameCorrected <- tolower(virus_select$vVirusNameCorrected)
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
virus_select$vVirusNameCorrected <- tolower(virus_select$vVirusNameCorrected)
virus_select$vVirusNameCorrected <- sub(" ", "_", virus_select$vVirusNameCorrected)
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
mv_associations <- read.csv("MammalViruses_Associations.csv")
#samples only 1000 so we can test it, and then run on all data
#mv_sample <- mv_associations %>% sample_n(1000)
mv_sample <- mv_associations
#selects columns
mv_sample_select <- select(mv_sample, HostOrder, Virus)
#changes virus names to match virus data
mv_sample_select$Virus <- gsub(" ", "_", mv_sample_select$Virus)
#changes to lower case
mv_sample_select$Virus <- tolower(mv_sample_select$Virus)
#CLEANING UP VIRUS DATA
virus_data <- read.csv("HP3_viruses.csv")
#see proportion of DNA/RNA viruses segmented
virus_data |>
group_by(vDNAoRNA) |> #DNA/RNA, SS/DS
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
#Since there are not segmented DNA viruses in the dataset, decided to remove all DNA viruses
#selects needed columns and only DNA viruses
virus_select <- virus_data |>
filter(vDNAoRNA == 'RNA') |>
select(vVirusNameCorrected, vSegmentedTF, vCytoReplicTF, vFamily, vGenomeAveLength, vEnvelope, vSSoDS, IsZoonotic)
#changes to lower case (to match mv data)
virus_select$vVirusNameCorrected <- tolower(virus_select$vVirusNameCorrected)
virus_select$vVirusNameCorrected <- gsub(" ", "_", virus_select$vVirusNameCorrected)
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
#attempt to merge df by virus names
# keeps all Y and all X, even the NA ones
df_merge <- merge(mv_sample_select, virus_select, by.x = "Virus", by.y = 'vVirusNameCorrected',
all.x = TRUE, all.y = TRUE)
#removing NA
df_noNA <- na.omit(df_merge)
df_sum <- df_noNA |>
group_by(HostOrder) |>
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
df_sum <- as.data.frame(df_sum)
df_sum
df_noNA |>
group_by(HostOrder) |>
summarize(sum = sum(vSegmentedTF == TRUE), n = n())
model = glm(vSegmentedTF ~ vDNAoRNA + vFamily + vGenomeAveLength + vEnvelope + vSSoDS + IsZoonotic, family = 'binomial', data = df_noNA)
model = glm(vSegmentedTF ~ vFamily + vGenomeAveLength + vEnvelope + vSSoDS + IsZoonotic, family = 'binomial', data = df_noNA)
summary(model)
model2 = glm(vSegmentedTF ~ HostOrder + vFamily + vGenomeAveLength + vEnvelope + vSSoDS + IsZoonotic, family = 'binomial', data = df_noNA)
summary(model2)
summary(model)
model = glm(vSegmentedTF ~ vFamily + vGenomeAveLength + vEnvelope + vSSoDS + IsZoonotic + vCytoReplicTF, family = 'binomial', data = df_noNA)
summary(model)
model2 = glm(vSegmentedTF ~ HostOrder + vFamily + vGenomeAveLength + vEnvelope + vSSoDS + IsZoonotic + vCytoReplicTF, family = 'binomial', data = df_noNA)
summary(model2)