For interactive code and other delights, please view this file on Org mode 9.1.13 on Emacs 25.3.1.
The analyses were performed on macOS High Sierra 10.13.6 using Python version 3.6.3 and R version 3.4.3. R session information is provided at the end of this file.
- tidyverse
- data.table
- nlme
- vegan
- profileR
- stringr
- multtest
- broom
- readxl
- gplots
- multcomp
- pathview
- pandas
- beautifulsoup4
- epride (available at https://github.com/manutamminen/epride)
Available at https://phytozome.jgi.doe.gov/pz/portal.html#!info?alias=Org_Creinhardtii.
Requires registration. Be sure to download the files
- Creinhardtii_281_v5.0.fa
- Creinhardtii_281_v5.5.gene.gff3
- ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt
Inputs | Outputs |
---|---|
Creinhardtii_281_v5.5.gene.gff3 | annot_genes.csv |
ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt | conversion.csv |
Creinhardtii_281_v5.0.fa |
import pandas as pd
import epride as ep
## Convert the C. reinhardtii gff3 annotation file into a csv
annot = pd.read_csv("Creinhardtii_281_v5.5.gene.gff3", sep="\t", comment="#", header=None)
annot_genes = annot[annot[2] == "gene"]
annot_genes[9] = annot_genes[8].str.split(";").apply(lambda x: x[1]).str.split("=").apply(lambda x: x[1])
annot_genes.to_csv("annot_genes.csv")
## Convert the C. reinhardtii annotation transcript file into a csv
conversion = pd.read_csv("ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.txt",
comment="#", sep=r"\s*", header=None)
conversion.to_csv("conversion.csv", index=None)
This is the starting point of the processing of the MaxQuant output files. proteinGroups.txt contains all peptide hist while proteinGroupsGoodColNames.csv excludes organelle hits.
Inputs | Outputs |
---|---|
proteinGroupsGoodColNames.csv | good_prot_hits_norm.csv |
proteinGroups.txt | good_prot_hits_norm_organelles.csv |
full_good_prot_hits_norm.csv | |
full_good_prot_hits_norm_organelles.csv | |
prot_table.csv | |
prot_table_organelles.csv | |
full_prot_table.csv | |
full_prot_table_organelles.csv |
## Define a function which extracts the Razor columns from the MaxQuant output and optionally
## truncates the gene IDs. Output in csv.
def process_raw(input_file, output_file, sep=",", cols=False, full=False):
protein_groups = pd.read_csv(input_file, sep=sep)
good_prots = protein_groups[~(protein_groups['Protein IDs'].str.contains("CON")) &
~(protein_groups['Protein IDs'].str.contains("REV"))]
razors = [i for i in good_prots.columns if "Razor" in i][1:]
razors = ['Protein IDs'] + razors
good_prot_hits = good_prots[razors]
good_prot_hits = good_prot_hits.set_index('Protein IDs')
if not full:
good_prot_hits.index = [".".join(i.split(".")[:2]) for i in good_prot_hits.index]
good_prot_hits_norm = good_prot_hits/good_prot_hits.sum(axis=0)
if cols:
good_prot_hits_norm.columns = cols
good_prot_hits_norm.to_csv(output_file)
else:
good_prot_hits_norm.to_csv(output_file)
cols = list(good_prot_hits_norm.columns)
return(cols)
## Define a function which transposes the peptide count tables and truncates the row
## names, leaving only the Strain, Treatment and Replicate IDs.
def table_convert(input_file, output_file):
gphn = pd.read_csv(input_file)
gphn.index = gphn.iloc[:,0]
gphn = gphn.iloc[:,1:gphn.shape[0]]
gphn = gphn.transpose()
gphn.index = [i.split()[4] for i in list(gphn.index)]
gphn.to_csv(output_file)
## Process the MaxQuant output
columns_names = process_raw("proteinGroupsGoodColNames.csv",
"good_prot_hits_norm.csv")
process_raw("proteinGroups.txt",
"good_prot_hits_norm_organelles.csv",
sep="\t", cols=columns_names)
process_raw("proteinGroupsGoodColNames.csv",
"full_good_prot_hits_norm.csv", full=True)
process_raw("proteinGroups.txt",
"full_good_prot_hits_norm_organelles.csv",
sep="\t", cols=columns_names, full=True)
## Transpose the tables and tidy up the row names.
table_convert("good_prot_hits_norm.csv", "prot_table.csv")
table_convert("good_prot_hits_norm_organelles.csv", "prot_table_organelles.csv")
table_convert("full_good_prot_hits_norm.csv", "full_prot_table.csv")
table_convert("full_good_prot_hits_norm_organelles.csv", "full_prot_table_organelles.csv")
Inputs | Outputs |
---|---|
prot_table.csv | mean_corrected.csv |
median_corrected.csv | |
full_prot_table.csv | full_corrected_mean.csv |
full_median_corrected.csv | |
prot_table_organelles.csv | mean_corrected_organelles.csv |
median_corrected_organelles.csv | |
full_prot_table_organelles.csv | full_mean_corrected_organelles.csv |
full_median_corrected_organelles.csv |
## Define a function which subtracts control means and medians from each Treatment in
## each Strain. Output as csvs.
def subtract_means(input_file, mean_output_file, median_output_file):
gphn = pd.read_csv(input_file, index_col=0)
gphn['Strain'] = [i.split("_")[0] for i in list(gphn.index)]
gphn['Treatment'] = [i.split("_")[1] for i in list(gphn.index)]
gphn['Replicate'] = [i.split("_")[2] for i in list(gphn.index)]
gphn.loc[gphn['Treatment'] == 'C', 'Replicate'] = 'r1'
gphn.loc[gphn['Replicate'] == 'r1.1', 'Replicate'] = 'r1'
control_means = gphn[gphn['Treatment'] == 'control'] \
.groupby(['Strain', 'Treatment']).mean() \
.reset_index().drop('Treatment', 1).set_index('Strain')
control_medians = gphn[gphn['Treatment'] == 'control'] \
.groupby(['Strain', 'Treatment']).median() \
.reset_index().drop('Treatment', 1).set_index('Strain')
mean_corrected = gphn.drop('Replicate', 1).set_index('Strain') \
.groupby('Treatment').apply(lambda x: x - control_means)
median_corrected = gphn.drop('Replicate', 1).set_index('Strain') \
.groupby('Treatment').apply(lambda x: x - control_medians)
mean_corrected.drop('Treatment', 1).drop_duplicates().to_csv(mean_output_file)
median_corrected.drop('Treatment', 1).drop_duplicates().to_csv(median_output_file)
## Prepare this for all proteome tables, organelle-encoded peptides included and excluded
## and truncated as well as non-truncated gene IDs.
subtract_means("prot_table.csv",
"mean_corrected.csv",
"median_corrected.csv")
subtract_means("full_prot_table.csv",
"full_mean_corrected.csv",
"full_median_corrected.csv")
subtract_means("prot_table_organelles.csv",
"mean_corrected_organelles.csv",
"median_corrected_organelles.csv")
subtract_means("full_prot_table_organelles.csv",
"full_mean_corrected_organelles.csv",
"full_median_corrected_organelles.csv")
Inputs | Outputs |
---|---|
conversion.csv | ids.txt |
id_conversion_raw.csv | annotation_table.csv |
annotation_table_chr.csv | |
id_conversion.csv |
awk -F, '{print $3}' conversion.csv | grep XM > ids.txt
Upload this to the pantherdb and download the resulting table as id_conversion_raw.txt. Convert into a proper csv:
awk '{print $1","$2}' id_conversion_raw.txt | awk -F'=' '{print $2","$3}' \
| awk '{gsub("\\|UniProtKB", ""); print $0}' | awk -F, 'NF == 3' > id_conversion.csv
Download the KEGG annotations for chromosomal genes…
os.chdir("KEGG_Chr")
pid = str(os.getpid())
## Extract all relevant gene IDs from the file id_conversion.csv
with open("../id_conversion.csv") as ids:
entries = [entr.split(",")[0] for entr in ids]
## Download the KEGG entries for the relevant gene IDs. Save these
## with suffix '.koe'.
with open("log_chr.txt", "a") as f:
acc = []
f.write(pid + "\n")
for entry in entries:
try:
page = pd.read_html("http://www.genome.jp/dbget-bin/www_bget?cre:" + entry)
page[0].to_csv(entry + ".koe")
f.write(entry + "passed\n")
f.flush()
except Exception as e:
print(e) # For debugging
f.write(entry + "failed\n")
f.flush()
…and chloroplast-encoded genes.
os.chdir("../KEGG_Cp")
pid = str(os.getpid())
## Download the KEGG entries for all the chloroplast-encoded gene IDs. Save these
## with suffix '.koe'.
with open("log_cp.txt", "a") as f:
acc = []
f.write(pid + "\n")
for i in range(1, 71):
entry_id = 'ChreCp{num:03d}'.format(num=i) # Construct the chloroplast gene IDs.
try:
page = pd.read_html("http://www.genome.jp/dbget-bin/www_bget?cre:" + entry_id)
page[0].to_csv(entry_id + ".koe")
f.write(entry_id + "passed\n")
f.flush()
except Exception as e:
print(e) # For debugging
f.write(entry_id + "failed\n")
f.flush()
And parse this annotation data into a single table for chromosomally encoded and organelle genomes
os.chdir("../KEGG_Cp")
## Define a parser function for the KEGG entries. Extract fields such as
## 'Entry', 'KO', 'Motif', 'Definition', 'Brite', 'Molecule', 'Other DBs'
## 'Pathway' and 'Module'.
def parse_entry(entry):
acc = {}
file_name = entry.split(".")[0]
entry = pd.read_csv(entry)
it = entry.iloc[0].items()
acc['Ensembl'] = file_name
for _, line in it:
if str(line) == "Entry":
_, acc['Entry'] = next(it)
if str(line) == "KO":
_, acc['KO'] = next(it)
if str(line) == "Motif":
_, acc['Motif'] = next(it)
if str(line) == "Definition":
_, acc['Definition'] = next(it)
if str(line) == "Brite":
_, acc['Brite'] = next(it)
if str(line) == "Molecule":
_, acc['Motif'] = next(it)
if str(line) == "Other DBs":
_, acc['Other DBs'] = next(it)
if str(line) == "Pathway":
_, acc['Pathway'] = next(it)
if str(line) == "Module":
_, acc['Module'] = next(it)
return acc
## Parse each file with the '.koe' suffix using the parse_entry function.
parsed = [parse_entry(i) for i in os.listdir() if ".koe" in i]
## Extract the Gene_ID, Protein_ID and UniProt columns and save as a csv.
acc2 = [{key: val.replace(u'\xa0', u' ') for key, val in i.items()} for i in parsed]
annot_table = pd.DataFrame(acc2)
annot_table['Gene_ID'] = annot_table['Other DBs'] \
.str \
.split("NCBI").apply(lambda x: x[1]).str \
.split(" ").apply(lambda x: x[1])
annot_table['Protein_ID'] = annot_table['Other DBs'] \
.str.split("NCBI").apply(lambda x: x[2]).str \
.split(" ").apply(lambda x: x[1]).str \
.split("UniProt").apply(lambda x: x[0])
annot_table['UniProt'] = annot_table['Other DBs'].str \
.split("NCBI").apply(lambda x: x[2]).str \
.split(" ").apply(lambda x: x[-1])
annot_table.to_csv("annotation_table.csv")
…and for organelle genomes
import pandas as pd
import os
os.chdir("../KEGG_Chr")
## Parse each file with the '.koe' suffix using the parse_entry function.
parsed = [parse_entry(i) for i in os.listdir() if ".koe" in i]
## Extract the Gene_ID, Protein_ID and UniProt columns and save as a csv.
acc2 = [{key: val.replace(u'\xa0', u' ') for key, val in i.items()} for i in parsed]
annot_table = pd.DataFrame(acc2)
annot_table['Gene_ID'] = annot_table['Other DBs'] \
.str.split("NCBI").apply(lambda x: x[1]) \
.str.split("?").apply(lambda x: x[1])
annot_table['Protein_ID'] = annot_table['Other DBs'].str \
.split("JGI").apply(lambda x: x[0]).str \
.split("?").apply(lambda x: x[2]).str \
.split("UniProt").apply(lambda x: x[0])
annot_table['UniProt'] = annot_table['Other DBs'].str \
.split("?").apply(lambda x: x[-1])
annot_table.to_csv("annotation_table_chr.csv")
Inputs | Outputs |
---|---|
full_prot_table.csv | |
annot_genes.csv | |
conversion.csv | |
full_mean_corrected.csv |
library(tidyverse)
library(nlme)
library(vegan)
library(profileR)
library(stringr)
library(multtest)
library(broom)
library(gplots)
library(multcomp)
library(readxl)
prot_table <- read_csv("full_prot_table.csv") %>%
separate(X1, into=c("Strain", "Treatment"), sep="_")
prot_table_replicates <- read_csv("full_prot_table.csv") %>%
separate(X1, into=c("Strain", "Treatment", "Replicate"), sep="_")
prot_table_replicates[prot_table_replicates$Treatment == "control",
"Treatment"] <- prot_table_replicates[prot_table_replicates$Treatment == "control",
c("Strain", "Treatment")] %>%
with(., paste(Strain, Treatment))
long_prot_table <- gather(prot_table, key=Gene, value=Expression, -Strain, -Treatment)
annot_genes <- read.csv("annot_genes.csv")
conversion <- read.csv("conversion.csv")
conversion$ID <- strsplit(as.character(conversion$X0), "\\.") %>%
sapply(function(x) paste(x[1], x[2], sep="."))
conversion <- merge(annot_genes, conversion, by.x="X9", by.y="ID") %>%
dplyr::select(X9, X0.x, X3.x, X2.y) %>% filter(X2.y != "--")
conversion$Locus <- with(conversion, paste(X0.x, X3.x, sep="X"))
conversion <- dplyr::select(conversion, X9, X2.y, Locus)
names(conversion) <- c("Genome_handle", "Panther_handle", "Locus")
mean_corrected <- read_csv("full_mean_corrected.csv")
mean_long_corrected <- gather(mean_corrected, key=Gene, value=Expr_level,
-Treatment, -Strain) %>%
filter(Treatment != "control")
mean_control_long_corrected <- gather(mean_corrected, key=Gene,
value=Expr_level, -Treatment, -Strain) %>%
filter(Treatment == "control")
Inputs | Outputs |
---|---|
rda_centroid_eb_treatment.pdf | |
rda_centroid_eb_strain.pdf | |
polar_plot_mean.pdf | |
rda1_tukey.pdf | |
rda2_tukey.pdf | |
angle_tukey.pdf | |
dist_tukey.pdf |
PDF name | Corresponding figure |
---|---|
rda_centroid_eb_treatment.pdf | Fig. 1A |
rda_centroid_eb_strain.pdf | Supplementary Fig. 2 |
polar_plot_mean.pdf | Fig. 1B |
rda1_tukey.pdf | Supplementary Fig. 3A |
rda2_tukey.pdf | Supplementary Fig. 3B |
angle_tukey.pdf | Supplementary Fig. 4A |
dist_tukey.pdf | Supplementary Fig. 4B |
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
## Define a function that processes the RDA data and draws scatter plots with error bars
draw_rda <- function(FULL.cap, output_file, color_column)
{
CAP1 <- scores(FULL.cap, display="wa", scaling=3)[,1]
CAP2 <- scores(FULL.cap, display="wa", scaling=3)[,2]
Res.dim <- as.data.frame(scores(FULL.cap, display="wa", scaling=3)[,1:2])
Res.dim$Strain <- prot_table$Strain
Res.dim$Treatment <- prot_table$Treatment
group_by_(Res.dim, color_column) %>%
summarise(CAP1mean=mean(CAP1), CAP2mean=mean(CAP2), CAP1sd=sd(CAP1), CAP2sd=sd(CAP2)) %>%
ggplot(aes_string(x="CAP1mean", y="CAP2mean", color=color_column)) +
geom_point(data=Res.dim, aes_string(x="CAP1", y="CAP2", color=color_column)) +
geom_errorbarh(aes(xmin = CAP1mean - CAP1sd, xmax = CAP1mean + CAP1sd)) +
geom_errorbar(aes(ymin = CAP2mean - CAP2sd, ymax = CAP2mean + CAP2sd)) +
scale_colour_manual(values=cbbPalette) +
theme_bw() + theme(legend.position="none")
ggsave(filename=output_file, plot=last_plot())
}
## Prepare the RDA analyses and figures
spe <- dplyr::select(prot_table, -Treatment, -Strain)
FULL.cap <- capscale(spe ~ Treatment + Condition(Strain), data=prot_table)
FULL.cap_str <- capscale(spe ~ Strain + Condition(Treatment), data=prot_table)
draw_rda(FULL.cap, "rda_centroid_eb_treatment.pdf", "Treatment")
draw_rda(FULL.cap_str, "rda_centroid_eb_strain.pdf", "Strain")
# Test for significant differences between Strains and Treatments
anova(FULL.cap, data=prot_table) # Treatments: p < 0.001
anova(FULL.cap_str, data=prot_table) # Strains: p = 0.762
## Define a function to subtract means/medians from strain-aggregated tables
normalize_rda <- function(Res.dim, norm_func)
{
filter(Res.dim, Treatment=="control") %>%
group_by(Strain) %>%
summarise(CAP1mean=norm_func(CAP1), CAP2mean=norm_func(CAP2)) %>%
full_join(Res.dim, by="Strain") %>%
mutate(CAP1=CAP1-CAP1mean, CAP2=CAP2-CAP2mean) %>%
dplyr::select(-CAP1mean, -CAP2mean) %>%
filter(Treatment != "control")
}
## Define a function to draw the Tukey test results
draw_tukey <- function(dist_test, output_file)
{
tuk1 <- TukeyHSD(dist_test)
psig <- as.numeric(apply(tuk1$Treatment[,2:3],1,prod)>=0)+1
op <- par(mar=c(4.2,9,3.8,2))
pdf(output_file)
plot(tuk1,col=psig,yaxt="n")
for (j in 1:length(psig)){
axis(2,at=j,labels=rownames(tuk1$Treatment)[length(psig)-j+1],
las=1,cex.axis=.8,col.axis=psig[length(psig)-j+1])
}
par(op)
dev.off()
}
## Test whether there's a difference on RDA axis 1
Res.dim <- as.data.frame(scores(FULL.cap, display="wa", scaling=3)[,1:2])
Res.dim$Strain <- prot_table$Strain
Res.dim$Treatment <- prot_table$Treatment
rda_norm <- normalize_rda(Res.dim, mean)
## Test whether normality assumption holds for the data;
## set the significance at p < 0.05
group_by(Res.dim, Treatment) %>%
summarise(p_cap1 = tidy(shapiro.test(CAP1))$p.value,
p_cap2 = tidy(shapiro.test(CAP2))$p.value,
sig_cap1 = p_cap1 < 0.05,
sig_cap2 = p_cap2 < 0.05)
## Only violated by CAP2 of Low light treatment (p = 0.0326)
## Test the similarity of the variances
lapply(c("S", "B", "L", "N", "P", "BS", "C"),
function(x) filter(Res.dim, Treatment == "control" | Treatment == x) %>%
var.test(CAP1 ~ Treatment, data=.) %>%
tidy) %>%
bind_rows %>%
.$p.value
lapply(c("S", "B", "L", "N", "P", "BS", "C"),
function(x) filter(Res.dim, Treatment == "control" | Treatment == x) %>%
var.test(CAP2 ~ Treatment, data=.) %>%
tidy) %>%
bind_rows %>%
.$p.value
## Only violated by P on CAP2 (p = 0.035). Otherwise p > 0.5
dist_test <- aov(lm(CAP1~Treatment, data=rda_norm))
draw_tukey(dist_test, "rda1_tukey2.pdf")
## Test whether there's a difference on RDA axis 2
dist_test <- aov(lm(CAP2~Treatment, data=rda_norm))
draw_tukey(dist_test, "rda2_tukey2.pdf")
## Define a function to calculate the treatment angles and distances from the ancestral origin
calculate_angles <- function(rda_norm, norm_func)
{
rda_norm <- normalize_rda(rda_norm, norm_func)
treatment_angle <- as.factor(rda_norm$Treatment)
levels(treatment_angle) <- c(270, 270, 90, 270, 270, 270, 270)
treatment_angle <- as.numeric(as.character(treatment_angle))
treatment_angle[21] <- 270
rda_norm$Angle <- -atan(rda_norm$CAP2/rda_norm$CAP1) * 180 / pi + treatment_angle
rda_norm$Dist <- sqrt(rda_norm$CAP1^2 + rda_norm$CAP2^2)
rda_means <- group_by(rda_norm, Treatment) %>%
summarise(Mean_angle=mean(Angle), Mean_dist=mean(Dist))
rda_norm <- rbind(rda_norm, rda_norm[1,])
rda_norm[length(rda_norm$Angle),'Angle'] <- 360
rda_norm[length(rda_norm$Angle),'Dist'] <- 0
rda_norm <- rbind(rda_norm, rda_norm[1,])
rda_norm[length(rda_norm$Angle),'Angle'] <- 0
rda_norm[length(rda_norm$Angle),'Dist'] <- 0
rda_list <- list(norm=rda_norm, means=rda_means)
return(rda_list)
}
## Define a function to draw the polar plots
draw_angles <- function(rda_list, output_file)
{
rda_norm <- rda_list[['norm']]
rda_means <- rda_list[['means']]
ggplot() +
geom_point(data=rda_norm, aes(x=Angle, y=Dist, color=Treatment)) +
geom_point(data=rda_means,
aes(x=Mean_angle, y=Mean_dist, color=Treatment, size=5)) +
coord_polar(theta="x", start=0) +
scale_colour_manual(values=cbbPalette) + theme_bw()
ggsave(filename=output_file, plot=last_plot())
}
## Draw the polar plots
calculate_angles(Res.dim, mean) %>%
draw_angles("polar_plot_mean.pdf")
## Perform Tukey tests to check which treatments are significantly different in their direction or distances
angle_norm <- calculate_angles(Res.dim, mean)[['norm']]
dist_test <- aov(lm(Angle~Treatment, data=angle_norm))
draw_tukey(dist_test, "angle_tukey.pdf")
dist_test <- aov(lm(Dist~Treatment, data=angle_norm))
draw_tukey(dist_test, "dist_tukey.pdf")
Find those proteins which are significantly different from the Ancestors. Prepare the heatmaps for Figure 3 and Supplementary Fig. 6.
Inputs | Outputs |
---|---|
full_mean_corrected.csv | org_genes.txt |
full_mean_corrected_organelles.csv | panther_annotations.txt |
dunnett_table.csv | |
dunnett_table_organelles.csv | |
parallel.pdf | |
divergent.pdf | |
opposites.txt |
PDF name | Corresponding figure |
---|---|
parallel.pdf | Part of Fig. 3 |
divergent.pdf | Part of Fig. 3 |
opposite.pdf | Part of Supplementary Fig. 5 |
P_VAL <- 0.05
corrected <- read_csv("full_mean_corrected.csv")
long_corrected <- gather(corrected, key=Gene, value=Expr_level, -Treatment, -Strain) %>%
filter(Treatment != "control")
corrected_organelles <- read_csv("full_mean_corrected_organelles.csv")
long_corrected_organelles <- gather(corrected_organelles, key=Gene,
value=Expr_level, -Treatment, -Strain) %>%
filter(Treatment != "control")
# Test the normality assumption
group_by(long_corrected, Gene) %>%
summarise(p_val = shapiro.test(Expr_level)$p.value) %>%
ggplot(aes(x=p_val)) + geom_density() + scale_x_log10()
# Prepare t-tests to test which groups significantly differ from zero
non_zeros <- group_by(long_corrected, Gene) %>%
summarise(sig=t.test(Expr_level)$p.value) %>%
filter(sig < P_VAL)
sig_long_corrected <- long_corrected[long_corrected$Gene %in% non_zeros$Gene,]
non_zeros_organelles <- group_by(long_corrected_organelles, Gene) %>%
summarise(sig=t.test(Expr_level)$p.value) %>%
filter(sig < P_VAL)
sig_long_corrected_organelles <- long_corrected[long_corrected$Gene %in% non_zeros$Gene,]
## Prepare the gene id file for Panther to recover the annotations
unique(long_corrected_organelles$Gene) %>% .[grepl("sp", .)] %>%
sapply(function(x) strsplit(x, "\\|")) %>% sapply(function(x) x[2]) %>%
as.character %>% write("org_genes.txt")
# Feed the resulting file "org_genes.txt" into PantherDB
# Download the results as panther_annotations.txt
# Test the significance of the detected number of proteins by a permutation test
permute_matrix <- function() {
l_c <- mutate(long_corrected, Treatment=sample(Treatment),
Strain=sample(Strain), Gene=sample(Gene))
non_zeros <- group_by(l_c, Gene) %>% summarise(sig=t.test(Expr_level)$p.value) %>%
filter(sig < 0.01)
l_c[l_c$Gene %in% non_zeros$Gene,]
}
perms <- lapply(1:1000, function(x) permute_matrix()$Gene %>% unique %>% length)
perm_list <- do.call(rbind, perms)
data.frame(a=perm_list) %>% ggplot(aes(x=a)) + geom_density() + theme_bw()
# This identifies a distribution with a mean of 36.028 and standard deviation of 5.86.
# This is significantly different from the observed value of 1304
# Out of 1000 permutations, not a single one exceeds 1304. Therefore p < 0.001.
# Then use Dunnett tests to check where at least one of the expressed proteins is
## significantly different from the controls
test_dunnett <- function(corrected, protein) {
current <- filter(corrected, Gene == protein)
controls <- filter(current, Treatment == "C")
controls$Treatment <- "A"
rest <- filter(current, Treatment != "C")
test_data <- rbind(controls, rest)
test_data$Treatment <- as.factor(test_data$Treatment)
fit <- aov(Expr_level ~ Treatment, test_data)
test_summary <- summary(glht(fit, linfct=mcp(Treatment="Dunnett")))
tidy(test_summary)
}
write_dunnett_table <- function(zero_list, corrected, output_file)
{
proteins <- unique(zero_list$Gene)
dunnett_tests <- list()
for (protein in proteins) {
print(protein)
test_table <- test_dunnett(corrected, protein)
test_table$protein_id <- protein
dunnett_tests[[protein]] <- test_table
}
dunnett_table <- do.call(rbind, dunnett_tests)
write_csv(dunnett_table, output_file)
}
write_dunnett_table(non_zeros,
long_corrected,
"dunnett_table.csv")
write_dunnett_table(non_zeros_organelles,
long_corrected_organelles,
"dunnett_table_organelles.csv")
## Test for difference to the controls: Dunnett test
## Prepare this again with a new corrected table with untruncated protein names!!
dunnett_table <- read_csv("dunnett_table.csv")
sig_dunnett_table <- group_by(dunnett_table, protein_id) %>%
mutate(min_p_val=min(p.value)) %>%
filter(min_p_val < P_VAL)
sig_proteins <- unique(sig_dunnett_table$protein_id)
mean_corrected <- group_by(long_corrected, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level))
dunnett_mean_corrected <- filter(mean_corrected, Gene %in% sig_proteins)
dunnett_long_corrected <- filter(long_corrected, Gene %in% sig_proteins, Treatment != "C")
# Check those dunnett-positives where the control is significantly different from zero
dunnett_long_corrected_c <- filter(long_corrected, Gene %in% sig_proteins)
# Perform Friedman tests for each gene (excluding the control) to check whether the
## response is parallel
friedman_tests <- list()
for (gene in unique(dunnett_long_corrected$Gene)) {
grp <- filter(dunnett_long_corrected, Treatment != "C", Gene == gene)
fit <- friedman.test(Expr_level ~ Treatment | Strain, data=grp)
p_val <- tidy(fit)$p.value
grp$anova_p_val <- p_val
friedman_tests[[gene]] <- grp
}
friedman_tests <- do.call(rbind, friedman_tests)
my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 299)
## Define a function to prepare the heatmap matrix
prepare_matrix <- function(friedman)
{
friedman_expr <- filter(dunnett_long_corrected_c, Gene %in% friedman)
friedman_ids <- unique(friedman_expr$Gene) %>%
strsplit("\\.") %>%
lapply(function(x) paste(x[1], x[2], sep=".")) %>%
do.call(rbind, .)
friedman_wide <- group_by(friedman_expr, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level)) %>%
spread(key=Gene, value=Mean_expr)
friedman_matrix <- as.matrix(friedman_wide[,-1])
rownames(friedman_matrix) <- friedman_wide$Treatment
friedman_matrix <- t(friedman_matrix)
sig_dif_ctr <- filter(friedman_expr, Treatment == "C") %>%
group_by(Gene) %>%
summarise(sig=t.test(Expr_level)$p.value) %>%
filter(sig < P_VAL)
sig_dif_loci <- match(sig_dif_ctr$Gene, rownames(friedman_matrix))
friedman_rows <- rownames(friedman_matrix)
row_frame <- data.frame(a=friedman_rows, b="", stringsAsFactors = FALSE)
row_frame[sig_dif_loci, 'b'] <- "*"
rownames(friedman_matrix) <- row_frame$b
return(friedman_matrix)
}
## Define a function to spit out the heatmap files
draw_heatmap <- function(test_matrix, file_name,
treatment_order=c('C', 'B', 'BS', 'L', 'N', 'P', 'S'))
{
pdf(file_name)
heatmap.2(test_matrix[,treatment_order],
main = "Category 2",
notecol="black",
density.info="none",
trace="none",
margins =c(3,25),
col=my_palette,
cexRow=0.5,
cexCol=0.5,
dendrogram="row",
Colv="NA")
dev.off()
}
## Plot the convergent and divergent responses
convergent <- friedman_tests[friedman_tests$anova_p_val >= 0.1,] %>% .$Gene %>% unique
divergent <- friedman_tests[friedman_tests$anova_p_val < 0.1,] %>% .$Gene %>% unique
prepare_matrix(convergent) %>%
draw_heatmap("convergent.pdf")
prepare_matrix(divergent) %>%
draw_heatmap("divergent.pdf")
## Plot also those responses where at least one Treatment has an opposite direction from the others.
all_prots <- c(parallel, divergent)
opposite_expr <- filter(dunnett_long_corrected_c, Gene %in% all_prots)
divergent_wide <- group_by(opposite_expr, Treatment, Gene) %>% summarise(Mean_expr=mean(Expr_level)) %>%
spread(key=Gene, value=Mean_expr) %>%
filter(Treatment != "C")
treatment_labels <- divergent_wide$Treatment
opposites <- colSums(divergent_wide < 0)
opposite_matrix <- as.matrix(divergent_wide[opposites != 6 & opposites != 0])
rownames(opposite_matrix) <- treatment_labels
opposite_matrix <- t(opposite_matrix)
opposite_prot_ids <- rownames(opposite_matrix)
rownames(opposite_matrix) <- panther_cre[match(rownames(opposite_matrix), panther_cre$X0), 15]
rownames(opposite_matrix)[rownames(opposite_matrix) == ""] <- NA
opposite_rows <- rownames(opposite_matrix)
draw_heatmap(opposite_matrix, "opposite.pdf", c('B', 'BS', 'L', 'N', 'P', 'S'))
## Check the opposite responses on David for enrichment
sapply(opposite_prot_ids, function(x) strsplit(x, "\\.")) %>%
sapply(function(x) paste(x[1], x[2], sep=".")) %>%
match(., conversion$Genome_handle) %>%
conversion[., 'Panther_handle'] %>%
as.character %>%
.[!is.na(.)] %>%
write("opposites.txt")
Inputs | Outputs |
---|---|
Creinhardtii_281_v5.5.gene.gff3 | parallels.highlight |
divergents.highlight | |
.hist files for Circos | |
chrom_distr.pdf |
PDF name | Corresponding figure |
---|---|
chrom_distr.pdf | Part of Fig. 2 |
### Prepare the histogram files for circos
### The configuration files reside in ~/Scratch/proteomics/Proteomics/circos/fig1
chr_positions <- read_tsv("~/Scratch/chlamy/Creinhardtii_281_v5.5.gene.gff3",
skip=2, col_names=FALSE) %>%
filter(X3 == "gene") %>% separate(col=X9, sep=";", into="X10") %>%
separate(col=X10, sep="=", into=c("X11", "X12")) %>%
dplyr::select(X1, X4, X5, X12) %>%
separate(col="X1", sep="_", into=c("X2", "X3")) %>%
dplyr::select(X3, X4, X5, X12) %>%
mutate(X3=paste("cr", X3, sep="")) %>%
separate(col=X12, into=c("Chr", "Id"), sep="\\.") %>%
mutate(Gene=paste(Chr, Id, sep=".")) %>% dplyr::select(-Chr, -Id)
slcp <- separate(sig_long_corrected, col=Gene, into=c("Chr", "Id"), sep="\\.") %>%
mutate(Gene=paste(Chr, Id, sep=".")) %>%
dplyr::select(-Chr, -Id) %>%
group_by(Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level)) %>%
merge(chr_positions, by="Gene")
for (treatment in unique(slcp$Treatment)) {
tmp <- filter(slcp, Treatment==treatment) %>%
dplyr::select(X3, X4, X5, Mean_expr)
path <- "circos/"
name <- paste(path, treatment, ".hist", sep="")
write_tsv(tmp, name, col_names=FALSE)
}
# Compile a table of the chromosomal loci for different categories
slcp2 <- slcp
slcp2$Cat <- "cat0"
slcp2[match(parallel_ids, slcp2$Gene), 'Cat'] <- "Parallel"
slcp2[match(divergent_ids, slcp2$Gene), 'Cat'] <- "Divergent"
slcp2 <- slcp2[slcp2$Cat != 'cat0',]
slcp2 <- group_by(slcp2, X3, Cat) %>% summarise(n=n())
pdf("chrom_distr.pdf")
ggplot(slcp2, aes(x=X3, y=n)) +
geom_bar(stat="identity", position="dodge") +
facet_grid(Cat~X3, scales="free_x") +
theme_bw()
dev.off()
# Output positions for the parallel responses for Circos
tmp <- strsplit(parallel_ids, "\\.") %>%
sapply(function(x) paste(x[1], x[2], sep="."))
sig_zeros <- filter(chr_positions, Gene %in% tmp)
sig_zeros$Mean_expr <- 1
dplyr::select(sig_zeros, X3, X4, X5, Mean_expr) %>%
write_tsv("circos/parallels.highlight", col_names=FALSE)
# Output positions for the divergent responses for Circos
tmp <- strsplit(divergent_ids, "\\.") %>%
sapply(function(x) paste(x[1], x[2], sep="."))
sig_zeros <- filter(chr_positions, Gene %in% tmp)
sig_zeros$Mean_expr <- 1
dplyr::select(sig_zeros, X3, X4, X5, Mean_expr) %>%
write_tsv("circos/divergents.highlight", col_names=FALSE)
#Test the distribution of the changes across chromosomes; chi square test
cat_tot <- c(divergent_ids, parallel_ids) %>%
strsplit("\\.") %>%
lapply(function(x) paste(x[1], x[2], sep=".")) %>%
do.call(rbind, .) %>%
as.character
sig_slcp <- slcp[slcp$Gene %in% cat_tot,]
table(sig_slcp$X3, sig_slcp$Gene) %>% chisq.test
# p < 2.2e-16
Inputs | Outputs |
---|---|
panther_annotations.txt | parallel_pos_ids.txt |
conversion.csv | parallel_neg_ids.txt |
selected_clusters.csv | divergent_pos_ids.txt |
divergent_neg_ids.txt | |
enrichment.pdf | |
opposite_pie.pdf |
PDF name | Corresponding figure |
---|---|
enrichment.pdf | Part of Fig. 3 |
opposite_pie.pdf | Part of Supplementary Fig. 5 |
panther <- read.csv("panther_annotations.txt", sep="\t", header=F)
panther$KEGG <- panther$V1 %>%
as.character %>%
strsplit("\\=") %>%
sapply(function(x) x[2]) %>%
strsplit("\\|") %>%
sapply(function(x) x[1])
conv <- read.csv("conversion.csv")
panther_cre <- merge(panther, conv, by.x="V2", by.y="X2")
## Output the gene lists from parallel and divergent responses for enrichment analysis
## parallel
parallel <- friedman_tests[friedman_tests$anova_p_val >= 0.1,] %>%
.$Gene %>% unique
parallel_expr <- filter(dunnett_long_corrected_c, Gene %in% parallel)
parallel_ids <- unique(parallel_expr$Gene) %>%
strsplit("\\.") %>%
lapply(function(x) paste(x[1], x[2], sep=".")) %>%
do.call(rbind, .)
parallel_wide <- group_by(parallel_expr, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level)) %>%
spread(key=Gene, value=Mean_expr)
parallel_matrix <- as.matrix(parallel_wide[,-1])
rownames(parallel_matrix) <- parallel_wide$Treatment
parallel_matrix <- t(parallel_matrix)
rownames(parallel_matrix) <- panther_cre[match(rownames(parallel_matrix),
panther_cre$X0), 1]
parallel_means <- parallel_matrix[,c('B', 'BS', 'L', 'N', 'P', 'S')] %>%
apply(1, mean)
parallel_pos <- names(parallel_means[parallel_means > 0])
write(parallel_pos[!is.na(parallel_pos)], "parallel_pos_ids.txt")
parallel_pos <- names(parallel_means[parallel_means < 0])
write(parallel_pos[!is.na(parallel_pos)], "parallel_neg_ids.txt")
## divergent
divergent <- friedman_tests[friedman_tests$anova_p_val < 0.1,] %>%
.$Gene %>%
unique
divergent_expr <- filter(dunnett_long_corrected_c, Gene %in% divergent)
divergent_ids <- unique(divergent_expr$Gene) %>%
strsplit("\\.") %>%
lapply(function(x) paste(x[1], x[2], sep=".")) %>%
do.call(rbind, .)
divergent_wide <- group_by(divergent_expr, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level)) %>%
spread(key=Gene, value=Mean_expr)
divergent_matrix <- as.matrix(divergent_wide[,-1])
rownames(divergent_matrix) <- divergent_wide$Treatment
divergent_matrix <- t(divergent_matrix)
rownames(divergent_matrix) <- panther_cre[match(rownames(divergent_matrix),
panther_cre$X0), 1]
divergent_means <- divergent_matrix[,c('B', 'BS', 'L', 'N', 'P', 'S')] %>%
apply(1, mean)
divergent_pos <- names(divergent_means[divergent_means > 0])
write(divergent_pos[!is.na(divergent_pos)], "divergent_pos_ids.txt")
divergent_pos <- names(divergent_means[divergent_means < 0])
write(divergent_pos[!is.na(divergent_pos)], "divergent_neg_ids.txt")
## Prepare the enrichment analyses of these files on
## https://david.ncifcrf.gov/
## Then, after combining and curating the files, import into R:
enriched_clusters <- read_csv("selected_clusters.csv")
pdf("enrichment.pdf")
ggplot(enriched_clusters, aes(x="", y=Count, fill=Term)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
facet_grid(.~Category)
dev.off()
opposite_enrichment <- read_excel("david_opposites.xlsx")
pdf("opposite_pie.pdf")
ggplot(opposite_enrichment, aes(x="", y=Count, fill=Term)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0)
dev.off()
Inputs | Outputs |
---|---|
annotation_table.csv | pathway PDF files |
annotation_table_chr.csv | |
id_conversion.csv | |
conversion.csv | |
mean_corrected.csv | |
dunnett_table_organelles.csv |
PDF name | Corresponding figure |
---|---|
Pathway PDF files | Parts of Fig. 4 and Supplementary Data 1 |
library(pathview)
kegg_res <- rbind(read_csv("KEGG_Cp/annotation_table.csv"),
read_csv("KEGG_Chr/annotation_table_chr.csv"))
kegg_conversion <- kegg_res[,c("Protein_ID", "Ensembl")]
id_conversion <- read_csv("id_conversion.csv",
col_names = c("KEGG", "UniProt", "Panther_handle"))
conversion <- read.csv("conversion.csv")
conversion$ID <- strsplit(as.character(conversion$X0), "\\.") %>%
sapply(function(x) paste(x[1], x[2], sep="."))
conversion <- dplyr::select(conversion, ID, X2) %>%
filter(X2 != "--")
names(conversion) <- c("Genome_handle", "Panther_handle")
#This is for cytoscape and KEGG visualization
mean_corr <- read_csv("mean_corrected.csv")
long_corrected <- gather(mean_corr, key=Gene, value=Expr_level, -Treatment, -Strain)
mean_corrected <- group_by(long_corrected, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level))
mean_corrected <- merge(mean_corrected, conversion, by.x='Gene', by.y='Genome_handle')
mean_corrected <- merge(mean_corrected, id_conversion, by="Panther_handle")
#Also include the organelle genomes
mean_corr_org <- read_csv("mean_corrected_organelles.csv")
org_genes <- mean_corr_org[,c(1:2, 3366:3417)]
long_corrected_org <- gather(org_genes, key=Gene, value=Expr_level, -Treatment, -Strain)
mean_corrected_org <- group_by(long_corrected_org, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level))
chloroplast_genes <- read_csv("KEGG_Cp/annotation_table.csv")
names(chloroplast_genes)[4] <- "KEGG"
mean_corrected_org$KEGG <- mean_corrected_org$Gene %>%
sapply(function(x) strsplit(x, "\\|")) %>%
sapply(function(x) x[2]) %>%
match(.,chloroplast_genes$UniProt) %>%
chloroplast_genes[.,] %>%
.$KEGG
mean_corrected_org <- dplyr::select(mean_corrected_org, Treatment, Mean_expr, KEGG)
mean_corrected_org <- dplyr::select(mean_corrected, Treatment, Mean_expr, KEGG) %>%
rbind(., data.frame(mean_corrected_org))
#And include Dunnett test corrections
dunnett_table_organelles <- read_csv("dunnett_table_organelles.csv")
sig_dunnett_table_organelles <- group_by(dunnett_table_organelles, protein_id) %>%
mutate(min_p_val=min(p.value)) %>%
filter(min_p_val < P_VAL)
sig_proteins_organelles <- unique(sig_dunnett_table_organelles$protein_id)
mean_corrected_organelles <- group_by(long_corrected_organelles, Treatment, Gene) %>%
summarise(Mean_expr=mean(Expr_level))
dunnett_mean_corrected_organelles <- filter(mean_corrected_organelles,
Gene %in% sig_proteins_organelles)
#And count how many instances differ from control
sig_count <- filter(sig_dunnett_table_organelles, p.value < P_VAL) %>%
group_by(protein_id) %>%
summarise(n=n())
sig_chr <- sig_count[1:404,]
sig_chr$protein_id <- strsplit(sig_chr$protein_id, "\\.") %>%
sapply(function(x) paste(x[[1]], x[[2]], sep="."))
sig_chr <- merge(sig_chr, conversion, by.x='protein_id', by.y='Genome_handle')
sig_chr <- merge(sig_chr, id_conversion, by="Panther_handle")
tmp <- sig_chr
sig_chr <- sig_chr[,c(4,3)]
sig_org <- sig_count[405:418,]
sig_org$protein_id <- sig_org$protein_id %>%
sapply(function(x) strsplit(x, "\\|")) %>%
sapply(function(x) x[2]) %>%
match(.,chloroplast_genes$UniProt) %>%
chloroplast_genes[.,] %>%
.$KEGG
sig_org <- sig_org[-8,]
names(sig_org)[1] <- "KEGG"
sig_count <- rbind(sig_chr, sig_org)
sig_matrix <- sig_count$n
names(sig_matrix) <- sig_count$KEGG
# Dunnett-tested proteins...
## dunnett_mean_corrected_org <- tail(dunnett_mean_corrected_organelles, 14)
dunnett_mean_corrected_org <- dunnett_mean_corrected_organelles
dunnett_mean_corrected_org$Gene <- dunnett_mean_corrected_org$Gene %>%
sapply(function(x) strsplit(x, "\\|")) %>%
sapply(function(x) x[2]) %>%
as.character
dunnett_mean_corrected_org <- merge(dunnett_mean_corrected_org, chloroplast_genes,
by.x="Gene", by.y="UniProt") %>%
dplyr::select(Treatment, Mean_expr, KEGG)
dun_mean_corrected <- merge(dunnett_mean_corrected, conversion, by.x='Gene', by.y='Genome_handle')
dun_mean_corrected <- merge(dun_mean_corrected, id_conversion, by="Panther_handle")
dun_mean_corrected <- dplyr::select(dun_mean_corrected, Treatment, Mean_expr, KEGG)
dunnett_corrected <- rbind(dunnett_mean_corrected_org, dun_mean_corrected)
#Treatment, Mean_expr, KEGG
# KEGG visualization by pathview
pw_df <- filter(mean_corrected, Treatment != "control") %>%
group_by(KEGG, Treatment) %>%
summarise(m_e=mean(Mean_expr)) %>%
spread(Treatment, value=m_e)
pw_matrix <- as.matrix(pw_df[,-1])
rownames(pw_matrix) <- pw_df$KEGG
pw_matrix <- pw_matrix[,c('C', 'B', 'BS', 'L', 'N', 'P', 'S')]
pw_df_organelles <- filter(mean_corrected_org, Treatment != "control") %>%
group_by(KEGG, Treatment) %>%
summarise(m_e=mean(Mean_expr)) %>%
spread(Treatment, value=m_e)
pw_matrix_organelles <- as.matrix(pw_df_organelles[,-1])
rownames(pw_matrix_organelles) <- pw_df_organelles$KEGG
pw_matrix_organelles <- pw_matrix_organelles[,c('C', 'B', 'BS', 'L', 'N', 'P', 'S')]
pw_dunnett_organelles <- filter(dunnett_corrected, Treatment != "control") %>%
group_by(KEGG, Treatment) %>%
summarise(m_e=mean(Mean_expr)) %>%
spread(Treatment, value=m_e)
pw_dunnett_matrix_organelles <- as.matrix(pw_dunnett_organelles[,-1])
rownames(pw_dunnett_matrix_organelles) <- pw_dunnett_organelles$KEGG
pw_dunnett_matrix_organelles <- pw_dunnett_matrix_organelles[,c('C', 'B', 'BS', 'L', 'N', 'P', 'S')]
plot_paths <- function(gene_data, pathway_id, out_suffix, coef=1000)
{
pathview(gene.data=gene_data*coef,
pathway.id=pathway_id,
species="cre",
out.suffix=out_suffix,
low=list(gene="blue"),
gene.idtype="KEGG")
}
## Carbon fixation in photosynthetic organisms
plot_paths(gene_data=pw_matrix,
pathway_id = "00710",
out_suffix = "Photos_fixation")
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00710",
out_suffix = "Photos_fixation_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00710",
out_suffix = "Photos_fixation_dunnett_org")
plot_paths(gene_data=sig_matrix,
pathway_id = "00710",
out_suffix = "Photos_sigs")
## Photosynthesis
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00195",
out_suffix = "Photos_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00195",
out_suffix = "Photos_dunnett_org")
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00196",
out_suffix = "Photos2_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00196",
out_suffix = "Photos2_dunnett_org")
plot_paths(gene_data=sig_matrix,
pathway_id = "00196",
out_suffix = "Photos2_sigs")
#Glycolysis/Gluconeogenesis
plot_paths(gene_data=pw_matrix,
pathway_id = "00010",
out_suffix = "Glycolysis")
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00010",
out_suffix = "Glycolysis_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00010",
out_suffix = "Glycolysis_dunnett_org")
plot_paths(gene_data=sig_matrix,
pathway_id = "00010",
out_suffix = "Glycolysis_sigs")
#TCA cycle
plot_paths(gene_data=pw_matrix,
pathway_id = "00020",
out_suffix = "TCA")
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00020",
out_suffix = "TCA_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00020",
out_suffix = "TCA_dunnett_org")
plot_paths(gene_data=sig_matrix,
pathway_id = "00020",
out_suffix = "TCA_sigs")
#Pentose phosphate pathway
plot_paths(gene_data=pw_matrix,
pathway_id = "00030",
out_suffix = "Pentose")
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00030",
out_suffix = "Pentose_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00030",
out_suffix = "Pentose_dunnett_org")
plot_paths(gene_data=sig_matrix,
pathway_id = "00030",
out_suffix = "Pentose_sigs")
#Starch and sucrose metabolism
plot_paths(gene_data=pw_matrix,
pathway_id = "00500",
out_suffix = "Starch")
plot_paths(gene_data=pw_matrix_organelles,
pathway_id = "00500",
out_suffix = "Starch_org")
plot_paths(gene_data=pw_dunnett_matrix_organelles,
pathway_id = "00500",
out_suffix = "Starch_dunnett_org")
plot_paths(gene_data=sig_matrix,
pathway_id = "00500",
out_suffix = "Starch_sigs")
#Nitrogen metabolism
plot_paths(gene_data=pw_matrix,
pathway_id = "00910",
out_suffix = "Nitrogen")
plot_paths(gene_data=pw_dunnett_matrix,
pathway_id = "00910",
out_suffix = "Nitrogen_dunnett")
Inputs | Outputs |
---|---|
ChlamEE_phenotypic_measurements.xlsx | ctop.pdf |
cton.pdf | |
cass_chl.pdf | |
ratio.pdf | |
respiration_chl.pdf | |
biomass.pdf | |
no3tobiomass.pdf | |
po4tobiomass.pdf |
PDF name | Corresponding figure |
---|---|
ctop.pdf | Supplementary Fig. 6A |
cton.pdf | Supplementary Fig. 6B |
respiration_chl.pdf | Supplementary Fig. 7A |
cass_chl.pdf | Supplementary Fig. 7B |
ratio.pdf | Supplementary Fig. 7C |
biomass.pdf | Supplementary Fig. 8A |
no3tobiomass.pdf | Supplementary Fig. 8B |
po4tobiomass.pdf | Supplementary Fig. 8C |
## Read in the phenotype data and prepare for plotting and Wilcoxon tests
xls <- read_excel("ChlamEE_phenotypic_measurements.xlsx") %>%
mutate(Ratio = (2*Cass) / Respiration,
NO3toBiomass = NO3 / Biomass,
PO4toBiomass = PO4 / Biomass,
Selection_Treatment = factor(Selection_Treatment,
levels=c("A", "C", "L", "N",
"P", "B", "S", "BS")))
xls_long <- gather(xls, key=variable, value=value, -Ancestor, -Selection_Treatment)
cols <- c("#CC5A9F", "#FF0000", "#2F8C55", "#E9EF28", "#26549C", "#333333", "#DB3C01", "#01A0C6")
## Define a plotting function to prepare boxplots for each variable
draw_bp <- function(Treatment, output_file)
{
filter(xls_long, variable == Treatment) %>%
ggplot(aes(x=Selection_Treatment, y=value, color=Selection_Treatment)) +
geom_boxplot() +
geom_point(aes(size=2)) +
theme_classic() +
theme(legend.position="none") +
scale_color_manual(values=cols)
ggsave(output_file, plot=last_plot(), useDingbats=FALSE)
}
## Draw the boxplots of the phenotypic measurements
draw_bp("CtoP_Molar", "ctop.pdf")
draw_bp("CtoN_Molar", "ctop.pdf")
draw_bp("Cass_Chl", "cass_chl.pdf")
draw_bp("Ratio", "ratio.pdf")
draw_bp("Respiration_Chl", "respiration_chl.pdf")
draw_bp("Biomass", "biomass.pdf")
draw_bp("NO3toBiomass", "no3tobiomass.pdf")
draw_bp("PO4toBiomass", "po4tobiomass.pdf")
## Define a function to test the signficance of the responses
## using two-sided Wilcoxon tests
wilcoxon_test <- function(treatment, stoich, alt)
{
control <- filter(xls, Selection_Treatment == "A")[[stoich]]
tmt <- filter(xls, Selection_Treatment == treatment)[[stoich]]
tidy(wilcox.test(x=control, y=tmt, alternative=alt)) %>%
mutate(Treatment = treatment, Stoich = stoich) %>%
dplyr::select(-method, -alternative, -statistic)
}
## And define some utulity functions to adjust the p-values
adj_p_vals <- function(p_table)
{
sapply(p.adjust.methods,
function(meth) p.adjust(p_table$p.value, meth))
}
add_p_vals <- function(p_table)
{
cbind(p_table, adj_p_vals(p_table))
}
adjust_wilcoxon <- function(column, hypothesis)
{
map_df(c("B", "BS", "C", "L", "N", "P", "S"),
~wilcoxon_test(., column, hypothesis)) %>%
add_p_vals
}
## Perform the Wilcoxon tests for each treatment against the ancestors
bind_rows(adjust_wilcoxon("CtoN_Molar", "less"),
adjust_wilcoxon("CtoP_Molar", "less"),
adjust_wilcoxon("Respiration", "greater"),
adjust_wilcoxon("Cass", "greater"),
adjust_wilcoxon("Ratio", "two.sided"),
adjust_wilcoxon("Biomass", "two.sided"),
adjust_wilcoxon("NO3toBiomass", "two.sided"),
adjust_wilcoxon("PO4toBiomass", "two.sided")) %>%
filter(fdr < 0.1) %>%
dplyr::select(Treatment, Stoich, fdr)
- Python version 3.6.3
- pandas version 0.21.0
- beautifulsoup4 version 4.6.3
- epride version 0.2dev (epride can be downloaded from https://github.com/manutamminen/epride)
- R version 3.4.3 (2017-11-30)
- Platform: x86_64-apple-darwin15.6.0 (64-bit)
- Running under: macOS High Sierra 10.13.6
- BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
- LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
[1] C
[1] stats4 parallel stats graphics grDevices utils datasets
[8] methods base
[1] pathview_1.18.2 org.Hs.eg.db_3.5.0 AnnotationDbi_1.40.0
[4] IRanges_2.12.0 S4Vectors_0.16.0 multcomp_1.4-8
[7] TH.data_1.0-9 MASS_7.3-50 survival_2.42-6
[10] mvtnorm_1.0-8 bindrcpp_0.2.2 gplots_3.0.1
[13] broom_0.5.0 multtest_2.34.0 Biobase_2.38.0
[16] BiocGenerics_0.24.0 profileR_0.3-5 lavaan_0.6-2
[19] reshape_0.8.7 RColorBrewer_1.1-2 vegan_2.5-2
[22] lattice_0.20-35 permute_0.9-4 nlme_3.1-137
[25] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.6
[28] purrr_0.2.5 readr_1.1.1 tidyr_0.8.1
[31] tibble_1.4.2 ggplot2_3.0.0.9000 tidyverse_1.2.1
[1] bitops_1.0-6 bit64_0.9-7 lubridate_1.7.4 httr_1.3.1
[5] Rgraphviz_2.22.0 tools_3.4.3 backports_1.1.2 utf8_1.1.4
[9] R6_2.2.2 KernSmooth_2.23-15 DBI_1.0.0 lazyeval_0.2.1
[13] mgcv_1.8-24 colorspace_1.3-2 withr_2.1.2 tidyselect_0.2.4
[17] mnormt_1.5-5 bit_1.1-14 compiler_3.4.3 graph_1.56.0
[21] cli_1.0.0 rvest_0.3.2 xml2_1.2.0 sandwich_2.4-0
[25] labeling_0.3 KEGGgraph_1.38.0 caTools_1.17.1.1 scales_0.5.0
[29] digest_0.6.15 pbivnorm_0.6.0 XVector_0.18.0 pkgconfig_2.0.1
[33] rlang_0.2.1 readxl_1.1.0 RSQLite_2.1.1 rstudioapi_0.7
[37] bindr_0.1.1 zoo_1.8-3 jsonlite_1.5 gtools_3.8.1
[41] magrittr_1.5 Matrix_1.2-14 Rcpp_0.12.18 munsell_0.5.0
[45] fansi_0.2.3 stringi_1.2.4 zlibbioc_1.24.0 plyr_1.8.4
[49] blob_1.1.1 grid_3.4.3 gdata_2.18.0 crayon_1.3.4
[53] Biostrings_2.46.0 haven_1.1.2 splines_3.4.3 KEGGREST_1.18.1
[57] hms_0.4.2 pillar_1.3.0 tcltk_3.4.3 reshape2_1.4.3
[61] codetools_0.2-15 XML_3.98-1.12 glue_1.3.0 modelr_0.1.2
[65] png_0.1-7 cellranger_1.1.0 gtable_0.2.0 assertthat_0.2.0
[69] memoise_1.1.0 cluster_2.0.7-1