CalculateForwardReads
+ +CalculateForwardReads.Rd
We calculate the number of forward reads covering a variant.
+diff --git a/DESCRIPTION b/DESCRIPTION old mode 100755 new mode 100644 index 6a5edfa..26976cc --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: sigurd Type: Package Title: Single cell Genotyping Using RNA Data -Version: 0.3.6 +Version: 0.3.8 Authors@R: c( person(given = "Martin", family = "Grasshoff", diff --git a/NAMESPACE b/NAMESPACE old mode 100755 new mode 100644 diff --git a/R/AllelFrequencyFoldChange.R b/R/AllelFrequencyFoldChange.R old mode 100755 new mode 100644 diff --git a/R/AmpliconSupplementing.R b/R/AmpliconSupplementing.R old mode 100755 new mode 100644 diff --git a/R/CalculateAlleleFrequency.R b/R/CalculateAlleleFrequency.R old mode 100755 new mode 100644 diff --git a/R/CalculateAltReads.R b/R/CalculateAltReads.R old mode 100755 new mode 100644 diff --git a/R/CalculateConsensus.R b/R/CalculateConsensus.R old mode 100755 new mode 100644 diff --git a/R/CalculateCorrelationPValue.R b/R/CalculateCorrelationPValue.R old mode 100755 new mode 100644 index 0a0e947..6b0b79c --- a/R/CalculateCorrelationPValue.R +++ b/R/CalculateCorrelationPValue.R @@ -3,12 +3,13 @@ #'@description #'We perform the correlation of SNVs and calculate the P values. #'@importFrom stats cor.test -#'@param variant_values The fraction values you are analysing. A vector. +#'@param variant_values The fraction values you are analyzing. A vector. #'@param other_mutation All other variants you have. A vector of variant names. #'@param all_variants_list List of fraction values for all the variants you want to compare your variant with. #'@param min_intersecting_cells Minimum number of intersecting cells. Correlations with less than this will not be performed. +#'@param value_type Are we using consensus or other information? #'@export -CalculateCorrelationPValue <- function(variant_values, other_mutation, all_variants_list, min_intersecting_cells = 5){ +CalculateCorrelationPValue <- function(variant_values, other_mutation, all_variants_list, value_type = "consensus", min_intersecting_cells = 5){ other_variant_values <- all_variants_list[[other_mutation]] if(sum(names(variant_values) %in% names(other_variant_values)) == 0){ #print("No intersection") @@ -28,10 +29,17 @@ CalculateCorrelationPValue <- function(variant_values, other_mutation, all_varia return(result) } else if(length(variant_values) > 2){ result <- stats::cor.test(variant_values, other_variant_values) - cells_som_alt <- sum(variant_values == 1) - cells_som_ref <- sum(variant_values == 0) - cells_MT_alt <- sum(other_variant_values == 1) - cells_MT_ref <- sum(other_variant_values == 0) + if(value_type == "consensus"){ + cells_som_alt <- sum(variant_values == 1) + cells_som_ref <- sum(variant_values == 0) + cells_MT_alt <- sum(other_variant_values == 1) + cells_MT_ref <- sum(other_variant_values == 0) + } else{ + cells_som_alt <- sum(variant_values > 0) + cells_som_ref <- sum(variant_values == 0) + cells_MT_alt <- sum(other_variant_values > 0) + cells_MT_ref <- sum(other_variant_values == 0) + } result <- c(result$p.value, result$estimate, cells_som_alt, cells_som_ref, cells_MT_alt, cells_MT_ref) } else{ #print("We do not have more than 2 cells for the somatic variant.") diff --git a/R/CalculateCoverage.R b/R/CalculateCoverage.R old mode 100755 new mode 100644 diff --git a/R/CalculateFisherTestPValue.R b/R/CalculateFisherTestPValue.R old mode 100755 new mode 100644 diff --git a/R/CalculateQuality.R b/R/CalculateQuality.R old mode 100755 new mode 100644 diff --git a/R/CalculateRefReads.R b/R/CalculateRefReads.R old mode 100755 new mode 100644 diff --git a/R/CalculateStrandCorrelation.R b/R/CalculateStrandCorrelation.R old mode 100755 new mode 100644 diff --git a/R/CallSupport.R b/R/CallSupport.R old mode 100755 new mode 100644 diff --git a/R/ClonalDefinition.R b/R/ClonalDefinition.R old mode 100755 new mode 100644 diff --git a/R/ClonalDiversity.R b/R/ClonalDiversity.R old mode 100755 new mode 100644 diff --git a/R/CombineSEobjects.R b/R/CombineSEobjects.R old mode 100755 new mode 100644 diff --git a/R/Enrichment_Fisher.R b/R/Enrichment_Fisher.R old mode 100755 new mode 100644 diff --git a/R/Filtering.R b/R/Filtering.R old mode 100755 new mode 100644 diff --git a/R/GetCellInfoPerVariant.R b/R/GetCellInfoPerVariant.R old mode 100755 new mode 100644 diff --git a/R/GetVariantInfo.R b/R/GetVariantInfo.R old mode 100755 new mode 100644 diff --git a/R/HeatmapVOI.R b/R/HeatmapVOI.R old mode 100755 new mode 100644 diff --git a/R/LoadingMAEGATK_typewise.R b/R/LoadingMAEGATK_typewise.R old mode 100755 new mode 100644 diff --git a/R/LoadingRawMatrix_typewise.R b/R/LoadingRawMatrix_typewise.R old mode 100755 new mode 100644 diff --git a/R/LoadingVCF_typewise.R b/R/LoadingVCF_typewise.R old mode 100755 new mode 100644 diff --git a/R/LoadingVarTrix_typewise.R b/R/LoadingVarTrix_typewise.R old mode 100755 new mode 100644 diff --git a/R/Merging_SE_list.R b/R/Merging_SE_list.R old mode 100755 new mode 100644 diff --git a/R/RowWiseSplit.R b/R/RowWiseSplit.R old mode 100755 new mode 100644 diff --git a/R/SeparatingMatrixToList.R b/R/SeparatingMatrixToList.R old mode 100755 new mode 100644 diff --git a/R/SetVariantInfo.R b/R/SetVariantInfo.R old mode 100755 new mode 100644 diff --git a/R/UMIQC.R b/R/UMIQC.R old mode 100755 new mode 100644 diff --git a/R/UMI_seq.R b/R/UMI_seq.R old mode 100755 new mode 100644 diff --git a/R/VariantBurden.R b/R/VariantBurden.R old mode 100755 new mode 100644 diff --git a/R/VariantCloneSizeThresholding.R b/R/VariantCloneSizeThresholding.R old mode 100755 new mode 100644 diff --git a/R/VariantCorrelationHeatmap.R b/R/VariantCorrelationHeatmap.R old mode 100755 new mode 100644 diff --git a/R/VariantFisherTestHeatmap.R b/R/VariantFisherTestHeatmap.R old mode 100755 new mode 100644 diff --git a/R/VariantQuantileThresholding_Combined.R b/R/VariantQuantileThresholding_Combined.R old mode 100755 new mode 100644 diff --git a/R/VariantSelection_Group.R b/R/VariantSelection_Group.R old mode 100755 new mode 100644 diff --git a/R/VariantSelection_Quantile.R b/R/VariantSelection_Quantile.R old mode 100755 new mode 100644 index 680bc30..442c638 --- a/R/VariantSelection_Quantile.R +++ b/R/VariantSelection_Quantile.R @@ -22,14 +22,14 @@ VariantSelection_Quantile <- function(SE, min_coverage = 2, quantiles = c(0.1, 0 SummarizedExperiment::assays(SE)[["fraction"]][nocall_check] <- NA SummarizedExperiment::assays(SE)[["coverage"]][nocall_check] <- NA } - + if(verbose) print("Get the mean allele frequency and coverage.") mean_af <- Matrix::rowMeans(SummarizedExperiment::assays(SE)[["fraction"]], na.rm = TRUE) mean_cov <- Matrix::rowMeans(SummarizedExperiment::assays(SE)[["coverage"]], na.rm = TRUE) - + if(verbose) print("Get the quantiles of the VAFs of each variant.") quantiles <- lapply(quantiles, function(x) apply(SummarizedExperiment::assays(SE)[["fraction"]], 1, quantile, x, na.rm = TRUE)) - + # Apply min_quality filtering if(!is.null(min_quality)){ vars <- data.frame(Mean_AF = mean_af, Mean_Cov = mean_cov, VariantQuality = SummarizedExperiment::rowData(SE)$VariantQuality, Quantile1 = quantiles[[1]], Quantile2 = quantiles[[2]]) @@ -37,9 +37,9 @@ VariantSelection_Quantile <- function(SE, min_coverage = 2, quantiles = c(0.1, 0 } else{ vars <- data.frame(Mean_AF = mean_af, Mean_Cov = mean_cov, Quantile1 = quantiles[[1]], Quantile2 = quantiles[[2]]) } - + if(verbose) print("Thresholding using the quantile approach.") - vois <- subset(vars, vars$Mean_AF > mean_allele_frequency & vars$Mean_Cov > min_coverage & vars$Quantile1 < thresholds[1] & vars$Quantile2 > thresholds[2]) - + vois <- subset(vars, vars$Mean_AF > mean_allele_frequency & vars$Mean_Cov > min_coverage & vars$Quantile1 <= thresholds[1] & vars$Quantile2 >= thresholds[2]) + return(rownames(vois)) } diff --git a/R/VariantSelection_TopCells.R b/R/VariantSelection_TopCells.R old mode 100755 new mode 100644 diff --git a/R/VariantSelection_VMR.R b/R/VariantSelection_VMR.R index bc397d8..f97334c 100644 --- a/R/VariantSelection_VMR.R +++ b/R/VariantSelection_VMR.R @@ -18,7 +18,20 @@ VariantSelection_VMR <- function(SE, stabilize_variance = TRUE, low_coverage_thr fraction[is.na(fraction)] <- 0 mean_af <- Matrix::rowSums(SummarizedExperiment::assays(SE)[["alts"]]) / Matrix::rowSums(SummarizedExperiment::assays(SE)[["coverage"]]) mean_af[is.na(mean_af)] <- 0 - + + if(verbose) print("We calculate the concordance.") + reads_forward <- SummarizedExperiment::assays(SE)[["forward"]] + reads_reverse <- SummarizedExperiment::assays(SE)[["reverse"]] + dt <- merge(data.table::data.table(summary(reads_forward)), + data.table::data.table(summary(reads_reverse)), + by.x = c("i", "j"), by.y = c("i", "j"), + all = TRUE) + dt <- dt[dt[,x.x] > 0 | dt[,x.y] > 0,] + dt <- data.table::data.table(variant = rownames(SE)[dt[[1]]], + cell_id = dt[[2]], + fw = dt[[3]], rev = dt[[4]]) + concordance <- dt[, .(cor = suppressWarnings(stats::cor(c(fw), c(rev), method = "pearson", use = "pairwise.complete"))), by = list(variant)] + if(stabilize_variance){ if(verbose) print("We stabilize the variance.") coverage <- SummarizedExperiment::assays(SE)[["coverage"]] @@ -42,7 +55,7 @@ VariantSelection_VMR <- function(SE, stabilize_variance = TRUE, low_coverage_thr if(verbose) print(paste0("We check if a cells has more than ", minimum_fw_rev_reads, " reads.")) detected <- (SummarizedExperiment::assays(SE)[["forward"]] >= minimum_fw_rev_reads) + (assays(SE)[["reverse"]] >= minimum_fw_rev_reads) - + voi <- data.frame( variant = rownames(detected), vmr = variants_variance / (mean_af + 0.00000000001), @@ -50,11 +63,9 @@ VariantSelection_VMR <- function(SE, stabilize_variance = TRUE, low_coverage_thr mean = round(mean_af, 7), variance = round(variants_variance, 7), cells_detected = Matrix::rowSums(detected == 2), # We only select cells that have enough reads for both directions. - strand_correlation = SummarizedExperiment::rowData(SE)$Concordance, + strand_correlation = concordance$cor, #SummarizedExperiment::rowData(SE)$Concordance, mean_coverage = Matrix::rowMeans(SummarizedExperiment::assays(SE)[["coverage"]]), stringsAsFactors = FALSE, row.names = rownames(detected) ) return(voi) } - - diff --git a/R/VariantWiseCorrelation.R b/R/VariantWiseCorrelation.R old mode 100755 new mode 100644 index a9f6742..6c2ef40 --- a/R/VariantWiseCorrelation.R +++ b/R/VariantWiseCorrelation.R @@ -8,8 +8,9 @@ #'@param n_cores Number of cores you want to use. Numeric. #'@param p_value_adjustment Method for P value adjustment. See p.adjust for details. #'@param verbose Should the function be verbose? Default = TRUE +#'@param value_type Are we using consensus or other information? #'@export -VariantWiseCorrelation <- function(variants_list, n_cores = 1, p_value_adjustment = "fdr", verbose = TRUE){ +VariantWiseCorrelation <- function(variants_list, n_cores = 1, p_value_adjustment = "fdr", value_type = "consensus", verbose = TRUE){ # We correlate the somatic variants with each other and the MT variants. # Since we have tens of thousands of MT variants, we do not correlate them with each other. variants <- names(variants_list) @@ -22,7 +23,7 @@ VariantWiseCorrelation <- function(variants_list, n_cores = 1, p_value_adjustmen variants_values_use <- variants_list[[variant_use]] variants_list_use <- variants_list[names(variants_list) != variant_use] all_variants <- names(variants_list_use) - results <- parallel::mclapply(X = all_variants, CalculateCorrelationPValue, variant_values = variants_values_use, all_variants_list = variants_list_use, mc.cores = n_cores) + results <- parallel::mclapply(X = all_variants, CalculateCorrelationPValue, variant_values = variants_values_use, all_variants_list = variants_list_use, mc.cores = n_cores, value_type = value_type) results <- do.call("rbind", results) results <- data.frame(Variant1 = variant_use, Variant2 = all_variants, P = results[,1], Corr = results[,2], Cells_1_Alt = results[,3], Cells_1_Ref = results[,4], Cells_2_Alt = results[,5], Cells_2_Ref = results[,6]) @@ -35,6 +36,10 @@ VariantWiseCorrelation <- function(variants_list, n_cores = 1, p_value_adjustmen if(verbose) print("We remove the negative corrlated SNPs.") results_total <- subset(results_total, Corr > 0) + if(verbose) print("Remove duplicative results.") + results_check <- apply(results_total[, c("Variant1", "Variant2")], 1, function(x) paste(sort(x), collapse = "_")) + results_total <- results_total[!duplicated(results_check), , drop = FALSE] + if(verbose) print(paste0("Adjusting P values using ", p_value_adjustment, ".")) results_total$P_adj <- stats::p.adjust(results_total$P, method = p_value_adjustment) rownames(results_total) <- NULL diff --git a/R/VariantWiseFisherTest.R b/R/VariantWiseFisherTest.R old mode 100755 new mode 100644 diff --git a/R/char_to_numeric.R b/R/char_to_numeric.R old mode 100755 new mode 100644 diff --git a/R/combine_NAMES.R b/R/combine_NAMES.R old mode 100755 new mode 100644 diff --git a/R/combine_SparseMatrix.R b/R/combine_SparseMatrix.R old mode 100755 new mode 100644 diff --git a/R/computeAFMutMatrix.R b/R/computeAFMutMatrix.R old mode 100755 new mode 100644 diff --git a/R/getAltMatrix.R b/R/getAltMatrix.R old mode 100755 new mode 100644 diff --git a/R/getMutMatrix.R b/R/getMutMatrix.R old mode 100755 new mode 100644 diff --git a/R/getReadMatrix.R b/R/getReadMatrix.R old mode 100755 new mode 100644 diff --git a/R/getRefMatrix.R b/R/getRefMatrix.R old mode 100755 new mode 100644 diff --git a/R/get_consensus.R b/R/get_consensus.R old mode 100755 new mode 100644 diff --git a/R/load_object.R b/R/load_object.R old mode 100755 new mode 100644 diff --git a/R/save_object.R b/R/save_object.R old mode 100755 new mode 100644 diff --git a/R/sdiv.R b/R/sdiv.R old mode 100755 new mode 100644 diff --git a/README.md b/README.md old mode 100755 new mode 100644 index d0d36ae..5f647af --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ The mutation data was obtained from the Sanger Institute Catalogue Of Somatic Mu ``` -# Current Features v0.3.6 +# Current Features v0.3.8 - Loading data from VarTrix and MAEGATK. - Transforming the data to be compatible for joint analysis. diff --git a/docs/404.html b/docs/404.html old mode 100755 new mode 100644 index cdf7f65..793d306 --- a/docs/404.html +++ b/docs/404.html @@ -24,7 +24,7 @@ sigurd - 0.3.2 + 0.3.8
# Loading the scRNA-seq data.
-scrna <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/scrna_MPN.rds.lz4")
+scrna <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/Paper_Figures/ZenodoFiles/Zenodo_Seurat_Object.rds")
# Loading the genotyping data.
-genotyping <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_VarTrix.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_JAK2V617F.vcf", type_use = "scRNAseq_Somatic", min_cells = 0, verbose = FALSE)
-amplicon <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_Amplicon.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_UCSC_JAK2V617F.vcf", type_use = "scRNAseq_Amplicon", min_cells = 0, verbose = FALSE)
-genotyping <- AmpliconSupplementing(scRNAseq = genotyping, amplicon = amplicon, verbose = FALSE)
+# genotyping <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_VarTrix.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_JAK2V617F.vcf", type_use = "scRNAseq_Somatic", min_cells = 0, verbose = FALSE)
+# amplicon <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_Amplicon.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_UCSC_JAK2V617F.vcf", type_use = "scRNAseq_Amplicon", min_cells = 0, verbose = FALSE)
+# genotyping <- AmpliconSupplementing(scRNAseq = genotyping, amplicon = amplicon, verbose = FALSE)
# Filtering the genotyping object to remove false positives.
-genotyping <- Filtering(genotyping, min_cells_per_variant = 0, fraction_threshold = 0.21, cells_include = colnames(scrna), min_variants_per_cell = 1, reject_value = "NoCall", verbose = FALSE)
We visualise the genotyping information on the UMAP.
+We visualize the genotyping information on the UMAP.
# Plotting the mutated cells on the UMAP.
scrna$JAK2_p.V617F_c.1849G.T_consensus <- factor(scrna$JAK2_p.V617F_c.1849G.T_consensus, levels = c("Ref", "Alt"))
@@ -132,14 +135,14 @@ Visualisation## Adding another scale for colour, which will replace the existing scale.
print(p)
Now, we determine the DEGs between the JAK2V617F cells and the wild type cells for the Orthochromatic Erythroblasts.
-scrna_orthoe <- subset(scrna, CellType == "Orthochromatic Erythroblasts" & Condition != "HC")
+scrna_orthoe <- subset(scrna, celltype == "Orthochromatic Erythroblasts" & condition != "HC")
degs <- FindMarkers(scrna_orthoe, group.by = "JAK2_p.V617F_c.1849G.T_consensus", ident.1 = "Alt", ident.2 = "Ref", logfc.threshold = 0)
degs <- data.frame(degs, gene = rownames(degs))
degs_up <- nrow(subset(degs, avg_log2FC > 0.25 & p_val_adj < 0.05))
@@ -155,7 +158,15 @@ Combining the JAK2V617F
First, we load and filter the mitochondrial variant information.
# Loading the data.
-genotyping_mt <- LoadingMAEGATK_typewise(patient = "MPN1", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", min_cells = 0, cells_include = colnames(scrna), type_use = "scRNAseq_MT", verbose = FALSE)
+# genotyping_mt <- LoadingMAEGATK_typewise(patient = "MPN1", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", min_cells = 0, cells_include = colnames(scrna), type_use = "scRNAseq_MT", verbose = FALSE)
+# genotyping_mt <- Filtering(genotyping_mt, min_variants_per_cell = 1, min_cells_per_variant = 2, fraction_threshold = 0.05, cells_include = colnames(scrna), verbose = FALSE)
+# Adding the JAK2V617F information.
+# genotyping_MPN1 <- Filtering(genotyping, cells_include = grep("MPN1", colnames(genotyping), value = TRUE))
+# genotyping_mt <- CombineSEobjects(genotyping_MPN1, genotyping_mt)
+
+# We load the Zenodo File. This is equivalent to the way we do it above.
+genotyping_mt_all <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/Paper_Figures/ZenodoFiles/scrna_genotyping.rds")
+genotyping_mt <- genotyping_mt_all[["MPN1"]]
genotyping_mt <- Filtering(genotyping_mt, min_variants_per_cell = 1, min_cells_per_variant = 2, fraction_threshold = 0.05, cells_include = colnames(scrna), verbose = FALSE)
# Subset to erythroid cells.
-erythroid_cells <- subset(scrna, annotation == "ErythroidCells")
-genotyping_mt <- Filtering(genotyping_mt, cells_include = colnames(erythroid_cells))
## [1] "We remove all cells not in the allow list."
## [1] "We remove all the variants that are always NoCall."
@@ -209,7 +220,7 @@Visualisation of mtVOIs and use_raster = TRUE, show_heatmap_legend = TRUE) draw(hm, annotation_legend_side = "bottom")
We now repeat the MT analysis for all samples. This will take considereable time.
-We speed the analsyis up by using parallel computing. We load the results generated by this code.
+We speed the analsyis up by using parallel computing. Since the Zenodo file was already loaded, we save time here.
# We get all patients present in the design matrix.
design_matrix <- read.table("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", header = TRUE, sep = ",")
-patients <- design_matrix$patient[1]
-genotyping_mt_all <- parallel::mclapply(patients, LoadingMAEGATK_typewise, samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", min_cells = 0, cells_include = colnames(scrna), type_use = "scRNAseq_MT", verbose = FALSE, mc.cores = length(patients))
-names(genotyping_mt_all) <- patients
-genotyping_mt_all <- parallel::mclapply(genotyping_mt_all, Filtering, min_variants_per_cell = 1, min_cells_per_variant = 2, fraction_threshold = 0.05, cells_include = colnames(scrna), verbose = FALSE, mc.cores = length(patients))
# We load the previous results.
-genotyping_mt_all <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/genotyping_mt_all.rds.lz4")
+genotyping_mt_all <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/Paper_Figures/ZenodoFiles/genotyping_mt_all.rds.lz4")
clonal_diversity_all <- unlist(lapply(genotyping_mt_all, ClonalDiversity, grouping = "Clones", diversity_measure = "EffectiveSpecies", verbose = FALSE))
clonal_diversity_all <- data.frame(Sample = names(genotyping_mt_all), EffectiveSpecies = clonal_diversity_all)
knitr::kable(clonal_diversity_all, format="html")
Grasshoff M, Costa I (2024). sigurd: Single cell Genotyping Using RNA Data. -R package version 0.3.2, https://costalab.github.io/sigurd/. +R package version 0.3.8, https://costalab.github.io/sigurd/.
@Manual{, title = {sigurd: Single cell Genotyping Using RNA Data}, author = {Martin Grasshoff and Ivan Costa}, year = {2024}, - note = {R package version 0.3.2}, + note = {R package version 0.3.8}, url = {https://costalab.github.io/sigurd/}, }
The fraction values you are analysing. A vector.
The fraction values you are analyzing. A vector.
CalculateForwardReads.Rd
We calculate the number of forward reads covering a variant.
+CalculateReverseReads.Rd
We calculate the number of reverse reads covering a variant.
+CombineSEobjects(se_1, se_2, suffixes = c("_1", "_2"))
CombineSEobjects(se_1, se_2, suffixes = c("_1", "_2"), patient_check = FALSE)
RowWiseSplit(se, n_cores = 1, minimum_reads, remove_nocalls = TRUE)
RowWiseSplit(
+ se,
+ n_cores = 1,
+ assay_to_split = "consensus",
+ remove_nocalls = TRUE
+)
SeparatingMatrixToList(row_use, total_matrix, remove_nocalls = TRUE)
SeparatingMatrixToList(
+ row_use,
+ total_matrix,
+ consensus,
+ assay_to_split = "consensus",
+ remove_nocalls = TRUE
+)
VariantSelection_VMR.Rd
This is an adaption of the VMR based selection of variants from Miller et al. +This selection process is designed for ATAC data and not scRNAseq data.
+VariantSelection_VMR(
+ SE,
+ stabilize_variance = TRUE,
+ low_coverage_threshold = 10,
+ minimum_fw_rev_reads = 2,
+ verbose = TRUE
+)
SummarizedExperiment object.
Should the variance be stabilized by using the mean allele frequency for cells with a low coverage? Coverage threshold is set by low_coverage_threshold.
Cells below this threshold are set to the mean.
How many forward and reverse reads should a cell have? Default = 2
Should the function be verbose?
CalculateQuality()
CalculateStrandCorrelation()
LoadingRawMatrix_typewise()
VariantWiseCorrelation()
BoneMarrow_SIGURD.Rmd
Here, we compare the analysis of cells from a patient with BPDCN.
-The original script is available here: https://github.com/petervangalen/MAESTER-2021/blob/main/4_CH_sample/4.2_Variant_Selection.R
-In this setup, the cells are all from a single donor and do not have to be separated into categories.
-Then small clones that might characterize the clonal lineages in the sample are selected.
-We load the data from MAESTER.
-
-print("Libraries for SIGURD.")
## [1] "Libraries for SIGURD."
-
-
-# The design matrix contains information for the genotyping data.
-genotyping <- LoadingMAEGATK_typewise(patient = "BPDCN712", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/MAESTER_Reproduction.csv", type_use = "Amplicon_MT", verbose = FALSE)
-
-# Loading the scRNA-seq data.
-scrna <- readRDS("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/BPDCN712_Seurat_with_TCR_Renamed.rds")
Variants that are also detected in the cell mixture data is treated as possible false positives. They are used as a blacklist and are removed from the results.
-
-# Loading the TenX Cell Mixture Genotyping.
-
-genotyping_tenx <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/TenX_CellMixture_Genotyping.rds.lz4")
-blocklist <- AllelFrequencyFoldChange(genotyping_tenx, group_of_interest = "CellType", group1 = "K562", group2 = "BT142", maximum_foldchange = 5, minimum_coverage = 5, minimum_allele_freq = 0.001, maximum_allele_freq = 0.999)$Variant
-
-# We add the variant chrM_1583_A_G by hand. It is identified as misleading based on downstream analysis.
-blocklist <- c(blocklist, "chrM_1583_A_G")
Now, we select the variants of interest from the loaded data.
-
-voi.ch.sigurd <- VariantSelection_TopCells(genotyping, min_coverage = 5, quantiles = 0.9, thresholds = 0, top_cells = 10, top_VAF = 0.5, min_quality = 30, remove_nocall = FALSE, verbose = FALSE)
-voi.ch.sigurd <- voi.ch.sigurd[!voi.ch.sigurd %in% blocklist]
-print(voi.ch.sigurd)
## [1] "chrM_683_G_A" "chrM_1323_G_A" "chrM_1415_G_A" "chrM_2593_G_A"
-## [5] "chrM_6205_G_A" "chrM_6243_G_A" "chrM_8697_G_A" "chrM_9753_G_A"
-## [9] "chrM_10158_T_A" "chrM_15140_G_A" "chrM_15812_G_A" "chrM_779_T_C"
-## [13] "chrM_3988_T_C" "chrM_6185_T_C" "chrM_6293_T_C" "chrM_9164_T_C"
-## [17] "chrM_9321_T_C" "chrM_11864_T_C" "chrM_15299_T_C" "chrM_1171_A_G"
-## [21] "chrM_1222_A_G" "chrM_2241_A_G" "chrM_2623_A_G" "chrM_3628_A_G"
-## [25] "chrM_4104_A_G" "chrM_6218_A_G"
-## [1] "We remove all cells not in the allow list."
-## [1] "We remove all the variants that are always NoCall."
-
-colData(genotyping)$CellType <- scrna$CellType
-HeatmapVoi(SE = genotyping, voi = voi.ch.sigurd, annotation_trait = "CellType", sort_cells = TRUE, remove_empty_cells = TRUE, minimum_allele_freq = 0.01)
MPN_SIGURD.Rmd
-suppressPackageStartupMessages(library(sigurd))
-suppressPackageStartupMessages(library(SummarizedExperiment))
-suppressPackageStartupMessages(library(ggplot2))
-suppressPackageStartupMessages(library(Seurat))
-suppressPackageStartupMessages(library(ComplexHeatmap))
-suppressPackageStartupMessages(library(circlize))
-suppressPackageStartupMessages(library(grid))
-suppressPackageStartupMessages(library(EnhancedVolcano))
Here, we illustrate how to analysis genotyping data from MPN samples. The JAK2V617F mutation was genotyped for 6 MPN samples and 3 healthy controls.
-This setup makes this analysis different from the previous analyses. Previously, only one sample was analysed at a time. Now, we load and analyse the samples simultaneously.
-Now, we load the JAK2V617F data and plot the mutated cells on the UMAP.
-Since we are loading multiple samples, this code is more complicated than the code for the MAESTER SW sample.
-
-# Loading the scRNA-seq data.
-scrna <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/scrna_MPN.rds.lz4")
-
-# Loading the genotyping data.
-genotyping <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_VarTrix.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_JAK2V617F.vcf", type_use = "scRNAseq_Somatic", min_cells = 0, verbose = FALSE)
-amplicon <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_Amplicon.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_UCSC_JAK2V617F.vcf", type_use = "scRNAseq_Amplicon", min_cells = 0, verbose = FALSE)
-genotyping <- AmpliconSupplementing(scRNAseq = genotyping, amplicon = amplicon, verbose = FALSE)
-
-# Filtering the genotyping object to remove false positives.
-genotyping <- Filtering(genotyping, min_cells_per_variant = 0, fraction_threshold = 0.21, cells_include = colnames(scrna), min_variants_per_cell = 1, reject_value = "NoCall", verbose = FALSE)
Now, we add the genotyping information to the Seurat object.
-
-scrna <- SetVariantInfo(SE = genotyping, seurat_object = scrna)
## [1] "You did not supply a vector of variants. All variants will be used."
-We visualise the genotyping information on the UMAP.
-
-# Plotting the mutated cells on the UMAP.
-scrna$JAK2_p.V617F_c.1849G.T_consensus <- factor(scrna$JAK2_p.V617F_c.1849G.T_consensus, levels = c("Ref", "Alt"))
-jak2_cells <- sum(scrna$JAK2_p.V617F_c.1849G.T_consensus == "Alt", na.rm = TRUE)
-ref_cells <- sum(scrna$JAK2_p.V617F_c.1849G.T_consensus == "Ref", na.rm = TRUE)
-all_cells <- ncol(scrna)
-p <- DimPlot(scrna, group.by = "JAK2_p.V617F_c.1849G.T_consensus", reduction = "INTE_UMAP", order = TRUE, pt.size = 1, raster = FALSE, na.value = "grey95", cols = c(Alt = "#aa0051", Ref = "#79b938")) +
- ggtitle(paste0("Alt: ", jak2_cells, ", Ref: ", ref_cells, ", All: ", all_cells)) +
- xlab("UMAP 1") + ylab("UMAP 2") + scale_color_manual(breaks = c("Alt", "Ref"), values = c("#aa0051", "#79b938"), na.value = "grey95") +
- theme(legend.position = "top")
## Scale for colour is already present.
-## Adding another scale for colour, which will replace the existing scale.
-
-print(p)
Now, we determine the DEGs between the JAK2V617F cells and the wild type cells for the Orthochromatic Erythroblasts.
-
-scrna_orthoe <- subset(scrna, CellType == "Orthochromatic Erythroblasts" & Condition != "HC")
-degs <- FindMarkers(scrna_orthoe, group.by = "JAK2_p.V617F_c.1849G.T_consensus", ident.1 = "Alt", ident.2 = "Ref", logfc.threshold = 0)
-degs <- data.frame(degs, gene = rownames(degs))
-degs_up <- nrow(subset(degs, avg_log2FC > 0.25 & p_val_adj < 0.05))
-degs_down <- nrow(subset(degs, avg_log2FC < -0.25 & p_val_adj < 0.05))
-EnhancedVolcano(degs, x = "avg_log2FC", y = "p_val_adj", lab = degs$gene, FCcutoff = 0.25, pCutoff = 0.05, title = "OrthoE JAK2V617F Pos. vs. Neg.",subtitle = NULL)
We combine the JAK2V617F SIGURD object with the MT object. Cells and variants in both objects are combined.
-To save time, we are only processing a single sample. All other samples are processed equally. Since the genotyping object is a list, it is easily processed using parallel computing.
-First, we load and filter the mitochondrial variant information.
-
-# Loading the data.
-genotyping_mt <- LoadingMAEGATK_typewise(patient = "MPN1", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", min_cells = 0, cells_include = colnames(scrna), type_use = "scRNAseq_MT", verbose = FALSE)
-genotyping_mt <- Filtering(genotyping_mt, min_variants_per_cell = 1, min_cells_per_variant = 2, fraction_threshold = 0.05, cells_include = colnames(scrna), verbose = FALSE)
Now, we select the mtVOIs. We then determine the clonal lineages and plot the results on a heatmap.
-We subset the cells to only include the erythroid cells.
-
-# Subset to erythroid cells.
-erythroid_cells <- subset(scrna, annotation == "ErythroidCells")
-genotyping_mt <- Filtering(genotyping_mt, cells_include = colnames(erythroid_cells))
## [1] "We remove all cells not in the allow list."
-## [1] "We remove all the variants that are always NoCall."
-
-# Selecting the mtVOIS.
-mt_vois <- VariantSelection_Quantile(genotyping_mt, min_coverage = 1, verbose = FALSE)
-
-# Determining the clonal lineages.
-genotyping_mt <- ClonalDefinition(se = genotyping_mt, variants_ls = list(mt_vois), verbose = FALSE)
We now plot the variants of interest and the clonal lineages.
-
-# Generating a heatmap.
-colors_lineage <- c("C1" = "#563e00", "C2" = "#dec234", "C3" = "#e23856", "C4" = "#7f0073", "C5" = "#006639", "C6" = "#b1db8e", "C7" = "#00348f", "Negative" = "black")
-ha <- columnAnnotation(Clones = colData(genotyping_mt)[,"Clones"],
- col = list(Clones = colors_lineage),
- annotation_name_gp = gpar(fontsize = 10, fontface = "plain"),
- annotation_legend_param = list(Clones = list(nrow = 1, title = "")))
-fraction <- as.matrix(assays(genotyping_mt)[["fraction"]])[mt_vois, ]
-hm <- Heatmap(fraction,
- name = "VAF",
- row_title_gp = gpar(fontsize = 12),
- row_names_gp = gpar(fontsize = 10),
- row_names_max_width = unit(16, "cm"),
- column_title = "MPN1",
- column_title_gp = gpar(fontsize = 12),
- column_names_gp = gpar(fontsize = 10),
- col = colorRamp2(seq(0, round(max(fraction, na.rm = TRUE)), length.out = 9), c("#FCFCFC", "#FFEDB0", "#FFDF5F", "#FEC510", "#FA8E24", "#F14C2B", "#DA2828", "#BE2222", "#A31D1D")),
- show_column_dend = FALSE,
- show_row_names = TRUE,
- show_column_names = FALSE,
- cluster_columns = TRUE,
- cluster_rows = FALSE,
- heatmap_legend_param = list(border = "#000000", grid_height = unit(10, "mm"),
- direction = "horizontal"),
- bottom_annotation = ha,
- border = TRUE,
- use_raster = TRUE,
- show_heatmap_legend = TRUE)
-draw(hm, annotation_legend_side = "bottom")
We calculate the clonal diversity for the sample MPN1 as an example.
-
-clonal_diversity <- ClonalDiversity(genotyping_mt, grouping = "Clones", diversity_measure = "EffectiveSpecies", verbose = FALSE)
-print(paste0("Clonal Diversity in Effective Number of Species for MPN1: ", clonal_diversity))
## [1] "Clonal Diversity in Effective Number of Species for MPN1: 6.72545022391521"
-SW_CellMixture_SIGURD.Rmd
In this data set, chronic myelogenous leukemia cells (K562) and brain tumor cells (BT142) were mixed and then sequenced using Seq-Well S3. The cells can be separated using the expression of marker genes.
-The original script then identifies 6 variants that can distinguish between the two cell types.
-In this reproduction, we show that the analysis can be faithfully reproduced using a very streamlined set of functions. This greatly simplifies the analysis. Using a set of dedicated functions, the analysis can now also be easily adopted to new data sets.
-We load the data from MAESTER.
-
-print("Libraries for SIGURD.")
## [1] "Libraries for SIGURD."
-
-
-seu <- readRDS("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/SW_CellLineMix_Seurat_Keep_Renamed.rds")
SIGURD loads the data and automatically reformats the data. This code uses dedicated functions to load the and process the data, which streamlines the analysis quite substantially.
-
-# The design matrix contains information for the genotyping data.
-genotyping <- LoadingMAEGATK_typewise(patient = "SW", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/MAESTER_Reproduction.csv", min_cells = 0, type_use = "Amplicon_MT", verbose = FALSE)
-
-# Removing cells that are not in the Seurat object.
-genotyping <- Filtering(genotyping, cells_include = colnames(seu))
## [1] "We remove all cells not in the allow list."
-## [1] "We remove all the variants that are always NoCall."
-
-# Adding meta data do the SIGURD object.
-colData(genotyping)$CellType <- seu$CellType
Now, we select the variants of interest.
-
-# Selecting informative variants.
-voi.ch.sigurd <- VariantSelection_Quantile(genotyping, min_coverage = 20, min_quality = 30, quantiles = c(0.1, 0.9), thresholds = c(0.1, 0.9), verbose = FALSE)
-print(voi.ch.sigurd)
## [1] "chrM_709_G_A" "chrM_1888_G_A" "chrM_1420_T_C" "chrM_2141_T_C"
-## [5] "chrM_9117_T_C" "chrM_7990_C_T"
-
-# Determining if a cell is supporting one of the two cell types.
-cell_support <- CallSupport(SE = genotyping, VOI_group1 = voi.ch.sigurd[voi.ch.sigurd != "chrM_7990_C_T"], VOI_group2 = "chrM_7990_C_T", group1_name = "K562", group2_name = "BT142", min_mutated_reads = 3, min_reads = 30, group_factor = 10, verbose = FALSE, return_nonsupport = TRUE)
-colData(genotyping)$CellType_MT <- cell_support$Support
-cell_support <- subset(cell_support, Support %in% c("K562", "BT142")) # Selecting only cells with sufficient support for a set of variants.
-
-# Plotting the results on a heatmap.
-HeatmapVoi(SE = genotyping[,cell_support$Cell], voi = voi.ch.sigurd, annotation_trait = "CellType", minimum_coverage = 3, remove_empty_cells = FALSE)
We now select variants with a higher VAF in the K562 cells than in the BT142 cells.
-
-voi.ch.lineages <- sigurd::VariantSelection_Group(SE = genotyping, min_coverage = 20, quantiles = c(0.01, 0.99), thresholds = c(0.01, 0.02), min_quality = 30,
- group_of_interest = "CellType_MT", group1 = "K562", group2 = "BT142", group_factor = 5, remove_nocall = FALSE, verbose = FALSE)
-voi.ch.lineages <- gsub("chrM_9117_T_C", "chrM_2141_T_C", voi.ch.lineages) # Replace chrM_9117_T_C, which comes up because it has no coverage in some cells, with another homoplasmic variant (chrM_2141_T_C), that has better coverage)
-voi.ch.lineages <- voi.ch.lineages[!voi.ch.lineages %in% c("chrM_5378_A_G", "chrM_6384_G_A")] # These variants are not detected in the bulk data and were removed by hand.
-print(voi.ch.lineages)
## [1] "chrM_1197_G_A" "chrM_1227_G_A" "chrM_1327_G_A" "chrM_1982_G_A"
-## [5] "chrM_2008_G_A" "chrM_2147_G_A" "chrM_2170_G_A" "chrM_2478_G_A"
-## [9] "chrM_2571_G_A" "chrM_2573_G_A" "chrM_2810_G_A" "chrM_2943_G_A"
-## [13] "chrM_5274_G_A" "chrM_8155_G_A" "chrM_8213_G_A" "chrM_8278_C_A"
-## [17] "chrM_9778_G_A" "chrM_2141_T_C" "chrM_1310_C_T" "chrM_1352_C_T"
-## [21] "chrM_1533_C_T" "chrM_9757_C_T"
-
-# Plotting the results on a heatmap.
-cell_support <- subset(cell_support, Support == "K562")
-genotyping <- Filtering(genotyping, cells_include = cell_support$Cell, verbose = FALSE)
-HeatmapVoi(SE = genotyping, voi = voi.ch.lineages, minimum_coverage = 3, sort_cells = TRUE, cluster_variants = TRUE)
TenX_CellMixture_SIGURD.Rmd
Here, we reproduce the analysis of a cell mixture experiment.
-We load the data from MAESTER.
-
-print("Libraries for SIGURD.")
## [1] "Libraries for SIGURD."
-
-
-seu <- readRDS("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/TenX_CellLineMix_Seurat_Keep_Renamed.rds")
-# The design matrix contains information for the genotyping data.
-genotyping <- LoadingMAEGATK_typewise(patient = "TenX", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/MAESTER_Reproduction.csv", min_cells = 0, type_use = "Amplicon_MT", verbose = FALSE)
-
-# Removing cells that are not in the Seurat object.
-genotyping <- Filtering(genotyping, cells_include = colnames(seu))
## [1] "We remove all cells not in the allow list."
-## [1] "We remove all the variants that are always NoCall."
-
-# Adding meta data do the SIGURD object.
-colData(genotyping)$CellType <- seu$CellType
-# Selecting informative variants.
-voi.ch.sigurd <- VariantSelection_Quantile(genotyping, min_coverage = 20, min_quality = 30, quantiles = c(0.1, 0.9), thresholds = c(0.1, 0.9), verbose = FALSE)
-print(voi.ch.sigurd)
## [1] "chrM_709_G_A" "chrM_1888_G_A" "chrM_6249_G_A" "chrM_8697_G_A"
-## [5] "chrM_11719_G_A" "chrM_15452_C_A" "chrM_1420_T_C" "chrM_2141_T_C"
-## [9] "chrM_4216_T_C" "chrM_6524_T_C" "chrM_9117_T_C" "chrM_11251_A_G"
-## [13] "chrM_11764_A_G" "chrM_11812_A_G" "chrM_15607_A_G" "chrM_7990_C_T"
-## [17] "chrM_12741_C_T"
-
-# Determining if a cell is supporting one of the two cell types.
-cell_support <- CallSupport(SE = genotyping, VOI_group1 = voi.ch.sigurd[voi.ch.sigurd != "chrM_7990_C_T"], VOI_group2 = "chrM_7990_C_T", group1_name = "K562", group2_name = "BT142", min_mutated_reads = 3, min_reads = 30, group_factor = c(10,2), verbose = FALSE)
-genotyping <- Filtering(genotyping, cells_include = cell_support$Cell)
## [1] "We remove all cells not in the allow list."
-## [1] "We remove all the variants that are always NoCall."
-
-colData(genotyping)$CellType_MT <- cell_support$Support
-cell_support <- subset(cell_support, Support %in% c("K562", "BT142")) # Selecting only cells with sufficient support for a set of variants.
-
-# Plotting the results on a heatmap.
-HeatmapVoi(SE = genotyping, voi = voi.ch.sigurd, annotation_trait = "CellType", minimum_coverage = 3)
We now select variants with a higher VAF in the K562 cells than in the BT142 cells.
-
-voi.ch.lineages <- sigurd::VariantSelection_Group(SE = genotyping, min_coverage = 100, quantiles = c(0.01, 0.99), thresholds = c(0.01, 0.02), min_quality = 30,
- group_of_interest = "CellType_MT", group1 = "K562", group2 = "BT142", group_factor = 5, remove_nocall = FALSE, verbose = FALSE)
-voi.ch.lineages <- voi.ch.lineages[!voi.ch.lineages %in% c("chrM_8251_G_A", "chrM_7693_C_T")] # These variants are not detected in the bulk data and were removed by hand.
-voi.ch.lineages <- c(voi.ch.lineages, "chrM_2141_T_C") # Adding a homoplasmic variant as positive control.
-print(voi.ch.lineages)
## [1] "chrM_1916_G_A" "chrM_2008_G_A" "chrM_2147_G_A" "chrM_2551_G_A"
-## [5] "chrM_2571_G_A" "chrM_2573_G_A" "chrM_7269_G_A" "chrM_7337_G_A"
-## [9] "chrM_7362_G_A" "chrM_7706_G_A" "chrM_8155_G_A" "chrM_9247_G_A"
-## [13] "chrM_9778_G_A" "chrM_10223_C_A" "chrM_11531_G_A" "chrM_11914_G_A"
-## [17] "chrM_15374_G_A" "chrM_8211_T_C" "chrM_9613_T_C" "chrM_1733_C_T"
-## [21] "chrM_2357_C_T" "chrM_2574_G_T" "chrM_7825_C_T" "chrM_9757_C_T"
-## [25] "chrM_9822_C_T" "chrM_11965_C_T" "chrM_12102_C_T" "chrM_2141_T_C"
-
-# Plotting the results on a heatmap.
-cell_support <- subset(cell_support, Support == "K562") # Only plotting K562 cells.
-genotyping <- Filtering(genotyping, cells_include = cell_support$Cell, verbose = FALSE)
-HeatmapVoi(SE = genotyping, voi = voi.ch.lineages, minimum_coverage = 3, sort_cells = TRUE, cluster_variants = TRUE)