We start by loading the libraries and functions we will need:
library("tidyverse") #library("knitr") source("ASE_functions.R") source("PerformDiffAIAnalysisFor2Conditions.R")
To demonstrate how the pipeline works, we are going to use data from RNA-seq experiment aimed to study effect of 5aza treatment on expression and allele-specific expression. Let's look at the data first:
5 samples of 4.11 Abelson clones:
name | description | number of repicates ------------- | ------------- | ------------- control | Untreated Day 0 | 2 DMSO | DMSO control Day 2 | 2 low | low 5aza treatment of 0.2 µM | 2 medium | medium 5aza treatment of 2 µM | 5 high | high 5aza treatment of 10 µM | 2
The first step is to create design matrix describing the experiment:
experimentNames <- c("control","DMSO","low","medium","high") techReps <- c(2,2,2,5,2) designMatrix <- BuildDesign(experimentNames, techReps) #kable(designMatrix)
Now we can look at the data. Here is the geneCountTab dataframe with counts for all genes, all replicates and conditions:
inTabs <- "../../../data/5aza/pr_20180714_ISEKI_processed_gene_extended2.txt" geneCountTab <- GetGatkPipelineTabs(inTabs, designMatrix$techReps, multiple = F) head(geneCountTab)
We can calculate allelic imbalances for all genes for all experimental conditions pooling technical replicates: AI = \frac{maternal counts}{maternal counts + paternal counts}
thr <- 10 geneCountTab_10 <- geneCountTab nameColumns <- function(exp_n, rep_n) { paste0("exp", rep(exp_n, 2*rep_n), "_rep", rep(1:rep_n, each = 2), "_", rep(c("ref", "alt"), rep_n)) } colnames(geneCountTab_10)[2:27] <- c(nameColumns(1,2), nameColumns(2,2), nameColumns(3,2), nameColumns(4,5), nameColumns(5,2)) geneCountTab_10 <- geneCountTab_10 %>% dplyr::filter(exp1_rep1_ref + exp1_rep1_alt >=thr & exp2_rep1_ref + exp2_rep1_alt >=thr & exp3_rep1_ref + exp3_rep1_alt >=thr & exp4_rep1_ref + exp4_rep1_alt >=thr & exp5_rep1_ref + exp5_rep1_alt >=thr & exp1_rep2_ref + exp1_rep2_alt >=thr & exp2_rep2_ref + exp2_rep2_alt >=thr & exp3_rep2_ref + exp3_rep2_alt >=thr & exp4_rep2_ref + exp4_rep2_alt >=thr & exp5_rep2_ref + exp5_rep2_alt >=thr) # TODO: use thr in CountsToAI (thr=10) and take care of downstream cbind (change to df and then merge) aiTable <- do.call(cbind, lapply(1:length(designMatrix$techReps), function(x){ round(CountsToAI(geneCountTab_10, reps = unlist(designMatrix$replicateNums[x])),3) })) aiTable <- cbind(geneCountTab_10$ensembl_gene_id, aiTable) colnames(aiTable) <- c("ensembl_gene_id", designMatrix$experimentNames) aiTable <- data.frame(aiTable) aiTable$control <- as.numeric(aiTable$control) aiTable$DMSO <- as.numeric(aiTable$DMSO) aiTable$low <- as.numeric(aiTable$low) aiTable$medium <- as.numeric(aiTable$medium) aiTable$high <- as.numeric(aiTable$high) #kable(head(aiTable)) write.table(aiTable, file = "../../../data/5aza/ai_full_thr_10.txt", row.names = F, quote = F, sep="\t")
Then get row-concatenated table of pairwise delta AI for all the 3 experiments:
dfDeltaAIPairwise <- do.call(rbind, lapply(1:length(designMatrix$techReps), function(x){ CreateMergedDeltaAIPairwiseDF(geneCountTab, mlns=F, repnums = unlist(designMatrix$replicateNums[x]), what=designMatrix$experimentNames[x], thr=10) })) head(dfDeltaAIPairwise)
P = pnorm(seq(0.4,2,0.2)) P = P - (1-P) DF = dfDeltaAIPairwise DF$geneORsnp = "Gene" DF$group = paste(DF$what, DF$geneORsnp, DF$ij) head(DF) DFQ = do.call(rbind, lapply(unique(DF$group), function(r){ CreateObservedQuantilesDF(DF[DF$group == r, ], P, ep=1.3, logbase=T, coverageLimit=2000, group=r) }) ) head(DFQ) DFQ$ij = sapply(as.character(DFQ$group), function(x){paste(unlist(strsplit(x, ' '))[3:5], collapse = ' ')})
groupsIntercepts = lapply(unique(DFQ$Q), function(q){ df = DFQ[DFQ$Q == q, ] res = sapply(unique(df$ij), function(x){ # проход по всем парам FitLmIntercept(df[df$ij == x, ], 40, morethan=10, logoutput=T) }) names(res) = unique(df$ij) 2**res # вернулись от логарифма обратно }) coeffDF <- data.frame(t(do.call(rbind, groupsIntercepts))) colnames(coeffDF) <- paste0("q_", round(as.numeric(as.character(unique(DFQ$Q))),3))
Add names column:
coeffDF$exp <- rep(designMatrix$experimentNames, choose(designMatrix$techReps, 2)) coeffDF <- coeffDF[,c(dim(coeffDF)[2],1:(dim(coeffDF)[2]-1))]
Let's compare conditions DMSO (experiment 2) and high 5aza treatment (experiment 5). We will construct 95%-CIs around AIs and get the resulting classification into genes demonstrating difference in AI between conditions (based on non-overlapping CIs):
DiffAI_DMSO_high <- PerformDiffAIAnalysisFor2Conditions(geneCountTab, vect1CondReps = unlist(designMatrix$replicateNums[2]), vect2CondReps = unlist(designMatrix$replicateNums[5]), Q=0.95, thr=40, EPS=1.3, BF=T, fullOUT=F) DiffAI_DMSO_high$deltaAI <- abs(DiffAI_DMSO_high$meanAI1 - DiffAI_DMSO_high$meanAI2) head(DiffAI_DMSO_high[, c(1,5,6,10,11)])
Let's look at top DE genes:
minDifference <- 0.2 GenesAIDiff <- DiffAI_DMSO_high[!is.na(DiffAI_DMSO_high$diffAI), ] GenesAIDiff <- GenesAIDiff[GenesAIDiff$diffAI==T & GenesAIDiff$deltaAI>=minDifference, ] head(GenesAIDiff[, c(1,5,6,10,11)])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.