We start by loading the libraries and functions we will need:

library("tidyverse")
#library("knitr")
source("ASE_functions.R")
source("PerformDiffAIAnalysisFor2Conditions.R")

5aza data

To demonstrate how the pipeline works, we are going to use data from RNA-seq experiment aimed to study effect of 5aza treatment on expression and allele-specific expression. Let's look at the data first:

Experiment design

5 samples of 4.11 Abelson clones:

name | description | number of repicates ------------- | ------------- | ------------- control | Untreated Day 0 | 2 DMSO | DMSO control Day 2 | 2 low | low 5aza treatment of 0.2 µM | 2 medium | medium 5aza treatment of 2 µM | 5 high | high 5aza treatment of 10 µM | 2

The first step is to create design matrix describing the experiment:

experimentNames <- c("control","DMSO","low","medium","high")
techReps <- c(2,2,2,5,2)
designMatrix <- BuildDesign(experimentNames, techReps)
#kable(designMatrix)

Now we can look at the data. Here is the geneCountTab dataframe with counts for all genes, all replicates and conditions:

inTabs <- "../../../data/5aza/pr_20180714_ISEKI_processed_gene_extended2.txt"
geneCountTab <- GetGatkPipelineTabs(inTabs, designMatrix$techReps, multiple = F)
head(geneCountTab)

We can calculate allelic imbalances for all genes for all experimental conditions pooling technical replicates: AI = \frac{maternal counts}{maternal counts + paternal counts}

thr <- 10
geneCountTab_10 <- geneCountTab
nameColumns <- function(exp_n, rep_n)  {
  paste0("exp", rep(exp_n, 2*rep_n), "_rep", rep(1:rep_n, each = 2), "_", rep(c("ref", "alt"), rep_n))
}
colnames(geneCountTab_10)[2:27] <- c(nameColumns(1,2), nameColumns(2,2), nameColumns(3,2), nameColumns(4,5), nameColumns(5,2))
geneCountTab_10 <- geneCountTab_10 %>% dplyr::filter(exp1_rep1_ref + exp1_rep1_alt >=thr & exp2_rep1_ref + exp2_rep1_alt >=thr & 
                             exp3_rep1_ref + exp3_rep1_alt >=thr & exp4_rep1_ref + exp4_rep1_alt >=thr & 
                             exp5_rep1_ref + exp5_rep1_alt >=thr & exp1_rep2_ref + exp1_rep2_alt >=thr & 
                             exp2_rep2_ref + exp2_rep2_alt >=thr & exp3_rep2_ref + exp3_rep2_alt >=thr & 
                             exp4_rep2_ref + exp4_rep2_alt >=thr & exp5_rep2_ref + exp5_rep2_alt >=thr)
# TODO: use thr in CountsToAI (thr=10) and take care of downstream cbind (change to df and then merge)
aiTable <- do.call(cbind, lapply(1:length(designMatrix$techReps), function(x){
      round(CountsToAI(geneCountTab_10, reps = unlist(designMatrix$replicateNums[x])),3)
    }))
aiTable <- cbind(geneCountTab_10$ensembl_gene_id, aiTable)
colnames(aiTable) <- c("ensembl_gene_id", designMatrix$experimentNames)
aiTable <- data.frame(aiTable)
aiTable$control <- as.numeric(aiTable$control)
aiTable$DMSO <- as.numeric(aiTable$DMSO)
aiTable$low <- as.numeric(aiTable$low)
aiTable$medium <- as.numeric(aiTable$medium)
aiTable$high <- as.numeric(aiTable$high)
#kable(head(aiTable))
write.table(aiTable, file = "../../../data/5aza/ai_full_thr_10.txt", row.names = F, quote = F, sep="\t")

Then get row-concatenated table of pairwise delta AI for all the 3 experiments:

dfDeltaAIPairwise <- do.call(rbind, lapply(1:length(designMatrix$techReps), function(x){
      CreateMergedDeltaAIPairwiseDF(geneCountTab, mlns=F, repnums = unlist(designMatrix$replicateNums[x]), what=designMatrix$experimentNames[x], thr=10)
    }))
head(dfDeltaAIPairwise)
P = pnorm(seq(0.4,2,0.2))
P = P - (1-P) 
DF = dfDeltaAIPairwise
DF$geneORsnp = "Gene" 
DF$group = paste(DF$what, DF$geneORsnp, DF$ij)
head(DF)

DFQ = do.call(rbind, lapply(unique(DF$group), 
                            function(r){
                              CreateObservedQuantilesDF(DF[DF$group == r, ], 
                                                        P, ep=1.3, logbase=T, 
                                                        coverageLimit=2000, group=r)
                            })
) 
head(DFQ)
DFQ$ij = sapply(as.character(DFQ$group), function(x){paste(unlist(strsplit(x, ' '))[3:5], collapse = ' ')})
groupsIntercepts = lapply(unique(DFQ$Q), function(q){
  df = DFQ[DFQ$Q == q, ]
  res = sapply(unique(df$ij), function(x){ # проход по всем парам
      FitLmIntercept(df[df$ij == x, ], 40, morethan=10, logoutput=T)
    })
  names(res) = unique(df$ij)
  2**res # вернулись от логарифма обратно
})

coeffDF <- data.frame(t(do.call(rbind, groupsIntercepts)))
colnames(coeffDF) <- paste0("q_", round(as.numeric(as.character(unique(DFQ$Q))),3))

Add names column:

coeffDF$exp <- rep(designMatrix$experimentNames, choose(designMatrix$techReps, 2))
coeffDF <- coeffDF[,c(dim(coeffDF)[2],1:(dim(coeffDF)[2]-1))]

Let's compare conditions DMSO (experiment 2) and high 5aza treatment (experiment 5). We will construct 95%-CIs around AIs and get the resulting classification into genes demonstrating difference in AI between conditions (based on non-overlapping CIs):

DiffAI_DMSO_high <- PerformDiffAIAnalysisFor2Conditions(geneCountTab, 
                                                          vect1CondReps = unlist(designMatrix$replicateNums[2]), 
                                                          vect2CondReps = unlist(designMatrix$replicateNums[5]), 
                                                          Q=0.95,
                                                          thr=40, 
                                                        EPS=1.3, 
                                                        BF=T, 
                                                        fullOUT=F)
DiffAI_DMSO_high$deltaAI <- abs(DiffAI_DMSO_high$meanAI1 - DiffAI_DMSO_high$meanAI2)
head(DiffAI_DMSO_high[, c(1,5,6,10,11)])

Let's look at top DE genes:

minDifference <- 0.2
GenesAIDiff <- DiffAI_DMSO_high[!is.na(DiffAI_DMSO_high$diffAI), ]
GenesAIDiff <- GenesAIDiff[GenesAIDiff$diffAI==T & GenesAIDiff$deltaAI>=minDifference, ]
head(GenesAIDiff[, c(1,5,6,10,11)])


gimelbrantlab/ASE documentation built on March 1, 2020, 5:41 a.m.