R/testDECount.R

Defines functions testDECount

testDECount <- function(sampleInfo, exDat, cnt = cnt, info = info){
    # Pull info from exDat
    erccInfo <- exDat$erccInfo
    plotInfo <- exDat$plotInfo
    filenameRoot = sampleInfo$filenameRoot
    legendLabels = sampleInfo$legendLabels
    FCcode = erccInfo$FCcode
    
    idCols = exDat$idColsAdj
    r_m.mn = exDat$Results$r_m.res$r_m.mn 
    repNormFactor = sampleInfo$repNormFactor 
    normFactor <- exDat$normFactor
    sample1 = exDat$sampleNames[1]
    sample2 = exDat$sampleNames[2]
    
    colScale <- plotInfo$colScale
    fillScale <- plotInfo$fillScale
    
    ## Organize the count table
    cnt = unique(cnt)
    Features = make.names(cnt$Feature,unique=TRUE)
    Features = gsub(".","-", Features, fixed = TRUE)
    rownames(cnt)<-Features
    cnt<-as.matrix(cnt[,-1])
    
    if(odd(ncol(cnt))) stop(paste("Uneven number of replicates for",
                                  "the two sample types"))
    
    colnames(cnt)<-paste(rep(c(sample1,sample2),
                             each=ncol(cnt)/2),
                         c(1:(ncol(cnt)/2),1:(ncol(cnt)/2)),sep="")
    
    ## Get ERCC names
    ERCC<-rownames(cnt[substr(rownames(cnt),1,5)=="ERCC-",])
    
    ## Specify Sample (A or B)
    trt<-rep(1:2,each=ncol(cnt)/2)
    
    ## Get design.list for edgeR
    design.list<-list(trt,rep(1,ncol(cnt)))
    
    ## Compute offset (e.g. total counts, 75% quantile, TMM, etc)
    if(is.null(normFactor)){
        log.offset <- c(rep(0,ncol(cnt)))
    }else{
        log.offset<- log(normFactor)
    }
    
    cat("\nShow log.offset\n")
    cat(log.offset,"\n")
    
    ERCC.FC = idCols[c(1,4)];rownames(ERCC.FC)<-ERCC.FC[,1]
    
    ERCC.FC$NumRatio <- NA
    FCcode$Ratio <- as.factor(FCcode$Ratio)
    for (i in 1:nlevels(FCcode$Ratio)){
      print(i)
        ERCC.FC$NumRatio[which(ERCC.FC$Ratio == 
                                   FCcode$Ratio[i])]=FCcode$FC[i]  
    }
    ERCC.Ratio = ERCC.FC[c(1,2)]
    ERCC.FC = ERCC.FC[-c(2)]
    
    group <- as.factor(trt)
    d <- DGEList(counts=cnt,group=group)
    # use log.offset for the library size
    d$samples$lib.size <- exp(log.offset) 
    
    #Dispersion trend
    design <- model.matrix(~group)
    d1 <- estimateGLMCommonDisp(d,design,verbose=TRUE)
    d1 <- estimateGLMTrendedDisp(d1,design)
    d1 <- estimateGLMTagwiseDisp(d1,design)
    
    #######################################################
    ###  Simulate ERCC data from negative binomial fit ####
    ###  (to be used for sim-based LODR)               ####
    #######################################################
    # Define function simcnt.lodr
    simcnt.lodr<-function(cnt,disp,trt,fold,log.offset){
        #### 'cnt' is the matrix of counts for endogenous genes
        #### 'disp' is the central trend fitted to the estimated dispersions 
        ####  (vs. expression)
        #### 'trt' is a vector specifying treatments for each column in 'cnt'
        #### 'fold' is the desired fold change (trt 2/trt 1)                  
        #### 'log.offset' is used to account for differences in library size
        
        ### mimick every 797th gene (roughly), when sorted by total count
        sim.ind<-round(nrow(cnt)*c(1:49)/50)  
        norm.cnt<-t(t(cnt)/exp(log.offset))
        sim.mn<-matrix(sort(rowMeans(norm.cnt))[sim.ind],
                       nrow=length(sim.ind), ncol=ncol(cnt))
        sim.mn<-sim.mn*2/(1+fold)
        sim.mn[,trt==2]<-sim.mn[,trt==2]*fold
        sim.mn<-t(t(sim.mn)*exp(log.offset))
        
        sim.disp<-disp[order(rowMeans(norm.cnt))][sim.ind]
        size<-matrix(1/sim.disp,length(sim.disp),ncol(cnt))
        simcnt<-matrix(rnbinom(length(sim.disp)*ncol(cnt),
                               mu=sim.mn,size=size),length(sim.disp),ncol(cnt))
        rownames(simcnt)<-paste("Sim",fold,"Fold",1:length(sim.ind),sep="")
        return(simcnt)
    }
    
    #### Simulate data for each of the fold changes used in the ERCCs
    simcnt<-NULL
    for(fold in unique(ERCC.FC[!is.na(ERCC.FC[,2]),2])){
        simcnt<-rbind(simcnt,simcnt.lodr(cnt,disp=d1$trended.dispersion,
                                         trt=trt,fold=fold,
                                         log.offset=log.offset))
    }
    
    ##### Analyze combination of observed and simulated data with edgeR
    group <- as.factor(trt)
    d2 <- DGEList(counts=rbind(cnt,simcnt),group=group)
    d2$samples$lib.size <- exp(log.offset)
    
    design <- model.matrix(~group)
    d2 <- estimateGLMCommonDisp(d2,design,verbose=TRUE)
    d2 <- estimateGLMTrendedDisp(d2,design)
    d2 <- estimateGLMTagwiseDisp(d2,design)
    
    NBdisp<-d2$tagwise.dispersion
    names(NBdisp)<-rownames(rbind(cnt,simcnt))
    NBdisptrend<-d2$trended.dispersion
    names(NBdisptrend)<-rownames(rbind(cnt,simcnt))
  
    use.fit <- glmQLFit(d2, design)
    
    qlf.res <- glmQLFTest(use.fit)
    
    use.res <- qlf.res
    
    ### Collect results for simulated data, to be passed along to LODR function
    Feature <- row.names(simcnt)
    MnSignal <- as.numeric(rowMeans(simcnt))
    Pval <- use.res$table$PValue[-(1:nrow(cnt))]
    LogPval <- log10(use.res$table$PValue)[-(1:nrow(cnt))]
    F.stat <- use.res$table$F[-(1:nrow(cnt))]
    Fold <- rep(unique(ERCC.FC[!is.na(ERCC.FC[,2]),2]),each=49)
    sim.pval.res <- data.frame(Feature, MnSignal, Pval, LogPval, F.stat, Fold)
    colnames(sim.pval.res)<-c("Feature","MnSignal","Pval","LogPval","F.stat",
                               "Fold")
    rownames(sim.pval.res)<-rownames(simcnt)
    
    write.csv(sim.pval.res[-c(4,5)],file=paste(filenameRoot,"Sim Pvals.csv"),
    row.names = FALSE)
    
    ## remove results for simulated data (using indexing with 1:nrow(cnt))
    use.fit2<-use.fit
    use.fit2$coefficients <-use.fit2$coefficients[1:nrow(cnt),]
    use.fit2$fitted.values <- use.fit2$fitted.values[1:nrow(cnt),]
    use.fit2$deviance <- use.fit2$deviance[1:nrow(cnt)]
    use.fit2$counts <- use.fit2$counts[1:nrow(cnt),]
    use.fit2$unshrunk.coefficients <- use.fit2$unshrunk.coefficients[1:nrow(cnt),]
    use.fit2$df.residual <- use.fit2$df.residual[1:nrow(cnt)]
    use.fit2$offset <- use.fit2$offset[1:nrow(cnt),]
    use.fit2$dispersion <- use.fit2$dispersion[1:nrow(cnt)]
    use.fit2$AveLogCPM <- use.fit2$AveLogCPM[1:nrow(cnt)]
    use.fit2$df.residual.zeros <- use.fit2$df.residual.zeros[1:nrow(cnt)]
    use.fit2$var.post <- use.fit2$var.post[1:nrow(cnt)]
    use.fit2$var.prior <- use.fit2$var.prior[1:nrow(cnt)]
    
    use.res2 <- glmQLFTest(use.fit2)
    
    ###################################
    #### Examine results for ERCCs ####
    ###################################
    pvals<-use.res2$table$PValue
    names(pvals)<-rownames(cnt)
    ERCC.pvals<-pvals[ERCC]
    
    ## Reanalyze ERCC transcripts using adjusted offsets to center fold 
    ## change estimates
    adj <- r_m.mn #### Use r_m estimated from NegBin GLM
    use.fit.adj <- glmQLFit(cnt[ERCC,], design, dispersion = NBdisptrend[ERCC],
                            offset = log.offset - rep(c(adj,0),each=ncol(cnt)/2))
    est.FC.adj<-use.fit.adj$coefficients[ERCC,2]
    
    use.fit3<-use.fit2
    # Substitute ERCC centered data into full use.fit3 structure
    use.fit3$coefficients[ERCC,] <- use.fit.adj$coefficients[ERCC,]
    use.fit3$unshrunk.coefficients[ERCC,] <- use.fit.adj$unshrunk.coefficients[ERCC,]
    use.fit3$var.prior[ERCC] <- use.fit.adj$var.prior[ERCC]
    use.fit3$var.post[ERCC] <- use.fit.adj$var.post[ERCC]
    
    # deal with CompressedMatrix format to add the new ercc offsets...
    expandedfit3 <- expandAsMatrix(use.fit3$offset)
    expandedfit3[1:length(ERCC),] <- expandAsMatrix(use.fit.adj$offset)
    recompress <- makeCompressedMatrix(expandedfit3)
    use.fit3$offset <- recompress
    
    use.res.adj<-glmQLFTest(use.fit3)
     
    # collect the results for plotting and writing a table
    Feature <- row.names(use.res.adj$table)
    MnSignal <- as.numeric(rowMeans(cnt))
    #replace with edgeR results
    Pval <- use.res.adj$table$PValue
    LogPval <- log10(use.res.adj$table$PValue)
    Qval <- qvalue(Pval)$qvalues
    F.stat <- use.res.adj$table$F
    
    names(Qval)<-names(F.stat)<-names(LogPval)<-names(Pval)<-rownames(cnt)
    
    Log2FC <- use.res.adj$table$logFC
    allDE.res <- data.frame(Feature = names(pvals),
                               MnSignal = rowMeans(cnt), 
                               Fold = 
                                   c(ERCC.FC[ERCC,2], rep(x=NA, length.out=
                                                              (length(pvals) -
                                                                   (length
                                                                    (ERCC.FC[
                                                                        ERCC,2]
                                                                    ))))), 
                               Log2Rat=Log2FC, Pval=Pval,
                               qvals=Qval, log.pvals=LogPval, F.stat=F.stat)
    
    write.csv(allDE.res[c(1,2,5,3)],
              file=paste0(filenameRoot, ".All.Pvals.csv"), row.names = FALSE)
    
    ERCC.Pval.adj<-Pval[ERCC]
    ERCC.F.stat.adj<-F.stat[ERCC]
    ERCC.LogPval.adj<-LogPval[ERCC]
    
    ### Collect results for ERCCs; to be passed along to LODR function
    ERCC.pval.res<-data.frame(row.names(cnt[ERCC,]),rowMeans(cnt[ERCC,]),
                              ERCC.Pval.adj,ERCC.LogPval.adj,ERCC.F.stat.adj,
                              ERCC.FC[ERCC,2])
    colnames(ERCC.pval.res)<-c("Feature","MnSignal","Pval","LogPval","F.stat","Fold")
    #print(str(pval.res))
    row.names(ERCC.pval.res) <- NULL
    write.csv(ERCC.pval.res[-c(4,5)],file=paste(filenameRoot,"ERCC Pvals.csv"),
              row.names = FALSE)
    cat("Finished DE testing")
    
    exDat$Results$allDE.res <- allDE.res
    exDat$Results$ERCCpvals <- ERCC.pval.res
    
    ### Using QLDisp code from edgeR to create similar ggplot with use.fit
      glmfit <- use.fit
      xlab = "Average Log2 CPM (Counts per million)"
      ylab = "Squeezed QL Dispersion Estimates (Quarter-Root Mean Deviance)"
      
      A <- glmfit$AveLogCPM
    
      if (is.null(A)) 
        A <- aveLogCPM(glmfit)
      s2 <- glmfit$deviance/glmfit$df.residual.zeros
      if (is.null(glmfit$var.post)) {
        stop("need to run glmQLFit before getting QL dispersion estimates")
      }
      
      squeezedDisp <- data.frame(A = A, Dispersion = sqrt(sqrt(glmfit$var.post)))
      dispERCC <- squeezedDisp[ERCC,]
      dispERCC$Ratio <- ERCC.Ratio[ERCC,2]
      
      if (length(glmfit$var.prior) == 1L) {
        trendPlot <- geom_abline(yintercept = sqrt(sqrt(glmfit$var.prior)))
      } else {
        o <- order(A)
        dispTrend <- data.frame(Atrend = A[o], Dispersion = sqrt(sqrt(glmfit$var.prior[o])) )
        trendPlot <-  geom_line(data = dispTrend, aes(Atrend, Dispersion))
      }
      quasiDispPlot <- ggplot() + geom_point(data = squeezedDisp, aes(x = A, y = Dispersion),
                                             colour = "grey80", size = 5,
                                             alpha = 0.6) +
        geom_point(data = dispERCC, aes(x = A, y = Dispersion,colour = Ratio), 
                   size = 5, alpha = 0.6) + xlab(xlab) + ylab(ylab) +
        trendPlot +
        #scale_x_log10() + 
        colScale + theme_bw() +
        theme(legend.justification=c(1,1), legend.position=c(0.9,0.9)) 
  
    exDat$Figures$dispPlot <- quasiDispPlot
    exDat$Results$simcnt <- simcnt 
    cat("\nFinished examining dispersions\n")
    
    return(exDat)
    ### end Edit Sarah Munro 20140216
}
munrosa/erccdashboard documentation built on April 4, 2020, 9:32 p.m.