R/AdaptedCodes_juncbias.R

# codes for simulation and plot PWF
#  simulation code from xiaobei
#  PWF plot code from goseq
getDataset <- function(counts, drop.extreme.dispersion = 0.1, drop.low.lambda = TRUE) {  
  ## this function generates NB parameters from real dataset ##
  ## it is low-level function of NBsim ##
  d <- DGEList(counts)
  d <- calcNormFactors(d)
  cp <- round(cpm(d,normalized.lib.sizes=TRUE),1)
  if(drop.low.lambda) d <- d[rowSums(cp>1) >= 2, ]
  d$AveLogCPM <- log2(rowMeans(cpm(d, prior.count = 1e-5)))
  d <- estimateGLMCommonDisp(d)
  d <- estimateGLMTrendedDisp(d)
  d <- estimateGLMTagwiseDisp(d)
  dispersion <- d$tagwise.dispersion
  AveLogCPM <- d$AveLogCPM
  if(is.numeric(drop.extreme.dispersion))
  {   
    bad <- quantile(dispersion, 1-drop.extreme.dispersion, names = FALSE)
    ids <- dispersion <= bad
    AveLogCPM <- AveLogCPM[ids]
    dispersion <- dispersion[ids]
  }
  dataset.AveLogCPM <- AveLogCPM
  dataset.dispersion <- dispersion
  dataset.lib.size <- d$samples$lib.size
  dataset.nTags <- nrow(d)
  list(dataset.AveLogCPM = dataset.AveLogCPM, dataset.dispersion = dataset.dispersion, dataset.lib.size = dataset.lib.size, dataset.nTags = dataset.nTags)
}


NBsim <-
  function(dataset, group, nTags = 10000, nlibs = length(group), fix.dispersion = NA, lib.size = NULL, drop.low.lambda = TRUE, drop.extreme.dispersion = 0.1,  add.outlier = FALSE, outlierMech = c("S", "R", "M"), pOutlier = 0.1, min.factor = 1.5, max.factor = 10, pDiff=.1, pUp=.5, foldDiff=3, name = NULL, save.file = FALSE, file = NULL, only.add.outlier = FALSE, verbose=TRUE)
    
  {   
    ## NBsim generate simulated count from the real dataset followed by the NB model ##		
    require(edgeR)
    group = as.factor(group)
    
    sample.fun <- function(object)
    {
      ## it is low-level function of NBsim ##
      ## it samples from the real dataset ## 
      nlibs <- object$nlibs
      nTags <- object$nTags
      AveLogCPM <-object$dataset$dataset.AveLogCPM
      dispersion <- object$dataset$dataset.dispersion
      
      id_r <- sample(length(AveLogCPM), nTags, replace = TRUE)
      Lambda <- 2^(AveLogCPM[id_r])
      Lambda <- Lambda/sum(Lambda)
      Dispersion <- dispersion[id_r]
      id_0<- Lambda == 0
      Lambda <- Lambda[!id_0]
      Dispersion <- Dispersion[!id_0]
      Lambda <- expandAsMatrix(Lambda, dim = c(nTags, nlibs))
      object$Lambda <- Lambda
      if(!is.na(fix.dispersion))
        Dispersion <- expandAsMatrix(fix.dispersion, dim = c(nTags, nlibs))
      else Dispersion <- expandAsMatrix(Dispersion, dim = c(nTags, nlibs))
      object$Dispersion <- Dispersion
      object
      
    }
    diff.fun <- function(object)
    { 
      
      ## it is low-level function of NBsim ##
      ## it creates diff genes according to foldDiff ## 
      group <- object$group
      pDiff <- object$pDiff
      pUp <-  object$pUp 
      foldDiff <- object$foldDiff
      Lambda <- object$Lambda
      nTags <- object$nTags
      g <- group == levels(group)[1]
      ind <- sample(nTags, floor(pDiff*nTags))
      if(length(ind)>0 & !foldDiff == 1 ) {
        fcDir <- sample(c(-1,1), length(ind), prob=c(1-pUp,pUp), replace=TRUE)
        Lambda[ind,g] <- Lambda[ind,g]*exp(log(foldDiff)/2*fcDir)
        Lambda[ind,!g] <- Lambda[ind,!g]*exp(log(foldDiff)/2*(-fcDir)) 
        #Lambda <- t(t(Lambda)/colSums(Lambda))
        object$Lambda <- Lambda
        object$indDE <- ind
        object$indNonDE <- (1:nTags)[-ind]
        object$mask_DEup <- object$mask_DEdown <- object$mask_nonDE <- expandAsMatrix(FALSE, dim = dim(Lambda))
        object$mask_DEup[ind[fcDir == 1], g] <- TRUE
        object$mask_DEup[ind[fcDir == -1], !g] <- TRUE
        object$mask_DEdown[ind[fcDir == -1], g] <- TRUE
        object$mask_DEdown[ind[fcDir == 1], !g] <- TRUE
        object$mask_nonDE[-ind,] <- TRUE
        object$mask_DE <- object$mask_DEup | object$mask_DEdown}
      if(foldDiff == 1| pDiff == 0)
        object$indDE <- NA
      object
    }
    sim.fun <- function(object)
    {   
      ## it is low-level function of NBsim ##
      ## it simulate counts using rnbinom ## 
      Lambda <- object$Lambda
      Dispersion <- object$Dispersion
      nTags <- object$nTags
      nlibs <- object$nlibs
      lib.size <- object$lib.size
      counts <- matrix(rnbinom(nTags*nlibs, mu = t(t(Lambda)*lib.size), size = 1/Dispersion), nrow = nTags, ncol = nlibs) 
      rownames(counts) <- paste("ids", 1:nTags, sep = "")
      object$counts <- counts
      object
    }
    
    outlier.fun <- function(object, outlierMech, pOutlier, min.factor = 2, max.factor = 5)
    {   
      ## it is low-level function of NBsim ##
      ## it makes outlier ## 
      outlierMech <- match.arg(outlierMech, c("S", "M", "R"))
      dim <- dim(object$counts)
      outlier.factor <- function() runif(1, min.factor, max.factor)
      countAddOut <- object$counts
      LambdaAddOut <- object$Lambda
      DispersionAddOut <- object$Dispersion	
      switch(outlierMech, 
             S = {
               mask_outlier <- expandAsMatrix(FALSE, dim = dim)
               id_r <- which(runif(dim[1]) < pOutlier)
               id_c <- sample(dim[2], length(id_r), replace = TRUE)
               for(i in seq(id_r)) 
                 mask_outlier[id_r[i], id_c[i]] <- TRUE
               countAddOut[mask_outlier] <- sapply(countAddOut[mask_outlier], function(z) round(z*outlier.factor()))
             }, 
             R = {				   
               mask_outlier <- matrix(runif(dim[1]*dim[2]) < pOutlier, dim[1], dim[2])
               countAddOut[mask_outlier] <- sapply(countAddOut[mask_outlier], function(z) round(z*outlier.factor()))
             },
             
             M = {
               mask_outlier <- matrix(runif(dim[1]*dim[2]) < pOutlier, dim[1], dim[2])
               LambdaAddOut[mask_outlier] <- sapply(LambdaAddOut[mask_outlier], function(z) z*outlier.factor())
               countAddOut[mask_outlier] <- rnbinom(sum(mask_outlier), mu = t(t(LambdaAddOut)*object$lib.size)[mask_outlier], size = 1/DispersionAddOut[mask_outlier])
             }
      )
      if(!object$foldDiff == 1 & !pDiff == 0)
      {
        indDEupOutlier <- which(apply(object$mask_DEup & mask_outlier, 1, any))
        indDEdownOutlier <- which(apply(object$mask_DEdown & mask_outlier, 1, any))
        indDEnoOutlier <- which(apply((object$mask_DE & !mask_outlier) , 1, all))
        indNonDEOutlier <- which(apply(object$mask_nonDE & mask_outlier, 1, any))
        indNonDEnoOutlier <- which(apply((object$mask_nonDE & !mask_outlier) , 1, all))
        indDEbothOutlier <- NA
        o <- indDEupOutlier %in% indDEdownOutlier
        q <-  indDEdownOutlier %in% indDEupOutlier
        if(any(o))
        {
          indDEupOutlier <- indDEupOutlier[!o]
          indDEbothOutlier <- indDEupOutlier[o]	
          indDEdownOutlier <- indDEdownOutlier[!q]	
        }	
      }
      else
      {
        indDEupOutlier <- indDEdownOutlier <- indDEnoOutlier <- indNonDEOutlier <- indNonDEnoOutlier <- indDEbothOutlier <- NA
      }
      out <- list(countAddOut = countAddOut, outlierMech = outlierMech, pOutlier = pOutlier, mask_outlier = mask_outlier, indDEupOutlier = indDEupOutlier, 
                  indDEdownOutlier = indDEdownOutlier, indDEbothOutlier = indDEbothOutlier, indDEnoOutlier = indDEnoOutlier, indNonDEOutlier = indNonDEOutlier, 
                  indNonDEnoOutlier = indNonDEnoOutlier, LambdaAddOut = LambdaAddOut, DispersionAddOut = DispersionAddOut) 
      
    }
    
    calProb <- function(x, l) round(1 -(1 - x)^(1/l), 2) ## calculate probability to make sure all the outlierMech produce the same amount of outliers ##
    
    
    if(verbose) message("Preparing dataset.\n")	
    if(class(dataset) == "DGEList")
    {   
      dat <- dataset
      dat[["R"]] <- dat[["S"]] <- dat[["M"]] <- dat[["pOutlier"]] <- dat[["outlierMech"]]<- NULL
    }
    else if(is.character(dataset)) 
    {
      load(dataset)
      dat <- get(gsub("(\\.)(\\w+)", "", basename(dataset)))
      dat[["R"]] <- dat[["S"]] <- dat[["M"]] <- dat[["pOutlier"]] <- dat[["outlierMech"]]<- NULL
    }
    else if(is.matrix(dataset)) 
    { 
      if(is.null(name)) name <- deparse(substitute(dataset))	
      dataset <- getDataset(counts =dataset, drop.extreme.dispersion = drop.extreme.dispersion, drop.low.lambda = drop.low.lambda)
      dat <- new("DGEList", list(dataset = dataset, nTags = nTags, lib.size = lib.size, nlibs = nlibs, group = group, design = model.matrix(~group), pDiff= pDiff, pUp = pUp, foldDiff = foldDiff, outlierMech = outlierMech, min.factor = min.factor, max.factor = max.factor, name = name))		
    }
    else
      dat <- new("DGEList", list(dataset = dataset, nTags = nTags, lib.size = lib.size, nlibs = nlibs, group = group, design = model.matrix(~group), pDiff= pDiff, pUp = pUp, foldDiff = foldDiff, outlierMech = outlierMech, min.factor = min.factor, max.factor = max.factor, name = name))
    
    if(!only.add.outlier)
    {
      if(is.null(lib.size))
        dat$lib.size <- runif(nlibs, min = 0.7*median(dat$dataset$dataset.lib.size), max = 1.3*median(dat$dataset$dataset.lib.size))
      
      if(is.null(nTags))
        dat$nTags <- dat$dataset$dataset.nTags 
      if(verbose) message("Sampling.\n")	
      dat <- sample.fun(dat)
      if(verbose) message("Calculating differential expression.\n")	
      dat <- diff.fun(dat)
      if(verbose) message("Simulating data.\n")	
      dat <- sim.fun(dat)
    }
    if(add.outlier){
      outlierMech <- match.arg(outlierMech,  c("S", "R", "M"), several.ok = TRUE)
      if(length(pOutlier)== 1 & length(outlierMech) > 1 & any(outlierMech == "S"))
      { 
        prob <- calProb(pOutlier, length(group))	
        pOutlier <- rep(pOutlier, length = length(outlierMech))
        pOutlier[!outlierMech == "S"] <- prob	
      }
      else if(!length(pOutlier) == length(outlierMech))
        stop("pOutlier is not equal to outlierMech")
      if(verbose) message("Adding outliers.\n")	
      dat[outlierMech] <- mapply(function(x, y) outlier.fun(dat, outlierMech = x, pOutlier = y, min.factor = min.factor, max.factor = max.factor), x = outlierMech, y = pOutlier, SIMPLIFY = FALSE)
      dat$pOutlier <- pOutlier
    }
    
    if(save.file)
    { 
      
      ## save file for shiny app ##
      if(verbose) message("Saving file.\n")
      if(is.null(file)) 
      { g <- paste0("g", sum(levels(group)[1] == group), "v", sum(levels(group)[2] == group))
      f <- paste0("f", foldDiff)
      if(add.outlier) o <- paste0("o", sprintf( "%02d",100*pOutlier[1L])) 
      else o <- paste0("o", sprintf( "%02d", 0 ))  
      file <- paste0(dat$name, "/", g, f, o, ".Rdata")
      dir.create(dat$name, showWarnings = FALSE)  
      }
      filenm <- eval(gsub("(\\.)(\\w+)", "", basename(file)))
      assign(filenm, dat)
      save(list = filenm, file = file)		
    }
    dat 	
  }

plotPWF2<-function (pwf, binsize = "auto", pwf_col = 3, pwf_lwd = 2, xlab = "Biased Data in <binsize> gene bins.", 
          ylab = "Proportion DE", ...) 
{
  w = !is.na(pwf$bias.data)
  print(w)
  o = order(pwf$bias.data[w])
  print(o)
  
  rang = max(pwf$pwf, na.rm = TRUE) - min(pwf$pwf, na.rm = TRUE)
  if (rang == 0 & binsize == "auto") 
    binsize = 1000
  if (binsize == "auto") {
    binsize = max(1, min(100, floor(sum(w) * 0.08)))
    resid = rang
    oldwarn = options()$warn
    options(warn = -1)
    while (binsize <= floor(sum(w) * 0.1) & resid/rang > 
           0.001) {
      binsize = binsize + 100
      splitter = ceiling(1:length(pwf$DEgenes[w][o])/binsize)
      de = sapply(split(pwf$DEgenes[w][o], splitter), mean)
      binlen = sapply(split(as.numeric(pwf$bias.data[w][o]), 
                            splitter), mean)
      resid = sum((de - approx(pwf$bias.data[w][o], pwf$pwf[w][o], 
                               binlen)$y)^2)/length(binlen)
    }
    options(warn = oldwarn)
  }
  else {
    splitter = ceiling(1:length(pwf$DEgenes[w][o])/binsize)
    print(splitter)
    de = sapply(split(pwf$DEgenes[w][o], splitter), mean)
    print(de)
    binlen = sapply(split(as.numeric(pwf$bias.data[w][o]), 
                          splitter), median)
    print(binlen)
  }
  xlab = gsub("<binsize>", as.character(binsize), xlab)
  if ("xlab" %in% names(list(...))) {
    if ("ylab" %in% names(list(...))) {
      plot(binlen, de, ...)
    }
    else {
      plot(binlen, de, ylab = ylab, ...)
    }
  }
  else if ("ylab" %in% names(list(...))) {
    plot(binlen, de, xlab = xlab, ...)
  }
  else {
    plot(binlen, de, xlab = xlab, ylab = ylab, ...)
  }
  lines(pwf$bias.data[w][o], pwf$pwf[w][o], col = pwf_col, 
        lwd = pwf_lwd)

 return(de)
  
}
SCCC-BBC/GOSJ documentation built on May 9, 2019, 11:07 a.m.