R/noiseqbio.R
In NOISeq: Exploratory analysis and differential expression for RNA-seq data

################################################################################################
################################################################################################


####### NOISeqBIO
#################


noiseqbio = function (input, k = 0.5, norm = c("rpkm","uqua","tmm","n"), nclust = 15, plot = FALSE,
                      factor=NULL, conditions = NULL, lc = 0, r = 50, adj = 1.5,
                      a0per = 0.9, random.seed = 12345, filter = 1, depth = NULL,
                      cv.cutoff = 500, cpm = 1) 
  
  # input: Object containing gene counts and as many columns as samples.
  
  # k:      When counts = 0, 0 will be changed to k. By default, k = 0.5.
  
  # norm:   Normalization method. It can be one of "rpkm" (default), "uqua"
  #         (upper quartile), "tmm" (trimmed mean of M) or "n" (no normalization).
  
  # factor: String with the factor to choose which conditions you are going
  #         to compare.
  
# conditions: String with the conditions to compare in case one factor contains more
#         than 2 conditions.

# lc:     Length correction in done by dividing expression by length^lc.
#         By default, lc = 0. 


# r:      Number of permutations to compute null distribution (r=10).
# a0per:  Percentile of S to compute a0. If NULL, a0 = 0. (a0per = 0.9)




{
  
  if (inherits(input,"eSet") == FALSE)
    stop("ERROR: You must give an object generated by the readData function\n")
  
  if (is.null(factor))
    stop("ERROR: You must specify the factor to know which conditions you wish to compare.
         Please, give the argument 'factor'.\n")
  
  if (min(table(input@phenoData@data[,factor])) < 2)
    stop("ERROR: To run NOISeqBIO at least two replicates per condition are needed.
         Please, run NOISeq if there are not enough replicates in your experiment.\n")  
  
  norm <- match.arg(norm)
  
  
  # Random seed
  if (!is.null(random.seed)) set.seed(random.seed)  


  
  # Z-scores for signal and noise
  cat("Computing Z values...\n")
  miMD <- allMDbio(input, factor, k = k, norm = norm, conditions = conditions, lc = lc, 
                   r = r, a0per = a0per, nclust = nclust, filter = filter, depth = depth,
                   cv.cutoff = cv.cutoff, cpm = cpm)
  
  
  #------------------------------------------------------------------#
  
  
  
  ##### Probability of differential expression
  
  cat("Computing probability of differential expression...\n")
  
  
  ## KDE estimators of f0 and f  
  desde <- min(c(miMD$Zs,miMD$Zn), na.rm = TRUE)
  hasta <- max(c(miMD$Zs,miMD$Zn), na.rm = TRUE)
  
  fdens <- density(miMD$Zs, adjust = adj, n = 5000, from = desde, to = hasta, na.rm = TRUE)
  f <- approxfun(fdens)
  
  f0dens <- density(miMD$Zn, adjust = adj, n = 5000, from = desde, to = hasta, na.rm = TRUE)
  f0 <- approxfun(f0dens)
  
  if (f0(0)/f(0) < 1) print("WARNING: f0(0)/f(0) < 1 => FP with Z~0 will be detected.")
  
  f0.f <- f0(miMD$Zs) / f(miMD$Zs)
  
  
  ## f0, f plot
  if (plot) {
    plot(f0dens, lwd = 2, col = 4, main = paste("r=", r, "; a0per=", a0per, sep = ""), xlab = "theta")
    lines(fdens, lwd = 2, col = 1)
    legend("topleft", c("f","f0"), col = c(1,4), lwd = 2)
  }
    
  
  ## ESTIMATION of p0
  p0 <- min(1/f0.f, na.rm = TRUE)
  p0 = min(p0,1)
  
  cat(paste("p0 =", p0)); cat("\n")
    
  
  ## PROBABILITY of DIFFERENTIAL EXPRESSION
  myprob <- 1 - p0*f0.f
  if(min(myprob, na.rm = TRUE) < 0) {
    print(summary(f0.f))
  } 
  names(myprob) <- names(miMD$Zs)
  
  cat("Probability\n")
  print(summary(myprob))
       
  
  if (!is.null(assayData(input)$exprs))
    todos <- rownames(as.matrix(assayData(input)$exprs))
  else
    todos <- rownames(as.matrix(assayData(input)$counts))
  
  myprob <- myprob[todos]
  names(myprob) <- todos
  
  ## Results
  resultat <- data.frame("level_1" = miMD$Level1, "level_2" = miMD$Level2, "theta" = miMD$Zs, "prob" = myprob,
                         "log2FC" = log2(miMD$Level1/miMD$Level2))
  rownames(resultat) <- todos
  
  colnames(resultat)[1] <- paste(unlist(strsplit(miMD$comp," "))[1],"mean",sep="_")
  colnames(resultat)[2] <- paste(unlist(strsplit(miMD$comp," "))[3],"mean",sep="_")
  
  #   resultat <- data.frame(resultat, "ranking" = ranking(resultat)$statistic)
  if (!is.null(featureData(input)@data$Length))
    resultat <- data.frame(resultat, "length" = as.numeric(as.character(featureData(input)@data[todos,"Length"])),
                           stringsAsFactors = FALSE)
  
  if (!is.null(featureData(input)@data$GC))
    resultat <- data.frame(resultat, "GC" = as.numeric(as.character(featureData(input)@data[todos,"GC"])),
                           stringsAsFactors = FALSE)
  
  if (!is.null(featureData(input)@data$Chromosome))
    resultat <- data.frame(resultat, "Chrom" = (as.character(featureData(input)@data$Chromosome)),
                           "GeneStart" = as.numeric(as.character(featureData(input)@data$GeneStart)),
                           "GeneEnd" = as.numeric(as.character(featureData(input)@data$GeneEnd)),
                           stringsAsFactors = FALSE)
  
  if (!is.null(featureData(input)@data$Biotype))
    resultat <- data.frame(resultat, "Biotype" = as.character(featureData(input)@data[todos,"Biotype"]),
                           stringsAsFactors = FALSE)
      
  Output(data = list(resultat), method=norm, k=miMD$k, lc=lc, factor=factor, comparison=miMD$comp,
         replicates="biological", v = 0, nss = 0, pnr = 0)
}