R/geneLengthEffect.r

Defines functions geneLengthEffect

Documented in geneLengthEffect

#' Generate some plots to investigate the gene length effect
#'
#' Generate some plots to investigate the gene length effect
#'
#' @param counts matrix of counts
#' @param complete list of results from \code{exportComplete.DESeq2()} or \code{exportComplete.edgeR()}
#' @param glength \code{data.frame} of 2 columns: features's id and length
#' @param out \code{TRUE} to export the figure
#' @param versionName versionName of the project
#' @author Hugo Varet

# created January 18th, 2017
# modified February 10th, 2017
# modified August 26th, 2019 (ggplot2)

geneLengthEffect <- function(counts, complete, glength, out=TRUE, versionName="."){
  if (nrow(counts) != nrow(glength) || any(sort(rownames(counts)) != sort(glength[,1]))){
    stop("Different features in the counts matrix and in the glength table, please check them.")
  }
  names(glength) <- c("Id", "length")
  
  # effect on the statistical tests
  if (out) pdf(paste0("figures/", versionName, "-geneLengthEffectPvalue.pdf"), width=7, height=7)
  for (name in names(complete)){
    complete.name <- complete[[name]][,c("Id", "pvalue")]
    complete.name <- merge(complete.name, glength, by="Id")
    complete.name$length_class <- cut(complete.name$length,
                                      breaks=quantile(complete.name$length, probs=seq(0, 1, 0.10), na.rm=TRUE),
                                      include.lowest=TRUE, labels=1:10)
    complete.name <- complete.name[which(!is.na(complete.name$pvalue)),]
    reverselog_trans <- function(base = exp(1)) {
      trans <- function(x) -log(x, base)
      inv <- function(x) base^(-x)
      trans_new(paste0("reverselog-", format(base)), trans, inv,
                log_breaks(base = base),
                domain = c(.Machine$double.xmin, Inf))
    }
    print(ggplot(complete.name, aes(x=.data$length_class, y=.data$pvalue, fill=.data$length_class)) + 
            geom_boxplot(show.legend=FALSE) +
            xlab("Gene length (by decile)") +
            ylab("P-value") +
            ggtitle(paste0(versionName, " - gene length effect on raw P values ", name)))
  }
  if (out) dev.off()
  
  # effect on the number of reads per gene
  counts <- data.frame(Id=rownames(counts), counts+1)
  counts <- merge(glength, counts, by="Id", sort=FALSE, all=FALSE)
  # plot gene length vs counts
  if (out) pdf(paste0("figures/", versionName, "-geneLengthEffectCounts.pdf"), width=7, height=7)
  for (i in 3:ncol(counts)){
    print(ggplot(data=counts, aes_string(x="length", y=names(counts)[i])) + 
            geom_point(show.legend = FALSE, size=0.2, alpha=0.2) +
            scale_x_continuous(trans = log10_trans(),
                               breaks = trans_breaks("log10", function(x) 10^x),
                               labels = trans_format("log10", math_format(~10^.x))) +
            scale_y_continuous(trans = log10_trans(),
                               breaks = trans_breaks("log10", function(x) 10^x),
                               labels = trans_format("log10", math_format(~10^.x))) +
            xlab("Gene length") +
            ylab("Raw counts") +
            ggtitle(paste0(versionName, " - sample ", names(counts)[i])))
  }
  if (out) dev.off()
  
  # export table
  write.table(counts[order(counts$length),], file=paste0("tables/", versionName, ".geneLengthEffect.xls"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
}
biomics-pasteur-fr/RNADiff documentation built on Aug. 27, 2020, 12:44 a.m.