R/majSequences.R

Defines functions majSequences

Documented in majSequences

#' Proportion of reads associated with the three most expressed sequences per sample
#'
#' Proportion of reads associated with the three most expressed sequences per sample
#'
#' @param counts matrix of counts
#' @param n number of most expressed sequences to return
#' @param group vector of the condition from which each sample belongs
#' @param out \code{TRUE} to export the figure
#' @param col colors of the bars
#' @param versionName versionName of the project
#' @return A \code{data.frame} with the percentage of reads of the three most expressed sequences
#' @author Marie-Agnes Dillies and Hugo Varet

# created Oct 29th, 2013
# modified Feb 17th, 2014 (added versionName and tabDir to export the table)
# modified July 30th, 2014 (added the barplot from majSequence())
# modified Aug 5th, 2014 (removed graphDir and tabDir arguments)
# modified Oct 19th, 2017 (percentage instead of proportion)
# modified August 26th, 2019 (ggplot2)

majSequences <- function(counts, n=3, group, out=TRUE, col=c("lightblue","orange","MediumVioletRed","SpringGreen"), versionName="."){
  
  seqnames <- apply(counts, 2, function(x){x <- sort(x, decreasing=TRUE); names(x)[1:n]})
  seqnames <- unique(unlist(as.character(seqnames)))
  sum <- apply(counts,2,sum)
  counts <- counts[seqnames,]
  sum <- matrix(sum, nrow(counts), ncol(counts), byrow=TRUE)
  p <- round(100*counts/sum,digits=3)
  write.table(p, file=paste0("tables/", versionName, ".majSequences.xls"), 
              row.names=TRUE, col.names=NA, sep="\t", quote=FALSE)
  
  maj <- apply(p, 2, max)
  seqname <- rownames(p)[apply(p, 2, which.max)]
  
  if (out) pdf(file=paste0("figures/", versionName, "-majSeq.pdf"), width=min(14,7+3*ncol(counts)/10), height=7)
  group <- data.frame(group=apply(group, 1, paste, collapse="-"))
  group$group <- factor(group$group, levels=unique(group$group))
  d <- data.frame(maj=maj, sample=factor(names(maj), levels=names(maj)), group, seqname=seqname)
  print(ggplot(d, aes(x=.data$sample, y=.data$maj, fill=.data$group)) +
          geom_bar(stat="identity", show.legend=TRUE) +
          labs(fill="") +
          scale_fill_manual(values=col) +
          xlab("Samples") + 
          ylab("Percentage of reads") +
          scale_y_continuous(expand=expansion(mult=c(0.01, 0.05))) +
          ggtitle(paste(versionName, "Percentage of reads from\nmost expressed sequence", sep=" - ")) +
          theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) +
          geom_text(aes(y=0.8*maj, label=seqname), color="black", size=2.5, angle=90, fontface="bold"))
  if (out) dev.off()
  
  return(p)
}
biomics-pasteur-fr/RNADiff documentation built on Aug. 27, 2020, 12:44 a.m.