R/GEfilt.R

Defines functions GEfilt

Documented in GEfilt

#' @title GEfilt
#'
#' @description
#' \itemize{
#'    \item Filter genes according to cpm threshold value and replicated cpm option value.
#'    \item Plot different graphes to explore data before and after filtering.
#' }
#'
#' @param data_list, list contain all data and metadata (DGEList, samples descritions, contrast, design and annotations)
#' @param parameters, list that contains all arguments charged in Asko_start
#' @return filtered_counts, large DGEList with filtered counts and data descriptions.
#'
#' @examples
#' \dontrun{
#'     filtered_counts<-GEfilt(data, parameters)
#' }
#'
#' @export
GEfilt <- function(data_list, parameters){
  study_dir = paste0(parameters$dir_path,"/", parameters$analysis_name, "/")
  image_dir = paste0(study_dir, "images/")

  # plot density before filtering
  #---------------------------------
  cpm<-edgeR::cpm(data_list$dge)
  logcpm<-edgeR::cpm(data_list$dge, log=TRUE)
  colnames(logcpm)<-rownames(data_list$dge$samples)
  nsamples <- ncol(data_list$dge$counts)

  maxi<-c()
  for (i in seq(nsamples)){
    m=max(stats::density(logcpm[,i])$y)
    maxi<-c(maxi,m)
  }
  ymax<-round(max(maxi),1) + 0.02

  sizeImg=15*nsamples
  if(sizeImg < 480){ sizeImg=480 }
  btm=round((nsamples/6),0)+0.5

  grDevices::png(paste0(image_dir, parameters$analysis_name, "_raw_data.png"), width=1024, height=1024)
  graphics::par(oma=c(2,2,2,0), mar=c(parameters$densbotmar,5,5,5))
  plot(stats::density(logcpm[,1]),
       col=as.character(data_list$dge$samples$color[1]),
       lwd=1,
       las=2,
       ylim=c(0,ymax),
       main="A. Raw data",
       xlab="Log-cpm")
  graphics::abline(v=0, lty=3)
  for (i in 2:nsamples){
    den<-stats::density(logcpm[,i])
    graphics::lines(den$x, col=as.character(data_list$dge$samples$color[i]), den$y, lwd=1)
  }

  graphics::legend("bottom", fill=data_list$dge$samples$color, bty="n", ncol=parameters$legendcol,
         legend=rownames(data_list$dge$samples), xpd=TRUE, inset=-parameters$densinset)
  grDevices::dev.off()

  # plot density after filtering
  #---------------------------------
  keep.exprs <- rowSums(cpm>parameters$threshold_cpm)>=parameters$replicate_cpm
  filtered_counts <- data_list$dge[keep.exprs,,keep.lib.sizes=FALSE]
  filtered_cpm<-edgeR::cpm(filtered_counts$counts, log=TRUE)

  maxi<-c()
  for (i in seq(nsamples)){
    m=max(stats::density(filtered_cpm[,i])$y)
    maxi<-c(maxi,m)
  }
  ymax<-round(max(maxi),1) + 0.02

  grDevices::png(paste0(image_dir,parameters$analysis_name,"_filtered_data.png"), width=1024, height=1024)
  graphics::par(oma=c(2,2,2,0), mar=c(parameters$densbotmar,5,5,5))
  plot(stats::density(filtered_cpm[,1]),
       col=as.character(data_list$dge$samples$color[1]),
       lwd=1,
       ylim=c(0,ymax),
       las=2,
       main="B. Filtered data",
       xlab="Log-cpm")
  graphics::abline(v=0, lty=3)
  for (i in 2:nsamples){
    den <- stats::density(filtered_cpm[,i])
    graphics::lines(den$x,col=as.character(data_list$dge$samples$color[i]), den$y, lwd=1)
  }
  graphics::legend("bottom", fill=data_list$dge$samples$color, bty="n", ncol=parameters$legendcol,
         legend=rownames(data_list$dge$samples), xpd=TRUE, inset=-parameters$densinset)
  grDevices::dev.off()

  # histogram cpm values distribution before filtering
  #------------------------------------------------------
  grDevices::png(paste0(image_dir,parameters$analysis_name,"_barplot_logcpm_before_filtering.png"), width=sizeImg, height=sizeImg)
  graphics::hist(logcpm,
       main= "A. Log2(cpm) distribution before filtering",
       xlab = "log2(cpm)",
       col = "grey")
  grDevices::dev.off()

  # histogram cpm values distribution after filtering
  #------------------------------------------------------
  grDevices::png(paste0(image_dir,parameters$analysis_name,"_barplot_logcpm_after_filtering.png"), width=sizeImg, height=sizeImg)
  graphics::hist(filtered_cpm,
       main= "B. Log2(cpm) distribution after filtering",
       xlab = "log2(cpm)",
       col = "grey")
  grDevices::dev.off()

  # boxplot cpm values distribution before filtering
  #------------------------------------------------------
  grDevices::png(paste0(image_dir,parameters$analysis_name,"_boxplot_logcpm_before_filtering.png"), width=sizeImg, height=sizeImg)
  graphics::par(oma=c(1,1,1,1))
  graphics::boxplot(logcpm,
          col=data_list$dge$samples$color,
          main="A. Log2(cpm) distribution before filtering",
          cex.axis=0.8,
          las=2,
          ylab="log2(cpm)")
  grDevices::dev.off()

  # boxplot cpm values distribution after filtering
  #------------------------------------------------------
  grDevices::png(paste0(image_dir,parameters$analysis_name,"_boxplot_logcpm_after_filtering.png"), width=sizeImg, height=sizeImg)
  graphics::par(oma=c(1,1,1,1))
  graphics::boxplot(filtered_cpm,
          col=data_list$dge$samples$color,
          main="B. Log2(cpm) distribution after filtering",
          cex.axis=0.8,
          las=2,
          ylab="log2(cpm)")
  grDevices::dev.off()

  return(filtered_counts)
}
askomics/askoR documentation built on Feb. 4, 2023, 5 a.m.