R/PAC_filtsep.R

Defines functions PAC_filtsep

Documented in PAC_filtsep

#' Generates filtered sequence name list seperated on group  
#'
#' \code{PAC_filtsep} Group seperated filtered name list
#'
#' Given a PAC object the function will extract sequences within the
#' given filter within a given group.
#' 
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#'   package.
#'
#' @param PAC PAC-list object containing an Pheno data.frame with samples as row
#'   names and a Counts table with raw counts or normalized counts table
#'   containing for example counts per million (cpm; generated by
#'   \code{PAC_norm}).
#'   
#' @param threshold Integer giving the threshold in counts PAC$Counts or
#'   normalized counts (table in PAC$norm) that needs to be reached for a
#'   sequence to be included (default=10).
#'   
#' @param coverage Integer giving the percent of independent samples of each
#'   group that needs to reach the threshold for a sequence to be included
#'   (default=100).
#'   
#' @param norm Character specifying if filtering should be done using "counts",
#'   "cpm" or another normalized data table in PAC$norm (default="counts").
#'   
#' @param pheno_target (optional) List with: 
#'          1st object being a character vector of target column in Pheno, 
#'          2nd object being a character vector of the target group(s) 
#'          in the target Pheno column (1st object).
#'          (default=NULL)
#'          
#' @param output Specifies the output format. If output="sequence" (default),
#'   then a data.frame is returned  where each column contains the sequences
#'   names that passed the filter for a specific group specified in
#'   pheno_target. If output="binary", then the resulting data.frame will be
#'   converted into a binary (hit=1, no hit=0) data.frame. See
#'   \code{\link{filtsep_bin}}.
#'   
#' @return A data.frame (see output for details).
#'
#'   
#' @examples
#' 
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
#'                   package = "seqpac", mustWork = TRUE))
#' 
#' ## Keep sequences with 5 counts (threshold) in 100% (coverage) of 
#' ## samples in a group:
#'  # Use PAC_filtsep to find sequences 
#'  filtsep <- PAC_filtsep(pac, norm="counts", threshold=5, 
#'                         coverage=100, pheno_target= list("stage"))
#'                         
#'  # Filter by unique sequences passing filtsep  
#'  filtsep <- unique(do.call("c", as.list(filtsep)))
#'  pac_filt <- PAC_filter(pac, subset_only = TRUE, anno_target= filtsep)
#'  
#'  # Find overlap
#'  olap <- reshape2::melt(filtsep, 
#'                         measure.vars = c("Stage1", "Stage3", "Stage5"), 
#'                         na.rm=TRUE)
#'                         
#' ## Upset plot using the UpSetR package
#'  # (when output="binary" PAC_filtsep uses filtsep_bin for binary conversion
#'  # Use PAC_filtsep with binary output
#'  filtsep_bin <- PAC_filtsep(pac, norm="counts", threshold=5, 
#'                             coverage=100, pheno_target= list("stage"), 
#'                             output="binary")
#'  
#' # Plot Wenn diagram or UpSetR
#' #
#' # plot(venneuler::venneuler(data.frame(olap[,2], olap[,1]))) 
#' #
#' # UpSetR::upset(filtsep_bin, sets = colnames(filtsep_bin), 
#' #              mb.ratio = c(0.55, 0.45), order.by = "freq", keep.order=TRUE)
#'            
#' 
#'  
#' @export

PAC_filtsep <- function(PAC, norm="counts", threshold=10, coverage=100, 
                        pheno_target=NULL, output="sequence"){
  
  ## Check S4
  if(isS4(PAC)){
    tp <- "S4"
    PAC <- as(PAC, "list")
  }else{
    tp <- "S3"
  }
  ## Extract data
  if(norm=="counts"){
    data <- PAC$Counts
  }else{  
    if(is.null(PAC$norm[[norm]])){
      stop("\nThere is no object called '", 
            norm, "' in the norm list.",
            "\n  (Hint: Did you forget to normalize the data using for",
            "\n   example PAC_norm, or would you rather run the function",
            "\n   on raw counts using norm='counts'?)")
   }  
    data <- PAC$norm[[norm]]
  }
  
  ## Subset dataset
  pheno <- PAC$Pheno
  pheno$All <- "All"
  if(is.null(pheno_target)){
    warning(
    "No grouping factor was specified with pheno_target.",
    "\nCalculations are based on all samples.")
    pheno_target <- list("All","All")
  }else{
    if(length(pheno_target)==1){
      pheno_target[[2]] <- unique(pheno[, pheno_target[[1]]])
    }else{
      if(is.null(pheno_target[[2]])){
        pheno_target[[2]] <- unique(pheno[, pheno_target[[1]]])
      }}}
  indx <- pheno[, pheno_target[[1]]] %in% pheno_target[[2]]
  pheno <- pheno[indx,]
  data <- data[,indx]
  stopifnot(identical(rownames(pheno), colnames(data)))
  ## Extract filtered anno sequence names    
  start_lst <- as.list(pheno_target[[2]])
  names(start_lst) <- pheno_target[[2]]
  sub_data_lst <- lapply(start_lst, function(x){ 
    y <- data[, pheno[, pheno_target[[1]]] == x]
  logi <- data.frame(rowSums(y >= threshold)) >= round(ncol(y)*(coverage*0.01))
  return(rownames(y)[logi])
  })
  nmax <- max(unlist(lapply(sub_data_lst, length)))
  df <- do.call("cbind", lapply(sub_data_lst, function(x){
    fix <- as.character(c(x, rep(NA, times=nmax-length(x))))
    return(fix)}
  ))
  df <- as.data.frame(df, stringsAsFactors=FALSE)
  if(output=="binary"){
    return(filtsep_bin(df))
  }else{
    return(df)
  }
}
Danis102/seqpac documentation built on Aug. 26, 2023, 10:15 a.m.