R/scrna_pipeline.R
In pipeComp: pipeComp pipeline benchmarking framework

Documented in scrna_pipeline

#' scrna_pipeline
#' 
#' The `PipelineDefinition` for the default scRNAseq clustering pipeline, with 
#' steps for doublet identification, filtering, normalization, feature 
#' selection, dimensionality reduction, and clustering.
#' Alternative arguments should be character, numeric or logical vectors of
#' length 1 (e.g. the function name for a method, the number of dimensions, 
#' etc). The default pipeline has the following steps and arguments:
#' \itemize{
#' \item doublet: `doubletmethod` (name of the doublet removal function)
#' \item filtering: `filt` (name of the filtering function, or filter string)
#' \item normalization: `norm` (name of the normalization function)
#' \item selection: `sel` (name of the selection function, or variable of 
#' rowData on which to select) and `selnb` (number of features to select)
#' \item dimreduction: `dr` (name of the dimensionality reduction function) and
#'  `maxdim` (maximum number of components to compute)
#' \item clustering: `clustmethod` (name of the clustering function), 
#' `dims` (number of dimensions to use), `k` (number of nearest neighbors to 
#' use, if applicable), `steps` (number of steps in the random walk, if 
#' applicable), `resolution` (resolution, if applicable), `min.size` (minimum 
#' cluster size, if applicable). If using the `scrna_alternatives.R` wrappers, 
#' the dimensionality can be automatically estimated by specifying 
#' `dims = "method_name"`. 
#' }
#' 
#' @param saveDimRed Logical; whether to save the dimensionality reduction for 
#' each analysis (default FALSE)
#' 
#' @param pipeClass `sce` or `seurat`; which object class to use throughout the
#' pipeline. Note that the `alternatives` functions have to be built around the
#' chosen class. Given that, if running the `scrna_alternatives`, the class of 
#' whole pipeline is determined by the output of the filtering, only this step 
#' is affected by this option. 
#' 
#' @return A `PipelineDefinition` object to be used with `runPipeline`.
#' 
#' @export
#' @examples
#' pip <- scrna_pipeline()
#' pip
scrna_pipeline <- function(saveDimRed=FALSE, pipeClass=c("seurat","sce")){
  pipeClass <- match.arg(pipeClass)
  
  # description for each step
  desc <- list( 
    doublet=
"Takes a SCE object with the `phenoid` colData column, passes it through the 
function `doubletmethod`, and outputs a filtered SCE.",
    filtering=
"Takes a SCE object, passes it through the function `filt`, and outputs a 
filtered Seurat object.",
    normalization=
paste("Passes the object through function `norm` and returns it with the",
ifelse(pipeClass=="sce","logcounts assay","normalized and scale data slots"),
"filled."),
    selection=
"Returns a Seurat object with the VariableFeatures filled with `selnb` features
 using the function `sel`.",
    dimreduction=
"Returns a Seurat object with the PCA reduction with up to `maxdim` components
using the `dr` function.",
    clustering=
"Uses function `clustmethod` to return a named vector of cell clusters." )
  
  
  if (pipeClass == "sce") desc <- lapply(desc, function(x){
    x <- gsub("Seurat", "SCE", x)
  })
  
  # we prepare the functions for each step
  # dimred intermediate return
  if(saveDimRed){
    if(pipeClass == "seurat") {
      DRfun <- function(x, dr, maxdim){ 
        x <- get(dr)(x, dims=maxdim)
        list( x=x, 
              intermediate_return=list(
                cell.embeddings=Embeddings(x[["pca"]]),
                evaluation=evaluateDimRed(x)
              )
            )
      }
    } else {
      DRfun <- function(x, dr, maxdim){ 
        x <- get(dr)(x, dims=maxdim)
        list( x=x, 
              intermediate_return=list(cell.embeddings=reducedDim(x, "PCA"),
                                       evaluation=evaluateDimRed(x)) )
      }
    }
    
  }else{
    DRfun <- function(x, dr, maxdim){ 
      get(dr)(x, dims=maxdim)
    }
  }
  # selection intermediate return
  if(pipeClass == "seurat") {
    selfun <- function(x, sel, selnb){                          
      x <- pipeComp:::.runf(sel, x, n=selnb, alt="applySelString")
      list( x=x, intermediate_return=Seurat::VariableFeatures(x) )
    }
    filtfun <- function(x, filt){
      x2 <- pipeComp:::.runf(filt, x, alt="applyFilterString")
      list( x=seWrap(x2), 
            intermediate_return=pipeComp:::.compileExcludedCells(x,x2) )
    }
  }else{
    selfun <- function(x, sel, selnb){                          
      x <- pipeComp:::.runf(sel, x, n=selnb, alt="applySelString")
      list( x=x, intermediate_return=metadata(x)$VariableFeats )
    }
    filtfun <- function(x, filt){
      x2 <- pipeComp:::.runf(filt, x, alt="applyFilterString")
      list(x=x2, intermediate_return=pipeComp:::.compileExcludedCells(x,x2))
    }
  }
  # functions list
  f <- list(
    doublet=function(x, doubletmethod){ 
      x2 <- pipeComp:::.runf(doubletmethod, x)
      list(x=x2, intermediate_return=pipeComp:::.compileExcludedCells(x,x2))
    },
    filtering=filtfun,
    normalization=function(x, norm){ 
      x <- get(norm)(x)
    },
    selection=selfun,
    dimreduction=DRfun,
    # # to try different dimensionality methods without having to recompute the
    # # reduction:
    # dimensionality=function(x, dims){
    #   if(!is.na(suppressWarnings(as.numeric(dims)))){
    #     dims <- as.integer(dims)
    #   }else{
    #     dims <- getDimensionality(x, dims)
    #   }
    #   x[["pca"]]@cell.embeddings<- x[["pca"]]@cell.embeddings[,seq_len(dims)]
    # },
    clustering=function(x, clustmethod, dims, k, steps, resolution, min.size){
      tl <- x$phenoid
      if(!is.na(suppressWarnings(as.numeric(dims)))){
        dims <- as.integer(dims)
      }else{
        dims <- getDimensionality(x, dims)
      }
      dims <- max(dims,2,na.rm=TRUE)
      x <- get(clustmethod)(x, dims=dims, resolution=resolution, k=k, 
                            steps=steps, min.size=min.size)
      attr(x, "true.labels") <- tl
      x
    }
  )
  
  eva <- list( doublet=NULL, filtering=NULL, normalization=evaluateNorm,
               selection=NULL, dimreduction=evaluateDimRed, 
               clustering=evaluateClustering )
  # functions to aggregate the intermediate_return of the different steps
  agg <- list( doublet=.aggregateExcludedCells,
               filtering=.aggregateExcludedCells,
               normalization=NULL,
               selection=NULL,
               dimreduction=.aggregateDR,
               clustering=.aggregateClusterEvaluation )
  # default arguments
  def <- list( selnb=2000, maxdim=50, dims=20, k=20, steps=8, min.size=50,
               resolution=c(0.01, 0.1, 0.5, 0.8, 1) )
  # initiation function
  initf <- function(x){
    if(is.character(x) && length(x)==1) return(readRDS(x))
    x
  }
  PipelineDefinition(functions=f, descriptions=desc, evaluation=eva,
                     aggregation=agg, initiation=initf, 
                     defaultArguments=def, verbose=FALSE)
}