R/methods-CellTypeHierarchy.R

Defines functions cth_classifier_cds classifyCellsHelperCds cth_classifier_cell classifyCellsHelperCell newCellTypeHierarchy addCellType classifyCells calculateMarkerSpecificity selectTopMarkers

Documented in addCellType calculateMarkerSpecificity classifyCells newCellTypeHierarchy selectTopMarkers

#' @import methods
#' @importFrom Biobase exprs pData
#' @importFrom igraph V
cth_classifier_cds <- function(cds_subset, cth, curr_node, frequency_thresh) {
  #curr_cell_vertex <-  V(cth@classificationTree)[curr_node]
  next_nodes <- c()
  #print (unique(pData(cds_subset)$Cluster))
  for (child in V(cth@classificationTree) [ suppressWarnings(nei(curr_node, mode="out")) ]){
    
    child_cell_class_func <- V(cth@classificationTree) [ child ]$classify_func[[1]]
    #type_res <- sparseApply(exprs(cds_subset), 2, child_cell_class_func, convert_to_dense=FALSE)
    type_res <- child_cell_class_func(exprs(cds_subset))
    #print(type_res)
    type_res <- unlist(type_res)
    
    names(type_res) <- row.names(pData(cds_subset))
    cell_type_name <- V(cth@classificationTree) [ child ]$name
    if (length(frequency_thresh) > 1)
      required_thresh <- frequency_thresh[cell_type_name]
    else
      required_thresh <- frequency_thresh
    if ((sum(type_res) / length(type_res)) > frequency_thresh){
      next_nodes <- c(next_nodes, cell_type_name)
    }
    #print (paste(V(cth@classificationTree) [ child ]$name, ":", sum(type_res),  " of ", length(type_res) ))
  }
  
  if (length(next_nodes) == 1){
    CellType <- cth_classifier_cds(cds_subset, cth, next_nodes[1], frequency_thresh)
  }else if(length(next_nodes) == 0){
    if (curr_node == "root")
      CellType = "Unknown"
    else
      CellType = curr_node
  }else if(length(next_nodes) > 1){
    CellType = "Ambiguous"
  }else{
    CellType = "Unknown"
  }
  return (CellType)
}

classifyCellsHelperCds <- function(cds_subset, cth, frequency_thresh){
  CellType <- cth_classifier_cds(cds_subset, cth, "root", frequency_thresh)
}

#' @importFrom igraph V
cth_classifier_cell <- function(cell_name, cth, curr_node, gate_res) {
  next_nodes <- c()
  for (child in V(cth@classificationTree) [ suppressWarnings(nei(curr_node, mode="out")) ]){
    type_res <- gate_res[[V(cth@classificationTree) [ child ]$name]]
    #print (class(type_res[cell_name]))
    #print (cell_name)
    if (type_res[cell_name] == TRUE)
      next_nodes <- c(next_nodes, V(cth@classificationTree) [ child ]$name)
  }
  
  if (length(next_nodes) == 1){
    CellType <- cth_classifier_cell(cell_name, cth, next_nodes[1], gate_res)
  }else if(length(next_nodes) == 0){
    if (curr_node == "root")
      CellType = "Unknown"
    else
      CellType = curr_node
  }else if(length(next_nodes) > 1){
    CellType = "Ambiguous"
  }else{
    CellType = "Unknown"
  }
  return (CellType)
}

#' @importFrom Biobase exprs pData
#' @importFrom igraph V
#' @importFrom dplyr %>%
classifyCellsHelperCell <- function(cds, cth){
  #next_node_list <- rep(list(), ncol(cds)) 
  
  gate_res <- list()
  for (v in V(cth@classificationTree)){
    cell_class_func <- V(cth@classificationTree) [ v ]$classify_func[[1]]
    type_res <- cell_class_func(exprs(cds))
    gate_res[[ V(cth@classificationTree) [ v ]$name]] <- type_res
  }
  cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname"), "rowname") 
  class_df <- as.data.frame(cds_pdata %>% do(CellType = cth_classifier_cell(.$rowname, cth, "root", gate_res)))
  CellType <- factor(unlist(class_df$CellType))
  names(CellType) <- class_df$rowname
  #CellType <- cth_classifier_cell(cds_subset, cth, "root", gate_res)
  return(CellType)
}

#' @title Classify cells according to a set of markers
#' 
#' @description Creates a CellTypeHierarchy object which can store
#' cell types with the addCellType() function. When classifyCells
#' is used with a CellDataSet and a CellTypeHierarchy cells in the 
#' CellDataSet can be classified as cell types found in the CellTypeHierarchy
#' 
#' @details CellTypeHierarchy objects are Monocle's mechanism for
#'   classifying cells into types based on known markers. To classify the cells
#'   in a CellDataSet object according to known markers, first construct a
#'   CellTypeHierachy with \code{newCellTypeHierarchy()} and 
#'   \code{addCellType()} and then provide both the \code{CellDataSet}
#'   and the \code{CellTypeHierachy} to \code{classifyCells()}. Each
#'   call to \code{addCellType()} registers a classification function
#'   that accepts the expression data from a CellDataSet object as input, and
#'   returns a boolean vector indicating whether each cell is of the given type.
#'   When you call \code{classifyCells()}, each cell will be checked against the classification functions in the
#'   \code{CellTypeHierachy}.  If you wish to make a cell type a subtype of
#'   another that's already been registered with a CellTypeHierarchy object,
#'   make that one the "parent" type with the \code{cell_type_name} argument. If
#'   you want two types to be mutually exclusive, make them "siblings" by giving
#'   them the same parent. The classifcation functions in a CellTypeHierarchy must take a single argument, a matrix of
#'   expression values, as input. Note that this matrix could either be a 
#'   \code{\link[Matrix]{sparseMatrix}} or a dense matrix. Explicitly casting the input to a dense
#'   matrix inside a classification function is likely to drastically slow down 
#'   classifyCells and other routines that use CellTypeHierarhcy objects.
#'   Successive calls to \code{addCellType} build up a tree of classification
#'   functions inside a CellTypeHierarchy. When two functions are siblings in 
#'   the tree, classifyCells expects that a cell will meet the classification
#'   criteria for at most one of them. For example, you might place 
#'   classification functions for T cells and B cells as siblings, because
#'   a cell cannot be both of these at the same time. When a cell meets the 
#'   criteria for more than one function, it will be tagged as "Ambiguous". If
#'   \code{classifyCells} reports a large number of ambiguous cells, consider
#'   adjusting your classification functions. For example, some cells are 
#'   defined by very high expression of a key gene that is expressed at lower
#'   levels in other cell types. Raising the threshold for this gene in a 
#'   classification could resolve the ambiguities. A classification function
#'   can also have child functions. You can use this to specify subtypes of 
#'   cells. For example, T cells express the gene CD3, and there are many
#'   subtypes. You can encode each subset by first adding a general T cell
#'   classification function that recognizes CD3, and then adding an additional
#'   function that recognizes CD4 (for CD4+ helper T cells), one for CD8 (to
#'   identify CD8+ cytotoxic T cells), and so on. \code{classifyCells} will
#'   aim to assign each cell to its most specific subtype in the "CellType" 
#'   column. By default, \code{classifyCells} applies the classification functions to
#'   individual cells, but you can also apply it to cells in a "grouped" mode to 
#'   impute the type of cells that are missing expression of your known markers.
#'   You can specify additional (quoted) grouping variables to \code{classifyCells}.
#'   The function will group the cells according to these factors, and then 
#'   classify the cells. It will compute the frequency of each cell type in each
#'   group, and if a cell type is present at the frquency specified in 
#'   \code{frequency_thresh}, all the cells in the group are classified as that 
#'   type. If group contains more one cell type at this frequency, all the cells
#'   are marked "Ambiguous". This allows you to impute cell type based on 
#'   unsupervised clustering results (e.g. with \code{\link{clusterCells}()}) or
#'   some other grouping criteria.
#' 
#' 
#' @return \code{newCellTypeHierarchy} and \code{addCellType} both return an 
#'   updated CellTypeHierarchy object. \code{classifyCells} returns an updated 
#'   \code{CellDataSet} with a new column, "CellType", in the pData table.
#'   
#' @importFrom igraph vertex graph.empty
#'   
#' @export
newCellTypeHierarchy <- function()
{
  cth <- new( "CellTypeHierarchy",
              classificationTree = graph.empty())
  
  root_node_id <- "root"
  
  cth@classificationTree <- cth@classificationTree + vertex(root_node_id, classify_func=list(function(x) TRUE))
  #cth@classificationTree %>% add_vertices(1, name = root_node_id, "classify_func"=list(function(x) TRUE))
  return(cth)
}

#' Add a new cell type
#' @description adds a cell type to a pre-existing CellTypeHierarchy and produces a function that accepts
#' expression data from a CellDataSet. When the function is called on a CellDataSet a boolean vector is returned
#' that indicates whether each cell is or is not the cell type that was added by addCellType.
#' @param cth The CellTypeHierarchy object 
#' @param cell_type_name The name of the new cell type. Can't already exist in
#'   cth
#' @param classify_func A function that returns true when a cell is of the new
#'   type
#' @param parent_cell_type_name If this cell type is a subtype of another,
#'   provide its name here
#'   
#' @importFrom igraph V edge
#'   
#' @export
addCellType <- function(cth, cell_type_name, classify_func, parent_cell_type_name="root") 
{
  if (cell_type_name %in% V(cth@classificationTree)$name){
    stop(paste("Error: cell type",cell_type_name, "already exists."))
  }
  
  # TODO: verify that classify_func has the right signature/call semantics?
  cth@classificationTree <- cth@classificationTree + vertex(cell_type_name, classify_func=list(classify_func))
  
  cth@classificationTree <- cth@classificationTree + edge(parent_cell_type_name, cell_type_name)
  return (cth)
}

#' @title Classify cells according to a set of markers
#' 
#' @description classifyCells accepts a cellDataSet and and a cellTypeHierarchy.
#' Each cell in the cellDataSet is checked against the functions in the cellTypeHierarchy
#' to determine each cell's type
#' 
#' @describeIn newCellTypeHierarchy Add a cell type to a CellTypeHierarchy
#' @param cds The CelllDataSet you want to classify
#' @param ... character strings that you wish to pass to dplyr's group_by_ routine
#' @param enrichment_thresh fraction to be multipled by each cell type percentage. Only used if frequency_thresh is NULL, both cannot be NULL
#' @param frequency_thresh If at least this fraction of group of cells meet a cell types marker criteria, impute them all to be of that type.  
#' @importFrom dplyr select_ do group_by_ inner_join %>%
#' @importFrom tibble rownames_to_column
#' @importFrom Biobase pData pData<-
#' @export 
#' @examples
#' \dontrun{
#' # Initialize a new CellTypeHierachy
#' 
#' # Register a set of classification functions. There are multiple types of T cells
#' # A cell cannot be both a B cell and a T cell, a T cell and a Monocyte, or
#' # a B cell and a Monocyte.
#' cth <- newCellTypeHierarchy()
#' 
#' cth <- addCellType(cth, "T cell", 
#'                    classify_func=function(x) {x["CD3D",] > 0})
#'                    
#' cth <- addCellType(cth, "CD4+ T cell", 
#'                    classify_func=function(x) {x["CD4",] > 0}, 
#'                    parent_cell_type_name = "T cell")
#'                    
#' cth <- addCellType(cth, "CD8+ T cell", 
#'                    classify_func=function(x) {
#'                      x["CD8A",] > 0 | x["CD8B",] > 0
#'                    }, 
#'                    parent_cell_type_name = "T cell")
#'                    
#' cth <- addCellType(cth, "B cell", 
#'                    classify_func=function(x) {x["MS4A1",] > 0})
#'                    
#' cth <- addCellType(cth, "Monocyte", 
#'                    classify_func=function(x) {x["CD14",] > 0})
#' 
#' # Classify each cell in the CellDataSet "mix" according to these types
#' mix <- classifyCells(mix, cth)
#'
#' # Group the cells by the pData table column "Cluster". Apply the classification
#' functions to the cells groupwise. If a group is at least 5% of a type, make
#' them all that type. If the group is 5% one type, and 5% a different, mutually
#' exclusive type, mark the whole cluster "Ambiguous"
#' mix <- classifyCells(mix, Cluster, 0.05)
#' }
#' 
classifyCells <- function(cds, cth, frequency_thresh=NULL, enrichment_thresh=NULL, ...) {
  progress_opts <- options()$dplyr.show_progress
  options(dplyr.show_progress = F)
  if (length(list(...)) > 0){
    if (is.null(enrichment_thresh) && is.null(frequency_thresh))
      stop("Error: to use classifyCells in grouped mode, you must also set frequency_thresh")
    
    cds <- classifyCells(cds, cth)
    if (is.null(frequency_thresh)){
      frequency_thresholds <- prop.table(table(pData(cds)$CellType))
      frequency_thresholds <- frequency_thresholds * enrichment_thresh
      frequency_thresholds <- unlist(lapply(frequency_thresholds, min, 1.0))
    }else
      frequency_thresholds <- frequency_thresh

    cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", ...), ...) 
    class_df <- as.data.frame(cds_pdata %>% dplyr::do(CellType = classifyCellsHelperCds(cds[,.$rowname], cth, frequency_thresh)))
    class_df$CellType <-  as.character(unlist(class_df$CellType))
    #class_df$rowname <- as.character(class_df$rowname)
  }else{
    type_res <- classifyCellsHelperCell(cds, cth)
    class_df <- data.frame(rowname = names(type_res), CellType = type_res)
    class_df$CellType <- as.character(class_df$CellType)
    class_df$rowname <- as.character(class_df$rowname)
  }
  
  options(dplyr.show_progress = progress_opts)
  
  pData(cds) <- pData(cds)[!(names(pData(cds)) %in% "CellType")]
  
  #pData(cds)$cell_type <- cds_types
  
  
  pData(cds) <- as.data.frame(suppressMessages(inner_join(rownames_to_column(pData(cds)), class_df)))
  
  pData(cds)$CellType <- factor(pData(cds)$CellType)
  
  row.names(pData(cds)) <- pData(cds)$rowname
  pData(cds) <- pData(cds)[,-1]
  cds
}   

#' @describeIn newCellTypeHierarchy Calculate each gene's specificity for each cell type 
#' 
#' Computes the Jensen-Shannon distance between the distribution of a gene's 
#' expression across cells and a hypothetical gene that is perfectly restricted
#' to each cell type. The Jensen-Shannon distance is an information theoretic
#' metric between two probability distributions. It is a widely accepted measure
#' of cell-type specificity. For a complete description see Cabili \emph{et. al},
#' Genes & Development (2011). 
#' 
#' @param cth CellTypeHierarchy
#' @param remove_ambig a boolean that determines if ambiguous cells should be removed
#' @param remove_unknown a boolean that determines whether unknown cells should be removed
#' @return For a CellDataset with N genes, and a CellTypeHierarchy with k types,
#' returns a dataframe with N x k rows. Each row contains a gene and a specifity
#' score for one of the types.
#' @importFrom reshape2 dcast
#' @importFrom dplyr %>%
#' @importFrom Biobase exprs fData pData
#' @export
calculateMarkerSpecificity <- function(cds, cth, remove_ambig=TRUE, remove_unknown=TRUE){
  
  if(class(cds)[1] != "CellDataSet") {
    stop("Error cds is not of type 'CellDataSet'")
  }
  
  if(class(cth)[1] != "CellTypeHierarchy") {
    stop("Error cth is not of type 'CellTypeHierarchy'")
  }
  
  CellType <- NA
  markerSpecificityHelper <- function(cds, cth){
    averageExpression <- Matrix::rowMeans(exprs(cds))
    averageExpression <- unlist(averageExpression)
    averageExpression[is.na(averageExpression)] <- 0
    #names(averageExpression) <- row.names(fData(cds))
    return (data.frame(gene_id = row.names(fData(cds)), expr_val=averageExpression))
  }
  progress_opts <- options()$dplyr.show_progress
  options(dplyr.show_progress = T)

  cds <- cds[,row.names(subset(pData(cds), CellType %in% c("Unknown", "Ambiguous") == FALSE))]
  cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", "CellType"), "CellType") 
  class_df <- as.data.frame(cds_pdata %>% do(markerSpecificityHelper(cds[,.$rowname], cth)))
  class_df <- dcast(class_df, CellType ~ gene_id, value.var = "expr_val")
  row.names(class_df) <- class_df$CellType
  class_df <- class_df[,-1]
  class_df <- t(as.matrix(class_df))
  
  marker_specificities <- lapply(1:ncol(class_df), function(cell_type_i){
    perfect_specificity <- rep(0.0, ncol(class_df))
    perfect_specificity[cell_type_i] <- 1.0
    apply(class_df, 1, function(x) { 
      if (sum(x) > 0) 1 - JSdistVec(makeprobsvec(x), perfect_specificity)
      else 0
    })
  })
  marker_specificities <- t(do.call(rbind, marker_specificities))
  colnames(marker_specificities) <- colnames(class_df)
  marker_specificities <- melt(marker_specificities)
  colnames(marker_specificities) <- c("gene_id", "CellType", "specificity")
  marker_specificities$gene_id <- as.character(marker_specificities$gene_id)
  return (marker_specificities)
}

#' Select the most cell type specific markers
#' 
#' This is a handy wrapper function around dplyr's top_n function to extract
#' the most specific genes for each cell type. Convenient, for example, for 
#' selecting a balanced set of genes to be used in semi-supervised clustering 
#' or ordering.
#' 
#' @param marker_specificities The dataframe of specificity results produced by \code{\link{calculateMarkerSpecificity}()}
#' @param num_markers The number of markers that will be shown for each cell type
#' @return A data frame of specificity results
#' @importFrom dplyr top_n %>%
#' @export
selectTopMarkers <- function(marker_specificities, num_markers = 10){
  specificity <- NA
  as.data.frame(marker_specificities %>%
    group_by_("CellType") %>%
    top_n(n = num_markers, wt = specificity))
}

#' Test genes for cell type-dependent expression
#' 
#' @description takes a CellDataSet and a CellTypeHierarchy and classifies all cells into types passed
#' functions passed into the CellTypeHierarchy. The function will remove all "Unknown" and "Ambiguous" types
#' before identifying genes that are differentially expressed between types.
#' 
#' @param cds A CellDataSet object containing cells to classify
#' @param cth The CellTypeHierarchy object to use for classification
#' @param residualModelFormulaStr A model formula string specify effects you
#' want to exclude when testing for cell type dependent expression
#' @param balanced Whether to downsample the cells so that there's an equal number of each type prior to performing the test
#' @param verbose Whether to emit verbose output during the the search for cell-type dependent genes
#' @param cores The number of cores to use when testing
#' @param reclassify_cells a boolean that indicates whether or not the cds and cth should be run through classifyCells again
#' @param remove_ambig a boolean that indicates whether or not ambiguous cells should be removed the cds
#' @param remove_unknown a boolean that indicates whether or not unknown cells should be removed from the cds
#' @return A table of differential expression test results
#' @importFrom stringr str_replace_all
#' @importFrom dplyr sample_n
#' @importFrom Biobase pData pData<-
#' @export 
markerDiffTable <- function (cds, cth, residualModelFormulaStr="~1", balanced=FALSE, reclassify_cells=TRUE, remove_ambig=TRUE, remove_unknown=TRUE, verbose=FALSE, cores=1) {
  if(class(cds)[1] != "CellDataSet") {
    stop("Error cds is not of type 'CellDataSet'")
  }
  
  if(class(cth)[1] != "CellTypeHierarchy") {
    stop("Error cth is not of type 'CellTypeHierarchy'")
  }
  
  CellType <- NULL
  if (verbose)
    message("Classifying cells according to markers")
  if (reclassify_cells)
    cds <- classifyCells(cds, cth, 0.05)
  if (remove_ambig)
    cds <- cds[,pData(cds)$CellType %in% c("Ambiguous") == FALSE]
  if (remove_unknown)
    cds <- cds[,pData(cds)$CellType %in% c("Unknown") == FALSE]
  pData(cds)$CellType <- droplevels(pData(cds)$CellType)
  
  if (balanced){
    cell_type_counts <- table(pData(cds)$CellType)
    cell_type_counts <- cell_type_counts[cell_type_counts > 0]
    least_frequent_type <- which(cell_type_counts == min(cell_type_counts))
    least_frequent_type <- names(cell_type_counts)[least_frequent_type]
    n_cells <- cell_type_counts[least_frequent_type]
    
    message(paste("Least frequent cell type is '", least_frequent_type, "', randomly selecting ", n_cells, " cells for marker identification test", sep=""))
    selected_cells <- c()

    for (cell_type in names(cell_type_counts)){
      cell_type_sample <- sample_n(rownames_to_column(subset(pData(cds), CellType == cell_type)), n_cells)$rowname
      selected_cells <- c(selected_cells, cell_type_sample)
    }
    
    cds <- cds[,selected_cells]
    # if(is.null(max_cells) == FALSE && max_cells < ncol(cds)) 
    #   selected_cells <- sample(ncol(cds), max_cells)
    # else
    #   selected_cells <- colnames(cds)
  }
  
  fullModelFormulaStr <- paste("CellType")
  fullModelFormulaStr <- paste("~", fullModelFormulaStr,sep = "")
  if (residualModelFormulaStr != "~1"){
    residual_terms <- str_replace_all(residualModelFormulaStr, "~", "")
    fullModelFormulaStr <- paste(fullModelFormulaStr, residual_terms, sep = " + ")
  }

  if (verbose)
    message("Testing for marker-dependent expression")
  
  marker_diff <- differentialGeneTest(cds, 
                                      fullModelFormulaStr=fullModelFormulaStr,
                                      reducedModelFormulaStr=residualModelFormulaStr,
                                      verbose=verbose,
                                      cores=cores)
  
  return(marker_diff)
}
cole-trapnell-lab/monocle-release documentation built on May 13, 2019, 8:50 p.m.