R/Signac.R

Defines functions Signac

Documented in Signac

#' Classification of cellular phenotypes in single cell data
#' 
#' \code{Signac} trains and then uses an ensemble of neural networks to classify cellular phenotypes using an expression matrix or Seurat object.
#' The neural networks are trained with the HPCA training data using only features that are present
#' in both the single cell and HPCA training data set. \code{Signac} returns annotations at each level of the classification
#' hierarchy, which are then converted into cell type labels using \code{\link{GenerateLabels}}. For a faster alternative,
#' try \code{\link{SignacFast}}, which uses pre-computed neural network models.
#'
#' @param E a sparse gene (rows) by cell (column) matrix, or a Seurat object. Rows are HUGO symbols.
#' @param R Reference data. If 'default', R is set to GetTrainingData_HPCA().
#' @param spring.dir If using SPRING, directory to categorical_coloring_data.json. Default is NULL.
#' @param N Number of machine learning models to train (for nn and svm). Default is 100.
#' @param num.cores Number of cores to use. Default is 1.
#' @param threshold Probability threshold for assigning cells to "Unclassified." Default is 0.
#' @param smooth if TRUE, smooths the cell type classifications. Default is TRUE.
#' @param impute if TRUE, gene expression values are imputed prior to cell type classification (see \code{\link{KSoftImpute}}). Default is TRUE.
#' @param verbose if TRUE, code will report outputs. Default is TRUE.
#' @param do.normalize if TRUE, cells are normalized to the mean library size. Default is TRUE.
#' @param return.probability if TRUE, returns the probability associated with each cell type label. Default is TRUE.
#' @param hidden Number of hidden layers in the neural network. Default is 1.
#' @param set.seed If true, seed is set to ensure reproducibility of these results. Default is TRUE.
#' @param seed if set.seed is TRUE, seed is set to 42.
#' @param graph.used If using Seurat object by default, Signac uses the nearest neighbor graph in the graphs field of the Seurat object. Other options are "wnn" to use weighted nearest neighbors, as well as "snn" to use shared nearest neighbors.
#' @return A list of character vectors: cell type annotations (L1, L2, ...) at each level of the hierarchy
#' as well as 'clusters' for the Louvain clustering results.
#' @seealso \code{\link{SignacFast}}, a faster alternative that only differs from \code{\link{Signac}} in nuanced T cell phenotypes.
#' @export
#' See \url{https://doi.org/10.1101/2021.02.01.429207} for more details.
#' @examples
#' \dontrun{
#' # download single cell data for classification
#' file.dir = "https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/"
#' file = "pbmc_1k_v3_filtered_feature_bc_matrix.h5"
#' download.file(paste0(file.dir, file), "Ex.h5")
#' 
#' # load data, process with Seurat
#' library(Seurat)
#' E = Read10X_h5(filename = "Ex.h5")
#' pbmc <- CreateSeuratObject(counts = E, project = "pbmc")
#' 
#' # run Seurat pipeline
#' pbmc <- SCTransform(pbmc, verbose = FALSE)
#' pbmc <- RunPCA(pbmc, verbose = FALSE)
#' pbmc <- RunUMAP(pbmc, dims = 1:30, verbose = FALSE)
#' pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = FALSE)
#' 
#' # classify cells
#' labels = Signac(E = pbmc)
#' celltypes = GenerateLabels(labels, E = pbmc)
#' 
#' # add labels to Seurat object, visualize
#' pbmc <- Seurat::AddMetaData(pbmc, metadata=celltypes$CellTypes_novel, col.name = "immmune")
#' pbmc <- Seurat::SetIdent(pbmc, value='immmune')
#' DimPlot(pbmc)
#' 
#' # save results
#' saveRDS(pbmc, "example_pbmcs.rds")
#' }
Signac <- function(E, R = 'default', spring.dir = NULL, N = 100, num.cores = 1, threshold = 0, smooth = TRUE, impute = TRUE, verbose = TRUE, do.normalize = TRUE, return.probability = FALSE, hidden = 1, set.seed = TRUE, seed = '42', graph.used = "nn")
{
  if (!is.null(spring.dir))
    spring.dir = gsub("\\/$", "", spring.dir, perl = TRUE)
  
  if (class(R) == 'character')
    R = GetTrainingData_HPCA()
  
  flag = class(E) == "Seurat"
  
  if (flag){
    default.assay <- Seurat::DefaultAssay(E)
    edges = E@graphs[[which(grepl(paste0(default.assay, "_", graph.used), names(E@graphs)))]]
  }
  
  if (verbose)
  {
    cat(" ..........  Entry in Signac \n");
    ta = proc.time()[3];
    
    # main function
    if (!flag)
    {
      cat(" ..........  Running Signac on input data matrix :\n");
    } else {
      cat(" ..........  Running Signac on Seurat object :\n");
    }
    cat("             nrow = ", nrow(E), "\n", sep = "");
    cat("             ncol = ", ncol(E), "\n", sep = "");
  }
  
  # keep only unique row names
  logik = CID.IsUnique(rownames(E))
  if (sum(logik) != length(logik))
    E = E[logik,]
  
  # intersect genes with reference set
  gns = intersect(rownames(E), R$genes)
  V = E[rownames(E) %in% gns, ]
  
  if (class(V) %in% "data.frame")
    V = Matrix::Matrix(as.matrix(V), sparse = TRUE)
  
  # normalize to the mean library size
  if (do.normalize)
  {
    if (!flag)
    {
      V = CID.Normalize(V)
    } else {
      V = CID.Normalize(V@assays[[default.assay]]@counts)
    }
  }
  
  # normalization function for gene expression scaling
  normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
  }
  
  # normalize V
  V = t(apply(V, 1, function(x){
    normalize(x)
  }))
  logik = apply(V, 1, function(x) {any(is.na(x))})
  V = V[!logik,]
  
  # set up imputation matrices
  # set up imputation matrices
  if (flag) {
    louvain = CID.Louvain(edges = edges)
    if (nrow(edges) != ncol(edges)) {
      edges = CID.GetDistMat(edges, n = 1)
    } else {
      edges = list(edges)
    }
  } else {
    edges = CID.LoadEdges(data.dir = spring.dir)
    louvain = CID.Louvain(edges = edges)
    edges = CID.GetDistMat(edges, n = 1)
  }
  res = pbmcapply::pbmclapply(R$Reference, FUN = function(x){
    # keep same gene names
    gns = sort(intersect(rownames(V), colnames(x)))
    Z = V[rownames(V) %in% gns, ]
    dat = x[,colnames(x) %in% gns]
    Z = Z[order(rownames(Z)), ]
    dat = dat[, order(colnames(dat))]
    
    # remove any low variance genes
    kmu = apply(Z, 1, function(x){sum(x != 0)})
    logik = kmu > 0;
    Z = Z[logik,]
    dat = dat[,logik]
    
    # run imputation (if desired)
    if (impute){
      Z = KSoftImpute(E = Z, dM = edges, verbose = FALSE)
      Z = t(apply(Z, 1, function(x){
        normalize(x)
      }))
    }
    
    # build training set
    df = data.frame(dat, celltypes = x$celltypes)
    
    # train a neural network (N times)
    if (set.seed)
    {
      RNGkind("L'Ecuyer-CMRG")
      set.seed(seed = seed)
    }
    res = suppressWarnings(lapply(1:N, function(x) {
      nn=neuralnet::neuralnet(celltypes~.,hidden=hidden,data=df, act.fct = 'logistic', linear.output = FALSE)
      Predict = stats::predict(nn, Matrix::t(Z))
      colnames(Predict) <- sort(nn$model.list$response)
      return(Predict)
    }))
    res = res[sapply(res, function(x) !is.null(x))]
    res.squared.mean <- Reduce("+", lapply(res, "^", 2)) / length(res)
    res = Reduce(res, f = '+') / length(res)
    res.variance <- res.squared.mean - res^2
    res.sd <- sqrt(res.variance)
    xx = apply(res, 1, which.max)
    celltypes = colnames(res)[xx]
    kmax = apply(res, 1, max)
    celltypes[kmax < threshold] = "Unclassified"
    errors = round(sapply(1:length(xx), function(x){res.sd[x, xx[x]]}), digits = 4)
    df = data.frame(celltypes = celltypes, probability = round(kmax, digits = 3), sd = errors, percent_features_detected = round(Matrix::colSums(Z != 0) / nrow(Z), digits = 3) * 100)
    
    # smooth the output classifications
    if (smooth & any(as.character(unique(x$celltypes)) %in% c("Immune", "Myeloid", "NonImmune", "Lymphocytes", "Monocytes.Neutrophils", "Monocytes", "Fibroblasts", "Epithelial", "T", "NK", "T.CD8", "T.CD4")))
      df$celltypes = CID.smooth(df$celltypes, edges[[1]])
    
    # return probabilities and cell type classifications
    if (return.probability){
      return(df)
    } else {
      return(df$celltypes)
    }
  }, mc.cores = num.cores)
  
  res$louvain = louvain
  
  if (length(R$Reference) == 1)
    return(as.character(res[[1]]))
  
  if (verbose) {
    tb = proc.time()[3] - ta;
    cat("\n ..........  Exit Signac.\n");
    cat("             Execution time = ", tb, " s.\n", sep = "");
  }
  return(res)
}

Try the SignacX package in your browser

Any scripts or data that you put into this service are public.

SignacX documentation built on Nov. 18, 2021, 5:07 p.m.