FastProjectR: Functional interpretation of single cell RNA-seq latent manifolds

Documented in addLatentSpace addSignatures addTSNE addUMAP analyzeLocalCorrelations analyzeTrajectoryCorrelations annotateLatentComponents calcSignatureScores clusterCells clusterSigScores computeLatentSpace computeProjectionGenes evalSigGeneImportance evalSigGeneImportanceSparse generateProjections poolCells

#' Creates clustering of the cells
#'
#' Uses the Louvain clustering algorithm to cluster the cells in the
#' transcriptional latent space (default is the top PCs, unless a user
#' specifies otherwise). If a tree exists, a tree-based clustering is also
#' performed.
#' 
#' Results of this are stored as a new variable in the object's metaData
#'
#' @param object the VISION object for which to cluster the cells
#' @return the VISION object modifed as described above
clusterCells <- function(object, tree=FALSE) {

    message("Clustering cells...\n", appendLF = FALSE)

    res <- object@LatentSpace
    metaData <- object@metaData

    K <- min(object@params$numNeighbors, 30)
    
    message("Using latent space to cluster cells...\n", appendLF = FALSE)
    kn <- find_knn_parallel(res, K)
    cl <- louvainCluster(kn, res)
    names(cl) <- paste('Cluster', seq(length(cl)))
    
    metaData["VISION_Clusters"] <- factor(levels = names(cl))

    for (cluster in names(cl)) {
        metaData[cl[[cluster]], 'VISION_Clusters'] <- cluster
    }
    
    if (tree) {
        # If tree exists, then we'll cluster using the tree as well.

        message("Using Tree to compute clusters...\n")
        # Get the MRCA matrix and convert the node indexes to depths
        cl <- maxSizeCladewiseTreeCluster(object@tree)

        names(cl) <- paste('Cluster', seq(length(cl)))
    
        metaData["VISION_Clusters_Tree"] <- factor(levels = names(cl))

        for (cluster in names(cl)) {
            metaData[cl[[cluster]], 'VISION_Clusters_Tree'] <- cluster
        }
    }

    object@metaData <- metaData

    message("completed\n")

    return(object)

}


#' create micro-clusters that reduce noise and complexity while maintaining
#' the overall signal in the data
#' @param object the VISION object for which to cluster the cells
#' @param cellsPerPartition the minimum number of cells to put into a cluster
#' @return the VISION with pooled cells
poolCells <- function(object, cellsPerPartition = NULL) {

    if (!is.null(cellsPerPartition)){
        object@params$micropooling$cellsPerPartition <- cellsPerPartition
    }

    if (length(object@Pools) == 0) {
        message(paste(
          "Performing micro-pooling on",
          ncol(object@exprData),
          "cells with a target pool size of",
          object@params$micropooling$cellsPerPartition
        ))
    } else {
      message("Performing micro-pooling using precomputed pools")
    }

    preserve_clusters <- NULL

    if (is.null(object@params$latentSpace[["projectionGenes"]])){
        filterInput <- object@params$latentSpace$projectionGenesMethod
    } else {
        filterInput <- object@params$latentSpace$projectionGenes
    }

    if (length(object@Pools) == 0) {
        pools <- applyMicroClustering(
            object@exprData,
            cellsPerPartition = object@params$micropooling$cellsPerPartition,
            filterInput = filterInput,
            filterThreshold = object@params$latentSpace$threshold,
            latentSpace = object@LatentSpace,
            K = object@params$numNeighbors)

        object@Pools <- pools
    }

    message("    Aggregating data using assigned pools...", appendLF = FALSE)
    pooled_cells <- poolMatrixCols(object@exprData, object@Pools)
    object@exprData <- pooled_cells

    if (hasUnnormalizedData(object)) {
        pooled_unnorm <- poolMatrixCols(object@unnormalizedData, object@Pools)
        object@unnormalizedData <- pooled_unnorm
    }

    if (!all(dim(object@LatentSpace) == c(1, 1))) {
        pooled_latent <- poolMatrixRows(object@LatentSpace, object@Pools)
        object@LatentSpace <- pooled_latent
    }

    if (hasProteinData(object)) {
        pooled_fbc <- poolMatrixRows(object@proteinData, object@Pools)
        object@proteinData <- pooled_fbc
    }

    poolMeta <- poolMetaData(object@metaData, object@Pools)
    object@metaData <- poolMeta

    if (length(object@Projections) > 0){

        newProjections <- lapply(
            object@Projections,
            function(proj) {
                new_coords <- poolMatrixRows(proj, object@Pools)
                return(new_coords)
            })

        names(newProjections) <- names(object@Projections)

        object@Projections <- newProjections

    }

    message("completed\n")

    return(object)
}

#' filter data accourding to the provided filters
#' @param object the VISION object
#' @param threshold Threshold to apply when using the 'threshold' or 'fano' projection genes filter.
#' If greater than 1, this specifies the number of cells in which a gene must be detected
#' for it to be used when computing PCA. If less than 1, this instead specifies the proportion of cells needed
#' @param num_mad Number of median absolute deviations to use when selecting highly-variable
#' genes in each mean-sorted bin of genes
#' @param projection_genes_method a method to select genes either 'Threshold' or 'Fano'
#' @param projection_genes a list of genes to use specifically.
#' If this is supplied then `projection_genes_method` is ignored
#' @return the VISION object, populated with filtered data
computeProjectionGenes <- function(object,
                       threshold = NULL,
                       num_mad = NULL,
                       projection_genes_method = NULL,
                       projection_genes = NULL) {

    if (!is.null(threshold)){
        if (threshold < 1) {
            num_samples <- ncol(object@exprData)
            threshold <- round(threshold * num_samples)
        }
        object@params$latentSpace$threshold <- threshold
    }

    if (!is.null(num_mad)){
        object@params$latentSpace$num_mad <- num_mad
    } else {
        object@params$latentSpace$num_mad <- 2
    }

    if (!is.null(projection_genes)){
        object@params$latentSpace$projectionGenes <- projection_genes
    } else {
        object@params$latentSpace$projectionGenes <- NA
    }

    if (!is.null(projection_genes_method)){
        object@params$latentSpace$projectionGenesMethod <- projection_genes_method
    }

    message("Determining projection genes...")

    if (is.na(object@params$latentSpace[["projectionGenes"]])){

        exprData <- matLog2(object@exprData)
        projection_genes <- applyFilters(
                    exprData,
                    object@params$latentSpace$projectionGenesMethod,
                    object@params$latentSpace$threshold,
                    object@params$latentSpace$num_mad)

        if (length(projection_genes) == 0){
            stop(
                sprintf("Filtering with (projection_genes=\"%s\", threshold=%i) results in 0 genes\n  Set a lower threshold and re-run",
                    object@params$latentSpace$projectionGenesMethod, object@params$latentSpace$threshold)
                )
        }
    } else {
        projection_genes <- intersect(
            object@params$latentSpace$projectionGenes, rownames(object@exprData))

        if (length(projection_genes) == 0){
            stop("Supplied list of genes in `projection_genes` does not match any rows of expression data")
        } else {
            message(
                sprintf("    Using supplied list of genes: Found %i/%i matches",
                    length(projection_genes), length(object@params$latentSpace$projectionGenes)
                    )
                )
        }
    }

    message()

    object@params$latentSpace$projectionGenes <- projection_genes


    return(object)
}


#' Add signatures to VISION object
#'
#'
#' @param object the VISION object
#' @param signatures list of file paths to signature files (.gmt or .txt) or
#' Signature objects.  See the createGeneSignature(...) method for information
#' on creating Signature objects.
#' @param min_signature_genes Signature that match less than this number of genes in the
#' supplied expression matrix are removed.
#' @param sig_gene_threshold Proportion of cells that a gene must be detected in (nonzero)
#' to be used in signature score calculations.
#' @return the VISION object, with the @sigData slot updated
#' @export
addSignatures <- function(object, signatures, min_signature_genes=5, sig_gene_threshold=.01) {

    if (is.list(signatures)) {
        sigs <- lapply(signatures, function(sig){
                   if (is(sig, "Signature")){
                       return(sig)
                   } else {
                       return(readSignaturesInput(sig))
                   }
        })

        if (length(sigs) > 0){
            sigs <- do.call(c, sigs)
        }

        names(sigs) <- vapply(sigs, function(x){x@name}, "")

    } else if (is.character(signatures)) {
        sigs <- readSignaturesInput(signatures)
    } else {
        stop("signatures must be paths to signature files or list of
            Signature objects")
    }

    sigs <- processSignatures(sigs, object@exprData, min_signature_genes, sig_gene_threshold)

    object@sigData <- c(object@sigData, sigs)

    return(object)
}


#' calculate signature scores
#'
#' For each signature-cell pair, compute a score that captures the level of
#' correspondence between the cell and the signature.
#'
#' @param object the VISION object
#' @param sig_norm_method Method to apply to normalize the expression matrix
#' before calculating signature scores. Valid options are:
#' "znorm_columns" (default), "none", "znorm_rows", "znorm_rows_then_columns",
#' or "rank_norm_columns"
#' @param sig_gene_importance whether or not to rank each gene's contribution to
#' the overall signature score.  Default = TRUE.  This is used for inspecting
#' genes in a signature in the output report
#' @return the VISION object, with the @SigScores and @SigGeneImportance slots populated
#' @export
calcSignatureScores <- function(
    object, sig_norm_method = NULL, sig_gene_importance = TRUE) {

    message("Evaluating signature scores on cells...\n")

    ## override object parameters
    if (!is.null(sig_norm_method)) object@params$signatures$sigNormMethod <- sig_norm_method

    if (length(object@sigData) == 0) {
        sigScores <- matrix(nrow = ncol(object@exprData), ncol = 0,
                            dimnames = list(colnames(object@exprData), NULL)
                            )
        object@SigScores <- sigScores
        object@SigGeneImportance <- list()
        return(object)
    }

    normExpr <- getNormalizedCopySparse(
        object@exprData,
        object@params$signatures$sigNormMethod
    )
    
    sigData <- object@sigData
    
    for (sig_name in names(object@sigData)){
        signature <- object@sigData[[sig_name]]
        directional <- all(c(1, -1) %in% signature@sigDict)
        if (directional) {
            up <- names(which(signature@sigDict == 1))
            down <- names(which(signature@sigDict == -1))
            
            up_name <- paste(signature@name, "_UP", sep = "")
            down_name <- paste(signature@name, "_DOWN", sep = "")
            
            up_sig <- Signature(sigDict = signature@sigDict[up], name = up_name, source = signature@source, metaData = signature@metaData)
            down_sig <- Signature(sigDict = signature@sigDict[down], name = down_name, source = signature@source, metaData = signature@metaData)
            sigData[[up_name]] <- up_sig
            sigData[[down_name]] <- down_sig
        }
    }
    
    object@sigData <- sigData # persist back to the object because of issues with up/down
    
    sigScores <- batchSigEvalNorm(sigData, normExpr)
    
    if (sig_gene_importance) {

        if (is(object@exprData, "sparseMatrix")) {
            sigGeneImportance <- evalSigGeneImportanceSparse(
                sigScores, sigData, normExpr
            )
        } else {
            normExprDense <- getNormalizedCopy(
                object@exprData,
                object@params$signatures$sigNormMethod
            )
            sigGeneImportance <- evalSigGeneImportance(
                sigScores, sigData, normExprDense
            )
        }

    } else {
        sigGeneImportance <- list()
    }

    object@SigScores <- sigScores
    object@SigGeneImportance <- sigGeneImportance

    return(object)
}


#' Calculate gene-signature importance
#'
#' For each signature, the contribution of each gene to the signature score
#' is evaluated by calculating the covariance between signature scores and expression
#' The correlation of genes with a negative sign in the signature are inverted.
#'
#' @importFrom pbmcapply pbmclapply
#' @importFrom matrixStats colSds
#' @importFrom matrixStats rowSds
#' @importFrom stats setNames
#'
#' @param sigScores matrix of signature scores (Cells x Signatures)
#' @param sigData list of Signature objects
#' @param normExpr output from calling getNormalizedCopySparse
#' @return named list with one item per signature.  Values are named vectors
#' of each gene's association (covariance) with the signature.
evalSigGeneImportance <- function(sigScores, sigData, normExpr){

    message("Evaluating signature-gene importance...\n")

    if (length(sigData) == 0) {
        return(list())
    }

    if (length(sigScores) <= 1){
        stop(
            sprintf("Signature scores have not yet been computed.  `calcSignatureScores` must be run before running `evalSigGeneImportance`")
            )
    }

    # Center each column of sigScores first

    mu <- colMeans(sigScores)

    sigScores <- t(sigScores)
    sigScores <- (sigScores - mu)
    sigScores <- t(sigScores)

    # Center each row of normExpr
    mu <- rowMeans(normExpr)

    normExpr <- (normExpr - mu)

    # Compute Covariances
    sigGene <- function(signame) {
        sigdata <- sigData[[signame]]

        genes <- sigdata@sigDict

        sigvals <- sigScores[, signame]

        geneIndices <- match(names(genes), rownames(normExpr))

        corr <- sigGeneInner(sigvals, normExpr, geneIndices)

        names(corr) <- names(genes)

        corr <- corr * genes

        return(corr)
    }

    sigs <- colnames(sigScores)
    res <- pbmclapply(setNames(sigs, sigs), sigGene)

    return(res)
}


#' Calculate Gene-Signature Importance
#'
#' For each signature, the contribution of each gene to the signature score
#' is evaluated by calculating the covariance between signature scores and expression
#' The correlation of genes with a negative sign in the signature are inverted.
#'
#' This version is made to avoid inflating sparse matrices
#'
#' @importFrom pbmcapply pbmclapply
#' @importFrom matrixStats colSds
#' @importFrom matrixStats rowSds
#' @importFrom Matrix Matrix
#' @importFrom Matrix Diagonal
#' @importFrom stats setNames
#'
#' @param object the VISION object
#' @return the VISION object, with SigGeneImportance slot populated
#' @param sigScores matrix of signature scores (Cells x Signatures)
#' @param sigData list of Signature objects
#' @param normExpr output from calling getNormalizedCopySparse
#' @return named list with one item per signature.  Values are named vectors
#' of each gene's association (covariance) with the signature.
evalSigGeneImportanceSparse <- function(sigScores, sigData, normExpr){

    message("Evaluating signature-gene importance...\n")

    if (length(sigData) == 0) {
        return(list())
    }

    if (length(sigScores) <= 1){
        stop(
            sprintf("Signature scores have not yet been computed.  `calcSignatureScores` must be run before running `evalSigGeneImportance`")
            )
    }

    # Center each column of sigScores first
    mu <- colMeans(sigScores)

    sigScores <- t(sigScores)
    sigScores <- (sigScores - mu)
    sigScores <- t(sigScores)

    # Precompute some matrices we'll need later
    NGenes <- nrow(normExpr@data)
    NCells <- ncol(normExpr@data)
    Cog <- Matrix(1, ncol = 1, nrow = NGenes)
    Coc <- Matrix(normExpr@colOffsets, nrow = 1)
    Cs <- Diagonal(x = normExpr@colScaleFactors)
    Roc <- Matrix(1, nrow = 1, ncol = NCells)
    C1 <- t(Roc)

    RM <- normExpr@data %*% (Cs %*% C1) + Cog %*% (Coc %*% (Cs %*% C1))
    RM <- RM / NCells
    RM <- RM[, 1]

    # Compute Covariances
    sigGene <- function(signame) {
        sigdata <- sigData[[signame]]

        genes <- sigdata@sigDict

        S <- sigScores[, signame, drop = F]
        geneIndices <- rownames(normExpr@data) %in% names(genes)

        E <- normExpr@data[geneIndices, , drop = FALSE]

        Rog <- Matrix(RM[geneIndices], ncol = 1)
        Cog <- Matrix(1, ncol = 1, nrow = length(genes))

        geneCov <- E %*% Cs %*% S - Rog %*% (Roc %*% S) + Cog %*% (Coc %*% (Cs %*% S))
        geneCov <- geneCov / (NCells - 1)
        geneCov <- geneCov[, 1]
        geneCov <- geneCov[names(genes)]

        geneCov <- geneCov * unlist(genes) # invert sign for negative genes

        return(geneCov)
    }

    sigs <- colnames(sigScores)
    res <- pbmclapply(setNames(sigs, sigs), sigGene)

    return(res)
}

#' Calculate single-cell plasticity scores
#' 
#' For each categorical meta data item, calculate a single-cell parsimony score,
#' which can be interpreted as a plasticity score as in Yang et al, bioRxiv 2021.
#' 
#' @param object A PhyloVision object
#' @return an updated object with plasticities as numeric meta data
#' @export
computePlasticityScores <- function(object) {

    message("Computing single cell plasticity scores on tree...\n")

    if (class(object) != 'PhyloVision') {
        stop('Object must be a PhyloVision object')
    }
    metaData <- object@metaData
    tree <- object@tree

    categoricalIndices <- vapply(seq_len(ncol(metaData)),
                          function(i) ( (is.factor(metaData[[i]]) | (is.character(metaData[[i]])))),
                          FUN.VALUE = TRUE)
    categoricalMetaLabels <- colnames(metaData)[categoricalIndices]

    out <- pbmclapply(categoricalMetaLabels, function(variable){
        metaSub <- as.character(metaData[,variable])
        names(metaSub) <- rownames(metaData)

        node.scores <- computeFitchHartiganParsimonyPerNode(tree, metaSub)
        leaf.scores <- computeSingleCellFitchScores(tree, node.scores)

        df <- t(data.frame(leaf.scores))
        colnames(df) <- c(paste0(variable, '_plasticity'))
        rownames(df) <- tree$tip.label
        return(df)
    }) 

    # remove existing plasticities if they are there
    for (entry in 1:length(out)) {
        df <- out[[entry]]
        if (!any(colnames(df) %in% colnames(metaData))) {
            metaData[,colnames(df)] <- df[rownames(metaData),]
        } 
    }

    object@metaData <- metaData

    return(object)
}

#' Computes the latent space of the expression matrix using PCA
#'
#' @param object the VISION object for which compute the latent space
#' @param projection_genes character vector of gene names to use for PCA
#' @param projection_genes_method name of filtering method. Either 'threshold' or 'fano'(default)
#' @param filterThreshold Threshold to apply when using the 'threshold' or 'fano' projection genes filter.
#' If greater than 1, this specifies the number of cells in which a gene must be detected
#' for it to be used when computing PCA. If less than 1, this instead specifies the proportion of cells needed
#' @param filterNumMad Number of median absolute deviations to use when selecting highly-variable
#' genes in each mean-sorted bin of genes
#' @param num_PCs the number of principal components to retain
#' @param perm_wPCA If TRUE, apply permutation wPCA to determine significant
#' number of components. Default is FALSE.
#' @return the VISION with @latentSpace slot populated
#' @export
computeLatentSpace <- function(
    object, projection_genes = NULL,
    filterThreshold = .05, filterNumMad = 2,
    projection_genes_method = NULL,
    num_PCs = 30, perm_wPCA = NULL) {

    message("Computing a latent space for expression data...\n")

    if (!is.null(projection_genes)) {
        object@params$latentSpace$projectionGenes <- projection_genes
    }
    if (!is.null(projection_genes_method)) {
        object@params$latentSpace$projectionGenesMethod <- projection_genes_method
    }

    if (is.null(object@params$latentSpace[["projectionGenes"]])) {
        object <- computeProjectionGenes(
            object,
            threshold = filterThreshold,
            num_mad = filterNumMad,
            projection_genes_method =
                object@params$latentSpace$projectionGenesMethod
        )
    } else {
        object <- computeProjectionGenes(
            object,
            projection_genes = object@params$latentSpace$projectionGenes
        )
    }

    if (!is.null(perm_wPCA)) object@params$latentSpace$permPCA <- perm_wPCA
    object@params$latentSpace$numPCs <- num_PCs

    expr <- object@exprData
    projection_genes <- object@params$latentSpace[["projectionGenes"]]
    perm_wPCA <- object@params$latentSpace$permPCA

    if (!is.null(projection_genes)) {
        exprData <- expr[projection_genes, , drop = FALSE]
    } else {
        exprData <- expr
    }

    exprData <- matLog2(exprData)

    if (perm_wPCA) {
        res <- applyPermutationWPCA(exprData, components = num_PCs)
        pca_res <- res[[1]]
    } else {
        res <- applyPCA(exprData, maxComponents = num_PCs)
        pca_res <- res[[1]]
    }

    object@LatentSpace <- t(pca_res)
    colnames(object@LatentSpace) <- paste0("PC ", seq_len(ncol(object@LatentSpace)))
    object@params$latentSpace$name <- "PCA"
    return(object)
}


#' Add a latent space computed using an external method
#'
#' @param object the VISION object for which compute the latent space
#' @param coordinates matrix with latent space coordinates (cells x dimensions)
#' @param name a label for the latent space (e.g. "PCA" or "scVI")
#' @return the VISION with @latentSpace slot populated
addLatentSpace <- function(object, coordinates, name) {

    if (is.data.frame(coordinates)){
        coordinates <- data.matrix(coordinates)
    }

    if (is.null(rownames(coordinates))) {
        if (nrow(coordinates) != ncol(object@exprData)) {
            stop("Supplied coordinates must of number of rows equal to number of cells in expression matrix")
        }

        rownames(coordinates) <- colnames(object@exprData)
    } else {
        sample_names <- colnames(object@exprData)
        common <- intersect(sample_names, rownames(coordinates))

        if (length(common) != nrow(coordinates)){
            stop("Supplied coordinates for coordinates must have rowlabels that match sample/cell names")
        }
        coordinates <- coordinates[colnames(object@exprData), , drop = FALSE]
    }

    if (is.null(colnames(coordinates))) {
        colnames(coordinates) <- paste0(name, "-", seq_len(ncol(coordinates)))
    }

    object@LatentSpace <- coordinates
    object@params$latentSpace$name <- name
    return(object)
}


#' generate projections
#'
#' Generates 2-dimensional representations of the expression matrix
#' Populates the 'Projections' slot on the VISION object
#'
#' @importFrom stats setNames
#'
#' @param object the VISION object
#' @return the VISION object with values set for the analysis results
generateProjections <- function(object) {
  message("Projecting data into 2 dimensions...")

  # Some projection methods operate on the full expression matrix
  # If using one of these, we need to compute 'projection_genes'
  projection_methods <- object@params$projectionMethods
  if ("ICA" %in% projection_methods || "RBFPCA" %in% projection_methods) {
      object <- computeProjectionGenes(object)
  }

  projections <- generateProjectionsInner(object@exprData,
                                     object@LatentSpace,
                                     projection_genes = object@params$latentSpace[["projectionGenes"]],
                                     projection_methods = object@params$projectionMethods,
                                     K = object@params$numNeighbors)

  # Add already-input projections
  for (proj in names(object@Projections)){
      projections[[proj]] <- object@Projections[[proj]]
  }

  # Make sure all projections have column names
  n <- names(projections)
  projections <- lapply(setNames(n, n), function(pname){
      proj <- projections[[pname]]
      if (is.null(colnames(proj))){
          colnames(proj) <- paste0(pname, "-", seq_len(ncol(proj)))
      }
      return(proj)
  })

  object@Projections <- projections

  message("")

  return(object)
}


#' Adds a UMAP projection
#'
#' @param object the VISION object
#' @param K Number of neighbors to use in UMAP projection.
#' @param name label to use for this projection
#' @param source coordinates to use to compute tSNE
#'   Choices are either "LatentSpace" (default) or "Proteins"
#'
#' @return VISION object with the projection added to @Projections slot
#' @export
addUMAP <- function(object, K = object@params$numNeighbors,
                    name = "UMAP", source = "LatentSpace") {

    if (!requireNamespace("uwot", quietly = TRUE)){
        stop("Package \"uwot\" needed to run UMAP.  Please install it using:\n\n   devtools::install_github(\"jlmelville/uwot\")\n\n",
            call. = FALSE)
    }

    if (source == "LatentSpace") {
        data <- object@LatentSpace
    } else if (source == "Proteins") {
        data <- object@proteinData
    } else {
        stop("Invalid 'source' parameter.  Can be either 'LatentSpace' or 'Proteins'")
    }

    n_workers <- getOption("mc.cores")
    n_workers <- if (is.null(n_workers)) 2 else n_workers
	res <- uwot::umap(
        data, n_neighbors = K,
        n_threads = n_workers, ret_nn = T
    )
	res <- res$embedding

	rownames(res) <- rownames(data)
    colnames(res) <- paste0(name, "-", seq_len(ncol(res)))
    object@Projections[[name]] <- res

    return(object)
}


#' Adds a tSNE projection
#'
#' @importFrom Rtsne Rtsne
#'
#' @param object the VISION object
#' @param perplexity parameter for tSNE
#' @param name label to use for this projection
#' @param source coordinates to use to compute tSNE
#'   Choices are either "LatentSpace" (default) or "Proteins"
#' @return VISION object with the projection added to @Projections slot
#' @export
addTSNE <- function(object, perplexity = 30, name = "tSNE", source = "LatentSpace") {

    if (source == "LatentSpace") {
        data <- object@LatentSpace
    } else if (source == "Proteins") {
        data <- object@proteinData
    } else {
        stop("Invalid 'source' parameter.  Can be either 'LatentSpace' or 'Proteins'")
    }

    res <- Rtsne(
        data, dims = 2, max_iter = 800, perplexity = perplexity,
        check_duplicates = FALSE, pca = FALSE)
    res <- res$Y
    rownames(res) <- rownames(data)
    colnames(res) <- paste0(name, "-", seq_len(ncol(res)))

    object@Projections[[name]] <- res

    return(object)

}

#' Compute local correlations for all signatures
#'
#' This is the main analysis function. For each filtered dataset, a set of
#' different projection onto low-dimensional space are computed, and the
#' consistency of the resulting space with the signature scores is computed
#' to find signals that are captured succesfully by the projections.
#' @param object the VISION object
#' @return the VISION object with values set for the analysis results
#' @export
analyzeLocalCorrelations <- function(object, tree=FALSE) {

  signatureBackground <- generatePermutationNull(
      object@exprData, object@sigData, num = 3000
  )

  normExpr <- getNormalizedCopySparse(
      object@exprData,
      object@params$signatures$sigNormMethod)

  message("Computing KNN Cell Graph in the Latent Space...\n")
  if (!tree) {
    weights <- computeKNNWeights(object@LatentSpace, object@params$numNeighbors)
  } else {
    message("Using Tree to compute neighbors...\n")
    weights <- computeKNNWeights(object@tree, object@params$numNeighbors)
  }
 
  message("Evaluating local consistency of signatures in latent space...\n")

  sigConsistencyScores <- sigConsistencyScores(
                                weights,
                                object@SigScores,
                                object@metaData,
                                signatureBackground,
                                normExpr)

  if (hasProteinData(object)) {
      fbcs <- fbConsistencyScores(weights, object@proteinData)
  } else {
      fbcs <- NULL
  }

  message("Clustering signatures...\n")
  sigClusters <- clusterSignatures(object@SigScores,
                                   object@metaData,
                                   sigConsistencyScores,
                                   clusterMeta = object@params$micropooling$pool)

  metaConsistencyScores <- sigConsistencyScores[
      colnames(object@metaData), , drop = FALSE
  ]

  sigConsistencyScores <- sigConsistencyScores[
      colnames(object@SigScores), , drop = FALSE
  ]


  LocalAutocorrelation <- list(
      "Signatures" = sigConsistencyScores,
      "Meta" = metaConsistencyScores,
      "Proteins" = fbcs,
      "Clusters" = sigClusters
  )

  object@LocalAutocorrelation <- LocalAutocorrelation

  return(object)
}


#' Compute trajectory correlations for all signatures
#'
#' This is the main analysis function. For each filtered dataset, a set of
#' different projection onto low-dimensional space are computed, and the
#' consistency of the resulting space with the signature scores is computed
#' to find signals that are captured succesfully by the projections.
#' @param object the VISION object
#' @return the VISION object with values set for the analysis results
analyzeTrajectoryCorrelations <- function(object) {

  signatureBackground <- generatePermutationNull(
      object@exprData, object@sigData, num = 3000
  )

  normExpr <- getNormalizedCopySparse(
      object@exprData,
      object@params$signatures$sigNormMethod)

  message("Computing KNN Cell Graph in the Trajectory Model...\n")

  weights <- computeKNNWeights(object@LatentTrajectory, object@params$numNeighbors)

  message("Evaluating local consistency of signatures within trajectory model...\n")

  sigVTreeProj <- sigConsistencyScores(weights,
                                       object@SigScores,
                                       object@metaData,
                                       signatureBackground,
                                       normExpr)

  if (hasProteinData(object)) {
      fbcs <- fbConsistencyScores(weights, object@proteinData)
  } else {
      fbcs <- NULL
  }

  message("Clustering signatures...\n")
  sigTreeClusters <- clusterSignatures(object@SigScores,
                                       object@metaData,
                                       sigVTreeProj,
                                       clusterMeta = object@params$micropooling$pool)

  metaConsistencyScores <- sigVTreeProj[
      colnames(object@metaData), , drop = FALSE
  ]

  sigConsistencyScores <- sigVTreeProj[
      colnames(object@SigScores), , drop = FALSE
  ]

  TrajectoryAutocorrelation <- list(
      "Signatures" = sigConsistencyScores,
      "Meta" = metaConsistencyScores,
      "Proteins" = fbcs,
      "Clusters" = sigTreeClusters
  )

  object@TrajectoryAutocorrelation <- TrajectoryAutocorrelation

  return(object)
}


#' Compute Ranksums Test, for all factor meta data.  One level vs all others
#'
#' @importFrom pbmcapply pbmclapply
#' @importFrom matrixStats colRanks
#' @importFrom stats setNames
#' @param object the VISION object
#' @param variables which columns of the meta-data to use for comparisons
#' @return the VISION object with the @ClusterComparisons slot populated
#' @export
clusterSigScores <- function(object, variables = "All") {

    message("Computing differential signature tests...\n")

    sigScores <- object@SigScores
    metaData <- object@metaData

    metaData <- metaData[rownames(sigScores), , drop = FALSE]

    if (variables == "All") {
        # Determine which metaData we can run on
        # Must be a factor with at least 20 levels
        clusterMeta <- vapply(colnames(metaData), function(x) {
                scores <- metaData[[x]]
                if (!is.factor(scores)){
                    return("")
                }
                if (length(levels(scores)) > 50){
                    return("")
                }
                if (length(unique(scores)) == 1){
                    return("")
                }
                return(x)
            }, FUN.VALUE = "")
        clusterMeta <- clusterMeta[clusterMeta != ""]
    } else {
        if (!all(variables %in% colnames(metaData))) {
            stop("Supplied variable names must be column names of object@metaData")
        }
        clusterMeta <- setNames(variables, variables)
    }

    ClusterComparisons <- list()

    # Comparisons for Signatures
    if (ncol(sigScores) > 0){
        sigScoreRanks <- colRanks(sigScores,
                                  preserveShape = TRUE,
                                  ties.method = "average")
        dimnames(sigScoreRanks) <- dimnames(sigScores)
    } else {
        sigScoreRanks <- sigScores
    }

    out <- pbmclapply(clusterMeta, function(variable){
        values <- metaData[[variable]]
        var_levels <- levels(values)

        result <- lapply(var_levels, function(var_level){
            cluster_ii <- which(values == var_level)

            r1 <- matrix_wilcox(sigScoreRanks, cluster_ii,
                                check_na = FALSE, check_ties = FALSE)

            pval <- r1$pval
            stat <- r1$stat
            fdr <- p.adjust(pval, method = "BH")
            out <- data.frame(
                stat = stat, pValue = pval, FDR = fdr
            )
            return(out)
        })

        names(result) <- var_levels
        result <- result[order(var_levels)]

        return(result)
    }, mc.cores = 1)

    ClusterComparisons[["Signatures"]] <- out

    # Comparisons for Meta data
    # Split meta into numeric and factor
    numericMeta <- vapply(seq_len(ncol(metaData)),
                          function(i) is.numeric(metaData[[i]]),
                          FUN.VALUE = TRUE)
    numericMeta <- metaData[, numericMeta, drop = F]
    numericMeta <- as.matrix(numericMeta)

    factorMeta <- vapply(seq_len(ncol(metaData)),
                          function(i) is.factor(metaData[[i]]),
                          FUN.VALUE = TRUE)
    factorMeta <- metaData[, factorMeta, drop = F]

    if (ncol(numericMeta) > 0){
        numericMetaRanks <- colRanks(numericMeta,
                                     preserveShape = TRUE,
                                     ties.method = "average")
        dimnames(numericMetaRanks) <- dimnames(numericMeta)
    } else {
        numericMetaRanks <- numericMeta
    }

    out <- pbmclapply(clusterMeta, function(variable){
        values <- metaData[[variable]]
        var_levels <- levels(values)

        result <- lapply(var_levels, function(var_level){
            cluster_ii <- which(values == var_level)

            r2 <- matrix_wilcox(numericMetaRanks, cluster_ii,
                                check_na = TRUE, check_ties = TRUE)

            r3 <- matrix_chisq(factorMeta, cluster_ii)

            pval <- c(r2$pval, r3$pval)
            stat <- c(r2$stat, r3$stat)
            fdr <- p.adjust(pval, method = "BH")
            out <- data.frame(
                stat = stat, pValue = pval, FDR = fdr
            )
            return(out)
        })

        names(result) <- var_levels
        result <- result[order(var_levels)]

        return(result)
    }, mc.cores = 1)

    ClusterComparisons[["Meta"]] <- out

    # Comparisons for Proteins
    if (hasProteinData(object)){
        fbData <- t(object@proteinData)

        # gather jobs
        jobs <- lapply(unname(clusterMeta), function(variable) {
            lapply(levels(metaData[[variable]]), function(level) {
                return(c(variable, level))
            })
        })
        jobs <- do.call(c, jobs)

        results <- pbmclapply(jobs, function(job){
            variable <- job[1]
            var_level <- job[2]
            values <- metaData[[variable]]
            cluster_ii <- which(values == var_level)
            not_cluster_ii <- which(values != var_level)

            rr <- matrix_wilcox_cpp(
                fbData, cluster_ii, not_cluster_ii, jobs = 1)

            pval <- rr$pval
            stat <- rr$AUC
            fdr <- p.adjust(pval, method = "BH")
            out <- data.frame(
                stat = stat, pValue = pval, FDR = fdr,
                row.names = rownames(rr)
            )
            return(list(job, out))
        })

        out_fb <- list()
        for (res in results){
            variable <- res[[1]][1]
            var_level <- res[[1]][2]
            df <- res[[2]]
            if (is.null(out_fb[[variable]])){
                out_fb[[variable]] <- list()
            }
            out_fb[[variable]][[var_level]] <- df
        }

        ClusterComparisons[["Proteins"]] <- out_fb
    } else {
        ClusterComparisons[["Proteins"]] <- NULL
    }

    object@ClusterComparisons <- ClusterComparisons

    return(object)

}

#' Compute single-cell effective plasticity scores
#' 
#' Computes a single-cell effective plasticity score, capturing the stability
#' of a particular categorical trait with respect to a tree (as introduced
#' with the metastatic score in Quinn, Jones et al Science 2021).
#' 
#' @importFrom phangorn parsimony
#' @param object the PhyloVision object
#' @return the object with new continuous covariates populated corresponding to the plasticities
#' @export 
computeSingleCellPlasticities <- function(object) {

    if (!is(object, "PhyloVision")) {
        stop("This object is not a PhyloVision object.")
    }

    tree <- object@tree
    metaData <- object@metaData
    categoricalMetaVars <- vapply(colnames(metaData),
                            function(x) is.factor(metaData[[x]]) | is.character(metaData[[x]]),
                            FUN.VALUE = TRUE)
    categoricalMeta <- metaData[, categoricalMetaVars, drop = FALSE]
    categoricalMeta <- categoricalMeta[tree$tip.label, , drop = FALSE]

    plasticity_scores <- pbmclapply(seq_len(ncol(categoricalMeta)), function(i) {
      labels <- categoricalMeta[,i,drop=F]

      parsimony_score <- computeFitchHartiganParsimony(tree, labels)

      return(parsimony_score / nrow(tree$edge))
    })
    names(plasticity_scores) <- unlist(lapply(colnames(categoricalMeta), function(x) paste0(x,'_plasticity')))
    
}

#' Compute pearson correlation between signature scores and components of the Latent Space
#'
#' Enables the LCAnnotator portion of the output report
#'
#' @importFrom pbmcapply pbmclapply
#' @importFrom stats cor.test
#' @param object the VISION object
#' @return object with the @LCAnnotatorData slot populated
#' @export
annotateLatentComponents <- function(object){

  message("Computing correlations between signatures and latent space components...\n")
  sigMatrix <- object@SigScores
  metaData <- object@metaData
  latentSpace <- object@LatentSpace

  ## combined gene signs and numeric meta variables

  numericMetaVars <- vapply(colnames(metaData),
                            function(x) is.numeric(metaData[[x]]),
                            FUN.VALUE = TRUE)

  numericMeta <- metaData[, numericMetaVars, drop = FALSE]
  numericMeta <- numericMeta[rownames(sigMatrix), , drop = FALSE]


  computedSigMatrix <- cbind(sigMatrix, numericMeta)

  pearsonCorr <- pbmclapply(seq_len(ncol(computedSigMatrix)), function(i) {
      ss <- computedSigMatrix[, i];

      ls_col_cor <- apply(latentSpace, 2, function(pc){
           suppressWarnings({
               pc_result <- cor.test(ss, pc)
           })
           if (is.na(pc_result$estimate)) {  # happens if std dev is 0 for a sig
               return(0)
           } else {
               return(pc_result$estimate)
           }
      })

      return(ls_col_cor)
  })

  if (length(pearsonCorr) > 0) {
      pearsonCorr <- do.call(rbind, pearsonCorr)
  } else {
      pearsonCorr <- matrix(
          nrow = ncol(computedSigMatrix),
          ncol = ncol(latentSpace)
      )
  }
  rownames(pearsonCorr) <- colnames(computedSigMatrix)
  colnames(pearsonCorr) <- colnames(latentSpace)

  if (hasProteinData(object)) {
      proteinData <- object@proteinData
      pearsonCorrProteins <- pbmclapply(
          seq_len(ncol(proteinData)), function(i) {
          ss <- proteinData[, i];

          ls_col_cor <- apply(latentSpace, 2, function(pc){
               suppressWarnings({
                   pc_result <- cor.test(ss, pc)
               })
               if (is.na(pc_result$estimate)) {  # happens if std dev is 0 for a protein
                   return(0)
               } else {
                   return(pc_result$estimate)
               }
          })

          return(ls_col_cor)
      })

      pearsonCorrProteins <- do.call(rbind, pearsonCorrProteins)
      rownames(pearsonCorrProteins) <- colnames(proteinData)
      colnames(pearsonCorrProteins) <- colnames(latentSpace)
  } else {
      pearsonCorrProteins <- NULL
  }


  pcaAnnotData <- LCAnnotatorData(
      pearsonCorr = pearsonCorr, pearsonCorrProteins = pearsonCorrProteins
  )
  object@LCAnnotatorData <- pcaAnnotData

  return(object)
}
YosefLab/FastProjectR documentation built on Feb. 15, 2023, 6:21 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
YosefLab/FastProjectR
Functional interpretation of single cell RNA-seq latent manifolds

R/AnalysisFunctions.R
In YosefLab/FastProjectR: Functional interpretation of single cell RNA-seq latent manifolds

R Package Documentation

Browse R Packages

We want your feedback!

YosefLab/FastProjectR Functional interpretation of single cell RNA-seq latent manifolds

R/AnalysisFunctions.R In YosefLab/FastProjectR: Functional interpretation of single cell RNA-seq latent manifolds

R Package Documentation

Browse R Packages

We want your feedback!

YosefLab/FastProjectR
Functional interpretation of single cell RNA-seq latent manifolds

R/AnalysisFunctions.R
In YosefLab/FastProjectR: Functional interpretation of single cell RNA-seq latent manifolds