R/findTopCorrelations.R

Defines functions .create_output_dataframe .compute_exact_neighbor_rho .create_blocked_rank_matrix .find_block_availability .find_top_correlations

#' Find top correlations between features
#'
#' For each feature, find the subset of other features in the same or another modality that have strongest positive/negative Spearman's rank correlations in a pair of normalized expression matrices.
#'
#' @param x,y Normalized expression matrices containing features in the rows and cells in the columns.
#' Each matrix should have the same set of columns but a different set of features, usually corresponding to different modes for the same cells.
#' 
#' Alternatively, \linkS4class{SummarizedExperiment} objects containing such a matrix.
#'
#' Finally, \code{y} may be \code{NULL}, in which correlations are computed between features in \code{x}.
#' @param number Integer scalar specifying the number of top correlated features to report for each feature in \code{x}.
#' @param block A vector or factor of length equal to the number of cells, specifying the block of origin for each cell.
#' @param d Integer scalar specifying the number of dimensions to use for the approximate search via PCA.
#' If \code{NA}, no approximation of the rank values is performed prior to the search.
#' @param use.names Logical scalar specifying whether row names of \code{x} and/or \code{y} should be reported in the output, if available.
#'
#' For the SummarizedExperiment method, this may also be a string specifying the \code{\link{rowData}} column containing the names to use;
#' or a character vector of length 2, where the first and second entries specify the \code{\link{rowData}} columns containing the names in \code{x} and \code{y} respectively.
#' If either entry is \code{NA}, the existing row names for the corresponding object are used.
#' Note that this only has an effect on \code{y} if it is a SummarizedExperiment.
#' @param BSPARAM A \linkS4class{BiocSingularParam} object specifying the algorithm to use for the PCA.
#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying the algorithm to use for the neighbor search.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use.
#' @param ... For the generic, further arguments to pass to specific methods.
#'
#' For the SummarizedExperiment method, further arguments to pass to the ANY method.
#' @param assay.type String or integer scalar specifying the assay containing the matrix of interest in \code{x} (and \code{y}, if a SummarizedExperiment).
#' @param direction String specifying the sign of the correlations to search for.
#' @param equiweight Logical scalar indicating whether each block should be given equal weight, if \code{block} is specified.
#' If \code{FALSE}, each block is weighted by the number of cells.
#' @param deferred Logical scalar indicating whether a fast deferred calculation should be used for the rank-based PCA.
#' @param subset.cols Vector indicating the columns of \code{x} (and \code{y}) to retain for computing correlations.
#'
#' @return A \linkS4class{List} containing one or two \linkS4class{DataFrame}s for results in each direction.
#' These are named \code{"positive"} and \code{"negative"}, and are generated according to \code{direction};
#' if \code{direction="both"}, both DataFrames will be present.
#'
#' Each DataFrame has up to \code{nrow(x) * number} rows, containing the top \code{number} correlated features for each feature in \code{x}.
#' This contains the following fields:
#' \itemize{
#' \item \code{feature1}, the name (character) or row index (integer) of each feature in \code{x}.
#' Not all features may be reported here, see Details.
#' \item \code{feature2}, the name (character) or row index (integer) of one of the top correlated features to \code{feature1}.
#' This is another feature in \code{x} if \code{y=NULL}, otherwise it is a feature in \code{y}.
#' \item \code{rho}, the Spearman rank correlation for the current pair of \code{feature1} and \code{feature2}.
#' \item \code{p.value}, the approximate p-value associated with \code{rho} under the null hypothesis that the correlation is zero.
#' \item \code{FDR}, the adjusted p-value.
#' }
#' The rows are sorted by \code{feature1} and then \code{p.value}.
#' 
#' @details
#' In most cases, we only care about the top-correlated features, allowing us to skip a lot of unnecessary computation.
#' This is achieved by transforming the problem of finding the largest Spearman correlation into a nearest-neighbor search in rank space.
#' For the sake of speed, we approximate the search by performing PCA to compress the rank values for all features.
#'
#' For each direction, we compute the one-sided p-value for each feature using the approximate method implemented in \code{\link{cor.test}}.
#' The FDR correction is performed by considering all possible pairs of features, as these are implicitly tested in the neighbor search.
#' Note that this is somewhat conservative as it does not consider strong correlations outside the reported features.
#'
#' If \code{block} is specified, correlations are computed separately for each block of cells.
#' For each feature pair, the reported \code{rho} is set to the average of the correlations across all blocks.
#' Similarly, the p-value corresponding to each correlation is computed separately for each block and then combined across blocks with Stouffer's method.
#' If \code{equiweight=FALSE}, the average correlation and each per-block p-value is weighted by the number of cells.
#'
#' We only consider pairs of features that have computable correlations in at least one block.
#' Blocks are ignored if one or the other feature has tied values (typically zeros) for all cells in that block.
#' This means that a feature may not have any entries in \code{feature1} if it forms no valid pairs, e.g., because it is not expressed.
#' Similarly, the total number of rows may be less than the maximum if insufficient valid pairs are available.
#'
#' @author Aaron Lun
#'
#' @examples
#' library(scuttle)
#' sce1 <- mockSCE()
#' sce1 <- logNormCounts(sce1)
#'
#' sce2 <- mockSCE(ngenes=20) # pretend this is CITE-seq data, or something.
#' sce2 <- logNormCounts(sce2)
#'
#' # Top 20 correlated features in 'sce2' for each feature in 'sce1':
#' df <- findTopCorrelations(sce1, sce2, number=20) 
#' df
#' 
#' @seealso
#' \code{\link{computeCorrelations}}, to compute correlations for all pairs of features.
#' 
#' @name findTopCorrelations
NULL

#' @importFrom S4Vectors DataFrame splitAsList List
#' @importFrom IRanges heads
#' @importFrom stats p.adjust
#' @importFrom BiocSingular IrlbaParam
#' @importFrom BiocNeighbors findKNN queryKNN KmknnParam buildIndex
#' @importFrom BiocParallel SerialParam bpstart bpstop
#' @importFrom scuttle .bpNotSharedOrUp .subset2index
#' @importFrom DelayedArray DelayedArray
#' @importFrom DelayedMatrixStats rowAnys
.find_top_correlations <- function(x, number=10, y=NULL, d=50, 
    direction=c("both", "positive", "negative"), 
    subset.cols=NULL, block=NULL, equiweight=TRUE, use.names=TRUE, deferred=TRUE, 
    BSPARAM=IrlbaParam(), BNPARAM=KmknnParam(), BPPARAM=SerialParam()) 
{
    if (.bpNotSharedOrUp(BPPARAM)) {
        bpstart(BPPARAM)
        on.exit(bpstop(BPPARAM))
    }

    if (!is.null(subset.cols)) {
        subset.cols <- .subset2index(subset.cols, x, byrow=FALSE)
        x <- x[,subset.cols,drop=FALSE]
        y <- y[,subset.cols,drop=FALSE]
        block <- block[subset.cols]
    }

    direction <- match.arg(direction)
    self.search <- is.null(y)

    # Wrapping in DA's to avoid actually subsetting the matrices unnecessarily.
    # Waiting for Bioconductor/DelayedArray#80.
    # x <- DelayedArray(x)
    if (self.search) {
        y <- x
    } else {
    #    y <- DelayedArray(y)
    }

    xcat <- .find_block_availability(x, block=block, BPPARAM=BPPARAM)
    if (self.search) {
        ycat <- xcat
    } else {
        ycat <- .find_block_availability(y, block=block, BPPARAM=BPPARAM)
    }

    total.ntests <- 0L
    output <- List()
    empty <- DataFrame(feature1=integer(0), feature2=integer(0), rho=numeric(0), p.value=numeric(0))
    if (direction %in% c("positive", "both")) {
        output$positive <- list(empty)
    }
    if (direction %in% c("negative", "both")) {
        output$negative <- list(empty)
    }
    ocounter <- 1L

    for (x.chosen in xcat$by) {
        x.valid <- unlist(xcat$available[x.chosen[1],,drop=FALSE])
        for (y.chosen in ycat$by) {
            y.valid <- unlist(ycat$available[y.chosen[1],,drop=FALSE])
            both.valid <- x.valid & y.valid
            if (!any(both.valid)) {
                next
            }

            same <- self.search && identical(x.chosen, y.chosen)
            if (same) {
                ntests <- length(x.chosen) * (length(x.chosen) - 1L) # don't divide by 2, as the same pair may be reported twice.
            } else {
                ntests <- length(x.chosen) * length(y.chosen)
            }
            total.ntests <- total.ntests + ntests

            block.info <- xcat$block[both.valid]
            cells <- unlist(block.info)
            x0 <- x[x.chosen,cells,drop=FALSE]
            if (!same) {
                y0 <- y[y.chosen,cells,drop=FALSE]
            }

            # This bit bears some explaining. We want to combine x and y into a single matrix
            # to do the PCA, so that all features are projected into the same space. 'in.first'
            # and 'in.second' just allow us to remember where 'x' and 'y' ends and starts. In 
            # addition, we also want to project the flipped features (to get the negative 
            # correlations), hence the 'in.first.neg'.
            in.first <- seq_len(nrow(x0))
            if (same) {
                in.second <- in.first
                combined <- t(x0)
            } else {
                in.second <- nrow(x0) + seq_len(nrow(y0))
                combined <- cbind(t(x0), t(y0))
            }
            flip <- direction!="positive"
            in.first.neg <- ncol(combined) + in.first

            nblocks <- lengths(block.info)
            stash <- .create_blocked_rank_matrix(combined, deferred=deferred, nblocks=nblocks,
                d=d, equiweight=equiweight, BSPARAM=BSPARAM, BPPARAM=BPPARAM, flip=flip)
            rank.out <- stash$rank.out
            search.out <- stash$search.out

            precomputed <- buildIndex(search.out[in.second,,drop=FALSE], BNPARAM=BNPARAM)

            if (!is.null(output$positive)) {
                if (same) {
                    nn.out <- findKNN(BNINDEX=precomputed, k=number, BPPARAM=BPPARAM, get.distance=FALSE)
                } else {
                    nn.out <- queryKNN(query=search.out[in.first,,drop=FALSE], k=number, 
                        get.distance=FALSE, BNINDEX=precomputed, BPPARAM=BPPARAM)
                }

                rho <- rowBlockApply(rank.out[in.first,,drop=FALSE], FUN=.compute_exact_neighbor_rho, grid=TRUE,
                    nblocks=nblocks, other=rank.out[in.second,,drop=FALSE], indices=nn.out$index, BPPARAM=BPPARAM)
                output$positive[[ocounter]] <- .create_output_dataframe(nn.out$index, rho, positive=TRUE, 
                    nblocks=nblocks, equiweight=equiweight, chosen1=x.chosen, chosen2=y.chosen)
            }

            if (!is.null(output$negative)) {
                nn.out <- queryKNN(query=search.out[in.first.neg,,drop=FALSE], k=number + as.integer(same), 
                    get.distance=FALSE, BNINDEX=precomputed, BPPARAM=BPPARAM)

                # Searching for number + 1 and then stripping out any self-matches.
                if (same) {
                    discard <- nn.out$index==seq_len(nrow(nn.out$index))
                    discard[!rowAnys(discard),ncol(discard)] <- TRUE # removing the last.
                    stripped <- matrix(t(nn.out$index)[t(!discard)], ncol(nn.out$index) - 1L, nrow(nn.out$index)) # transposing so we refill by column-major.
                    nn.out$index <- t(stripped)
                }

                rho <- rowBlockApply(rank.out[in.first,,drop=FALSE], FUN=.compute_exact_neighbor_rho, grid=TRUE,
                    nblocks=nblocks, other=rank.out[in.second,,drop=FALSE], indices=nn.out$index, BPPARAM=BPPARAM)
                output$negative[[ocounter]] <- .create_output_dataframe(nn.out$index, rho, positive=FALSE, 
                    nblocks=nblocks, equiweight=equiweight, chosen1=x.chosen, chosen2=y.chosen)
            }

            ocounter <- ocounter + 1L
        }
    }

    for (dir in names(output)) {
        combined <- do.call(rbind, output[[dir]])
        combined <- combined[order(combined$feature1, combined$p.value),,drop=FALSE]
        fragged <- splitAsList(combined, combined$feature1)
        fragged <- heads(fragged, number)
        final <- unlist(fragged, use.names=FALSE)
        final <- .fill_names(final, use.names, rownames(x), rownames(y))
        final$FDR <- p.adjust(final$p.value, method="BH", n=max(1, total.ntests)) # hack for bug.
        output[[dir]] <- final
    }

    output
}

######################
##### Internals ######
######################

#' @importFrom S4Vectors DataFrame splitAsList selfmatch
#' @importFrom BiocParallel SerialParam
#' @importFrom DelayedArray DelayedArray setAutoBPPARAM getAutoBPPARAM
#' @importFrom DelayedMatrixStats rowVars
.find_block_availability <- function(x, block, tol=1e-8, BPPARAM=SerialParam()) {
    old <- getAutoBPPARAM()
    setAutoBPPARAM(BPPARAM)
    on.exit(setAutoBPPARAM(old))

    x <- DelayedArray(x)
    if (is.null(block)) {
        output <- list(all=rowVars(x) > tol) 
        by.block <- list(seq_len(ncol(x)))
    } else {
        by.block <- splitAsList(seq_len(ncol(x)), block)
        output <- vector("list", length(by.block))
        names(output) <- names(by.block)
        for (i in seq_along(by.block)) {
            output[[i]] <- rowVars(x, cols=by.block[[i]]) > tol
        }
    }

    availabilities <- do.call(DataFrame, c(output, list(check.names=FALSE)))
    ids <- selfmatch(availabilities)
    by.availability <- split(seq_along(ids), ids)
    list(by=by.availability, block=by.block, available=availabilities)
}

#' @importFrom BiocGenerics cbind
#' @importFrom DelayedMatrixStats colVars
#' @importFrom Matrix colMeans t
#' @importFrom ScaledMatrix ScaledMatrix
#' @importFrom DelayedArray getAutoBPPARAM setAutoBPPARAM
#' @importFrom scran scaledColRanks
#' @importFrom BiocSingular runPCA
.create_blocked_rank_matrix <- function(x, deferred, nblocks, ..., d, equiweight, flip, BSPARAM, BPPARAM) {
    old <- getAutoBPPARAM()
    setAutoBPPARAM(BPPARAM)
    on.exit(setAutoBPPARAM(old))

    # Remember, features are columns coming into this function!
    # So we have to split by rows, as cells are in different batches.
    rank.out <- search.out <- vector("list", length(nblocks))
    last <- 0L

    for (i in seq_along(nblocks)) {
        chosen <- last + seq_len(nblocks[i])
        x0 <- x[chosen,,drop=FALSE]

        if (!deferred) {
            y <- scaledColRanks(x0, BPPARAM=BPPARAM, transposed=TRUE)
            if (flip) {
                # Flipping to compute the negative correlations.
                # This takes advantage of the fact that scaledColRanks(-x0) == -y.
                y <- rbind(y, -y)
            }
        } else {
            y <- scaledColRanks(x0, transposed=FALSE, as.sparse=TRUE, BPPARAM=BPPARAM)
            center <- colMeans(y)
            if (flip) {
                y <- cbind(y, -y)
                center <- c(center, -center)
            }
            y <- ScaledMatrix(y, center=center)
            y <- t(y)
        }
        rank.out[[i]] <- y

        if (!is.na(d)) {
            BSPARAM@deferred <- deferred # TODO: add easier setters.
            search.out[[i]] <- runPCA(rank.out[[i]], rank=d, get.rotation=FALSE, BSPARAM=BSPARAM, BPPARAM=BPPARAM)$x
        } else {
            search.out[[i]] <- as.matrix(rank.out[[i]])
        }

        # Equalizing the distance contribution based on the total variance.
        # This is not necessary for d=NA because the rank scaling does this for us,
        # but the PCA may change the relative contributions, so we have to do something.
        if (!is.na(d) && equiweight) {
            total.var <- sum(colVars(search.out[[i]]))
            search.out[[i]] <- search.out[[i]] / sqrt(total.var)
        }

        last <- last + nblocks[i]
    }

    rank.out <- do.call(cbind, rank.out)
    search.out <- do.call(cbind, search.out)
    list(rank.out=rank.out, search.out=search.out)
}

#' @importFrom Matrix rowSums
#' @importFrom DelayedArray currentViewport makeNindexFromArrayViewport
.compute_exact_neighbor_rho <- function(block, other, indices, nblocks) {
    vp <- currentViewport()
    subset <- makeNindexFromArrayViewport(vp, expand.RangeNSBS=TRUE)
    if (!is.null(subset[[1]])) {
        indices <- indices[subset[[1]],,drop=FALSE]
    }

    output <- matrix(0, nrow(block), ncol(indices))
    output <- rep(list(output), length(nblocks))

    for (j in seq_len(ncol(indices))) {
        current <- block - as.matrix(other[indices[,j],,drop=FALSE])
        if (length(nblocks)==1L) {
            output[[1]][,j] <- 1 - 2*rowSums(current^2)
        } else {
            last <- 0L
            for (n in seq_along(nblocks)) {
                chosen <- last + seq_len(nblocks[n])
                output[[n]][,j] <- 1 - 2*rowSums(current[,chosen,drop=FALSE]^2)
                last <- last + nblocks[n]
            }
        }
    }

    output
}

#' @importFrom S4Vectors DataFrame
#' @importFrom metapod parallelStouffer
#' @importFrom scran rhoToPValue
.create_output_dataframe <- function(indices, rho, nblocks, equiweight, positive, chosen1, chosen2) {
    rho <- do.call(mapply, c(list(FUN=rbind, SIMPLIFY=FALSE), rho)) # still fragmented from the blockApply.
    mean.rho <- .compute_mean_rho(rho, nblocks, equiweight)

    self <- rep(chosen1, ncol(indices))
    df <- DataFrame(feature1=self, feature2=chosen2[as.vector(indices)], rho=as.vector(mean.rho))
    p.values <- mapply(FUN=rhoToPValue, rho=rho, n=nblocks, MoreArgs=list(positive=positive), SIMPLIFY=FALSE)

    if (length(nblocks)==1L) {
        p.value <- p.values[[1]]
    } else {
        # Combining all the one-sided p-values together.
        if (equiweight) {
            weights <- NULL
        } else {
            weights <- nblocks
        }
        p.value <- parallelStouffer(p.values, weights=weights)$p.value
    }

    df$p.value <- as.vector(p.value)
    df
}

#######################
##### S4 methods ######
#######################

#' @export
#' @rdname findTopCorrelations
setGeneric("findTopCorrelations", function(x, number, ...) standardGeneric("findTopCorrelations"))

#' @export
#' @rdname findTopCorrelations
setMethod("findTopCorrelations", "ANY", .find_top_correlations)

#' @export
#' @rdname findTopCorrelations
#' @importFrom SummarizedExperiment assay
setMethod("findTopCorrelations", "SummarizedExperiment", function(x, number, y=NULL, use.names=TRUE, ..., assay.type="logcounts") {
    formatted <- .format_xy_for_correlations(x, y, use.names=use.names, assay.type=assay.type)
    .find_top_correlations(formatted$x, number=number, y=formatted$y, use.names=formatted$use.names, ...)
})
LTLA/mumosa documentation built on March 10, 2024, 1:20 a.m.