R/makeSNNGraph.R
In bluster: Clustering Algorithms for Bioconductor

Documented in makeKNNGraph makeSNNGraph neighborsToKNNGraph neighborsToSNNGraph

#' Build a nearest-neighbor graph
#'
#' Build a shared or k-nearest-neighbors graph of observations for downstream community detection.
#'
#' @param x A matrix-like object containing expression values for each observation (row) and dimension (column).
#' @param k An integer scalar specifying the number of nearest neighbors to consider during graph construction.
#' @param type A string specifying the type of weighting scheme to use for shared neighbors.
#' @param directed A logical scalar indicating whether the output of \code{buildKNNGraph} should be a directed graph.
#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying the nearest neighbor algorithm.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object to use for parallel processing.
#' @param indices An integer matrix where each row corresponds to an observation
#' and contains the indices of the \code{k} nearest neighbors (by increasing distance and excluding self) from that observation.
#' 
#' @details
#' The \code{makeSNNGraph} method builds a shared nearest-neighbour graph using observations as nodes.
#' For each observation, its \code{k} nearest neighbours are identified using the \code{\link{findKNN}} function,
#' based on distances between their expression profiles (Euclidean by default).
#' An edge is drawn between all pairs of observations that share at least one neighbour,
#' weighted by the characteristics of the shared nearest neighbors:
#' \itemize{
#' \item If \code{type="rank"}, the weighting scheme defined by Xu and Su (2015) is used.
#' The weight between two nodes is \eqn{k - r/2} where \eqn{r} is the smallest sum of ranks for any shared neighboring node.
#' For example, if one node was the closest neighbor of each of two nodes, the weight between the two latter nodes would be \eqn{k - 1}.
#' For the purposes of this ranking, each node has a rank of zero in its own nearest-neighbor set. 
#' More shared neighbors, or shared neighbors that are close to both observations, will generally yield larger weights.
#' \item If \code{type="number"}, the weight between two nodes is simply the number of shared nearest neighbors between them.
#' The weight can range from zero to \eqn{k + 1}, as the node itself is included in its own nearest-neighbor set.
#' This is a simpler scheme that is also slightly faster but does not account for the ranking of neighbors within each set.
#' \item If \code{type="jaccard"}, the weight between two nodes is the Jaccard similarity between the two sets of neighbors for those nodes.
#' This weight can range from zero to 1, and is a monotonic transformation of the weight used by \code{type="number"}.
#' It is provided for consistency with other clustering algorithms such as those in \pkg{seurat}.
#' }
#' 
#' The aim is to use the SNN graph to perform clustering of observations via community detection algorithms in the \pkg{igraph} package.
#' This is faster and more memory efficient than hierarchical clustering for large numbers of observations.
#' In particular, it avoids the need to construct a distance matrix for all pairs of observations.
#' Only the identities of nearest neighbours are required, which can be obtained quickly with methods in the \pkg{BiocNeighbors} package.
#' 
#' The choice of \code{k} controls the connectivity of the graph and the resolution of community detection algorithms.
#' Smaller values of \code{k} will generally yield smaller, finer clusters, while increasing \code{k} will increase the connectivity of the graph and make it more difficult to resolve different communities.
#' The value of \code{k} can be roughly interpreted as the anticipated size of the smallest subpopulation.
#' If a subpopulation in the data has fewer than \code{k+1} observations, \code{buildSNNGraph} and \code{buildKNNGraph} will forcibly construct edges between observations in that subpopulation and observations in other subpopulations. 
#' This increases the risk that the subpopulation will not form its own cluster as it is more interconnected with the rest of the observations in the dataset.
#' 
#' Note that the setting of \code{k} here is slightly different from that used in SNN-Cliq.
#' The original implementation considers each observation to be its first nearest neighbor that contributes to \code{k}.
#' In \code{buildSNNGraph}, the \code{k} nearest neighbours refers to the number of \emph{other} observations.
#' 
#' The \code{makeKNNGraph} method builds a simpler k-nearest neighbour graph.
#' Observations are again nodes, and edges are drawn between each observation and its k-nearest neighbours.
#' No weighting of the edges is performed.
#' In theory, these graphs are directed as nearest neighour relationships may not be reciprocal.
#' However, by default, \code{directed=FALSE} such that an undirected graph is returned.
#'
#' The \code{neighborsToSNNGraph} and \code{neighborsToKNNGraph} functions operate directly on a matrix of nearest neighbor indices,
#' obtained using functions like \code{\link{findKNN}}.
#' This may be useful for constructing a graph from precomputed nearest-neighbor search results.
#' Note that the user is responsible for ensuring that the indices are valid (i.e., \code{range(indices)} is positive and no greater than \code{max(indices)}).
#' 
#' @return
#' A \link{graph} where nodes are cells and edges represent connections between nearest neighbors.
#' For \code{buildSNNGraph}, these edges are weighted by the number of shared nearest neighbors.
#' For \code{buildKNNGraph}, edges are not weighted but may be directed if \code{directed=TRUE}.
#' 
#' @author 
#' Aaron Lun, with KNN code contributed by Jonathan Griffiths.
#' 
#' @seealso
#' See \code{\link{make_graph}} for details on the graph output object.
#' 
#' See \code{\link{cluster_walktrap}}, \code{\link{cluster_louvain}} and related functions in \pkg{igraph} for clustering based on the produced graph.
#' 
#' Also see \code{\link{findKNN}} for specifics of the nearest-neighbor search.
#' 
#' @references
#' Xu C and Su Z (2015).
#' Identification of cell types from single-cell transcriptomes using a novel clustering method.
#' \emph{Bioinformatics} 31:1974-80
#' 
#' @examples
#' m <- matrix(rnorm(10000), ncol=10)
#'
#' g <- makeSNNGraph(m)
#' clusters <- igraph::cluster_fast_greedy(g)$membership
#' table(clusters)
#'
#' # Any clustering method from igraph can be used:
#' clusters <- igraph::cluster_walktrap(g)$membership
#' table(clusters)
#'
#' # Smaller 'k' usually yields finer clusters:
#' g <- makeSNNGraph(m, k=5)
#' clusters <- igraph::cluster_walktrap(g)$membership
#' table(clusters)
#'
#' @name makeSNNGraph
NULL

#' @export
#' @rdname makeSNNGraph
#' @importFrom BiocNeighbors KmknnParam findKNN
#' @importFrom BiocParallel SerialParam 
makeSNNGraph <- function(x, k=10, type=c("rank", "number", "jaccard"), BNPARAM=KmknnParam(), BPPARAM=SerialParam()) { 
    nn.out <- findKNN(x, k=k, BNPARAM=BNPARAM, BPPARAM=BPPARAM, get.distance=FALSE)
    neighborsToSNNGraph(nn.out$index, type=match.arg(type))
}

#' @export
#' @rdname makeSNNGraph
#' @importFrom BiocParallel SerialParam
#' @importFrom BiocNeighbors KmknnParam findKNN
makeKNNGraph <- function(x, k=10, directed=FALSE, BNPARAM=KmknnParam(), BPPARAM=SerialParam()) { 
    nn.out <- findKNN(x, k=k, BNPARAM=BNPARAM, BPPARAM=BPPARAM, get.distance=FALSE)
    neighborsToKNNGraph(nn.out$index, directed=directed)
}

#' @export
#' @rdname makeSNNGraph
#' @importFrom igraph make_graph simplify E "E<-"
neighborsToSNNGraph <- function(indices, type=c("rank", "number", "jaccard")) {
    type <- match.arg(type)
    if (type=="rank") {
        g.out <- build_snn_rank(indices)
    } else {
        g.out <- build_snn_number(indices)
    }

    edges <- g.out[[1]] 
    weights <- g.out[[2]]
    if (type=="jaccard") {
        weights <- weights / (2 * (ncol(indices) + 1) - weights)
    }

    g <- make_graph(edges, directed=FALSE)
    E(g)$weight <- weights
    simplify(g, edge.attr.comb="first") # symmetric, so doesn't really matter.
}


#' @export
#' @rdname makeSNNGraph
#' @importFrom igraph make_graph simplify
neighborsToKNNGraph <- function(indices, directed=FALSE) {
    start <- as.vector(row(indices))
    end <- as.vector(indices)
    interleaved <- as.vector(rbind(start, end))
    
    if (directed) { 
        g <- make_graph(interleaved, directed=TRUE)
    } else {
        g <- make_graph(interleaved, directed=FALSE)
        g <- simplify(g, edge.attr.comb = "first")
    }
    g
}