bluster: Clustering Algorithms for Bioconductor

Documented in DbscanParam

#' Density-based clustering with DBSCAN
#'
#' Perform density-based clustering with a fast re-implementation of the DBSCAN algorithm.
#'
#' @inheritParams clusterRows
#' @param eps Numeric scalar specifying the distance to use to define neighborhoods.
#' If \code{NULL}, this is determined from \code{min.pts} and \code{core.prop}.
#' @param min.pts Integer scalar specifying the minimum number of neighboring observations required for an observation to be a core point.
#' @param core.prop Numeric scalar specifying the proportion of observations to treat as core points.
#' This is only used when \code{eps=NULL}, see Details.
#' @param chunk.size Integer scalar specifying the number of points to process per chunk.
#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying the algorithm to use for the neighbor searches.
#' This should be able to support both nearest-neighbor and range queries.
#' @param num.threads Integer scalar specifying the number of threads to use.
#' @param BPPARAM Deprecated and ignored, use \code{num.threads} instead.
#' @param full Logical scalar indicating whether additional statistics should be returned.
#'
#' @details
#' DBSCAN operates by identifying core points, i.e., observations with at least \code{min.pts} neighbors within a distance of \code{eps}.
#' It identifies which core points are neighbors of each other, one \code{chunk.size} at a time, forming components of connected core points.
#' All non-core points are then connected to the closest core point within \code{eps}.
#' All groups of points that are connected in this manner are considered to be part of the same cluster.
#' Any unconnected non-core points are treated as noise and reported as \code{NA}.
#'
#' As a suitable value of \code{eps} may not be known beforehand, we can automatically determine it from the data.
#' For all observations, we compute the distance to the \eqn{k}th neighbor where \eqn{k} is defined as \code{round(min.pts * core.prop)}.
#' We then define \code{eps} as the \code{core.prop} quantile of the distances across all observations.
#' The default of \code{core.prop=0.5} means that around half of the observations will be treated as core points.
#'
#' Larger values of \code{eps} will generally result in fewer observations classified as noise, as they are more likely to connect to a core point.
#' It may also promote agglomeration of existing clusters into larger entities if they are connected by regions of (relatively) low density.
#' Conversely, larger values of \code{min.pts} will generally increase the number of noise points and may fragment larger clusters into subclusters.
#' 
#' To modify an existing DbscanParam object \code{x},
#' users can simply call \code{x[[i]]} or \code{x[[i]] <- value} where \code{i} is any argument used in the constructor.
#' 
#' @return
#' The \code{DbscanParam} constructor will return a \linkS4class{DbscanParam} object with the specified parameters.
#'
#' The \code{clusterRows} method will return a factor of length equal to \code{nrow(x)} containing the cluster assignments.
#' Note that this may contain \code{NA} values corresponding to noise points.
#' If \code{full=TRUE}, a list is returned with \code{clusters} (the factor, as above) and \code{objects} 
#' (a list containing the \code{eps} and \code{min.pts} used in the analysis).
#' 
#' @examples
#' clusterRows(iris[,1:4], DbscanParam())
#' clusterRows(iris[,1:4], DbscanParam(core.prop=0.8))
#'
#' @references
#' Ester M et al. (1996).
#' A density-based algorithm for discovering clusters in large spatial databases with noise.
#' \emph{Proceedings of the Second International Conference on Knowledge Discovery and Data Mining}, 226-231.
#' 
#' @author Aaron Lun
#' @name DbscanParam-class
#' @docType class
#' @aliases
#' show,DbscanParam-method
NULL

#' @export
setClass(
    "DbscanParam", 
    contains="BlusterParam",
    slots=c(
        eps="numeric_OR_NULL",
        min.pts="integer",
        core.prop="numeric", 
        chunk.size="integer",
        BNPARAM="BiocNeighborParam",
        num.threads="integer"
    )
)

#' @export
#' @rdname DbscanParam-class
DbscanParam <- function(eps=NULL, min.pts=5, core.prop=0.5, chunk.size=1000, BNPARAM=KmknnParam(), num.threads=1, BPPARAM=NULL) {
    if (!is.null(BPPARAM)) {
        num.threads <- BiocParallel::bpnworkers(BPPARAM)
    }
    new("DbscanParam",
        eps=eps,
        min.pts=as.integer(min.pts), 
        core.prop=as.double(core.prop), 
        chunk.size=as.integer(chunk.size),
        num.threads=as.integer(num.threads),
        BNPARAM=BNPARAM
    )
}

#' @export
setMethod("show", "DbscanParam", function(object) {
    callNextMethod()
    cat(sprintf("eps: %s\n", if (is.null(object[["eps"]])) "default" else object[["eps"]]))
    for (i in c("min.pts", "core.prop", "num.threads")) {
        cat(sprintf("%s: %s\n", i, object[[i]]))
    }
    for (i in c("BNPARAM")) {
        cat(sprintf("%s: %s\n", i, class(object[[i]])[1]))
    }
})

setValidity2("DbscanParam", function(object) {
    msg <- character(0)

    if (!is.null(object[["eps"]])) {
        msg <- c(msg, .check_positive_slots(object, "eps"))
    }

    msg <- c(msg, .check_positive_slots(object, c("min.pts", "core.prop", "num.threads")))

    if (object[["core.prop"]] > 1) {
        msg <- c(msg, "'core.prop' should not be greater than 1")
    }

    if (length(msg)) {
        return(msg)
    }
    TRUE
})

#' @export
#' @rdname DbscanParam-class
setMethod("clusterRows", c("ANY", "DbscanParam"), function(x, BLUSPARAM, full=FALSE) {
    out <- .DBSCAN(x, 
        eps=BLUSPARAM[["eps"]],
        min.pts=BLUSPARAM[["min.pts"]],
        core.prop=BLUSPARAM[["core.prop"]],
        chunk.size=BLUSPARAM[["chunk.size"]],
        BNPARAM=BLUSPARAM[["BNPARAM"]],
        num.threads=BLUSPARAM[["num.threads"]]
    )

    clusters <- out$clusters
    clusters[clusters==0] <- NA_integer_
    clusters <- factor(clusters)

    if (full) {
        out$clusters <- NULL
        list(clusters=clusters, objects=out)
    } else {
        clusters
    }
})

#' @importFrom BiocNeighbors buildIndex findNeighbors queryKNN findDistance
#' @importFrom stats quantile
#' @importFrom igraph make_graph components
#' @importFrom utils head tail
.DBSCAN <- function(x, eps=NULL, min.pts=5, core.prop=0.5, chunk.size=1000, BNPARAM=KmknnParam(), num.threads=1) {
    x <- as.matrix(x)

    # Finding all core points, using a quick NN algorithm.
    all.dist <- findDistance(x, k=min.pts, BNPARAM=BNPARAM, num.threads=num.threads)
    if (is.null(eps)) {
        eps <- quantile(all.dist, core.prop)
    } 
    is.core <- all.dist <= eps

    # Identifying core-core components.
    n.core <- sum(is.core)
    core.idx <- buildIndex(x[is.core,,drop=FALSE], BNPARAM=BNPARAM)
    remaining <- seq_len(n.core)
    to <- vector("list", n.core)

    while (length(remaining)) {
        chosen <- head(remaining, chunk.size)
        neighbors <- findNeighbors(BNINDEX=core.idx, subset=chosen, threshold=eps, get.distance=FALSE, num.threads=num.threads)$index
        to[chosen] <- neighbors
        remaining <- setdiff(tail(remaining, -chunk.size), unlist(neighbors)) 
    }

    from <- rep(seq_along(to), lengths(to))
    to <- unlist(to)
    g <- make_graph(rbind(from, to), n=n.core, directed=FALSE)
    core.clusters <- components(g)$membership

    # Finding closest core point (within range) for all non-core points.
    clusters <- integer(nrow(x))
    clusters[is.core] <- core.clusters

    if (n.core) {
        closest.core <- queryKNN(core.idx, query=x[!is.core,,drop=FALSE], num.threads=num.threads, k=1)
        in.range <- closest.core$distance <= eps
        clusters[which(!is.core)[in.range]] <- core.clusters[closest.core$index[in.range]]
    }

    list(clusters=clusters, eps=eps, min.pts=min.pts)
}