R/1003-clusterJarvis-Patrick.R

######################################
#' Nearest Neighbors
#'
#' Nearest Neighbors
#'
#' Computes the nearest neighbors of descriptors in an FPset or APset object for use with the 
#' \code{\link{jarvisPatrick}} clustering function. Only one of \code{numNbrs} or \code{cutoff} 
#' should be given, \code{cutoff} will take precedence if both are given. If \code{numNbrs} is 
#' given, then that many neighbors will be returned for each item in the set. If \code{cutoff} 
#' is given, then, for each item X, every neighbor that has a similarity value greater than or 
#' equal to  the cutoff will be returned in the neighbor list for X.
#' 
#' @param x Either an \code{FPset} or an \code{APset}.
#' @param numNbrs Number of neighbors to find for each item. If not enough neighbors can be found the 
#'                matrix will be padded with NA.
#' @param cutoff The minimum similarity value an item must have to another item in order to be included 
#'               in that items neighbor list. This parameter takes precedence over \code{numNbrs}. This 
#'               parameter allows to obtain tighter clustering results.               
#' @param ... These parameters will be passed into the distance function used
#' 
#' @return The return value is a list with the following components:
#' \itemize{
#' \item \code{indexes} - index values of nearest neighbors, for each item. If \code{cutoff} is
#' used, this will be a list of lists, otherwise it will be a matrix
#' \item \code{names} - The names of each item in the set, as returned by cid
#' \item \code{similarities} - The similarity values of each neighbor to the item for that row. 
#' This will also be either a list of lists or a matrix, depending on whether or not
#' \code{cutoff} was used. Each similarity values corresponds to the id number in the 
#' same position in the indexes entry
#' }
#' 
#' @keywords nearest Neighbors NNeighbors
#'
#' @aliases NNeighbors
#' 
#' @author Min-feng Zhu <\email{wind2zhu@@163.com}>
#' 
#' @seealso See \code{\link{clusterJP}} for Jarvis-Patrick Clustering
#' 
#' @export NNeighbors
#' 
#' @examples
#' data(sdfbcl)
#' apbcl = convSDFtoAP(sdfbcl)
#' nnm = NNeighbors(apbcl, cutoff = 0.5)
#' 

NNeighbors <- function (x, numNbrs = NULL, cutoff = NULL, ...) {

    nnm = ChemmineR::nearestNeighbors(x, numNbrs = numNbrs, cutoff = cutoff, ...)
    nnm
  
}



#' Jarvis-Patrick Clustering
#' 
#' Jarvis-Patrick Clustering
#' 
#' Function to perform Jarvis-Patrick clustering. The algorithm requires a nearest 
#' neighbor table, which consists of neighbors for each item in the dataset. This 
#' information is then used to join items into clusters with the following 
#' requirements: (a) they are contained in each other's neighbor list (b) they share
#' at least k nearest neighbors The nearest neighbor table can be computed with 
#' \code{\link{NNeighbors}}. For standard Jarvis-Patrick clustering, this function 
#' takes the number of neighbors to keep for each item. 
#' 
#' @param nnm  A nearest neighbor table, as produced by \code{\link{nearestNeighbors}}.
#'
#' @param k  Minimum number of nearest neighbors two rows (items) in the nearest neighbor 
#'           table need to have in common to join them into the same cluster. 
#' 
#' @param mode If \code{mode = "a1a2b"} (default), the clustering is run with both 
#'             requirements (a) and (b); if \code{mode = "a1b"} then (a) is relaxed 
#'             to a unidirectional requirement; and if \code{mode = "b"} then only
#'             requirement (b) is used. The size of the clusters generated by the different 
#'             methods increases in this order: "a1a2b" < "a1b" < "b". The run time of 
#'             method "a1a2b" follows a close to linear relationship, while it is nearly
#'             quadratic for the much more exhaustive method "b". Only methods "a1a2b" and 
#'             "a1b" are suitable for clustering very large data sets (e.g. >50,000 items) 
#'             in a reasonable amount of time.  
#'             
#' @param linkage Can be one of "single", "average", or "complete", for single linkage, 
#'                average linkage and complete linkage merge requirements, respectively. 
#'                In the context of Jarvis-Patrick, average linkage means that at least half 
#'                of the pairs between the clusters under consideration must pass the merge 
#'                requirement. Similarly, for complete linkage, all pairs must pass the merge 
#'                requirement. Single linkage is the normal case for Jarvis-Patrick and just 
#'                means that at least one pair must meet the requirement.         
#' 
#' @return Depending on the setting under the type argument, the function returns the clustering 
#'         result in a named vector or a nearest neighbor table as matrix.
#' 
#' @keywords clusterJP 
#'
#' @aliases clusterJP
#' 
#' @author Min-feng Zhu <\email{wind2zhu@@163.com}>
#' 
#' @seealso see \code{\link{NNeighbors}} for nearest neighbors 
#' 
#' @export clusterJP
#' 
#' @references 
#' Jarvis RA, Patrick EA (1973) 
#' Clustering Using a Similarity Measure Based on Shared Near Neighbors. IEEE
#' Transactions on Computers, C22, 1025-1034. URLs: http://davide.eynard.it/teaching/2012_PAMI/JP.pdf,
#' http://www.btluke.com/jpclust.html, 
#' http://www.daylight.com/dayhtml/doc/cluster/index.pdf
#' 
#' @examples
#' data(sdfbcl)
#' apbcl <- convSDFtoAP(sdfbcl)
#' fpbcl <- convAPtoFP(apbcl)
#' clusterJP(NNeighbors(apbcl, numNbrs = 6), k = 5, mode = "a1a2b")
#' clusterJP(NNeighbors(fpbcl, numNbrs = 6), k = 5, mode = "a1b")
#' clusterJP(NNeighbors(apbcl, cutoff = 0.6), k = 2, mode = 'b')
#' 
clusterJP <- function (nnm,  k, mode = "a1a2b", linkage = "single") {      
  ## Run Jarvis-Patrick clustering on nearest neighbor matrix (nnm)
    clusters = ChemmineR::jarvisPatrick(nnm, k = k, mode = mode, linkage = linkage)
    return(clusters)
}

Try the BioMedR package in your browser

Any scripts or data that you put into this service are public.

BioMedR documentation built on July 5, 2019, 9:03 a.m.