#' Cluster or group key - value(s) pairs
#'
#' \code{cluster_kv} is a function that will cluster or group keys based on
#' values provided for the keys. Mutliple formats for `return` are available.
#'
#' @usage cluster_kv(key, val)
#' @usage cluster_kv(key, val, return = "standard")
#'
#' @description Function for clustering keys based on value content. Clustering
#' or grouping is solely based on presence / absence of values across keys. For
#' example, if we have the key - values of A = c(1, 2, 3), B = c(3, 4, 5), and
#' C = c(6, 7, 8), then keys A and B will be clustered together because they
#' share the value 3, but C will not be clustered with either A or B because it
#' does not share any values with the respective keys. This type of clustering
#' can be helpful for grouping keys based on unique IDs, such as readnames or
#' character strings representing unique alignment locations.
#'
#' @param key,val vector coercible into a factor vector. Both key and val
#' vectors need to be equal length. Output will be of equal length and in same
#' input order.
#' @param return options for output returned. "standard" will return a numeric
#' vector of grouping IDs and is the default. "data.frame" will return a
#' data.frame with key, val, and clus columns. "simple" will return a numeric
#' vector of grouping IDs with the names associated with unique keys. Lastly,
#' "graph" will return a simplifed graph with the keys as nodes and edges
#' indicating which keys share values.
#'
#' @example
#' key <- c("A", "A", "A", "B", "B", "B", "C", "C", "C")
#' val <- c(1, 2, 3, 3, 4, 5, 6, 7, 8)
#' cluster_kv(key, val)
#' cluster_kv(key, val, return = "graph")
#' cluster_kv(key, val, return = "data.frame")
#' cluster_kv(key, val, return = "simple")
#'
#' @author Christopher Nobles, Ph.D.
#'
#' @importFrom magrittr %>%
#' @export
#'
cluster_kv <- function(key, val, return = "standard"){
# Check inputs
stopifnot(return %in% c("standard", "data.frame", "simple", "graph"))
stopifnot(length(key) == length(val))
# Factorize keys and values for consistancy of indexing
key_fac <- factor(key)
val_fac <- factor(val)
# Construct mock GRangesList where positions represent indices
grl <- GenomicRanges::GRanges(
seqnames = "mock",
ranges = IRanges::IRanges(start = as.integer(val_fac), width = 1),
strand = "*") %>%
GenomicRanges::split(key) %>%
GenomicRanges::reduce()
# Determine which keys overlap based on associated values
g <- as.matrix(GenomicRanges::findOverlaps(grl)) %>%
igraph::graph.edgelist(directed = FALSE) %>%
igraph::simplify()
clus <- igraph::clusters(g)
# Return data in different formats
if(return == "standard"){
return(igraph::membership(clus)[as.integer(key_fac)])
}else if(return == "data.frame"){
return(data.frame(
key = key,
val = val,
clus = igraph::membership(clus)[as.integer(key_fac)]))
}else if(return == "simple"){
return(structure(igraph::membership(clus), names = levels(key_fac)))
}else if(return == "graph"){
return(g)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.