gintools: Genomic DNA Integration Analysis Tools

Documented in cluster_kv

#' Cluster or group key - value(s) pairs
#' 
#' \code{cluster_kv} is a function that will cluster or group keys based on 
#' values provided for the keys. Mutliple formats for `return` are available.
#' 
#' @usage cluster_kv(key, val)
#' @usage cluster_kv(key, val, return = "standard")
#' 
#' @description Function for clustering keys based on value content. Clustering
#' or grouping is solely based on presence / absence of values across keys. For
#' example, if we have the key - values of A = c(1, 2, 3), B = c(3, 4, 5), and
#' C = c(6, 7, 8), then keys A and B will be clustered together because they 
#' share the value 3, but C will not be clustered with either A or B because it
#' does not share any values with the respective keys. This type of clustering
#' can be helpful for grouping keys based on unique IDs, such as readnames or
#' character strings representing unique alignment locations.
#' 
#' @param key,val vector coercible into a factor vector. Both key and val 
#' vectors need to be equal length. Output will be of equal length and in same 
#' input order.
#' @param return options for output returned. "standard" will return a numeric 
#' vector of grouping IDs and is the default. "data.frame" will return a 
#' data.frame with key, val, and clus columns. "simple" will return a numeric
#' vector of grouping IDs with the names associated with unique keys. Lastly,
#' "graph" will return a simplifed graph with the keys as nodes and edges 
#' indicating which keys share values.
#' 
#' @example 
#' key <- c("A", "A", "A", "B", "B", "B", "C", "C", "C")
#' val <- c(1, 2, 3, 3, 4, 5, 6, 7, 8)
#' cluster_kv(key, val)
#' cluster_kv(key, val, return = "graph")
#' cluster_kv(key, val, return = "data.frame")
#' cluster_kv(key, val, return = "simple")
#' 
#' @author Christopher Nobles, Ph.D.
#' 
#' @importFrom magrittr %>%
#' @export
#' 

cluster_kv <- function(key, val, return = "standard"){
  # Check inputs
  stopifnot(return %in% c("standard", "data.frame", "simple", "graph"))
  stopifnot(length(key) == length(val))
  
  # Factorize keys and values for consistancy of indexing
  key_fac <- factor(key)
  val_fac <- factor(val)
  
  # Construct mock GRangesList where positions represent indices
  grl <- GenomicRanges::GRanges(
    seqnames = "mock", 
    ranges = IRanges::IRanges(start = as.integer(val_fac), width = 1), 
    strand = "*") %>%
    GenomicRanges::split(key) %>%
    GenomicRanges::reduce()
  
  # Determine which keys overlap based on associated values
  g <- as.matrix(GenomicRanges::findOverlaps(grl)) %>%
    igraph::graph.edgelist(directed = FALSE) %>%
    igraph::simplify()
  clus <- igraph::clusters(g)
  
  # Return data in different formats
  if(return == "standard"){
    return(igraph::membership(clus)[as.integer(key_fac)])
  }else if(return == "data.frame"){
    return(data.frame(
      key = key, 
      val = val, 
      clus = igraph::membership(clus)[as.integer(key_fac)]))
  }else if(return == "simple"){
    return(structure(igraph::membership(clus), names = levels(key_fac)))
  }else if(return == "graph"){
    return(g)
  }
}