R/bdiv_table.r

#' Distance / dissimilarity between samples.
#' 
#' @inherit documentation_cmp
#' @inherit documentation_default
#' 
#' @family beta_diversity
#' 
#' @return
#' \itemize{
#'   \item{`bdiv_matrix()` - }{ An R matrix of samples x samples. }
#'   \item{`bdiv_distmat()` - }{ A dist-class distance matrix. }
#'   \item{`bdiv_table()` - }{
#'     A tibble data.frame with columns names .sample1, .sample2, .weighted, 
#'     .bdiv, .distance, and any fields requested by `md`. Numeric metadata 
#'     fields will be returned as `abs(x - y)`; categorical metadata fields as 
#'     `"x"`, `"y"`, or `"x vs y"`. }
#' }
#' 
#' 
#' @export
#' @examples
#'     library(rbiom)
#'     
#'     # Subset to four samples
#'     biom <- hmp50$clone()
#'     biom$counts <- biom$counts[,c("HMP18", "HMP19", "HMP20", "HMP21")]
#'     
#'     # Return in long format with metadata
#'     bdiv_table(biom, 'unifrac', md = ".all")
#'     
#'     # Only look at distances among the stool samples
#'     bdiv_table(biom, 'unifrac', md = c("==Body Site", "Sex"))
#'     
#'     # Or between males and females
#'     bdiv_table(biom, 'unifrac', md = c("Body Site", "!=Sex"))
#'     
#'     # All-vs-all matrix
#'     bdiv_matrix(biom, 'unifrac')
#'     
#'     # All-vs-all distance matrix
#'     dm <- bdiv_distmat(biom, 'unifrac')
#'     dm
#'     plot(hclust(dm))
#'

bdiv_table <- function (
    biom, bdiv = "Bray-Curtis", weighted = TRUE, tree = NULL, 
    md = ".all", within = NULL, between = NULL, delta = '.all', 
    trans = "none" ) {
  
  biom   <- as_rbiom(biom)
  params <- eval_envir(environment())
  cmd    <- sprintf("bdiv_table(%s)", as.args(params, fun = bdiv_table))
  
  
  #________________________________________________________
  # See if this result is already in the cache.
  #________________________________________________________
  cache_file <- get_cache_file('bdiv_table', params)
  if (isTRUE(attr(cache_file, 'exists', exact = TRUE)))
    return (readRDS(cache_file))
  
  
  #________________________________________________________
  # Strips '==' and '!='; appends to within and between.
  #________________________________________________________
  validate_var_cmp(c('md', 'delta'))
  
  
  #________________________________________________________
  # Validate and restructure user's arguments.
  #________________________________________________________
  validate_bdiv(max = Inf)
  validate_bool('weighted', max = Inf)
  
  validate_biom_field('md',      null_ok = TRUE, max = Inf)
  validate_biom_field('delta',   null_ok = TRUE, max = Inf)
  validate_biom_field('within',  null_ok = TRUE, max = Inf, col_type = "cat")
  validate_biom_field('between', null_ok = TRUE, max = Inf, col_type = "cat")
  
  
  
  
  #________________________________________________________
  # Multiple combinations of bdiv/weighted into one table.
  #________________________________________________________
  tbl <- NULL
  
  for (w in weighted)
    for (b in bdiv)
      tbl %<>% dplyr::bind_rows(local({
        
        #________________________________________________________
        # Compute the distance matrix
        #________________________________________________________
        mtx <- bdiv_matrix(
          biom     = biom, 
          bdiv     = b, 
          weighted = w, 
          tree     = tree, 
          within   = within, 
          between  = between, 
          trans    = trans )
        
        
        #________________________________________________________
        # Convert to long form
        #________________________________________________________
        tibble(
          '.sample1'  = rownames(mtx)[row(mtx)],
          '.sample2'  = colnames(mtx)[col(mtx)],
          '.weighted' = w,
          '.bdiv'     = b,
          '.distance' = as.numeric(mtx) ) %>%
          dplyr::filter(.sample1 < .sample2) %>%
          dplyr::filter(!is.na(.distance))
        
      }))
  
  tbl[['.bdiv']] %<>% factor(levels = bdiv)
  
  
  #________________________________________________________
  # Add metadata columns
  #________________________________________________________
  for (col in unique(c(md, within, between))) {
    
    map <- pull(biom, col)
    v1  <- map[tbl[['.sample1']]]
    v2  <- map[tbl[['.sample2']]]
    
    
    # Compute abs(v1 - v2)
    #________________________________________________________
    if (is.numeric(map) && col %in% delta) {
      tbl[[col]] <- abs(v1 - v2)
    }
    
    
    # Transform to "v1 vs v2"
    #________________________________________________________
    else {
      
      
      # Change "Male vs Male" --> "Male", and sort by factor
      # level, e.g. "Male vs Female" --> "Female vs Male".
      #________________________________________________________
      tbl[[col]] <- ifelse(
        test = (v1 == v2), 
        yes  = as.character(v1), 
        no   = ifelse(
          test = as.numeric(v1) < as.numeric(v2), 
          yes  = paste(v1, "vs", v2), 
          no   = paste(v2, "vs", v1) ))
      
      
      # Keep factors as factors.
      #________________________________________________________
      if (is.factor(map)) {
        
        lvls <- levels(map)
        
        if (length(lvls) > 1)
          lvls %<>% {c(., apply(combn(., 2), 2L, paste, collapse=" vs "))}
        
        tbl[[col]] %<>% {factor(., levels = intersect(lvls, .))}
      }
      
    }
    
    
  }
  
  
  
  #________________________________________________________
  # Descriptive label for y-axis.
  #________________________________________________________
  if (length(bdiv) == 1 && length(weighted) == 1) {
    
    resp_label <- paste(
      ifelse(weighted, "Weighted", "Unweighted"), bdiv, "Distance" )
    
  } else {
    
    resp_label <- "\u03B2 Dissimilarity" %>% 
      aa(display = '"\\u03B2 Dissimilarity"')
  }
  
  
  
  tbl %<>% as_rbiom_tbl()
  attr(tbl, 'cmd')      <- cmd
  attr(tbl, 'response') <- ".distance"
  attr(tbl, 'resp_label') <- resp_label
  
  set_cache_value(cache_file, tbl)
  
  return (tbl)
}





#' @rdname bdiv_table
#' @export
bdiv_matrix <- function (
    biom, bdiv = "Bray-Curtis", weighted = TRUE, tree = NULL, 
    within = NULL, between = NULL, trans = "none" ) {
  
  #________________________________________________________
  # Take care not to cache filepath to tree.
  #________________________________________________________
  biom <- as_rbiom(biom)
  validate_tree(null_ok = TRUE)
  
  params <- eval_envir(environment())
  cmd    <- sprintf("bdiv_matrix(%s)", as.args(params, fun = bdiv_matrix))
  
  
  #________________________________________________________
  # See if this result is already in the cache.
  #________________________________________________________
  cache_file <- get_cache_file('bdiv_matrix', params)
  if (isTRUE(attr(cache_file, 'exists', exact = TRUE)))
    return (readRDS(cache_file))
  remove("params")
  
  
  #________________________________________________________
  # Ensure `within` and `between` are non-overlapping.
  #________________________________________________________
  validate_biom_field('within',  null_ok = TRUE, max = Inf, col_type = "cat")
  validate_biom_field('between', null_ok = TRUE, max = Inf, col_type = "cat")
  
  if (length(x <- intersect(within, between)) > 0)
    cli_abort("Metadata field name{?s} {.val {x}} cannot be set as both a within (==) and between (!=) grouping.")
  
  
  
  #________________________________________________________
  # Check for valid arguments.
  #________________________________________________________
  validate_bdiv()
  validate_bool("weighted")
  validate_var_choices('trans', c("none", "rank", "log", "log1p", "sqrt"))
  
  if (!is.null(tree)) {
    biom <- biom$clone()
    biom$tree <- tree
  }
  
  
  
  #________________________________________________________
  # Order the sparse matrix's values by sample, then by taxa
  #________________________________________________________
  if (bdiv == "UniFrac") {
    
    if (is.null(biom$tree))
      cli_abort(c(x = "Phylogenetic tree required for UniFrac."))
    
    counts <- biom$counts
    counts <- counts[as.character(biom$tree$tip.label),]
    
  } else {
    counts <- biom$counts
  }
  
  ord      <- order(counts$j, counts$i)
  counts$i <- counts$i[ord]
  counts$j <- counts$j[ord]
  counts$v <- counts$v[ord]
  
  
  
  
  #________________________________________________________
  # Only calculate distances between relevant samples.
  #________________________________________________________
  sids  <- colnames(counts)
  pairs <- local({
    
    # Limit to sample1 < sample2.
    x <- combn(factor(sids, levels = sids), 2)
    
    # Limit to only within or between comparisons.
    for (col in c(within, between)) {
      map <- pull(biom, col)
      if (col %in% within)  x <- x[,map[x[1,]] == map[x[2,]]]
      if (col %in% between) x <- x[,map[x[1,]] != map[x[2,]]]
    }
    
    x <- apply(x, 1L, as.integer)
    
    return (x)
  })
  
  
  
  #________________________________________________________
  # Run C++ implemented dissimilarity algorithms.
  #________________________________________________________
  if (bdiv == "UniFrac") {
    
    tree      <- biom$tree
    distances <- par_unifrac(counts, pairs - 1L, tree, ifelse(weighted, 1L, 0L))
    
  } else {
    counts    <- t(as.matrix(counts))
    distances <- par_beta_div(counts, pairs - 1L, tolower(bdiv), ifelse(weighted, 1L, 0L))
  }
  
  
  
  #________________________________________________________
  # Optionally transform the computed distance values.
  #________________________________________________________
  if (trans != "none") # "rank", "log", "log1p", "sqrt"
    distances <- do.call(`::`, list('base', trans))(distances)
  
  
  
  #________________________________________________________
  # Assemble a complete all-vs-all matrix.
  #________________________________________________________
  n   <- length(sids)
  mtx <- matrix(data = rep(as.numeric(NA), n * n), nrow = n)
  mtx[pairs]          <- distances
  mtx[pairs[,c(2,1)]] <- distances
  mtx[cbind(1:n,1:n)] <- rep(0, n)
  dimnames(mtx)       <- list(sids, sids)
  
  
  
  #________________________________________________________
  # Return the matrix
  #________________________________________________________
  attr(mtx, 'cmd') <- cmd
  set_cache_value(cache_file, mtx)
  
  return (mtx)
}




#' @rdname bdiv_table
#' @export
bdiv_distmat <- function (
    biom, bdiv = "Bray-Curtis", weighted = TRUE, tree = NULL, 
    within = NULL, between = NULL, trans = "none" ) {
  
  as.dist(bdiv_matrix(
    biom     = biom, 
    bdiv     = bdiv, 
    weighted = weighted, 
    tree     = tree, 
    within   = within, 
    between  = between, 
    trans    = trans ))
}
cmmr/rbiom documentation built on April 28, 2024, 6:38 a.m.