R/bed_jaccard.r

Defines functions bed_jaccard

Documented in bed_jaccard

#' Calculate the Jaccard statistic for two sets of intervals.
#'
#' Quantifies the extent of overlap between to sets of intervals in terms of
#' base-pairs. Groups that are shared between input are used to calculate the statistic
#' for subsets of data.
#'
#' @details The Jaccard statistic takes values of `[0,1]` and is measured as:
#'
#' \deqn{ J(x,y) = \frac{\mid x \bigcap y \mid}
#'                      {\mid x \bigcup y \mid} =
#'                 \frac{\mid x \bigcap y \mid}
#'                      {\mid x \mid + \mid y \mid -
#'                       \mid x \bigcap y \mid} }
#'
#' @param x [ivl_df]
#' @param y [ivl_df]
#'
#' @template stats
#'
#' @family interval statistics
#'
#' @return
#' tibble with the following columns:
#'
#'   - `len_i` length of the intersection in base-pairs
#'   - `len_u` length of the union in base-pairs
#'   - `jaccard` value of jaccard statistic
#'   - `n_int` number of intersecting intervals between `x` and `y`
#'
#' If inputs are grouped, the return value will contain one set of values per group.
#'
#' @seealso
#'   \url{https://bedtools.readthedocs.io/en/latest/content/tools/jaccard.html}
#'
#' @examples
#' genome <- read_genome(valr_example("hg19.chrom.sizes.gz"))
#'
#' x <- bed_random(genome, seed = 1010486)
#' y <- bed_random(genome, seed = 9203911)
#'
#' bed_jaccard(x, y)
#'
#' # calculate jaccard per chromosome
#' bed_jaccard(
#'   dplyr::group_by(x, chrom),
#'   dplyr::group_by(y, chrom)
#' )
#'
#' @export
bed_jaccard <- function(x, y) {
  check_required(x)
  check_required(y)

  x <- check_interval(x)
  y <- check_interval(y)

  groups_shared <- shared_groups(x, y)

  x <- bed_merge(x)
  y <- bed_merge(y)

  res_intersect <- bed_intersect(x, y)

  if (!is.null(groups_shared)) {
    x <- group_by(x, !!!syms(groups_shared))
    y <- group_by(y, !!!syms(groups_shared))

    res_intersect <- group_by(res_intersect, !!!syms(groups_shared))
  }

  res_intersect <- summarize(
    res_intersect,
    sum_overlap = sum(as.numeric(.overlap)),
    n_int = as.numeric(n())
  )

  res_x <- mutate(x, .size = end - start)
  res_x <- summarize(res_x, sum_x = sum(as.numeric(.size)))

  res_y <- mutate(y, .size = end - start)
  res_y <- summarize(res_y, sum_y = sum(as.numeric(.size)))

  if (!is.null(groups_shared)) {
    res <- left_join(res_intersect, res_x, by = as.character(groups_shared))
    res <- left_join(res, res_y, by = as.character(groups_shared))

    res <- mutate(res, sum_xy = sum_x + sum_y)
    group_cols <- select(res, !!!syms(groups_shared))

    res <- transmute(
      res,
      len_i = sum_overlap,
      len_u = sum_xy,
      jaccard = sum_overlap / (sum_xy - sum_overlap),
      n = n_int
    )

    res <- bind_cols(group_cols, res)
  } else {
    n_i <- res_intersect$sum_overlap
    n_u <- res_x$sum_x + res_y$sum_y

    jaccard <- n_i / (n_u - n_i)

    res <- tibble(
      len_i = n_i,
      len_u = n_u,
      jaccard = jaccard,
      n = res_intersect$n_int
    )
  }

  res
}
jayhesselberth/valr documentation built on April 24, 2024, 7:15 a.m.