R/bin_depths.R
In whoa: Evaluation of Genotyping Error in Genotype-by-Sequencing Data

Documented in bin_depths

#' bin read depths of SNPs into categories having at least S observations
#'
#' @param D a matrix of read depths.  Rows are individuals, columns are SNPs.  Cells where data are missing
#' in the genotype matrix must be denoted as NA
#' @param S the min number of observations to have in each bin
#' @return This returns a list with two components.  \code{dp_bins} is a matrix of the same
#' shape as D with the bin categories (as 1, 2, ...) and -1 for this cells
#' corresponding to missing genotypes.  \code{num_cats} is the number of depth bins.
#' \code{tidy_bins} is a long format description of the bins.
#' \code{bin_stats} is a tibble giving summary information about the read depth bins which
#' is useful for plotting things, etc.
#' @export
#' @keywords internal
#' @examples
#'
#' # get a matrix of read depths and make it an integer matrix
#' depths <- vcfR::extract.gt(lobster_buz_2000, element = "DP")
#' storage.mode(depths) <- "integer"
#'
#' # get a character matrix of genotypes, so we can figure out which
#' # are missing and mask those from depths
#' genos <- vcfR::extract.gt(lobster_buz_2000, element = "GT")
#'
#' # make missing in depths if missing in genos
#' depths[is.na(genos)] <- NA
#'
#' # bin the read depths into bins with at least 1000 observations in each bin
#' bins <- bin_depths(depths, 1000)

bin_depths <- function(D, S) {

  # first, count things up and get them into a reasonable format
  dcnts <- as_tibble(as.data.frame(table(D))) %>%
    mutate(D = as.integer(as.character(D))) %>%
    setNames(c("D", "Freq")) %>%
    rename(dp = D,
           n = Freq) %>%
    arrange(dp) # just to make sure it is in increasing order

  # now we lump them up.  We just use a for loop for this
  idx <- 1
  sum <- 0
  n <- dcnts$n
  res <- rep(NA, length(n))
  for(i in seq_along(n)) {
    res[i] <- idx
    sum <- sum + n[i]
    if(sum > S) {
      idx <- idx + 1
      sum <- 0
    }
  }

  # at the end of that, we take the very last bin and just merge is with the
  # preceding one, so that it doesn't have just a small number in it.
  # but we only do this if there is more than one read depth bin!
  if(max(res) > 1) {
    res[res == max(res)] <- max(res) - 1
  }

  # and we add it on there with a mutate which will bark an error is something has gone awry
  tidy_bins <- dcnts %>%
    mutate(bin = as.integer(res))

  # now we have to cut the original matrix up into these categories
  left_endpoints <- tidy_bins %>%
    group_by(bin) %>%
    summarise(ends = max(dp) + 0.2)

  cut_vec <- c(-0.01, left_endpoints$ends)


  dp_bins <- D
  dp_bins[] <- cut(D, breaks = cut_vec)

  dp_bins[is.na(dp_bins)] <- -1L

  # finally summarize the bin stats
  bin_stats <- tidy_bins %>%
    group_by(bin) %>%
    summarise(total_n = sum(n),
              mean_dp = sum(dp * n) / sum(n))

  list(dp_bins = dp_bins,
       num_cats = max(tidy_bins$bin),
       tidy_bins = tidy_bins,
       bin_stats = bin_stats
  )
}