R/reconstruct_hap.R

Defines functions reconstruct_hap

Documented in reconstruct_hap

#' A function to reconstruct the donor haplotypes using data from gametes
#' 
#' This function takes gamete data and computes and clusters a distance matrix with a binary method. It 
#' anticipates sparse data and replaces NAs with 0.5. It clusters the tree into two groups, i.e., haplotypes. 
#' It then categorizes the gamete cells falling into each of the two groups. It reconstructs the original 
#' haplotypes by majority vote after inverting the opposite haplotype. 
#' 
#' @param input_dt A dataframe of sparse gamete data coded with reference and alt alleles (i.e., 0, 1, or NA)
#' @param input_positions vector of SNP positions
#' @param window_indices A window of the gamete data (using segments generated from `split_with_overlap`) 
#' 
#' @return inferred_output A tibble with the inferred haplotype sorted by position and window
#' 
#' @importFrom stats dist hclust cutree 
#' @importFrom tibble tibble
#' 
reconstruct_hap <- function(input_dt, input_positions, window_indices) {
  window_start <- min(window_indices)
  window_end <- max(window_indices)
  positions_for_window <- input_positions[window_start:window_end]
  # compute a distance matrix
  d <- dist(t(as.matrix(input_dt)[window_start:window_end,]), method = "binary")
  # plug in 0.5 for any NA entries of the distance matrix
  d[is.na(d)] <- 0.5
  # cluster the distance matrix
  tree <- hclust(d, method = "ward.D2")
  # cut the tree generated by clustering into two groups (haplotypes)
  haplotypes <- cutree(tree, k=2)
  # get the names of the gamete cells falling into the two groups
  h1_gamete <- names(haplotypes[haplotypes == 1])
  h2_gamete <- names(haplotypes[haplotypes == 2])
  # reconstruct the original haplotypes by majority vote after inverting the opposite haplotype
  h1_inferred <- unname(apply(cbind(input_dt[window_start:window_end, h1_gamete],
                                    invert_bits(input_dt[window_start:window_end, h2_gamete])),
                              1, function(x) get_mode(x)))
  inferred_output <- tibble(index = window_indices, pos = positions_for_window, h1 = h1_inferred)
  return(inferred_output)
}
mccoy-lab/rhapsodi documentation built on July 27, 2022, 3:56 a.m.