R/bin_test.r

Defines functions BIN_test

#' Split data into bins and carry out a two-sample goodness-of-fit test
#' Calculate a p-value for positional differences of rare missense-variant residue positions between cases and controls.
#'
#' @author Adam Waring - adam.waring@@msdtc.ox.ac.uk
#' @return Returns an object of class htest - use $p.value for p-value
#'
#' @param case_residues vector of case-variant amino acid residue positions
#' @param control_residues vector of control-variant amino acid residue positions
#' @param method method to bin data either mann-wald or nbins
#' @param nbins number of bins to use if method == "nbins"
#' @param nresidues length of the protein under investigation in amino acid residues
#' @param plot_resids should chi-squared residuals be plotted? Defaults to False
#'
#' @keywords RVAT, distribution, cluster, genetics, case-control, gene
#'
#' @details The function takes a vector of case and control missense-variant residue positions (aggregated over a protein-coding-region) as input and returns a p-value representing the significance of variant clustering within the gene.
#' The linear sequence of the protein is split into 'bins' and the the counts for variant within each bin for each cohort are used to construct a kx2 contigency table where k is the number of bins and 2 is for the two cohorts: cases and controls.
#' The binning method is either:
#' - "mann_wald" where the number of bins k is determined by the total number of observed variants n by the equation k ~ n^(2/5)
#' - "nbins" where the user selects a specific number of bins (reasonable values here would be ~10-20 bins)
#' When "nresidues" is kept at its default NULL value - the protein length is assigned to the largest variant residue position in the observed data
#' Setting "plot_resids" to true allows the residuals for each cell in the kx2 contigency table to be plotted - this allows the user to determine which cells (protein regions) contribute towards the significance of the test.
#'
#' @export
#' @examples
#' # The essential inputs are case_residues and control_residues
#'
#' # Example 1: Simulated NULL data
#' # Bin by the Mann-Wald heuristic; n ^ (2/5) where n = length(case_residues) + length(control_residues)
#' # simulate case-control residue positions from the same distribution
#'
#' nresidues = 1000 # length of the protein
#' probs = rexp(nresidues)^2 # probability of a missense variant at each residue
#'
#' case_residues = sample(1:nresidues, 100, rep=T, probs)
#' control_residues = sample(1:nresidues, 100, rep=T, probs)
#'
#' BIN_test(case_residues, control_residues)
#'
#' # Example 2: Simulated DISEASE data
#' # simulate case-control residue positions from different distributions
#'
#' nresidues = 1000 # length of the protein
#' probs = rexp(nresidues)^2
#' case_probs = probs * rep(c(1, 3, 1), c(200, 200, 600))
#' control_probs = probs * rep(c(2, 1, 2), c(200, 200, 600))
#'
#' case_residues = sample(1:nresidues, 100, rep=T, case_probs)
#' control_residues = sample(1:nresidues, 100, rep=T, control_probs)
#'
#' plot_distributions(case_residues, control_residues)
#'
#' BIN_test(case_residues, control_residues, plot_resids = T)




BIN_test = function(case_residues, control_residues, method="mann-wald", nbins=12, nresidues=NULL, plot_resids=F){

  # check input and restructure
  if(!method %in% c("mann-wald", "nbins")){
    stop(paste0("'method' arguement must equal'mann-wald' or 'nbins' but is ", method))
  }
  inspect_residues = function(residues, name){

    if(!is.vector(residues) & !is.numeric(residues)){
      stop(paste0(name, "_residues must be a numeric vector but is a ", class(residues)))
    }

    if(any(is.na(residues))){
      warning(paste0("NA values present in ", name, "_residues vector. Removing ", sum(is.na(residues)), " NAs from analysis."))
      residues = residues[!is.na(residues)]
    }

    counts = table(residues)

    most_freq_ac = names(which.max(table(counts)))
    if(most_freq_ac != "1") warning("Expected most variants to be singletons but the most frequent allele count is ", most_freq_ac)

    outlier_boundary = mean(counts) + 3*sd(counts) + 2
    outliers = counts[counts > outlier_boundary]

    if(length(outliers) > 0) warning("There are some outliers in the frequency some ", name, "_residues are observed. (residues = ", paste(names(outliers), collapse=", "), "; ac = ", paste(outliers, collapse=", "), "). Be careful as these variants may be founders, from related samples or common ancestry-specific variants (that have evaded frequency filtering). Such variants may cause artefactual clusters.")

    return(residues)

  }
  case_residues = inspect_residues(case_residues, "cases")
  control_residues = inspect_residues(control_residues, "controls")

  residues = c(case_residues, control_residues)
  aff = c(rep(1, length(case_residues)), rep(0, length(control_residues)))
  n = length(aff)
  if(is.null(nresidues)) nresidues = max(residues)


  # calculate the number of bins to split the region into
  if(method == "nbins"){
    if(nbins > nresidues){
      warning("nbins > nresidues: nbins = nresidues")
      nbins = nresidues - 1
    }
    bins = nbins + 1
  } else bins = round(n ^ (2/5) + 1)


  # create a contingency table of case and control bin counts
  breaks = seq(1, nresidues, length.out=bins)
  case_tab = table(cut(residues[aff == 1], breaks))
  control_tab = table(cut(residues[aff == 0], breaks))

  contig = rbind(case_tab, control_tab)


  # remove bins with zero counts in both cases and controls
  index = which(apply(contig, 2, function(x) sum(x) == 0))
  if(length(index) > 0) contig = contig[, -index]


  # calculate chi-squared two-sample test on contingency table
  chisq = chisq.test(contig)


  # plot the resulting residuals per bin to identify bins that contributed the most to test statistic
  if(plot_resids == T) print(plot_residuals(chisq))

  return(chisq)

}
adamwaring/POSBURDEN documentation built on Feb. 21, 2020, 11:21 a.m.