#' Split data into bins and carry out a two-sample goodness-of-fit test
#' Calculate a p-value for positional differences of rare missense-variant residue positions between cases and controls.
#'
#' @author Adam Waring - adam.waring@@msdtc.ox.ac.uk
#' @return Returns an object of class htest - use $p.value for p-value
#'
#' @param case_residues vector of case-variant amino acid residue positions
#' @param control_residues vector of control-variant amino acid residue positions
#' @param method method to bin data either mann-wald or nbins
#' @param nbins number of bins to use if method == "nbins"
#' @param nresidues length of the protein under investigation in amino acid residues
#' @param plot_resids should chi-squared residuals be plotted? Defaults to False
#'
#' @keywords RVAT, distribution, cluster, genetics, case-control, gene
#'
#' @details The function takes a vector of case and control missense-variant residue positions (aggregated over a protein-coding-region) as input and returns a p-value representing the significance of variant clustering within the gene.
#' The linear sequence of the protein is split into 'bins' and the the counts for variant within each bin for each cohort are used to construct a kx2 contigency table where k is the number of bins and 2 is for the two cohorts: cases and controls.
#' The binning method is either:
#' - "mann_wald" where the number of bins k is determined by the total number of observed variants n by the equation k ~ n^(2/5)
#' - "nbins" where the user selects a specific number of bins (reasonable values here would be ~10-20 bins)
#' When "nresidues" is kept at its default NULL value - the protein length is assigned to the largest variant residue position in the observed data
#' Setting "plot_resids" to true allows the residuals for each cell in the kx2 contigency table to be plotted - this allows the user to determine which cells (protein regions) contribute towards the significance of the test.
#'
#' @export
#' @examples
#' # The essential inputs are case_residues and control_residues
#'
#' # Example 1: Simulated NULL data
#' # Bin by the Mann-Wald heuristic; n ^ (2/5) where n = length(case_residues) + length(control_residues)
#' # simulate case-control residue positions from the same distribution
#'
#' nresidues = 1000 # length of the protein
#' probs = rexp(nresidues)^2 # probability of a missense variant at each residue
#'
#' case_residues = sample(1:nresidues, 100, rep=T, probs)
#' control_residues = sample(1:nresidues, 100, rep=T, probs)
#'
#' BIN_test(case_residues, control_residues)
#'
#' # Example 2: Simulated DISEASE data
#' # simulate case-control residue positions from different distributions
#'
#' nresidues = 1000 # length of the protein
#' probs = rexp(nresidues)^2
#' case_probs = probs * rep(c(1, 3, 1), c(200, 200, 600))
#' control_probs = probs * rep(c(2, 1, 2), c(200, 200, 600))
#'
#' case_residues = sample(1:nresidues, 100, rep=T, case_probs)
#' control_residues = sample(1:nresidues, 100, rep=T, control_probs)
#'
#' plot_distributions(case_residues, control_residues)
#'
#' BIN_test(case_residues, control_residues, plot_resids = T)
BIN_test = function(case_residues, control_residues, method="mann-wald", nbins=12, nresidues=NULL, plot_resids=F){
# check input and restructure
if(!method %in% c("mann-wald", "nbins")){
stop(paste0("'method' arguement must equal'mann-wald' or 'nbins' but is ", method))
}
inspect_residues = function(residues, name){
if(!is.vector(residues) & !is.numeric(residues)){
stop(paste0(name, "_residues must be a numeric vector but is a ", class(residues)))
}
if(any(is.na(residues))){
warning(paste0("NA values present in ", name, "_residues vector. Removing ", sum(is.na(residues)), " NAs from analysis."))
residues = residues[!is.na(residues)]
}
counts = table(residues)
most_freq_ac = names(which.max(table(counts)))
if(most_freq_ac != "1") warning("Expected most variants to be singletons but the most frequent allele count is ", most_freq_ac)
outlier_boundary = mean(counts) + 3*sd(counts) + 2
outliers = counts[counts > outlier_boundary]
if(length(outliers) > 0) warning("There are some outliers in the frequency some ", name, "_residues are observed. (residues = ", paste(names(outliers), collapse=", "), "; ac = ", paste(outliers, collapse=", "), "). Be careful as these variants may be founders, from related samples or common ancestry-specific variants (that have evaded frequency filtering). Such variants may cause artefactual clusters.")
return(residues)
}
case_residues = inspect_residues(case_residues, "cases")
control_residues = inspect_residues(control_residues, "controls")
residues = c(case_residues, control_residues)
aff = c(rep(1, length(case_residues)), rep(0, length(control_residues)))
n = length(aff)
if(is.null(nresidues)) nresidues = max(residues)
# calculate the number of bins to split the region into
if(method == "nbins"){
if(nbins > nresidues){
warning("nbins > nresidues: nbins = nresidues")
nbins = nresidues - 1
}
bins = nbins + 1
} else bins = round(n ^ (2/5) + 1)
# create a contingency table of case and control bin counts
breaks = seq(1, nresidues, length.out=bins)
case_tab = table(cut(residues[aff == 1], breaks))
control_tab = table(cut(residues[aff == 0], breaks))
contig = rbind(case_tab, control_tab)
# remove bins with zero counts in both cases and controls
index = which(apply(contig, 2, function(x) sum(x) == 0))
if(length(index) > 0) contig = contig[, -index]
# calculate chi-squared two-sample test on contingency table
chisq = chisq.test(contig)
# plot the resulting residuals per bin to identify bins that contributed the most to test statistic
if(plot_resids == T) print(plot_residuals(chisq))
return(chisq)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.