#' Compute locus allele frequencies
#'
#' On a regular matrix, this is essentially a wrapper for [colMeans()] or [rowMeans()] depending on `loci_on_cols`.
#' On a BEDMatrix object, the locus allele frequencies are computed keeping memory usage low.
#'
#' @param X The genotype matrix (regular R matrix or BEDMatrix object).
#' Missing values are ignored in averages.
#' @param loci_on_cols If `TRUE`, `X` has loci on columns and individuals on rows; if false (the default), loci are on rows and individuals on columns.
#' If `X` is a BEDMatrix object, code assumes loci on columns (`loci_on_cols` is ignored).
#' @param fold If `TRUE`, allele frequencies are converted to minor allele frequencies.
#' Default is to return frequencies for the given allele counts in `X` (regardless of whether it is the minor or major allele).
#' @param m_chunk_max BEDMatrix-specific, sets the maximum number of loci to process at the time.
#' If memory usage is excessive, set to a lower value than default (expected only for extremely large numbers of individuals).
#' @param subset_ind Optionally subset individuals by providing their indexes (negative indexes to exclude) or a boolean vector (in other words, the usual ways to subset matrices).
#' Most useful for BEDMatrix inputs, to subset chunks and retain low memory usage.
#' @param want_counts If `TRUE` (default `FALSE`), raw allele counts are also returned.
#' Note `fold` option has no effect on these counts.
#'
#' @return If `want_counts = FALSE`, the vector of estimated ancestral allele frequencies, one per locus.
#' Names are set to the locus names, if present.
#' If `want_counts = TRUE`, a named list containing both the estimated ancestral allele frequencies `p_anc_est` and the allele `counts` matrix, with loci along the rows (also with names if present), and alleles along the columns.
#'
#' @examples
#' # Construct toy data
#' X <- matrix(
#' c(0, 1, 2,
#' 1, 0, 1,
#' 1, NA, 2),
#' nrow = 3,
#' byrow = TRUE
#' )
#'
#' # row means
#' allele_freqs(X)
#' c(1/2, 1/3, 3/4)
#'
#' # row means, in minor allele frequencies
#' allele_freqs(X, fold = TRUE)
#' c(1/2, 1/3, 1/4)
#'
#' # col means
#' allele_freqs(X, loci_on_cols = TRUE)
#' c(1/3, 1/4, 5/6)
#'
#' @export
allele_freqs <- function(
X,
loci_on_cols = FALSE,
fold = FALSE,
m_chunk_max = 1000, # optimal value not tested directly
subset_ind = NULL,
want_counts = FALSE
) {
# behavior depends on class
if ( 'BEDMatrix' %in% class( X ) ) {
# extract data dimensions
m_loci <- ncol( X )
n_ind <- nrow( X )
# allocate desired vector of allele frequencies
p_anc_est <- vector( 'numeric', m_loci )
# and these others but only if strictly required (otherwise it's memory wasteful)
if ( want_counts ) {
counts1 <- vector( 'numeric', m_loci )
counts2 <- vector( 'numeric', m_loci )
}
# navigate chunks
i_chunk <- 1 # start of first chunk (needed for matrix inputs only; as opposed to function inputs)
while ( TRUE ) { # start an infinite loop, break inside as needed
# this means all SNPs have been covered!
if ( i_chunk > m_loci )
break
# indexes to extract loci, and also so save to FstTs and FstBs vectors
indexes_loci_chunk <- i_chunk : min( i_chunk + m_chunk_max - 1, m_loci ) # range of SNPs to extract in this chunk
# Only loci_on_cols==TRUE cases is supported here, this is only for BEDMatrix
# DO NOT transpose for our usual setup (this is faster)
Xi <- X[, indexes_loci_chunk, drop = FALSE]
# apply subsetting to this chunk of matrix
if ( !is.null( subset_ind ) )
Xi <- Xi[ subset_ind, , drop = FALSE ]
# compute and store the values we want!
# because we didn't transpose, use colMeans here istead of rowMeans!
# get the more basic counts first
counts1_chunk <- colSums( Xi, na.rm = TRUE )
n_obs_chunk <- 2 * colSums( !is.na( Xi ) )
# save only what we need, nothing else
p_anc_est[ indexes_loci_chunk ] <- counts1_chunk / n_obs_chunk
if ( want_counts ) {
counts1[ indexes_loci_chunk ] <- counts1_chunk
counts2[ indexes_loci_chunk ] <- n_obs_chunk - counts1_chunk
}
# fold allele frequencies if requested
# NOTE: perform on each chunk, further preserving memory
if ( fold )
p_anc_est[ indexes_loci_chunk ] <- fold_allele_freqs( p_anc_est[ indexes_loci_chunk ] )
# update starting point for next chunk! (overshoots at the end, that's ok)
i_chunk <- i_chunk + m_chunk_max
}
# set names to be SNP names
# this is done by col/rowMeans when X is not BEDMatrix below, so let's match that here
names( p_anc_est ) <- colnames( X )
if ( want_counts )
names( counts1 ) <- colnames( X )
} else if ( is.matrix( X ) ) {
# apply subsetting to whole matrix in this case
if ( !is.null( subset_ind ) )
X <- if ( loci_on_cols ) X[ subset_ind, , drop = FALSE ] else X[ , subset_ind, drop = FALSE ]
# compute allele frequencies directly, all at once
if ( loci_on_cols ) {
counts1 <- colSums( X, na.rm = TRUE )
n_obs <- 2 * colSums( !is.na( X ) )
} else {
counts1 <- rowSums( X, na.rm = TRUE )
n_obs <- 2 * rowSums( !is.na( X ) )
}
p_anc_est <- counts1 / n_obs
if ( want_counts )
counts2 <- n_obs - counts1
# either way, fold allele frequencies if requested
if ( fold )
p_anc_est <- fold_allele_freqs( p_anc_est )
} else
stop( 'Only matrix and BEDMatrix supported! Instead, class was: ', toString( class( X ) ) )
# return the vector whichever way it was computed, and optionally the counts
if ( want_counts ) {
# `deparse.level = 0` ensures there's no labels on this matrix (otherwise the column names are "counts1" and "counts2")
counts <- cbind( counts1, counts2, deparse.level = 0 )
return( list( p_anc_est = p_anc_est, counts = counts ) )
} else
return( p_anc_est )
}
# internal function that folds allele frequencies that have already been calculated from genotypes or otherwise
fold_allele_freqs <- function( ps ) {
if ( missing( ps ) )
stop( 'Allele frequency vector `ps` is required!' )
# identify loci to flip
indexes <- ps > 0.5
# actually flip
ps[ indexes ] <- 1 - ps[ indexes ]
# return modified vector
return( ps )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.