#' 2011 GCTA inbreeding estimator III
#'
#' This function calculates the biased GCTA inbreeding estimator III described in Yang et al. (2011).
#' Though these estimates (MOR version) were the basis of the GRM diagonal according to that paper, the GCTA software history shows that this exact estimator was abandoned in version 0.93.0 (8 Jul 2011) in favor of [kinship_std()] (also MOR version), which remains in use as of writing (2022).
#'
#' @inheritParams kinship_std
#'
#' @return Inbreeding estimates
#'
#' @examples
#' # dimensions of simulated data
#' n_ind <- 100
#' m_loci <- 1000
#' n_data <- n_ind * m_loci
#'
#' # missingness rate
#' miss <- 0.1
#'
#' # simulate ancestral allele frequencies
#' # uniform (0,1)
#' # it'll be ok if some of these are zero
#' p_anc <- runif(m_loci)
#'
#' # simulate some binomial data
#' X <- rbinom(n_data, 2, p_anc)
#'
#' # sprinkle random missingness
#' X[ sample(X, n_data * miss) ] <- NA
#'
#' # turn into a matrix
#' X <- matrix(X, nrow = m_loci, ncol = n_ind)
#'
#' # estimate inbreeding
#' # ... ROM version (see Ochoa and Storey (2021)).
#' inbr_gcta_rom <- inbr_gcta(X)
#' # ... MOR version (from Yang et al. (2011)).
#' inbr_gcta_mor <- inbr_gcta(X, mean_of_ratios = TRUE)
#'
#' @seealso
#' GCTA 2011 GRM estimator ROM limit [kinship_gcta_limit()].
#' The limit of `inbr_gcta` with `mean_of_ratios = FALSE` is given by `popkin::inbr(kinship_gcta_limit(true_kinship))`.
#'
#' Standard kinship estimator [kinship_std()] and the limit of the ROM version [kinship_std_limit()].
#'
#' GCTA software, including history/update log.
#' https://yanglab.westlake.edu.cn/software/gcta/#Download
#'
#' @export
inbr_gcta <- function(
X,
n = NA,
mean_of_ratios = FALSE,
loci_on_cols = FALSE,
mem_factor = 0.7,
mem_lim = NA,
m_chunk_max = 1000 # gave good performance in tests
) {
if ( missing( X ) )
stop( 'Genotype matrix `X` is required!' )
# determine some behaviors depending on data type
# first validate class and set key booleans
isFn <- FALSE
if ( is.function(X) ) {
isFn <- TRUE
if (is.na(n))
stop('Number of individuals `n` is required when X is a function!')
} else if ('BEDMatrix' %in% class(X)) {
# same as general matrix but transposed
# this is always imposed for this particular format!
loci_on_cols <- TRUE
} else if (!is.matrix(X)) {
stop('X has unsupported class: ', toString( class(X) ) )
}
# extract dimensions from data (not possible for function version)
# also get individual names (IDs)
names_X <- NULL # default
if (isFn) {
# have to define as NA to pass to get_mem_lim_m below
m <- NA
} else {
if (loci_on_cols) {
if (!is.na(n) && n != nrow(X))
warning('User set number of samples that does not match X dimensions (will go with latter): ', n, ' != ', nrow(X))
n <- nrow(X)
m <- ncol(X)
names_X <- rownames(X)
} else {
if (!is.na(n) && n != ncol(X))
warning('User set number of samples that does not match X dimensions (will go with latter): ', n, ' != ', ncol(X))
n <- ncol(X)
m <- nrow(X)
names_X <- colnames(X)
}
}
# initialize vectors of interest
# calculate as running sum over chunks
if ( mean_of_ratios ) {
f_hat <- vector( 'numeric', n )
m_c <- vector( 'numeric', n ) # complete observations
} else {
f_top <- vector( 'numeric', n )
f_bot <- vector( 'numeric', n )
}
# estimating total memory usage in bytes
data <- popkin:::solve_m_mem_lim(
n = n,
m = m,
mat_m_n = 2.5,
mat_n_n = 4,
vec_m = 3,
mem = mem_lim,
mem_factor = mem_factor
)
m_chunk <- data$m_chunk
# cap value to a nice performing value (very good speed, minimal memory)
if ( m_chunk > m_chunk_max )
m_chunk <- m_chunk_max
# navigate chunks
i_chunk <- 1 # start of first chunk (needed for matrix inputs only; as opposed to function inputs)
while(TRUE) { # start an infinite loop, break inside as needed
if (isFn) {
# get next "m_chunk" SNPs
Xi <- X( m_chunk )
# stop when SNPs run out (only happens for functions X, not matrices)
if (is.null(Xi))
break
} else {
# here m is known...
# this means all SNPs have been covered!
if (i_chunk > m)
break
# range of SNPs to extract in this chunk
indexes_loci_chunk <- i_chunk : min(i_chunk + m_chunk - 1, m)
if (loci_on_cols) {
Xi <- t(X[, indexes_loci_chunk, drop = FALSE]) # transpose for our usual setup
} else {
Xi <- X[indexes_loci_chunk, , drop = FALSE]
}
# update starting point for next chunk! (overshoots at the end, that's ok)
i_chunk <- i_chunk + m_chunk
}
# calculate MAF
p_est <- rowMeans( Xi, na.rm = TRUE ) / 2
# identify fixed loci, exclude
# (this is true for loci to keep)
indexes_not_fixed <- 0 < p_est & p_est < 1
if ( any( !indexes_not_fixed ) ) {
p_est <- p_est[ indexes_not_fixed ]
Xi <- Xi[ indexes_not_fixed, ]
}
# this is an efficient way to compute terms of interest, formulated as self-kinship
# note this is entrywise product!
# - to save memory, overwrite Xi (original Xi no longer needed)
# - NA values in original Xi are NAs in this new Xi
Xi <- ( Xi - 1 ) * ( Xi - 2 * p_est )
# this is vector of estimates
# note that estimates are actually 2x self-kinship at this stage
if ( mean_of_ratios ) {
# normalize each locus, then sum excluding missing loci
f_hat <- f_hat + colSums( Xi / ( p_est * ( 1 - p_est ) ), na.rm = TRUE )
# add to counts of non-missing observations
m_c <- m_c + colSums( !is.na( Xi ) )
} else {
# sum excluding missing loci
f_top <- f_top + colSums( Xi, na.rm = TRUE )
# sum these too, but include for each individual only if its observation was not missing
f_bot <- f_bot + colSums( ( !is.na( Xi ) ) * ( p_est * ( 1 - p_est ) ) )
}
}
# when done, complete normalizations
if ( mean_of_ratios ) {
# get average from sum by dividing by number of complete observations
f_hat <- f_hat / m_c / 2
} else {
# get final ratio by dividing both parts
f_hat <- f_top / f_bot / 2
}
# transfer names from X to inbr if present
if (!is.null(names_X))
names( f_hat ) <- names_X
# and very lastly, turn 2x-self-kinship to inbreeding by subtracting 1
return ( f_hat - 1 )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.