R/ligera_basic.R

Defines functions ligera_basic

# the original version without BEDMatrix or missingness support
# (for tests; do not export!)
# unlike main function, this one requires explicit kinship inverse to be provided (for extra sanity checks)
ligera_basic <- function(X, trait, kinship, kinship_inv, covar = NULL, loci_on_cols = FALSE) {
    # some internal constants (preserving old tests)
    hetz <- TRUE
    hetz_indiv_inbr <- TRUE
    
    # override this for BEDMatrix
    if ( 'BEDMatrix' %in% class( X ) ) {
        loci_on_cols <- TRUE
    } else if ( !is.matrix( X ) )
        stop('X has unsupported class: ', toString( class( X ) ) )
    
    # need these dimensions
    if (loci_on_cols) {
        m_loci <- ncol(X)
        n_ind <- nrow(X)
    } else {
        m_loci <- nrow(X)
        n_ind <- ncol(X)
    }

    ##############################
    ### EFFECT SIZE ESTIMATION ###
    ##############################

    # gather matrix of trait, intercept, and optional covariates
    Y <- cbind( trait, 1 )
    # add covariates, if present
    if ( !is.null( covar ) ) {
        # handle NAs now, so final Y has no missingness whatsoever
        covar <- covar_fix_na( covar )
        Y <- cbind( Y, covar )
    }
    # use kinship inverse if given
    Z <- kinship_inv %*% Y
    # new way to abstract the rest of these
    obj <- get_proj_denom_multi( Z, Y )
    proj <- obj$proj
    beta_var_fac <- obj$var

    # the coefficients are simply the genotypes projected!
    beta <- drop( X %*% proj )
    
    ###########################
    ### VARIANCE ESTIMATION ###
    ###########################

    if (hetz) {
        # instead of estimating p_anc_hat first, here we estimate p*q per individual (that's what counting heterozygotes does), then average
        if ( hetz_indiv_inbr ) {
            # correct for inbreeding bias on a per-individual basis!
            # formulate as using weights (but these don't sum to one)
            weights_inbr <- 1 / (1 - popkin::inbr(kinship)) / 2 / n_ind

            # the desired "average"
            p_q <- drop( ( X == 1 ) %*% weights_inbr )
            
        } else {
            # bias in this case is given by FST (we correct), not mean_kinship (as for p_anc_hat version below):
            p_q <- rowMeans( X == 1 ) / 2 / ( 1 - popkin::fst(kinship) )
        }

        # in all hetz cases we need to regularize, as zero estimates are possible (likely even on the genome-wide level) and ruin inference completely
        # this is the crudest "laplace prior"-like version that prevents zeroes and is more conservative at rarer loci (which is good)
        # un-average the previous p_q by multiplying it by the sample size n_ind, then apply the correction and renormalize again.
        p_q <- ( 1 + n_ind * p_q ) / ( 2 + n_ind )
        
    } else {
        
        # to correct for a variance bias
        # will assume uniform weights!
        mean_kinship <- popkin::mean_kinship(kinship)
        
        # compute all p_anc_hat, vectorizing
        p_anc_hat <- rowMeans( X, na.rm = TRUE ) / 2
        
        # construct final estimate of p*q
        # includes (1 - mean_kinship) bias correction for this particular estimation approach
        p_q <- p_anc_hat * ( 1 - p_anc_hat ) / (1 - mean_kinship)
    }

    # construct final variance estimate of beta
    beta_std_dev <- sqrt( 4 * p_q * beta_var_fac )
    
    ####################
    ### T-STATISTICS ###
    ####################
    
    # the test t-statistics
    t_stat <- beta / beta_std_dev
    
    # replace infinities with NAs
    t_stat[ is.infinite(t_stat) ] <- NA
    
    ################
    ### P-VALUES ###
    ################
    
    # Get naive p-values assuming a null t-distribution with the obvious degrees of freedom
    # so far I know this is wrong (the true tails are longer), may need to adjust the degrees of freedom (not yet known how exactly)
    # NOTE: two-sided test!
    pval <- 2 * stats::pt(-abs(t_stat), n_ind-1)

    # done, return quantities of interest (nice table!)
    return(
        tibble::tibble(
            pval = pval,
            beta = beta,
            beta_std_dev = beta_std_dev,
            p_q = p_q,
            t_stat = t_stat
        )
    )
}
OchoaLab/ligera documentation built on Jan. 5, 2023, 8:29 p.m.