Nothing
#' Conditional analysis for genetic variance effects
#'
#'
#' @description
#' We estimate the variance effect of a primary variant conditioned on one or more secondary variants.
#' We apply a likelyhood ratio test with one degree of freedom
#'
#' H0: All secondary variants have a variance effect, but not the primary one.
#' H1: All variants have a variance effect, including the primary one
#'
#' Under both models we assume a full mean effect model. That is, the number of mean-value parameters is 3^(n+1), where n is the number of covariates
#' Thus the null model has 3^(n+1)+n degrees of freedom whereas the alternative model has 3^(n+1)+n+1
#' Due to the exponential growth of free parameters, the test might have low statistical power if many covariates are used
#'
#' IMPORTANT NOTE: We use the Gauss-Newton algorithm to estimate many parameters.
#' We do not check if the algorithm converges to the true minimum values, or if it converges at all.
#' Thus, we advise against blindly believing all results
#'
#' @param qt A numeric vector.
#' @param g An integer vector.
#' @param g_covar An integer matrix where each column corresponds to a genetic covariate
#' @param iter_num An integer. Represents the number of iterations performed in the Gauss-Newton algorithm
#'
#' @returns
#' A list with the values:
#' * alpha, the estimated variance effect, conditioned on the covariates
#' * pval, the p-value corresponding to alpha
#' @examples
#' n <- 10000
#' qt <- rnorm(n)
#' g <- rbinom(n, 2, 0.3)
#' qt <- qt * 1.2^g
#' g_covar <- as.data.frame(matrix(0, nrow = n, ncol = 3))
#' for(i in 1:ncol(g_covar)){
#' freq <- runif(1, min = 0, max = 1)
#' g_covar[, i] <- rbinom(n, 2, freq)
#' }
#' res <- alpha.cond(qt, g, g_covar)
#' @export
alpha.cond <- function(qt, g, g_covar, iter_num=50) {
k <- length(qt) #Number of measurements
n <- ncol(g_covar) + 1 #Number of variants (counting the main variant)
A <- cbind(g, g_covar)
flips <- rep(1, ncol(A)) #We flip to the minor allele of each variant for techincal reasons. We keep track of which ones were flipped and flip back in the end.
for(i in 1:ncol(A)){
A[, i] <- round(A[, i])
if(mean(A[, i]) > 1){
flips[i] <- -1
A[, i] <- 2 - A[, i]
}
}
freqs <- rep(0, n) #A vector with the frequencies of the variants
for(i in 1:ncol(A)){
freqs[i] <- sum(A[, i]) / (2 * k)
}
vecs <- list(c(0, 1, 2)) #possible genotypes for a single variant
vecs <- rep(vecs, n) #possible genotypes for all variants
vecs <- as.matrix(do.call(expand.grid, vecs)) #All possible genotype combinations
# The followign if-loop is here to save running time. The fastest code depends on how many covariates there are.
# If there are only a few covariates, it makes more sense to loop over all possible genotype groops
# If there are more covariates, it makes more sense to loop over all the subjects and check what genotype groop each one belongs to
# We tested the code with n=300.000 and with that datasize, the cutoff seems to be around 5.
covar_cutoff <- 5
if(n > covar_cutoff){
m <- array(0, rep(3, n)) #An array storing the mean value for each possible genotype-group
number_counter <- array(0, rep(3, n)) #An array storing the number of subjects in each genotype-group
rss <- array(0, rep(3, n)) #An array storing the residual sum of squares
A_temp <- as.matrix(A)
for(i in 1:k){
m[t(1 + A_temp[i, ])] <- m[t(1 + A_temp[i, ])]+qt[i]
number_counter[t(1 + A_temp[i, ])] <- number_counter[t(1 + A_temp[i,])] + 1
}
m <- m / number_counter
for(i in 1:k){
rss[t(1 + A_temp[i, ])] <- rss[t(1 + A_temp[i, ])] + (qt[i] - m[t(1 + A_temp[i, ])])^2
}
}else{
m <- array(0, rep(3, n)) #An array storing the mean value for each possible genotype-group
rss <- array(0, rep(3, n)) #An array storing the residual sum of squares
for(i in 1:nrow(vecs)){
subject_subset <- rowSums(t(vecs[i, ] == t((A[, 1:ncol(A)])))) == n #The list of subjects with genotype group equal to vecs[i,]
if(sum(subject_subset) == 0){
m[t(1 + vecs[i,])] <- 0
rss[t(1 + vecs[i,])] <- 0
}else{
m[t(1 + vecs[i,])] <- mean(qt[subject_subset])
rss[t(1 + vecs[i,])] <- sum((qt[subject_subset] - m[t(1 + vecs[i, ])])^2)
}
}
}
#Null model.
#The problem can be solved analytically if there is only one covariate. Hence the if-statement
if(n <= 2){
rss_null <- c(0, 0, 0)
rss_null[1] <- rss[1, 1]+rss[2, 1]+rss[3, 1]
rss_null[2] <- rss[1, 2]+rss[2, 2]+rss[3, 2]
rss_null[3] <- rss[1, 3]+rss[2, 3]+rss[3, 3]
aa <- 2 * freqs[2] * rss_null[1]
bb <- (2 * freqs[2] - 1) * rss_null[2]
cc <- (2 * freqs[2] - 2) * rss_null[3]
alpha_null <- (- bb + sqrt(bb^2 - 4 * aa * cc)) / (2 * aa) #This is the maximum likelyhood estimator for alpha
sigma_null <- (rss_null[3] / (alpha_null^2) + rss_null[2] / alpha_null + rss_null[1]) / k #This is the maximum likelyhood estimator for sigma
l_null <- - k * log(sigma_null) / 2 - log(alpha_null) * k * freqs[2] #This is the likelyhood function evaluated at those values
}else{
#if there is more than one covariate we use Newtons method to solve the problem numerically
vecs_null <- list(c(0, 1, 2))
vecs_null <- rep(vecs_null, n - 1)
vecs_null <- as.matrix(do.call(expand.grid, vecs_null))
rss_null <- array(0, rep(3, n - 1))
for(i in 1:nrow(vecs_null)){
rss_null[t(1 + vecs_null[i, ])] <- rss[t(1 + c(0, vecs_null[i, ]))] + rss[t(1 + c(1, vecs_null[i, ]))] + rss[t(1 + c(2, vecs_null[i, ]))]
}
freqs_null <- freqs[2:n]
alpha_null <- rep(1, n - 1) # This is the orignial guess of the alpha parameters. The subsequent for-loop is Newtons method to obtain the correct maximum likelyhood estimators
for(i in 1:iter_num){
alpha_null <- tryCatch(alpha_null - solve(Hess_likely(alpha_null, rss_null, freqs_null)) %*% G_likely(alpha_null, rss_null, freqs_null), error = function(err) NA)
}
sigma_null <-0
for(i in 1:nrow(vecs_null)){
sigma_null <- sigma_null + rss_null[t(1 + vecs_null[i, ])] * prod(alpha_null^(2 - vecs_null[i, ]))
}
sigma_null <- sigma_null / (k * prod(alpha_null^2)) # This is the maximum likelyhood estimator for sigma
l_null <- - k * log(sigma_null) / 2 - k * sum(log(alpha_null) * freqs_null) ### This is the likelyhood function evaluated at those values
}
#Alt model
#We always need newtons method to solve for the parameters in the alternative model.
alpha_alt <- rep(1, n)
for(i in 1:iter_num){
alpha_alt <- tryCatch(alpha_alt - solve(Hess_likely(alpha_alt, rss, freqs)) %*% G_likely(alpha_alt, rss, freqs), error = function(err) NA)
}
if(is.na(alpha_alt[1])){
P <- alpha_alt
}else{
sigma_alt <- 0
for(i in 1:nrow(vecs)){
sigma_alt <- sigma_alt + rss[t(1 + vecs[i, ])] * prod(alpha_alt^(2 - vecs[i, ]))
}
sigma_alt <- sigma_alt / (k * prod(alpha_alt^2)) # This is the maximum likelyhood estimator for sigma
l_alt <- - k * log(sigma_alt) / 2 - k * sum(log(alpha_alt)*freqs) ### This is the likelyhood function evaluated at those values
#Finally we compare the maximized likelyhood functions to obtain an approximated X2-statistic
X2 <- 2 * (l_alt - l_null)
alpha_alt <- alpha_alt^(flips)
P <- stats::pchisq(X2, 1, lower.tail = FALSE)
}
return(list(alpha = alpha_alt[1], pval = P))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.