# R/vbdmR.R In vbdm: Variational Bayes Discrete Mixture Model

#### Documented in vbdmR

```#R implementation of discrete mixture regression model (vanilla) for rare variant association analysis
#equations that are referred to below are in a companion discrete_mixture.pdf file

#############################discrete.mixture.test########################
#INPUTS:
#y: continous outcome to be tested
#G: A matrix of genotypes
#X: A matrix of confounding covariates
#tol: Optionally specified tolerance for convergence
#thres: Minor allele frequency threhold for variants within G
#scaling: Whether or not columns of G should be scaled to have mean 0 and variance 1
#hyper: Hyper parameters alpha and beta for prior over the mixing pprobability parameter.  Default is 2 and 2.
##########################################################################
#OUTPUTS:
#keep: the columns of G that passed the MAF threshold and are used in the analysis
#y: outcome
#gssq: the sum of squares of columns of G
#Gq: The unscaled G matrix
#X: the covariate matrix used in the analysis
#n: sample size
#m: number of features
#p: number of covariates
#pvec: posterior probability of association of each variant
#prob: the mixing probability (e.g. proporition of associated variants in a gene)
#theta: the estimate of the effect of the burden of associated variants
#gamma: the fixed effects
#sigma: the estimated residual variance
#Gp: G%*%pvec; ie the matrix vector product between genotypes and posterior probabilities
#resid: The residuals from the model
#lb: the lower bound of the alternative model
#lb_null: the lower bound of the null model
#tol: the tolerance used
#lrt: the likelihood ratio test
#p.value: the p-value of the association (assuming the lrt ~ chi^2_1)
##########################################################################
#EXAMPLE:
#set.seed(1)
#n <- 2000;
#m <- 20;
#p <- 2;
#G <- matrix(rbinom(n*m,2,0.01),n,m);
#y <- G%*%rbinom(m,1,.2);
#y <- y+ rnorm(n,0,sqrt(10));
#X <- matrix(rnorm(n*p),n,p);
#res_scaled <- discrete.mixture.test(y=y,G=G,X=X,scaling=T);
#res_unscaled <- discrete.mixture.test(y=y,G=G,X=X,scaling=F);

vbdmR <- function(y,G,X=NULL,tol=1e-4,thres=0.05,scaling=TRUE,hyper=c(2,2)){
update_p <- function(model){
#This function produces the estimates of the posterior probabilities for
#each variant and stores them in the list element model\$pvec.
#It also updates the overall residuals and probability
#weighted burden scores after accounting for
#each variant in the model when weighted by its posterior
#probability.
for(j in 1:model\$m){
#extract the current value of the posterior probability
pold <- model\$pvec[j];

#compute the dot product between the residual *without* variant j
#and the genotype vector for variant j
vec1 <- t(model\$resid)%*%model\$G[,j]+model\$gssq[j]*pold*model\$theta;

#compute the first term in the expression for the posterior probability
#of variant j(related to the standard error of the estimate)
#equation 10
a1 <- as.numeric(model\$theta^2*model\$gssq[j]);
#cat(a1,'a1\n')

#compute the second term in the expression for the posterior probability
#of variant j (related to the estimate of the effect of variant j)
#equation 10
a2 <- -as.numeric(2*model\$theta*(vec1));
#cat(a2,'a2\n')

#compute the third term in the expression for the posterior probability
#of variant j related to the overall mixture probability of being non-zero.
#The higher this is the weaker the evidence of association needs to be to have a high
#posterior probability
#a3 <- -log(model\$prob);
#equation 10
a3 <- -(digamma(model\$prob*(model\$m+sum(model\$hyper)))-digamma(model\$m+sum(model\$hyper)));
#cat(a3,'a3\n')
#compute the fourth term in the expression for the posterior probability
#of variant j (related to the overall mixture probability of being zero.
#the higher this is, the stronger the evidence of association needs to be to have
#a high posterior probability
#a4 <- log(1-model\$prob);
#equation 10
a4 <- (digamma((1-model\$prob)*(model\$m+sum(model\$hyper)))-digamma(model\$m+sum(model\$hyper)))
#cat(a4,'a4\n')
#combine all the terms (essentially rescaling a1 and a2 by the relative scale of the
#residuals so that all variants are treated on the same scale.
#equation 10
a5 <- (1/(2*model\$sigma))*(a1+a2)+a3+a4;
#cat(a5,'a5\n')

#compute the logistic function of a5 to transform it from a real number
#into the posterior probability
#equation 10

model\$pvec[j] <- 1/(1+exp(a5));
#cat(model\$pvec[j],'pj\n');

#update the model residuals by adding the old effect and subtracting the new effect
model\$resid <- model\$resid + model\$theta*model\$G[,j]*(pold-model\$pvec[j]);

#update the reweighted burden score, model\$Gp, by adding the new probability reweighted
#score, and subtracing the old probability reweighted score
model\$Gp <- model\$Gp + model\$G[,j]*(model\$pvec[j]-pold);

}

#update the estimated of the overall probabilty of being non-zoer, model\$prob:
#equation 17/18
model\$prob <- (sum(model\$pvec)+model\$hyper[1])/(model\$m+sum(model\$hyper));
#cat(model\$prob,'prob\n');

return(model);
}

update_theta_gamma <- function(model){

#store old value of theta
theta_old <- model\$theta;

#compute numerator of new theta estimate
#equation 12
theta_new <- t(model\$resid)%*%model\$Gp + t(model\$Gp)%*%model\$Gp*model\$theta;
#cat(theta_new,'theta_new\n')
#compute denominator of new theta estimate
#equation 12
const2 <- sum(model\$Gp^2)+t(model\$gssq)%*%(model\$pvec-model\$pvec^2);
#cat(const2,sum(model\$Gp^2),model\$gssq,model\$pvec,'denom\n')
#update theta
#equation 12
theta_new <- as.numeric(theta_new/as.numeric(const2));
#cat(theta_new,'tn2\n')
if(!is.null(model\$theta2)){
#update theta under null model
theta_new <- model\$theta2;
}
model\$theta<-theta_new;

#update residuals
model\$resid <- model\$resid + model\$Gp*(theta_old-theta_new);

#extract covariates
Z <- model\$X;
#store old value of covariate effects
#equation 14
alpha_old <- c(model\$gamma);

#solve for new value of covariate effects
#equation 14
alpha <- solve(t(Z)%*%Z)%*%t(Z)%*%(model\$y-model\$Gp*theta_new);
#cat(alpha,'alpha\n')

#update the covariate effects
#equation 14
model\$gamma <- alpha;

#update the residuals
model\$resid <- model\$resid + Z%*%(alpha_old-alpha);

return(model);
}
update_sigma <- function(model){

#compute correction for sum of square errors from lower bound
#equation 15
const2 <- model\$gssq%*%(model\$pvec-model\$pvec^2);

#compute error variance sigma including the correction
#equation 15
model\$sigma <- (t(model\$resid)%*%model\$resid) + const2*model\$theta^2;
#cat(model\$sigma,'sigma\n')

#update sigma
#equation 15
model\$sigma <- model\$sigma/model\$n;
#cat(model\$sigma,'sigma2\n')
return(model);

}
update_lb <- function(model){

#compute likelihood portion of lower bound
#equation 16
lb <- -0.5*(model\$n*(log(2*pi*model\$sigma)+1));
##cat(lb,"ll\n")
#lba <- lb;
#model\$lb <- lb;

alpha1 <- sum(model\$pvec)+model\$hyper[1];
beta1  <- model\$m-sum(model\$pvec)+model\$hyper[2];

#the prior portion of the lower bound
#equation 19
lb <- lb + (digamma(alpha1)-digamma(alpha1+beta1))*(sum(model\$pvec)+1);
lb <- lb + (digamma(beta1)-digamma(alpha1+beta1))*(model\$m-sum(model\$pvec)+1);
##cat(lb,'elp\n')

#entropy portion of lower bound
#equation 20
ent_1 <- model\$pvec*log(model\$pvec);
ent_0 <- (1-model\$pvec)*log(1-model\$pvec);
ent_1[is.na(ent_1)]<-0;
ent_0[is.na(ent_0)]<-0;
lb <- lb - sum(ent_1);
lb <- lb - sum(ent_0);
##cat(model\$pvec,'prob\n')
##cat(model\$theta,'theta\n')
##cat(lb,'hz\n')
lb <- lb + lbeta(alpha1,beta1) - (alpha1-1)*digamma(alpha1)
lb <- lb - (beta1-1)*digamma(beta1) + (alpha1+beta1-2)*digamma(alpha1+beta1);
##cat(lb,'hp\n')

model\$lb <- lb;
return(model);
}

run_discrete_mixture_vanilla <- function(y,G,X=NULL,tol=1e-4,theta=NULL,thres=0.05,scaling=TRUE,hyper=c(2,2)){
model <- list();
#preprocess covariates, thres is
if(is.null(X)){
X <- as.matrix(rep(1,nrow(G)));
}else{
X <- cbind(rep(1,nrow(X)),X);
}
#preprocess G
cs <- colMeans(G,na.rm=T)/2;
cvec <- apply(cbind(cs,1-cs),1,min);
keep <- which((cvec<thres)*(cvec>0)==1);
cvec2 <- apply(cbind(cs,1-cs),1,which.min);
G <- as.matrix(G[,keep]);
if(length(keep)==0){
stop("No rare variants left.\n");
}
flip <- which(cvec2[keep]==2);
if(length(flip)>0){
G[,flip] <- 2-G[,flip];
}
nav <- which(is.na(G));
if(length(nav)>0){
G[nav]<-0;
}
Gq <- G;

#scale all variants to have mean 0 and variance 1
if(scaling){
G <- as.matrix(scale(G));
}else{
G <- as.matrix(G);
}
#initialize model;
model\$keep <- keep;
model\$y <- y;
model\$G <- G;
model\$Gq <- Gq;
model\$gssq <- colSums(G^2);
model\$X <- X;
model\$n <- length(y);
model\$m <- ncol(G);
model\$p <- ncol(X);
model\$hyper <- hyper;

model\$pvec <- rep(0.5,model\$m);
model\$prob <- 0.5;
model\$theta <- 0;
model\$gamma <- rep(0,model\$p);
model\$sigma <- var(model\$y);
model\$Gp <- model\$G%*%model\$pvec;
model\$resid <- model\$y;
model <- update_lb(model);
model\$tol <- tol;
model\$theta2 <- theta;
lb_old <- -Inf;
lb_new <- 0;

while(abs(lb_old-lb_new)>model\$tol){
lb_old <- lb_new;
model<-update_p(model);
#print(model\$pvec)
model<-update_theta_gamma(model);
model<-update_sigma(model);
model<-update_lb(model);
lb_new <- model\$lb;
#print(lb_new);
}
#print(model\$pvec);
#print(model\$lb);
return(model);

}

model_a <- run_discrete_mixture_vanilla(y=y,G=G,X=X,tol=tol,thres=thres,scaling=scaling,hyper=hyper);
model_b <- run_discrete_mixture_vanilla(y=y,G=G,X=X,tol=tol,theta=0,thres=thres,scaling=scaling,hyper=hyper);
model_a\$lrt <- -2*model_b\$lb + 2*model_a\$lb;
model_a\$p.value <- pchisq(model_a\$lrt,1,lower.tail=F);
model_a\$lb_null <- model_b\$lb
return(model_a);
}
```

## Try the vbdm package in your browser

Any scripts or data that you put into this service are public.

vbdm documentation built on May 2, 2019, 2:37 a.m.