Gene-Environment Interaction in Longitudinal Studies

Documented in GEI.prelim GEI.test

#########################
#   Hypothesis Testing
#########################

#p.s: the current code is only for a single environmental exposure
GEI.prelim<-function(Y,time,E,X=NULL,E.method='ns',E.df=floor(sqrt(length(unique(time[,1])))),corstr="exchangeable"){
  ##Preliminary
  Y<-as.matrix(Y);E<-as.matrix(E);N<-nrow(Y)
  cluster.id<-unique(time[,1]);m<-length(cluster.id)
  #sort the data by cluster.id
  order.index<-order(match(time[,1],cluster.id))
  if (sum(time[,1]!=time[order.index,1])>0){
    msg<-sprintf("time was not sorted, now data is grouped by subject id")
    warning(msg,call.=F)
    Y<-as.matrix(Y[order.index,]);E<-as.matrix(E[order.index,])
    if(length(X)!=0){X<-as.matrix(X[order.index])}
    time<-time[order.index,]
  }

  ##Generate polynomial/cubic basis functions
  #use quantiles for knots:
  knots<-quantile(E,probs=seq(0,1,length=E.df+1));knots<-unique(knots[!knots%in%knots[c(1,E.df+1)]])
  if(E.df!=1){B.E<-NULL;
    if(E.method=='ns'){M.E<-ns(E,knots=knots)}
    if(E.method=='bs'){M.E<-bs(E,knots=knots)}
    if(E.method=='ps'){M.E<-ps(E,df=E.df)}
  }else{M.E<-E}
  M.E<-as.matrix(M.E)
  #standrdize enviromental exposure
  if(E.df!=1){M.E<-M.E[,which(apply(M.E,2,sd)>0)]}
  #fit the gee adjusting for X and E, the residuals will be used for selecting the G components
  #Z.A0<-svd(cbind(rep(1,N),X, M.E))$u
  #nullgee<-geeglm(Y~0+Z.A0,family=gaussian,corstr=corstr,id=time[,1])
  Z.A0<-svd(cbind(X, M.E))$u
  nullgee<-geem(Y~Z.A0,family=gaussian,corstr=corstr,id=time[,1])
  mu<-colSums(nullgee$beta*t(cbind(rep(1,N),Z.A0)));Y.res0<-Y-mu

  #prepare the intermediate results
  result.prelim<-list(Y=Y,time=time,X=X,E=E,M.E=M.E,cluster.id=cluster.id,m=m,corstr=corstr,Y.res0=Y.res0,Z.A0=Z.A0)
  return(result.prelim)
}

#G is an m*q matrix, each row coresponds to one subject.
GEI.test<-function(result.prelim,G,Gsub.id=NULL,G.method='wPCA',G.df=floor(sqrt(nrow(G))),bootstrap=NULL,MinP.adjust=NULL,impute.method='fixed'){
  ## Load preliminary data
  Y<-result.prelim$Y;corstr<-result.prelim$corstr
  m<-result.prelim$m;X<-result.prelim$X
  time<-result.prelim$time;cluster.id<-result.prelim$cluster.id
  E<-result.prelim$E;M.E<-result.prelim$M.E
  Y.res0<-result.prelim$Y.res0 #residuals by regressing Y on cbind(rep(1,N),X, M.E)
  Z.A0<-result.prelim$Z.A0 #Z.A0=svd(cbind(rep(1,N),X, M.E))$u

  ## Deal with the genotype
  SNP.list<-colnames(G)
  # match the phenotype and genotype subject ID
  if(length(Gsub.id)==0){Gsub.id<-cluster.id}
  G<-as.matrix(G[match(cluster.id,Gsub.id),])

  # missing genotype imputation
  G[G==9]<-NA
  N_MISS<-sum(is.na(G))
  if(N_MISS>0){
    msg<-sprintf("The missing genotype rate is %f. Imputation is applied.", N_MISS/nrow(G)/ncol(G))
    warning(msg,call.=F)
    G<-Impute(G,impute.method)
  }
  # centered genotype
  G<-as.matrix(G);center.G<-t(t(G)-colMeans(G))
  MAF<-colMeans(as.matrix(G[,colMeans(center.G^2)!=0]))/2;MAF[MAF>0.5]<-1-MAF[MAF>0.5]
  G<-as.matrix(center.G[,colMeans(center.G^2)!=0]);n.G<-ncol(G)
  SNP.name<-SNP.list[colMeans(center.G^2)!=0]
  if(ncol(G) == 0){
    msg<-sprintf("G does not include any heterozygous variant")
    stop(msg)
  }

  # generate long form genotype
  Z<-as.matrix(G[match(time[,1],cluster.id),])

  # find adjusted G components
  G.df<-min(G.df,ncol(G))
  if (G.method=='PCA'){G.adj<-GEI.PCA(time,G,G.df)}
  if (G.method=='wPCA'){G.adj<-GEI.wPCA(Y.res0,time,G,G.df)}
  if (G.method=='R2'){G.adj<-GEI.R2(Y.res0,time,G,G.df)}

  ## Fit null model using GEE
  #Z.A<-cbind(Z.A0,G.adj)
  #nullgee<-geeglm(Y~0+Z.A,family=gaussian,corstr=corstr,id=time[,1])
  #rho<-nullgee$geese$alpha
  Z.A<-cbind(Z.A0,G.adj) #Z.A does not include the intercept for geeM
  nullgee<-try(geem(Y~Z.A,family=gaussian,corstr=corstr,id=time[,1]),silent = TRUE)
  if(class(nullgee) == "try-error"){
    msg<-sprintf("function geem() in package geeM cannot be fitted due to a numerical error")
    stop(msg)
  }

  rho<-as.numeric(nullgee$alpha)
  ## Generate interaction variables
   if (corstr=="ar1"){
    n.total<-1;n.rep<-as.numeric(table(time[,1]))
    c.E<-NULL;for (i in m){
      ni<-n.rep[i]
      index<-n.total:(n.total+ni-1)
      n.total<-n.total + ni
      Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)
      c.E<-c(c.E,rowSums(Vi.inv))
    }
    Z.I<-as.vector((E-sum(E*c.E)/sum(c.E)))*Z
  }else{Z.I<-as.vector(Standardize(E))*Z}

  if(length(bootstrap)==0){
    # calculate score
    GEI.fit<-GEI.Score(nullgee,Y,time,Z.A,Z.I,corstr) #Z.A does not include the intercept
    # calculate score statistics and their covariance matrix
    score<-GEI.fit$score;M<-GEI.fit$M
    # calculate p-value
    TestStat<-t(score)%*%score
    EigenM<-eigen(M,symmetric=TRUE,only.values=TRUE)$values
    p.value<-davies(TestStat,EigenM)$Qq

    # implement a single SNP based analysis by GEE score tests
    p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/diag(M),df=1,lower.tail=F)))
    colnames(p.single)<-c('MAF','p.value')
    rownames(p.single)<-SNP.name
  }else{
    # calculate score
    GEI.fit<-GEI.bootstrap(nullgee,Y,time,Z.A,Z.I,corstr,bootstrap) #Z.A does not include the intercept
    score<-GEI.fit$score;perturb.score<-GEI.fit$perturb.score
    Q<-t(score)%*%score
    # calculate bootstrap based mean, variance and kurtosis
    perturb.Q<-apply(perturb.score^2,2,sum)
    perturb.mean<-mean(perturb.Q)
    perturb.variance<-var(perturb.Q)
    perturb.kurtosis<-mean((perturb.Q-perturb.mean)^4)/perturb.variance^2-3
    # calculate p.value
    if (perturb.kurtosis>0){df<-12/perturb.kurtosis}else{
      df<-100000
    }
    p.value<-1-pchisq((Q-perturb.mean)*sqrt(2*df)/sqrt(perturb.variance)+df,df)
    # implement a single SNP based analysis by GEE score tests
    perturb.variance.single<-apply(perturb.score,1,var)
    p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/perturb.variance.single,df=1,lower.tail=F)))
    colnames(p.single)<-c('MAF','p.value')
    rownames(p.single)<-SNP.name
  }

  if(length(MinP.adjust)!=0){
    eigen.G<-eigen(cor(G),only.values=TRUE)$values
    me.G<-effect.n(eigen.G,MinP.adjust)
    p.MinP<-min(min(p.single[,2])*me.G,1)
  }else{p.MinP<-NA}

  return(list(n.marker=n.G,E.df=ncol(M.E),G.df=G.df,p.single=p.single,p.MinP=p.MinP,p.value=p.value))
}

#################################################
#   Calculate Score Statistics and their Variance
#################################################


GEI.bootstrap<-function(nullgee,Y,time,Z.A,Z.I,corstr,bootstrap) #Z.A does not include the intercept
{
  rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept
  mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu
  #rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu
  cluster.id<-unique(time[,1]);m<-length(cluster.id);dimI<-ncol(Z.I)
  ##Generalized score test
  score.array<-matrix(0,dimI,m)
  n.total<-1;n.rep<-as.numeric(table(time[,1]))
  for (i in 1:m)
  {
    ni<-n.rep[i]
    index<-n.total:(n.total+ni-1)
    n.total<-n.total + ni
    #working covariance matrix
    if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)}
    if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)}
    if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi}
    #if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv}
    #score matrices
    Z.Ii<-Z.I[index,];if (ni==1) {Z.Ii<-t(Z.Ii)}
    score.array[,i]<-t(Z.Ii)%*%(Vi.inv%*%Y.res[index])/sqrt(m)
  }
  score<-as.matrix(apply(score.array,1,sum))
  # null distribution
  perturb.coef<-matrix(rbinom(m*bootstrap,1,0.5)*2-1,m,bootstrap)
  perturb.score<-score.array%*%perturb.coef

  return(list(score=score,perturb.score=perturb.score))#Q: test statistic; M: covariance matrix
}


GEI.Score<-function(nullgee,Y,time,Z.A,Z.I,corstr) #Z.A does not include the intercept
{
  rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept
  mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu
  #rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu
  cluster.id<-unique(time[,1]);m<-length(cluster.id)
  ##Generalized score test
  dimY<-nrow(Y.res);dimA<-ncol(Z.A);dimI<-ncol(Z.I);score<-matrix(0,dimI,1)
  An<-matrix(0,dimA+dimI,dimA+dimI);Bn<-matrix(0,dimA+dimI,dimA+dimI)
  n.total<-1;n.rep<-as.numeric(table(time[,1]))
  for (i in 1:m)
  {
    ni<-n.rep[i]
    index<-n.total:(n.total+ni-1)
    n.total<-n.total + ni
    #working covariance matrix
    if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)}
    if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)}
    if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi}
    #if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv}
    #score matrices
    Z.Ai<-Z.A[index,];Z.Ii<-Z.I[index,]
    if (ni==1) { Z.Ai<-t(Z.Ai); Z.Ii<-t(Z.Ii)}
    Zi<-cbind(Z.Ii,Z.Ai)
    #An<-An+t(Zi)%*%Vi.inv%*%Zi
    An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]+t(Z.Ai)%*%Vi.inv%*%Z.Ai
    An[1:dimI, (dimI+1):(dimI+dimA)]<-An[1:dimI, (dimI+1):(dimI+dimA)]+t(Z.Ii)%*%(Vi.inv%*%Z.Ai)
    Bntemp<-t(Zi)%*%(Vi.inv%*%Y.res[index])
    Bn<-Bn+Bntemp%*%t(Bntemp)
    score<-score+Bntemp[1:dimI,]#t(Z.Ii)%*%Vi.inv%*%Y.res[index]
  }
  An<-An/m; Bn<-Bn/m
  #Calculate p-value
  A00<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)];A00.inv<-solve(A00);A10<-An[1:dimI, (dimI+1):(dimI+dimA)]
  B11<-Bn[1:dimI,1:dimI];B10<-Bn[1:dimI,(dimI+1):(dimI+dimA)];B00<-Bn[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]
  P<-A10%*%A00.inv;M<-B11-t(B10%*%t(P))-B10%*%t(P)+P%*%B00%*%t(P)
  #Revise the covariance matrix using m-q
  M<-M*m/(m-dimA)
  #Revise the score using standard representation
  score<-score/sqrt(m)
  return(list(score=score,M=M))#Q: test statistic; M: covariance matrix
}

############################################
#   Type-I Error Control: adjustment of G
############################################

GEI.wPCA<-function(outcome,time,G,ncomp=5)
{
  cluster.id<-unique(time[,1]);n.G<-ncol(G)
  svd.G<-svd(G);svd.Z<-as.matrix(svd.G$u[match(time[,1],cluster.id),])
  #Calculate weighted eigen-values
  adj.score<-(svd.G$d)^2*(t(svd.Z)%*%outcome)^2;# weighted
  Iter<-TRUE;index<-rep(0,n.G)
  if (ncomp==ncol(svd.Z)){index<-rep(1,n.G);G.adj<-svd.Z;Iter=FALSE}
  if (ncomp==0){G.adj<-NULL;Iter=FALSE}
  if (Iter==TRUE){G.adj<-svd.Z[,order(adj.score,decreasing=T)][,1:ncomp]}
  return(G.adj)
}

GEI.R2<-function(outcome,time,G,ncomp=5)
{
  cluster.id<-unique(time[,1]);n.G<-ncol(G)
  svd.G<-svd(G);svd.Z<-as.matrix(svd.G$u[match(time[,1],cluster.id),])
  #Calculate R2s
  adj.score<-(t(svd.Z)%*%outcome)^2;# weighted
  Iter<-TRUE;index<-rep(0,n.G)
  if (ncomp==ncol(svd.Z)){index<-rep(1,n.G);G.adj<-svd.Z;Iter=FALSE}
  if (ncomp==0){G.adj<-NULL;Iter=FALSE}
  if (Iter==TRUE){G.adj<-svd.Z[,order(adj.score,decreasing=T)[1:ncomp]]}
  return(G.adj)
}

GEI.PCA<-function(time,G,ncomp=5)
{
  cluster.id<-unique(time[,1]);n.G<-ncol(G)
  svd.G<-svd(G);svd.Z<-as.matrix(svd.G$u[match(time[,1],cluster.id),])
  Iter<-TRUE
  if (ncomp==ncol(svd.Z)){index<-rep(1,n.G);G.adj<-svd.Z;Iter=FALSE}
  if (ncomp==0){G.adj<-NULL;Iter=FALSE}
  if (Iter==TRUE){G.adj<-svd.Z[,1:ncomp]}
  return(G.adj)
}

# calculate number of independent tests
effect.n<-function(x,MinP.adjust){
  temp<-0;sum.EV<-sum(x) #summation of eigen values
  for (i in 1:length(x)){
    temp<-temp+x[i];if (temp>sum.EV*MinP.adjust){break}
  }
  return(i)
}

#Data management
Minor.allele<-function(x){if(mean(x)>1){x<-2-x};return(x)}
Standardize<-function(x){return((x-mean(x))/sd(x))}
Center<-function(x){return(x-mean(x))}
Variation<-function(x){x<-apply(x,2,Minor.allele);x<-x[,which(apply(x,2,sd)>0)];return(x)}
Common<-function(x){x<-apply(x,2,Minor.allele);freq<-apply(x,2,mean)/2;x<-x[,which(freq>0.05)];return(x)}
##Matrix calculation
#Get sqrt.root of a matrix
Get.sqrt<-function(A){
  a.eig <- eigen(A,symmetric=TRUE)
  ID1<-which(a.eig$values > 0)
  if(length(ID1)== 0){stop("Error to obtain matrix square!")}
  a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])) %*% t(a.eig$vectors[,ID1])
  return(a.sqrt)
}
#Get inverse of a matrix; numerically robust
Get.inverse<-function(A){
  a.eig <- eigen(A,symmetric=TRUE)
  ID1<-which(a.eig$values > 0)
  if(length(ID1)== 0){stop("Error to obtain matrix inverse!")}
  a.inverse <- a.eig$vectors[,ID1] %*% diag(a.eig$values[ID1]^-1) %*% t(a.eig$vectors[,ID1])
  return(a.inverse)
}
#Get -1/2 of a matrix; numerically robust
Get.inv.sqrt<-function(A){
  a.eig <- eigen(A,symmetric=TRUE)
  ID1<-which(a.eig$values > 0)
  if(length(ID1)== 0){stop("Error to obtain matrix square!")}
  a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])^-1,length(ID1)) %*% t(a.eig$vectors[,ID1])
  return(a.sqrt)
}
#Time similarity
Check.same<-function(x,y){return(as.numeric(x==y))}
Check.near<-function(x,y){return(as.numeric(abs(x-y)==1))}
D.matrix<-function(X){
  n<-length(X[,1]);temp<-matrix(0,n,n)
  #checking time
  temp<-outer(X[,1],X[,1],Check.same)*outer(X[,2],X[,2],Check.near)
  diag(temp)<-0
  return(temp)
}
ps<-function(x,df) #generate polynomial sieves
{
  temp<-NULL
  for (i in 1:df){temp<-cbind(temp,x^i)}
  return(temp)
}
# Simple Imputation (from SKAT package)
# Z : an m x p genotype matrix with m samples and p SNPs

Impute<-function(Z, impute.method){
  p<-dim(Z)[2]
  if(impute.method =="random"){
    for(i in 1:p){
      IDX<-which(is.na(Z[,i]))
      if(length(IDX) > 0){
        maf1<-mean(Z[-IDX,i])/2
        Z[IDX,i]<-rbinom(length(IDX),2,maf1)
      }
    }
  } else if(impute.method =="fixed"){
    for(i in 1:p){
      IDX<-which(is.na(Z[,i]))
      if(length(IDX) > 0){
        maf1<-mean(Z[-IDX,i])/2
        Z[IDX,i]<-2 * maf1
      }
    }
  } else if(impute.method =="bestguess") {
    for(i in 1:p){
      IDX<-which(is.na(Z[,i]))
      if(length(IDX) > 0){
        maf1<-mean(Z[-IDX,i])/2
        Z[IDX,i]<-round(2 * maf1)
      }
    }
  } else {
    stop("Error: Imputation method shoud be \"fixed\", \"random\" or \"bestguess\" ")
  }
  return(as.matrix(Z))
}

Any scripts or data that you put into this service are public.

LGEWIS documentation built on May 2, 2019, 3:48 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

LGEWIS
Tests for Genetic Association/Gene-Environment Interaction in Longitudinal Studies

R/GEI.R
In LGEWIS: Tests for Genetic Association/Gene-Environment Interaction in Longitudinal Studies

Defines functions GEI.prelim GEI.test GEI.bootstrap GEI.Score GEI.wPCA GEI.R2 GEI.PCA effect.n Minor.allele Standardize Center Variation Common Get.sqrt Get.inverse Get.inv.sqrt Check.same Check.near D.matrix ps Impute

Documented in GEI.prelim GEI.test

Try the LGEWIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

LGEWIS Tests for Genetic Association/Gene-Environment Interaction in Longitudinal Studies

R/GEI.R In LGEWIS: Tests for Genetic Association/Gene-Environment Interaction in Longitudinal Studies

Defines functions GEI.prelim GEI.test GEI.bootstrap GEI.Score GEI.wPCA GEI.R2 GEI.PCA effect.n Minor.allele Standardize Center Variation Common Get.sqrt Get.inverse Get.inv.sqrt Check.same Check.near D.matrix ps Impute

Documented in GEI.prelim GEI.test

Try the LGEWIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

LGEWIS
Tests for Genetic Association/Gene-Environment Interaction in Longitudinal Studies

R/GEI.R
In LGEWIS: Tests for Genetic Association/Gene-Environment Interaction in Longitudinal Studies