R/hapassoc.R
In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

Documented in hapassoc

# Filename: hapassoc.R
# Version : $Id: hapassoc.R,v 1.60 2012/04/05 04:09:37 mcneney Exp $

# hapassoc- Inference of trait-haplotype associations in the presence of uncertain phase
# Copyright (C) 2003  K.Burkett, B.McNeney, J.Graham

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

########################################################################

hapassoc<-function(form, haplos.list, baseline="missing", family=binomial(), 
                   design="cohort", disease.prob=NULL, freq=NULL, maxit=50, 
                   tol=0.001, start=NULL, verbose=FALSE){
environment(form)<-environment()#set envir of formula to envir w/i hapassoc function
call <-  match.call()

if (is.character(family))
    family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
    family <- family()
if(design=="cc" && family$family!="binomial") {
  warning(paste("design is case-control, but family is ",family$family,"; using binomial instead",sep=""))
  family<-binomial()
}
#if(family$family=="binomial") family<-binomialNoWarn() 
   # so we don't issue warnings in M step

haplos.names<-names(haplos.list$initFreq)

 # Initial freq values, if no freq specified use initFreq
 if (!is.null(freq)) {
   names(freq)<-haplos.names
 } else {
   freq<-haplos.list$initFreq
 }

 if(baseline=="missing") {
   #no baseline specified
   baseline<-haplos.names[which.max(freq)] 
 }

 if ( length( setdiff( colnames(haplos.list$haploDM), all.vars(form)[2:length(all.vars(form))]))==0){
   #Too many haplotypes specified in model formula. Must determine a baseline category and 
   #create a new model formula. 
   column.subsethaplo <- colnames(haplos.list$haploDM) != baseline
   resp <- as.character(all.vars(form)[1])
   column.subsetnonhaplo <- intersect(colnames(haplos.list$nonHaploDM),all.vars(form)[2:length(all.vars(form))])
   form.rhs <- paste(c(names(haplos.list$nonHaploDM[, column.subsetnonhaplo, 
            drop = FALSE]), names(haplos.list$haploDM)[column.subsethaplo]), 
            collapse = "+")
   form <- formula(paste(resp, "~", form.rhs))
   cat(paste("Formula contained all possible haplotypes; using ",baseline," as reference 
category\n\n"))
 }

 if(!is.na(all.vars(form)[2]) && all.vars(form)[2]==".") {
   #Then formula is of form "y ~ ." 
   column.subsethaplo <- colnames(haplos.list$haploDM)!=baseline
   resp<-as.character(all.vars(form)[1])
   column.subsetnonhaplo <- colnames(haplos.list$nonHaploDM)!=resp
   form.rhs<-paste(c(names(haplos.list$nonHaploDM[,column.subsetnonhaplo,drop=FALSE]),
				   names(haplos.list$haploDM)[column.subsethaplo]),collapse="+")
   form<-formula(paste(resp,"~",form.rhs))
 }
 hdat <- cbind(haplos.list$nonHaploDM, haplos.list$haploDM)
 colnames(hdat)<- c(colnames(haplos.list$nonHaploDM),
                    colnames(haplos.list$haploDM))
 ID <- haplos.list$ID
 N<-round(sum(haplos.list$wt))
 wts<-haplos.list$wt
 
 # Get the haplotype columns
 haplos<-haplos.list$haploDM
 haploMat<-haplos.list$haploMat
 allHaps<-c(haploMat[,1],haploMat[,2]) #Needed later in hapassoc loop for wt calcs
 # Initial beta values calculated from augmented data set
 if (family$family=="binomial"){
	 regr=tryCatch( glm(form, family=family, 
					 data=hdat, weights=wts, start=start),
			 
			 warning = function(w) {
				 
				 teststring="non-integer #successes in a binomial glm!"
				 if (teststring==conditionMessage(w)){
					 suppressWarnings(glm(form, family=family, 
									 data=hdat, weights=wts, start=start))
				 } else {
					 glm(form, family=family, data=hdat, weights=wts, start=start)
				 }
			 }
	 
	 )
 } else { 
	 regr<-glm(form, family=family, data=hdat, weights=wts, start=start)
 }
 
 response<-regr$y #Change added Nov.2/04 to extract response from fitted model
 beta<-regr$coef
 fits<-regr$fitted.values
 betadiff<-1
 it<-1
 num.prob<-vector(length=nrow(hdat))
 
 # The hapassoc loop for case-control data

 if (design=="cc")    ## use the hybrid method for case-control study
 {  		
     n1 <- sum(response[!duplicated(ID)])
     n0 <- N - n1

     nhaps <- length(freq) 	
     nhaps.DM <- length(haplos.list$haploDM) 	

   # Construct the design matrix "Des.Mat" for the psuedo-sample.     
     mf <- model.frame(form, hdat)
     Des.Mat <- model.matrix(form, mf, contrasts)  

   # Construct the orginal non-haplotype data "nonHap" for the subjects.     
   # The non-haplo data may include factors, but we require the numeric
   # columns of the design matrix. We will extract these from Des.Mat by
   # trimming off haplotype columns. Haplotype columns appear last in Des.Mat
     nonHap <- haplos.list$nonHaploDM[!duplicated(ID),,drop=FALSE]
     #nonHap<- as.matrix(nonHap)     

   # Construct the design matrices (array) "DesMat.Arr" for numerator of the 
   # effective sample-sizes (see formula (13) of Spinka et al.).
     nonHap.Mat <- matrix(nrow=nrow(nonHap)*length(freq), ncol=ncol(nonHap))
     nonHap.Mat<-data.frame(nonHap.Mat)
     for (j in 1:ncol(nonHap)) {
        if(is.numeric(nonHap[,j])) {
          nonHap.Mat[,j] <- c(outer(rep.int(1,length(freq)), nonHap[,j]))
        } else { #assume factor
          if(!is.factor(nonHap[,j])) stop("non-numeric, non-factor covariate")
          n.nonHapj<-as.numeric(nonHap[,j])
          nonHap.Matj <- c(outer(rep.int(1,length(freq)), n.nonHapj))
          nonHap.Mat[,j]<-factor(nonHap.Matj,labels=levels(nonHap[,j]))
        }
     }
     DesMat.Arr <- array(data=NA, dim=c(nrow(nonHap.Mat), length(beta), nhaps))
     for (j in 1:nhaps)
     {
#       Hap.Mat <- matrix(rep.int(diag(1, nhaps), N), ncol=nhaps, byrow=TRUE)
       Hap.Mat <- matrix(rep.int(diag(1, length(freq)), N), ncol=length(freq), byrow=TRUE)
       Hap.Mat[,j] <- Hap.Mat[,j] + 1
       indPooled <-  match(haplos.list$pooled,names(freq))  
       if(!any(is.na(indPooled))) { #then there is pooling to do
         pooled <- vector("numeric",nrow(Hap.Mat))
         for (k in 1:length(indPooled))
           pooled <- pooled + Hap.Mat[,indPooled[k]]
         dm <- data.frame(nonHap.Mat, Hap.Mat[,-indPooled], pooled)
       }
       else  dm <- data.frame(nonHap.Mat, Hap.Mat)
       dimnames(dm)[[2]] <-c(dimnames(haplos.list$nonHaploDM)[[2]], 
                             colnames(haplos.list$haploDM) )
       mf <- model.frame(form, dm)
       DesMat.Arr[,,j] <- model.matrix(form, mf, contrasts)
     }
     DMA.ID <- c(outer(rep.int(1,nhaps),unique(ID)))

   # Construct the design matrix "Denom.Mat" for the denominator of the 
   # effective sample-sizes  

     DiploMat <- matrix(nrow=length(freq)*(length(freq)+1)/2, ncol=length(freq))
     myInd <- 1
     for (j in 1:length(freq))  {
       diag.mat <- diag(1,length(freq))
       diag.mat[,j] <- diag.mat[,j] + 1
       DiploMat[myInd:(myInd + length(freq)-j),] <- diag.mat[j:length(freq),]
       myInd <- myInd + length(freq)-j+1
     }
     if(!any(is.na(indPooled))) {
       pooled <- vector("numeric",nrow(DiploMat))
       for (k in 1:length(indPooled))
         pooled <- pooled + DiploMat[,indPooled[k]]
       DiploMat <- data.frame(DiploMat[,-indPooled], pooled)
     }

     Denom.Mat<-matrix(NA,nrow=nrow(DiploMat)*nrow(nonHap),ncol=ncol(nonHap))
     Denom.Mat<-as.data.frame(Denom.Mat)
     for (j in 1:ncol(nonHap)) {
       if(is.numeric(nonHap[,j])) {
         Denom.Mat[,j] <- c(outer(rep.int(1,nrow(DiploMat)), nonHap[,j]))  
       } else {
         n.nonHapj<-as.numeric(nonHap[,j])
         Denom.Matj<- c(outer(rep.int(1,nrow(DiploMat)), n.nonHapj))  
         Denom.Mat[,j]<-factor(Denom.Matj,labels=levels(nonHap[,j]))
       }
     }
     for (j in 1:nhaps.DM)
       Denom.Mat <- data.frame(Denom.Mat, rep.int(DiploMat[,j], N))
       
     dimnames(Denom.Mat)[[2]] <- dimnames(dm)[[2]]    
     mf <- model.frame(form, Denom.Mat)
     Denom.Mat <- model.matrix(form, mf, contrasts)
     DenomMat.ID <- c(outer(rep.int(1, nrow(DiploMat)), unique(ID)))
     
 		
     while ( (it<maxit) && (betadiff>tol) ){
 	 	
    # Multiplicative const for haplo probs: 1 for homo, 2 for het

      haplo.probs<-rep.int(1,nrow(haplos))+isMultiHetero(haplos.list)
      haplo.probs <- haplo.probs*freq[haploMat[,1]]*freq[haploMat[,2]]
      phi<-mlPhi(regr) #Compute ML estimate of dispersion param
      if(is.null(phi)) { #no converergence in ml estimate of phi
           break() #hapassoc will throw a warning of non-convergence
      }

      num.prob <- exp(response*(Des.Mat%*%beta))
      if (!is.null(disease.prob))  # if Pr(D=1) is known, beta0 can be calulated 
      {   
         beta0 <- beta[1] - log(n1/n0) + log(disease.prob/(1-disease.prob))
         num.prob <- num.prob/(1+exp(1+Des.Mat%*%c(beta0,beta[-1]))) 
      }
      num.prob <- as.vector(num.prob)*haplo.probs   

    # E step: Calculate the weights for everyone
    # Use the ID to determine the number of pseudo-individuals in the 
    # denominator probability
      wts<-vector("double", length(wts))
      wts<-.C("getWts", as.integer(nrow(hdat)), as.integer(ID),
              wts = as.double(wts), as.double(as.vector(num.prob)),
              PACKAGE="hapassoc")
      wts <- wts$wts

     
     if (min(freq)<1.0e-8 | min(wts)<1.0e-8)  
     {  # if some haplotype frequency is estimated to be 0, 
     	 # throw a non-convergence warning
       break()
     }
    
    # M step: Find new estimates using GLM and weighted haplotype counts

    # Find the new betas using old betas as starting value
    
	if (family$family=="binomial"){
		regr=tryCatch( glm(form, family=family, data=hdat, weights=wts, 
						control=glm.control(epsilon=1e-08),start=beta),
					 
					 warning = function(w) {
						 
						 teststring="non-integer #successes in a binomial glm!"
						 if (teststring==conditionMessage(w)){
							 suppressWarnings(glm(form, family=family, data=hdat, weights=wts, 
											 control=glm.control(epsilon=1e-08),start=beta))
						 } else {
							 glm(form, family=family, data=hdat, weights=wts, 
									 control=glm.control(epsilon=1e-08),start=beta)
						 }
					 }
			 
			 )
		 } else { 
			 regr <- glm(form, family=family, data=hdat, weights=wts,
					 control=glm.control(epsilon=1e-08),start=beta)
		 }		 
		 
		 
		 
      betaNew<-regr$coef
      fits<-regr$fitted.values
      betadiff<-max( max(abs(beta-betaNew), na.rm=TRUE),
	       max(abs(beta-betaNew)/(0.1+abs(betaNew)), na.rm=TRUE) )
      beta<-betaNew

    # Calculate the expected haplotype counts "Numer.Freq" 
    # in formula (13) of Spinka el al. (2005) 
      Numer.Freq <- tapply(c(wts,wts),allHaps,sum)

    # Calculate the effective sample-sizes "Denom.Freq" 
    # in formula (13) of Spinka el al. (2005)
      DenomFreq.Nu <- NULL
      for (j in 1:nhaps)
      {
        rr <- r.Omega(DesMat.Arr[,,j], beta, ncases=n1, ncontrols=n0, disease.prob) 
        rr <- 2*rep.int(freq,N)*rr 

        factors <- unique(DMA.ID)
        sums <- vector("double",length(factors))
        rr <- .C("tapply_sum", as.integer(length(rr)), as.integer(factors), 
           as.double(rr), as.integer(DMA.ID), sums = as.double(sums),
           PACKAGE="hapassoc")
        rr <- rr$sums
        names(rr) <- factors

        DenomFreq.Nu <- cbind(DenomFreq.Nu, rr)
      }
      DenomFreq.Nu <- data.frame(DenomFreq.Nu)
      dimnames(DenomFreq.Nu)[[2]] <- haplos.names
      DenomFreq.De <- rep.int(get.diplofreq(freq),N)*r.Omega(Denom.Mat,beta, 
                      ncases=n1,ncontrols=n0,disease.prob)


      factors <- unique(DenomMat.ID)
      sums <- vector("double",length(factors))
      DenomFreq.De <- .C("tapply_sum", as.integer(length(DenomFreq.De)), 
                       as.integer(factors), as.double(DenomFreq.De), 
                       as.integer(DenomMat.ID), sums = as.double(sums),
                       PACKAGE="hapassoc")
      DenomFreq.De <- DenomFreq.De$sums      

      Denom.Freq <- apply(DenomFreq.Nu/DenomFreq.De, 2, sum)  
    
    # Update the haplotype frequencies  
      freq <- Numer.Freq/Denom.Freq
      freq <- freq/sum(freq)
    
      if (verbose) 
	      cat("iteration",it,": value of convergence criterion =",betadiff,"\n")

      it<-it+1
   } # end of while
 } # end of if (design=="cc")

 else   # If design is not "cc", use original hapassoc code.
 { 
   while ( (it<maxit) && (betadiff>tol) ){
   
        # Vector of P(Y|X)*P(X) probability 
	# If the person is unaffected, P(Y=0) is 1-fitted value
	# otherwise it is the fitted value 
   
        # Multiplicative const for haplo probs: 1 for homo, 2 for het
        haplo.probs<-rep.int(1,nrow(haplos))+isMultiHetero(haplos.list)
        haplo.probs <- haplo.probs*freq[haploMat[,1]]*freq[haploMat[,2]]

	phi<-mlPhi(regr) #Compute ML estimate of dispersion param
        if(is.null(phi)) { #no converergence in ml estimate of phi
           break() #hapassoc will throw a warning of non-convergence
        }
        num.prob<-pYgivenX(response,fits,phi,family)*haplo.probs

	# E step: Calculate the weights for everyone
	# Use the ID to determine the number of pseudo-individuals in the 
	# denominator probability

        wts<-vector("double", length(wts))
        wts<-.C("getWts", as.integer(nrow(hdat)), as.integer(ID),
                 wts = as.double(wts), as.double(as.vector(num.prob)),
                 PACKAGE="hapassoc")
        wts <- wts$wts

	# M step: Find new estimates using GLM and weighted haplotype counts

	# Find the new betas using old betas as starting value
        
	if (family$family=="binomial"){
		regr=tryCatch( glm(form, family=family, data=hdat, weights=wts, 
						control=glm.control(epsilon=1e-08),start=beta),
						
						warning = function(w) {
							
							teststring="non-integer #successes in a binomial glm!"
							if (teststring==conditionMessage(w)){
								suppressWarnings(glm(form, family=family, data=hdat, weights=wts, 
												control=glm.control(epsilon=1e-08),start=beta))
							} else {
								glm(form, family=family, data=hdat, weights=wts, 
										control=glm.control(epsilon=1e-08),start=beta)
							}
						}
				
				)
	} else { 
		regr <- glm(form, family=family, data=hdat, weights=wts,
				control=glm.control(epsilon=1e-08),start=beta)
	}	
			
    betaNew<-regr$coef
   	fits<-regr$fitted.values
	betadiff<-max( max(abs(beta-betaNew), na.rm=TRUE),
	max(abs(beta-betaNew)/(0.1+abs(betaNew)), na.rm=TRUE) )   	
	beta<-betaNew

	# Find the new freqs, weighted sum of haplotypes
    freq <- tapply(c(wts,wts),allHaps,sum)/(2*N)

	if(verbose) 
	  cat("iteration",it,": value of convergence criterion =",betadiff,"\n")

    it<-it+1
   } # end of while
 } # end of else: design != "cc" 
 
 	 
 if(betadiff>tol) { #did not converge
    warning(paste("No convergence in hapassoc in",it,"iterations\n")) 
    ans<-list(converged=FALSE)
    class(ans)<-"hapassoc"
    return(ans)
 }

 #freq is currently an array which causes problems in the calculations below
 freq <- as.matrix(freq) 

 # Added more elements to EMresults list for later call to
 # log.likelihood and renamed dispersion (KB 18/02/2006) and
 # freq (KB 28/07/2008) 
 EMresults <- list(beta=beta, freq=freq, fits=fits, wts=wts, ID=ID,
                   glm.final.fit=regr,dispersion=phi, family=family,
                   response=response)


 # Compute the log-likelihood so that results can be returned
 # from function (KB 18/02/2006)
 if(design!="cc") {
   loglik <- loglikelihood(haplos.list,EMresults)
 } else {
   loglik<-NA
 }

 names(haplos.list)[names(haplos.list) == "freq"] <- "gamma"
 names(EMresults)[names(EMresults) == "freq"] <- "gamma"
 var.est <- EMvar(haplos.list, EMresults, family)

 # Added the model equation and log-likelihood as elements
 # returned from function (KB 18/02/2006)
 ans<-list(it=it, beta=beta, freq=freq, fits=fits, wts=wts, ID=ID,
           var=var.est, dispersion=phi, family=family, response=response,
           converged=TRUE, model=form,loglik=loglik, call=call)

 class(ans)<-"hapassoc"
 return(ans)

}

## Other functions called in hapassoc:

#############################################################################
## Function to calculate r.Omega
r.Omega <- function(des.mat, beta, ncases, ncontrols, disease.prob=NULL)
{
 # Assuming a rare disease 	
   rr <- 1 + exp(des.mat%*%beta)  
 # If Pr(D=1) is known, beta_0 can be calculated
   if (!is.null(disease.prob))    
   {
 	  beta0 <- beta[1] - log(ncases/ncontrols) + log(disease.prob/(1-disease.prob))
 	  rr <- rr / (1 + exp(des.mat%*%c(beta0,beta[-1])))
   }	
   return(as.vector(rr))
}
 
#############################################################################
# Function to calcuate the diplotype frequencies, given haplotype frequencies 
get.diplofreq <- function(freq)
{
  nhaplos <- length(freq)
  dipprob <- 2*kronecker(freq,t(freq))
  diag(dipprob) <- diag(dipprob)/2
  dipprob.vec<-NULL
  for(i in 1:nhaplos) 
    dipprob.vec<-c(dipprob.vec,dipprob[i:nhaplos,i])
  return (dipprob.vec)
}


########################################################################

mlPhi<-function(myfit,maxit=30,tol=1e-5) {
  if(myfit$family$family=="binomial" || myfit$family$family=="poisson") {
    return(1) #dispersion set to one
  }
  if(myfit$family$family=="gaussian") {
    return(summary(myfit)$deviance/sum(myfit$prior.weights))
  }
  if(myfit$family$family=="Gamma") { #Need to do Newton-Raphson
    return(mlPhiGamma(myfit,maxit,tol))
  }
  #Else, we don't support the family passed in
  stop(paste("ML estimation of dispersion not supported for family",
                myfit$family$family))
}

########################################################################

mlPhiGamma<-function(myfit,maxit,tol) {
  
  # Need to do Newton-Raphson, use moment estimate of phi to get
  # starting value for N-R 

  dev<-myfit$deviance
  n<-sum(myfit$prior.weights)
  phiMoment<-summary(myfit)$dispersion #moment estimate of phi
  ## Looking for root of score equation as a function of x=1/phi
  ## Function f is the score equation and fp is its derivative
  f<-function(x,n,dev) { return(dev+2*n*(digamma(x)-log(x))) }
  fp<-function(x,n) { return(2*n*(trigamma(x)-1/x)) }
  diff<-1; i<-1  # initialization
  xold<-1/phiMoment #starting value for N-R 
  while(i<=maxit && diff>tol) {
    xnew<-xold - f(xold,n,dev)/fp(xold,n)
    if(is.na(xnew)) {
       warning("No convergence for ML estimate of Gamma scale parameter")
       return(NULL)
    }
    diff<-abs(xnew-xold); xold<-xnew; i<-i+1
  }
  if(i>maxit) 
    warning("No convergence for ML estimate of Gamma scale parameter")
  return(1/xnew)
}

########################################################################

pYgivenX<-function(y,mu,phi,family){

  #Calculate P(Y|X) to be used in the weights. We use P(Y|X) in 
  #expressions like P(Y_i|X_i^j)P(X_I^j)/sum_k{ P(Y_i|X_i^k)P(X_i^k) }
  #so factors that are the same for all X_i^j can be ignored.
  #The deviance resids can be used to get P(Y|X) up to constants:
  #Assuming weights of 1 
  #dev.resid = 2*phi*(l(y,phi) - l(mu,phi)) where l(mu,phi) is the log-lik
  #using mean mu and l(y,phi) is the log-likelihood in the saturated
  #model using y as the mean.
  #So   exp(-dev.resid/(2*phi)) = P(y|x)*exp(-l(y,phi))
  #Call the dev.resid function with a weight of 1 

  return(exp(-1*family$dev.resid(y,mu,wt=1)/(2*phi)))
} 


## EMvar Functions
 
########################################################################

EMvar<-function(haplos.list, results, family)  {
 # Get the results of the last iteration
 beta<-results$beta[!is.na(results$beta)]
 betanames<-names(beta)
 num.beta<-length(beta)

 gammanames<-rownames(results$gamma)
 gamma<-as.vector(results$gamma)
 num.gamma<-length(gamma)

 weights<-results$wt
 ID <- haplos.list$ID

 final.regr <- results$glm.final.fit

 # Set up the Y vectors and P vector of fitted values
 # one for the augmented data set, one for just the missing
 
 missing<-(weights<1)

 yi.full<-results$response
 fits.full<-results$fits
 
 yi.mis<-yi.full[missing]
 fits.mis<-fits.full[missing]

 # Set up the matrices needed to find the variance:

   ## Design matrices

   Xreg <- model.matrix(final.regr)
   colnames(Xreg)<-betanames
   Xreg.mis<-Xreg[missing,]

   ## Haplotype matrix
   #Instead of assuming the glm model is additive in the haplotypes, 
   #make the Xgen matrix from scratch here. Used to be
   #   Xgen<-as.matrix(haplos.list$haploDM)
   haploMat<-haplos.list$haploMat
   #each application of outer in next line returns a matrix of T/F with
   # (i,j)th element true if haploMat[i,k]==gammanames[j]; k=1,2
   #Adding these T/F matrices coerces to 1/0 s first before adding
   Xgen<-outer(haploMat[,1],gammanames,"==")+outer(haploMat[,2],gammanames,"==")

   Xgen.mis <- as.matrix(Xgen[missing,])
 
   ## Weight matrices

   W.full<-diag(weights)
   W.mis<-diag(weights[missing])

   ## likelihood derivative matrices

   H.mis<-diag(yi.mis-fits.mis)/results$dispersion

   ## Haplotype frequency and derivative matrices

   ones.mat<-matrix(1,nrow=(num.gamma-1), ncol=(num.gamma-1))
   num.haplo<-vector(length=num.gamma)
   for (i in 1:num.gamma){
	num.haplo[i]<-sum(weights*Xgen[,i])}
   der.gamma<-1/gamma
   der2.gamma<-1/gamma^2

   G1<-diag(der.gamma)
   G<-G1[,1:(num.gamma-1),drop=FALSE]
   G[num.gamma,]<-G1[num.gamma, num.gamma]

   ## Block diagonal matrix

   ID.mis <- ID[missing]
   i <- 1
   B<-matrix(0,nrow=sum(missing), ncol=sum(missing) )

   while (i<length(ID.mis)){
	pseudo.index<-ID.mis==ID.mis[i]
	ones.vec<-rep.int(1, sum(pseudo.index)*sum(pseudo.index))
	block<-matrix(ones.vec, nrow=sum(pseudo.index), ncol=sum(pseudo.index))
	block.size<-sum(pseudo.index)
	B[i:(i+block.size-1), i:(i+block.size-1)]<-block
	i<-i+sum(pseudo.index)
   }

 # Calculate the complete data expected information (Ic)

   ## Top block

   Ic.reg<-solve(summary(final.regr)$cov.unscaled)/results$dispersion

   ## Middle block if dispersion was estimated -- Ic.phi requires Sphi

   Sphi<-SPhi(final.regr,results$dispersion) #will be NULL if phi not est'd
   if(!is.null(Sphi)) { 
     Ic.phi<-IPhi(final.regr,Sphi,results$dispersion)
   } else { 
     Ic.phi<-NULL 
   }

   ## Bottom block

   if(num.gamma>2) {
     Ic.cov<- diag(num.haplo[1:(num.gamma-1)]*der2.gamma[1:(num.gamma-1)])+der2.gamma[num.gamma]*num.haplo[num.gamma]*ones.mat
   } else { # first term in above sum is a scalar s, 
            # and diag(s) returns an s-by-s matrix -- not what we want
     Ic.cov<- num.haplo[1:(num.gamma-1)]*der2.gamma[1:(num.gamma-1)]+der2.gamma[num.gamma]*num.haplo[num.gamma]*ones.mat
   }

   ## Full block diagonal matrix (combine top, middle and bottom blocks)

   if(!is.null(Ic.phi)) {
     Ic<-matrix(0,nrow=(nrow(Ic.reg)+1+nrow(Ic.cov)),
                ncol=(nrow(Ic.reg)+1+nrow(Ic.cov))) #+1 is for phi
     Ic[1:num.beta,1:num.beta]<-Ic.reg
     Ic[(num.beta+1),(num.beta+1)]<-Ic.phi
     Ic[(num.beta+2):(num.beta+num.gamma),
        (num.beta+2):(num.beta+num.gamma)]<-Ic.cov
   } else {
     Ic<-matrix(0,nrow=(nrow(Ic.reg)+nrow(Ic.cov)),
                ncol=(nrow(Ic.reg)+nrow(Ic.cov)))
     Ic[1:num.beta,1:num.beta]<-Ic.reg
     Ic[(num.beta+1):(num.beta+num.gamma-1),
        (num.beta+1):(num.beta+num.gamma-1)]<-Ic.cov
   }
     

 # Calculate the missing data information (Imis), only over missing

   ## S_beta
 
   Sbeta<-H.mis%*%Xreg.mis

   ## S_phi

   if(!is.null(Sphi)) {
     Sphi<-Sphi[missing]
   }
 
   ## S_gamma
 
   Sgamma<-Xgen.mis%*%G

   ## Score matrix

   if(!is.null(Sphi)) # a fix to bug submitted by Kelly on Feb 8, 2006 -s.b.
   S<-cbind(Sbeta,Sphi,Sgamma)
   else
   S<-cbind(Sbeta,Sgamma)

   ## Calculate Imis from the other matrices

   Imis<-t(S)%*%(W.mis-W.mis%*%B%*%W.mis)%*%S  

 # Calculate the information and var/cov matrix

 Info<-Ic-Imis
 varEM<-solve(Info)

 gammanames <- paste("f", gammanames, sep="")
 if(!is.null(Sphi)) {
   colnames(varEM)<-c(betanames, "phi", gammanames[1:(num.gamma-1)])
 }else {
   colnames(varEM)<-c(betanames, gammanames[1:(num.gamma-1)])
 }
 rownames(varEM)<-colnames(varEM)


 return(varEM)

}


## Other functions called in EMvar:

########################################################################

SPhi<-function(myfit,phi) {
  if(myfit$family$family=="binomial" || myfit$family$family=="poisson") {
    return(NULL) #dispersion set to one so score not defined
  }
  if(myfit$family$family=="gaussian") {
    return(SPhiGaussian(myfit,phi))
  }
  if(myfit$family$family=="Gamma") { 
    return(SPhiGamma(myfit,phi))
  }
}

########################################################################

SPhiGaussian<-function(myfit,phi) {
  return( (myfit$y-myfit$fitted)^2/(2*phi^2) - 1/(2*phi) )
}

########################################################################

SPhiGamma<-function(myfit,phi) {
  x<-1/phi
  y<-myfit$y
  mu<-myfit$fitted.values
  return( (digamma(x)-log(x)+(y-mu)/mu-log(y/mu))*x^2 )
}

########################################################################

IPhi<-function(myfit,score,phi) {
  if(myfit$family$family=="binomial" || myfit$family$family=="poisson") {
    return(NULL) #dispersion set to one so phi not estimated
  }
  if(myfit$family$family=="gaussian") {
    return(IPhiGaussian(myfit,score,phi))
  }
  if(myfit$family$family=="Gamma") {
    return(IPhiGamma(myfit,score,phi))
  }
}

########################################################################

IPhiGaussian<-function(myfit,score,phi) {
  wts<-myfit$prior.weights
  n<-sum(wts)
  return( 2*(t(score)%*%wts)/phi + n/(2*phi^2) )
}

########################################################################

IPhiGamma<-function(myfit,score,phi) {
  x<-1/phi
  wts<-myfit$prior.weights
  n<-sum(wts)
  return( 2*x*(t(score)%*%wts) + n*x^4*(trigamma(x) - 1/x) )
}
Any scripts or data that you put into this service are public.
hapassoc documentation built on May 28, 2022, 1:13 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
hapassoc
Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

R/hapassoc.R
In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

Defines functions IPhiGamma IPhiGaussian IPhi SPhiGamma SPhiGaussian SPhi EMvar pYgivenX mlPhiGamma mlPhi get.diplofreq r.Omega hapassoc

Documented in hapassoc

Try the hapassoc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hapassoc Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

R/hapassoc.R In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

Defines functions IPhiGamma IPhiGaussian IPhi SPhiGamma SPhiGaussian SPhi EMvar pYgivenX mlPhiGamma mlPhi get.diplofreq r.Omega hapassoc

Documented in hapassoc

Try the hapassoc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hapassoc
Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

R/hapassoc.R
In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm