R/PreHap.R
In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

Documented in pre.hapassoc

# Filename: PreHap.R
# Version : $Id: PreHap.R,v 1.14 2006/04/06 01:17:00 mcneney Exp $

# HapAssoc- Inference of trait-haplotype associations in the presence of uncertain phase
# Copyright (C) 2003  K.Burkett, B.McNeney, J.Graham

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

########################################################################

pre.hapassoc <- function(dat,numSNPs,maxMissingGenos=1, pooling.tol=0.05, 
                 zero.tol=1/(2*nrow(dat)*10), allelic=TRUE, verbose=TRUE){

  haplos.list<-RecodeHaplos(dat,numSNPs,allelic,maxMissingGenos,verbose)
  haplotest<-FALSE; ID.check<-rep(FALSE,length(haplos.list$ID))

  # Starting matrices, some rows/columns will be deleted if there
  # are missing haplotypes

  newhaploDM <- haplos.list$haploDM
  newnonHaploDM <- haplos.list$nonHaploDM
  nonHaploDMnames<-names(haplos.list$nonHaploDM)
  newhaploMat <- haplos.list$haploMat
  newID <- haplos.list$ID

  #Run the usual EM algorithm using just the haplo information to
  #get estimates of haplotype frequencies and initial weights
  emres<-EMnull(haplos.list)
  newwt<-emres$wts

  initFreq<-emres$gamma[!emres$gamma<zero.tol] #haplos with non-zero frequency
  haplos.names<-names(initFreq)
  zeroFreqHaplos<-names(emres$gamma[emres$gamma<zero.tol])

  if(sum(emres$gamma<zero.tol)>0) { #then non-existent haplos need to be removed
    haplotest<-TRUE
    newhaploDM<-newhaploDM[,haplos.names]

    #We only want rows that sum to two - others must have involved
    #haplotypes with estimated frequency of zero
    finalMatInd<- (rowSums(newhaploDM) == 2)

    newhaploDM<-newhaploDM[finalMatInd,]
    newhaploMat<-newhaploMat[finalMatInd,]
    newnonHaploDM<-newnonHaploDM[finalMatInd,,drop=FALSE]
    names(newnonHaploDM)<-nonHaploDMnames
    newwt<-newwt[finalMatInd]
    newID<-newID[finalMatInd]
      
    # Re-calculate weights -- 2 loops!! there must be a better way
    uniqueIDs<-unique(newID)
    IDsum<-rep(0,length(uniqueIDs))
    for(i in 1:length(uniqueIDs)) {
      IDsum[i]<-sum(newwt[newID==uniqueIDs[i]])
    }
    for(i in 1:length(newwt)) {
      newwt[i] <- newwt[i]/IDsum[uniqueIDs==newID[i]]
    }
  }

  pooling.ind<-initFreq<pooling.tol #flag rare haplos 
  pooled.haplos<-"no pooled haplos"
  if(sum(pooling.ind)>1) { #then pooling to be done *in design matrix only*
    pooled.haplos<-haplos.names[pooling.ind]
    pooledDMcol<-rowSums(newhaploDM[,pooled.haplos])
newhaploDM<-data.frame(newhaploDM[,haplos.names[!pooling.ind]],pooled=pooledDMcol)

  }

# The following code converts the columns of newnonHaploDM 
# to the right types to fix the effect of the previous 
# matrix conversions:                         -Sigal
j<-NULL
for(i in 1:length(dat)) j<-c(j, class(dat[[i]]))
for(i in 1:length(newnonHaploDM)) {
 if (is.numeric(dat[[i]])) 
 newnonHaploDM[[i]] <- as.numeric(as.character(newnonHaploDM[[i]]))
 else if(is.character(dat[[i]]))
 newnonHaploDM[[i]] <- as.character(newnonHaploDM[[i]])
}


  return(list(nonHaploDM=newnonHaploDM, haploDM=newhaploDM,
              haploMat=newhaploMat, wt=newwt, ID=newID, 
              haplotest=haplotest, initFreq=initFreq,
              zeroFreqHaplos=zeroFreqHaplos,pooledHaplos=pooled.haplos))
}


## Other functions called in CheckHaplos

########################################################################


isMissing <- function(test.vec){
  
  flag <- vector(length=max(test.vec))
  names(flag) <- c(1:length(flag))

  for (i in 1:length(flag)){if (sum(test.vec==i)==0){flag[i] <-TRUE} }
  return(flag)
}
  
isIn <- function(vec,vecElements) {
  #Return a vector of same length as vec. Element i of returned vector is
  #FALSE if vec[i] is not in the vector vecElements and TRUE if it is.
  flag<-rep(FALSE,length(vec))
  for(i in 1:length(vecElements)) {
    flag<-(flag | vec==vecElements[i])
  }
  return(flag)
}
  

EMnull<-function(haplos.list, gamma=FALSE, maxit=100, tol=1/(2*sum(haplos.list$wt)*100)){

 haploMat <- haplos.list$haploMat

 ID <- haplos.list$ID
 N<-sum(haplos.list$wt)
 wts<-haplos.list$wt
 
 # Initial gamma values, if no gamma specified, calculate gamma values based
 # on augmented dataset.

 # We should avoid using the design matrix from an additive risk model to
 # help with haplotype frequency calculations. The following used to use
 # gamma <- (t(haplos)%*%wts)/(2*N)   where haplos is the additive model
 # design matrix as a computational trick to get haplotype frequencies.
 # Now use the tapply function and the haplotypes in haploMat to sum wts .

 if (gamma==FALSE){
    allHaps<-c(haploMat[,1],haploMat[,2])
    allWts<-c(wts,wts)
    gamma<-tapply(allWts,allHaps,sum)/(2*N)
 }

 gammadiff<-1
 it<-1
 num.prob<-vector(length=nrow(haploMat))
 
 # The EM loop

 while ( (it<maxit) && (gammadiff>tol) ){
   
        # multiplicative constant = 2 for heterozyg 1 for homozyg
        haplo.probs<-rep(1,nrow(haploMat))+isMultiHetero(haplos.list)
        haplo.probs <- haplo.probs*gamma[haploMat[,1]]*gamma[haploMat[,2]]
        num.prob<-haplo.probs

	# E step: Calculate the weights for everyone
	# Use the ID to determine the number of pseudo-individuals in the
	# denominator probability

        wts<-vector("double", length(wts))
        wts<-.C("getWts", as.integer(nrow(haploMat)), as.integer(ID),
                 wts = as.double(wts), as.double(as.vector(num.prob)),
                 PACKAGE="hapassoc")
        wts <- wts$wts

	# M step: Find new estimates using weighted haplotype counts
        allWts<-c(wts,wts)
	gammaNew <- tapply(allWts,allHaps,sum)/(2*N)
   	gammadiff<-max(abs(gamma-gammaNew), na.rm=TRUE)#maximum diff
   	gamma<-gammaNew

        it<-it+1
 }
 if(gammadiff>tol) 
   warning(paste("no convergence in EMnull after ",maxit,"iterations\n"))
 results <- list(gamma=gamma, wts=wts)
 return(results)
}

############################################################
isMultiHetero<-function(haplos.list) {
  return(as.numeric(haplos.list$haploMat[,1] != haplos.list$haploMat[,2]))
}

############################################################
EMnullPHASE<-function()
{
	f=file("temp.out_freqs")
	rawdat=scan(file=f,list(x=""))

	#A little bit of subtlety.  One would expect that the length required for these
	#vectors would be 2^numSNPs.  If we did not screen out some rows in
	#handleMissingsPHASE, this would be true, however, because we do typically
	#screen out some individuals, this changes.  Why?  Because some haplotypes
	#with really low frequency (ie essentially 0) would only have appeared in
	#the data from pseudo-individuals reconstructed from rows with missing data.  As
	#we typically eliminate these rows with a lot of missing data, these pseudo-
	#individuals are not recreated, so PHASE may have less than 2^numSNPs haplotypes.
	#(divide by 4 since 4 columns and -1 since first row is header)
	gammanames=rep("h",length(rawdat$x)/4-1)
	gammavals=rep(0,length(rawdat$x)/4-1)

	i=5 #the first 4 entries are just the column headers in the PHASE *.out_freqs file
	while(!is.na(rawdat$x[i]))
	{
		idx=as.numeric(rawdat$x[i])
		i=i+1
		gammanames[idx]=paste(gammanames[idx],as.character(rawdat$x[i]),sep="")
		i=i+1
		gammavals[idx]=as.numeric(rawdat$x[i])
		i=i+1
		i=i+1 #skip over the 4th entry in the file, which is the std.err.
	}
	names(gammavals)=gammanames
	return(list(gamma=gammavals))
}

Any scripts or data that you put into this service are public.

hapassoc documentation built on May 28, 2022, 1:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

hapassoc
Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

R/PreHap.R
In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

Defines functions EMnullPHASE isMultiHetero EMnull isIn isMissing pre.hapassoc

Documented in pre.hapassoc

Try the hapassoc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hapassoc Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

R/PreHap.R In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

Defines functions EMnullPHASE isMultiHetero EMnull isIn isMissing pre.hapassoc

Documented in pre.hapassoc

Try the hapassoc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hapassoc
Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm

R/PreHap.R
In hapassoc: Inference of Trait Associations with SNP Haplotypes and Other Attributes using the EM Algorithm