R/hap.score.R

Defines functions print.hap.score plot.hap.score hap.score.podds hap.score.glm hap.score

Documented in hap.score plot.hap.score print.hap.score

#' Score statistics for association of traits with haplotypes
#'
#' @param y Vector of trait values. For  trait.type  =  "binomial",  y  must have values of 1 for event, 0 for no event.
#' @param geno Matrix of alleles, such that each locus has a pair of adjacent columns of alleles, and the order of columns corresponds to the order of loci on a chromosome. If there are K loci, then ncol(geno) = 2*K. Rows represent alleles for each subject.
#' @param trait.type Character string  defining  type  of  trait, with values of "gaussian", "binomial", "poisson", "ordinal".
#' @param offset Vector of offset when trait.type = "poisson".
#' @param x.adj Matrix of non-genetic covariates used to adjust the score statistics. Note that intercept should not be included, as it will be added in this function.
#' @param skip.haplo Skip score statistics for haplotypes with frequencies < skip.haplo.
#' @param locus.label Vector of labels for loci, of length K (see definition of geno matrix).
#' @param miss.val Vector of codes for missing values of alleles.
#' @param n.sim Number of simulations for empirical p-values.  If n.sim=0, no empirical p-values are computed.
#' @param method method of haplotype frequency estimation, "gc" or "hap".
#' @param id an added option which contains the individual IDs.
#' @param handle.miss flag to handle missing genotype data, 0=no, 1=yes.
#' @param mloci maximum number of loci/sites with missing data to be allowed in the analysis.
#' @param sexid flag to indicator sex for data from X chromosome, i=male, 2=female.
#'
#' @details
#' Compute score statistics to evaluate the association of a trait with haplotypes, when linkage phase is unknown and diploid marker 
#' phenotypes are observed among unrelated subjects. For now, only autosomal loci are considered. This package haplo.score
#' which this function is based is greatly acknowledged.
#'
#' @export
#' @return
#' List with the following components:
#' - score.global Global statistic to test association of trait with haplotypes that have frequencies >= skip.haplo.
#' - df Degrees of freedom for score.global.
#' - score.global.p P-value of score.global based on chi-square distribution, with degrees of freedom equal to df.
#' - score.global.p.sim P-value of score.global based on simulations (set equal to NA when n.sim=0).
#' - score.haplo Vector of score statistics for individual haplotypes that have frequencies >= skip.haplo.
#' - score.haplo.p Vector of p-values for score.haplo, based on a chi-square distribution with 1 df.
#' - score.haplo.p.sim Vector of p-values for score.haplo, based on  simulations (set equal to NA when n.sim=0).
#' - score.max.p.sim P-value  of  maximum  score.haplo, based on simulations (set equal to NA when n.sim=0).
#' - haplotype Matrix of hapoltypes  analyzed.  The ith row of haplotype corresponds to the ith item of score.haplo, score.haplo.p, and score.haplo.p.sim.
#' - hap.prob Vector of haplotype probabilies, corresponding to the haplotypes in the matrix haplotype.
#' - locus.label Vector of labels for loci, of length K (same as input argument).
#' - n.sim Number of simulations.
#' - n.val.global Number of valid simulated global statistics.
#' - n.val.haplo Number of valid simulated score statistics (score.haplo) for individual haplotypes.
#'
#' @details This is a version which substitutes haplo.em.
#'
#' @references
#' \insertRef{schaid02}{gap}
#'
#' @examples
#' \dontrun{
#' data(hla)
#' y<-hla[,2]
#' geno<-hla[,3:8]
#' # complete data
#' hap.score(y,geno,locus.label=c("DRB","DQA","DQB"))
#' # incomplete genotype data
#' hap.score(y,geno,locus.label=c("DRB","DQA","DQB"),handle.miss=1,mloci=1)
#' unlink("assign.dat")
#'
#' ### note the differences in p values in the following runs
#' data(aldh2)
#' # to subset the data since hap doesn't handle one allele missing
#' deleted<-c(40,239,256)
#' aldh2[deleted,]
#' aldh2<-aldh2[-deleted,]
#' y<-aldh2[,2]
#' geno<-aldh2[,3:18]
#' # only one missing locus
#' hap.score(y,geno,handle.miss=1,mloci=1,method="hap")
#' # up to seven missing loci and with 10,000 permutations
#' hap.score(y,geno,handle.miss=1,mloci=7,method="hap",n.sim=10000)
#'
#' # hap.score takes considerably longer time and does not handle missing data
#' hap.score(y,geno,n.sim=10000)
#' }
#'
#' @keywords models regression

hap.score <- function(y, geno, trait.type="gaussian",
                      offset = NA, x.adj = NA, skip.haplo=.005,
                      locus.label=NA, miss.val=0, n.sim=0, method="gc", id=NA,
                      handle.miss=0, mloci=NA, sexid=NA)
{
  trait.int <- charmatch(trait.type, c("gaussian", "binomial", "poisson", "ordinal"))
  if(is.na(trait.int)) stop("Invalid trait type")
  if(trait.int == 0)   stop("Ambiguous trait type")
  if(length(y)!=nrow(geno)) stop("Dims of y and geno are not compatible")
  n.loci <- ncol(geno)/2
  if(n.loci != (floor(ncol(geno)/2))) stop("Odd number of cols of geno")
  if(handle.miss==0)
  {
    miss <- apply(is.na(geno),1,any)
    if(!all(is.na(miss.val))) {
       for(mval in miss.val){
          miss <- miss | apply(geno==mval, 1, any)
       }
    }
  }
  else
  {
    if(is.na(mloci)) stop("Maximum number of missing loci (mloci) not specified")
    nmiss <- apply(is.na(geno),1,sum)
    if(!all(is.na(miss.val))) {
       for(mval in miss.val) {
          nmiss <- nmiss + apply(geno==mval, 1, sum)
       }
    }
    if(mloci<0 | mloci >= n.loci) stop("Invalid control for number of missing loci")
    miss <- rep(F, length(y))
    for(i in 1:length(y)) if(nmiss[i] > mloci*2) miss[i] <- T
  }
  adjusted <- T
  if( all(is.na(x.adj))) adjusted <- F
  if(adjusted){
    x.adj <- as.matrix(x.adj)
    if(nrow(x.adj)!=length(y)) stop("Dims of y and x.adj are not compatible")
  }
  miss <- miss | is.na(y) 
  if(adjusted) miss <- miss| apply(is.na(x.adj),1,any)
  if(trait.int==3) {
     if(all(is.na(offset))) stop("Missing offset")
     miss <- miss | is.na(offset)
     offset <- offset[!miss]
  }
  y <- as.numeric(y[!miss])
  geno <- geno[!miss,]
  if(adjusted) x.adj <- x.adj[!miss,,drop=F]
  if(trait.int==2) {
    if(!all(y==1|y==0)) stop("Invalid y values")
    if(all(y==1) | all(y==0)) stop("No variation in y values")
  }
  if(trait.int==4){
     y <- factor(y)
     y.lev <- levels(y)
     y <- as.numeric(y)
     if(max(y) < 3) stop("Less than 3 levels for y values")
  }
  n.subj <- length(y)
  if(all(is.na(id))) id <- 1:n.subj
  method.id<-charmatch(method, c("gc", "hap", "phase"))
  if(is.na(method.id)) stop("Invalid selection of method")
  if(method.id == 0)   stop("Ambiguous method")
  else if(method.id==1) haplo <- gc.em(data=geno, locus.label, converge.eps=0.00001, maxiter=5000, handle.miss=handle.miss, miss.val=miss.val)
  else if(method.id==2) haplo <- hap.em(id, data=geno, locus.label, converge.eps=0.00001, maxiter=5000, miss.val=miss.val)
  if(method.id<3 & !haplo$converge) stop("EM for haplo failed to converge")
  hap1 <- haplo$hap1code
  hap2 <- haplo$hap2code
  indx <- haplo$indx.subj
  post <- haplo$post
  nreps <- as.vector(haplo$nreps)
  uhap<-haplo$uhap
  which.haplo<-haplo$hap.prob>=skip.haplo
  uhap<-uhap[which.haplo]
  x <- outer(hap1,uhap,"==") + outer(hap2,uhap,"==")
  n.x <- ncol(x)
  x.post<-matrix(rep(NA, n.subj * n.x), ncol=n.x)
  for(j in 1:n.x){
     x.post[,j] <- tapply(x[,j]*post, indx, sum)
  }
  if(trait.int <= 3){ 
    if(!adjusted){
       mu <- switch(trait.int, mean(y), mean(y), sum(y)/sum(offset) )
       a  <- switch(trait.int, var(y), 1, 1)
       x.adj <- matrix(rep(1,n.subj),ncol=1)
     }
     if(adjusted){
        reg.out <- glm(y ~ x.adj, family=trait.type)
        x.adj <- cbind(rep(1,n.subj),x.adj)
        mu <- reg.out$fitted.values
        a  <- switch(trait.int,sum(reg.out$residuals^2)/reg.out$df.residual,1, 1)
      }
     v <- switch(trait.int, 1/a, mu*(1-mu), mu )
     tmp <- hap.score.glm(y, mu, a, v, x.adj, nreps, x.post, post, x)
     u.score <- tmp$u.score
     v.score <- tmp$v.score
   }
   if(trait.int ==4) {
      if(adjusted){
         for(p in c("rms")) {
            if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) {
               if (!requireNamespace(p, quietly = TRUE))
               warning(paste("This function needs package `", p, "' to be fully functional; please install", sep=""))
            }
         }
         reg.out <- rms::lrm(y ~ x.adj)
         K <- max(y)
         n.xadj <- ncol(x.adj)
         alpha <- reg.out$coef[1:(K-1)]
         beta <- reg.out$coeff[K:(K-1 + n.xadj)]

         tmp <- hap.score.podds(y, alpha, beta, x.adj, nreps, x.post, post, x)
       }
      if(!adjusted){
         tbl <- table(y)
         s <- 1- (cumsum(tbl)-tbl)/n.subj
         alpha <-  - log((1-s[-1])/s[-1])
         tmp <- hap.score.podds(y, alpha, beta=NA, x.adj=NA, nreps, x.post, post, x)
       }
      u.score <- tmp$u.score
      v.score <- tmp$v.score
    }
   tmp <- haplo.stats::Ginv(v.score)
   df <- tmp$rank
   g.inv <- tmp$Ginv
   score.global <- u.score%*% g.inv %*%u.score
   score.haplo <- u.score / sqrt(diag(v.score))
   score.max <-  max(score.haplo^2)
   if(n.sim==0){
      score.global.p.sim <- NA
      score.haplo.p.sim <- rep(NA,length(score.haplo))
      score.max.p.sim <- NA
      n.val.global <- NA
      n.val.haplo <- NA
   }
   if(n.sim > 0){
      score.global.rej <- 0
      score.haplo.rej  <- rep(0,length(score.haplo))
      score.max.rej    <- 0
      n.val.global <- 0
      n.val.haplo <- 0
      if(trait.int<=3){
         mu.rand <- mu
         v.rand <- v  
       }
      for(i in 1:n.sim){
         rand.ord <- order(runif(n.subj))
         if(trait.int <=3){ 
           if(adjusted){
              mu.rand <- mu[rand.ord]
              v.rand <- switch(trait.int, v, v[rand.ord], v[rand.ord])
            }
           tmp <- hap.score.glm(y[rand.ord], mu.rand, a, v.rand, 
                                x.adj[rand.ord,], nreps, x.post, post, x)
         }
         if(trait.int ==4){
            if(adjusted){             
               tmp <- hap.score.podds(y[rand.ord], alpha, beta, 
                               x.adj[rand.ord,,drop=F],nreps, x.post, post, x)
            }
            if(!adjusted) {
               tmp <- hap.score.podds(y[rand.ord], alpha, beta=NA, 
                               x.adj=NA,nreps, x.post, post, x)
             }

          }
         u.score <- tmp$u.score
         v.score <- tmp$v.score
         tmp <- haplo.stats::Ginv(v.score)
         g.inv <- tmp$Ginv  
         score.global.sim <- u.score %*% g.inv %*% u.score
         score.haplo.sim  <- (u.score / sqrt(diag(v.score)))^2
         score.max.sim <- max(score.haplo.sim)
         if(!is.na(score.global.sim)) {
            n.val.global <- n.val.global +1
            if(score.global.sim >= score.global) score.global.rej <- score.global.rej +1
          }
         if(!any(is.na(score.haplo.sim))){
            n.val.haplo <- n.val.haplo + 1
            score.haplo.rej <- score.haplo.rej +
                               ifelse(score.haplo.sim >= score.haplo^2, 1, 0)
            if(score.max.sim >= score.max) score.max.rej <- score.max.rej +1
          }
      }
      score.global.p.sim <- score.global.rej /  n.val.global
      score.haplo.p.sim <- score.haplo.rej / n.val.haplo
      score.max.p.sim <- score.max.rej / n.val.haplo
    }
   score.global.p <- 1 - pchisq(score.global,df)
   score.haplo.p <- 1-pchisq(score.haplo^2,1)
   if(all(is.na(locus.label))) {
      locus.label<- paste("loc-",1:n.loci,sep="")
    }
   obj <- (list(score.global=score.global, df=df,score.global.p=score.global.p,
       score.global.p.sim=score.global.p.sim,
       score.haplo=score.haplo,score.haplo.p=score.haplo.p,
       score.haplo.p.sim=score.haplo.p.sim,
       score.max.p.sim=score.max.p.sim,
       haplotype=haplo$haplotype[which.haplo,],
       hap.prob=haplo$hap.prob[which.haplo],
       locus.label=locus.label,
       n.sim=n.sim, n.val.global=n.val.global, n.val.haplo=n.val.haplo))
   class(obj) <- "hap.score"
   return(obj)
}
 
hap.score.glm <- function(y,mu,a,v,x.adj,nreps,x.post, post, x)
{
   u.mtx  <- (y-mu)*x.post / a
   u.score <- apply(u.mtx,2,sum)

   # Var matrix for x.adj covariates
   v.11 <- t(x.adj * v) %*% x.adj

   # Var matrix for covar(x.adj, x.post)
   v.21 <- t(x.post) %*% (x.adj * v)

   # Var matrix for haplo scores
   res <- ( (y - mu)/a ) ^2
   t1 <- rep( (v-res) ,nreps) * post
   v.22 <- t(x*t1) %*% x + t(u.mtx) %*% u.mtx

   # Var matrix for haplo scores, adjusted for x.adj
   v.score <- v.22 - v.21 %*% solve(v.11) %*% t(v.21) 
   return(list(u.score=u.score, v.score=v.score))
}

hap.score.podds <- function(y, alpha, beta=NA, x.adj=NA, nreps, x.post,  post, x)
{
###################################################################
#
# If U=c(u.a, u.e, u.g), where 
#   u.a = score for alpha's
#   u.e = score for unambiguous (x.adj) covariates
#   u.g = score for ambiguous haplotypes
#
# Then the upper triangle of Var(U) can be partitioned as
#
#          | v.aa   v.ae   v.ag |   |           |
#   V(U) = |        v.ee   v.eg | = | v.11 v.12 |
#          |               v.gg |   |      v.gg |
#
# where v.12 is composed of v.aa, v.ae, v.ee
#       v.12 is composed of v.ag, v.eg
#
# and Var(u.g) = v.gg - v.12 * v.12(inv) * t(v.12)
#
# The following computes each of the submatrices as needed
# to determine u.g and Var(u.g)
#
##################################################################
adjusted <- T
if(any(is.na(x.adj))) adjusted <- F
if(adjusted) n.xadj <- ncol(x.adj)
n.x <- ncol(x)
K <- max(y)

# to make suscripting easier, append Inf to front of alpha,
# as place-holder for alpha[1] = Inf
alpha <- c(Inf, alpha)
if(adjusted){
   s   <- ifelse(y==1, 1, 1/(1 + exp(-(alpha[y  ] + x.adj %*% beta ))) )
   s.p <- ifelse(y==K, 0, 1/(1 + exp(-(alpha[y+1] + x.adj %*% beta ))) )
 }
if(!adjusted){
   s   <- ifelse(y==1, 1, 1/(1 + exp(-(alpha[y  ]  ))) )
   s.p <- ifelse(y==K, 0, 1/(1 + exp(-(alpha[y+1]  ))) )
 }
w1 <- (s*(1-s) - s.p*(1-s.p))/(s - s.p)
u.mtx <- w1 * x.post
u.score <- apply(u.mtx,2,sum)

#  compute information matrix for alpha-beta (v.ab) and alpha-alpha (v.aa)
tmp1 <- (s   + s.p^2 - 2*s*s.p)*s.p*(1-s.p)/(s-s.p)^2
tmp2 <- (s.p +   s^2 - 2*s*s.p)*s*(1-s)/(s-s.p)^2
tmp3 <- s.p*(1-s.p)*s*(1-s)/(s-s.p)^2

v.ag <- matrix(rep(0, (K-1)*n.x), ncol=n.x)
if(adjusted) v.ae <- matrix(rep(0, (K-1)*n.xadj), ncol=n.xadj)
v.aa <- matrix(rep(0,(K-1)^2),ncol=(K-1))

n.subj <- length(y)
for(j in 2:K){
   wt <- rep(0,n.subj)
   wt <- ifelse(y==(j-1), (tmp1 - tmp3), wt)
   wt <- ifelse(y==j, (tmp2 - tmp3), wt)
   v.ag[(j-1),] <- apply(wt * x.post, 2,sum)
   if(adjusted) v.ae[(j-1),] <-  apply(wt * x.adj, 2,sum)
   v.aa[(j-1),(j-1)] <- sum(tmp1[y==(j-1)]) + sum(tmp2[y==j])   
   if(j < K) v.aa[(j-1), j] <- -sum(tmp3[y==j])
 }

# fill in lower tri of v.aa to make it symmetric
v.aa <- v.aa + t( (col(v.aa) > row(v.aa))*v.aa )

# Louis' method for v.gg
w2 <- s*(1-s) + s.p*(1-s.p)
t1 <- rep( (w2 - w1^2), nreps) * post
v.gg <- t(x*t1) %*% x + t(u.mtx) %*% u.mtx

if(adjusted){
   v.ee <- t(w2*x.adj) %*% x.adj
   v.eg <- t(w2*x.adj) %*% x.post
   v.11 <- rbind( cbind(v.aa, v.ae), cbind(t(v.ae),v.ee) )
   v.12 <- rbind(v.ag,v.eg)
   v.score <- v.gg - t(v.12) %*% solve(v.11) %*% v.12
 }
if(!adjusted){
   v.score <- v.gg - t(v.ag) %*% solve(v.aa) %*% v.ag
 }

return(list(u.score=u.score, v.score=v.score))
}

#' Plot haplotype frequencies versus haplotype score statistics
#'
#' Method function to plot a class of type hap.score
#'
#' @param x The object returned from hap.score (which has class hap.score).
#' @param ... Optional arguments.
#'
#' @export
#' @return
#' Nothing is returned.
#'
#' This is a plot method function used to plot haplotype frequencies on
#' the x-axis and haplotype-specific scores on the y-axis. Because
#' hap.score is a class, the generic plot function 
#' can be used, which in turn calls this plot.hap.score function.
#'
#' @references
#' Schaid DJ, Rowland CM, Tines DE, Jacobson RM,  Poland  GA (2002)
#' Score tests for association of traits with haplotypes when
#' linkage phase is ambiguous. Amer J Hum Genet 70:425-34
#'
#' @seealso [`hap.score`]
#'
#' @examples
#' \dontrun{
#' save <- hap.score(y, geno, trait.type = "gaussian")
#' 
#' # Example illustrating generic plot function:
#' plot(save)
#'
#' # Example illustrating specific method plot function:
#' plot.hap.score(save)
#' }
#'
#' @keywords hplot

plot.hap.score <- function(x, ...){
   plot(x$hap.prob, x$score.haplo, xlab="Haplotype Frequency", 
        ylab="Haploltype Score Statistic", ...)
   invisible()
}

#' Print a hap.score object
#'
#' Method function to print a class of type hap.score
#'
#' @param x The object returned from hap.score (which has class hap.score).
#' @param ... Optional argunents.
#'
#' @export
#' @return Nothing is returned.
#' 
#' This is a print method function used to print information from
#' hap.score class, with haplotype-specific information given in a
#' table. Because hap.score is a class, the generic print function 
#' can be used, which in turn calls this print.hap.score function.
#'
#' @references
#' Schaid DJ, Rowland CM, Tines DE, Jacobson RM, Poland  GA (2002)
#' Score tests for association of traits with haplotypes when
#' linkage phase is ambiguous. Amer J Hum Genet 70:425-34
#'
#' @seealso [`hap.score`]
#'
#' @examples
#' \dontrun{
#' save <- hap.score(y, geno, trait.type = "gaussian")
#'
#' # Example illustrating generic print function:
#' print(save)
#'
#' # Example illustrating specific method print function:
#' print.hap.score(save)
#' }
#'
#' @keywords print

print.hap.score <- function(x, ...){

# print of global score stats:
   cat("\nGlobal Score Statistics\n\n")
   cat(paste("global-stat = ",round(x$score.global,5),", df = ",x$df,
         ", p-val = ",round(x$score.global.p,5),sep=""))
   if(x$n.sim>0) cat(", sim. p-val = ",x$score.global.p.sim,"\n\n")
   if(x$n.sim>0) cat("max-stat sim. p-val = ",x$score.max.p.sim)
   cat("\n\n")

# create table for haplotype specific stats:
   tbl <- cbind(x$haplotype,round(x$hap.prob,5),round(x$score.haplo,5),
          round(x$score.haplo.p,5))
   if(x$n.sim>0) tbl <- cbind(tbl,x$score.haplo.p.sim)
   ord <- order(x$score.haplo)
   tbl <- tbl[ord,]
   if(x$n.sim == 0) dimnames(tbl) <- list(NULL,c(x$locus.label,"Hap-Freq",
                  "Hap-Score","p-val"))
   if(x$n.sim > 0) dimnames(tbl) <- list(NULL,c(x$locus.label,"Hap-Freq",
                  "Hap-Score","p-val","sim p-val"))
   cat("Haplotype-specific Scores\n\n")
   print(tbl,quote=F)
   cat("\n\n")
   invisible()
}

# 13-9-2003 start to implement
# 14-9-2003 in shape
# 21-9-2003 start extensive checking
# 23-9-2003 rewrite interface to genecounting
# 26-9-2003 done with successful use of by and order
# 17-10-2003 start to implement missing genotype code
# 18-9-2004 to fix S3 class hap.score

Try the gap package in your browser

Any scripts or data that you put into this service are public.

gap documentation built on Aug. 26, 2023, 5:07 p.m.