R/fitqtl.R

######################################################################
#
# fitqtl.R
#
# copyright (c) 2002-2013, Hao Wu and Karl W. Broman
# last modified Jan, 2013
# first written Apr, 2002
#
#     This program is free software; you can redistribute it and/or
#     modify it under the terms of the GNU General Public License,
#     version 3, as published by the Free Software Foundation.
#
#     This program is distributed in the hope that it will be useful,
#     but without any warranty; without even the implied warranty of
#     merchantability or fitness for a particular purpose.  See the GNU
#     General Public License, version 3, for more details.
#
#     A copy of the GNU General Public License, version 3, is available
#     at http://www.r-project.org/Licenses/GPL-3
#
# Part of the R/qtl package
# Contains: fitqtl, fitqtlengine, parseformula, summary.fitqtl,
#           print.summary.fitqtl, mybinaryrep, deparseQTLformula
#           printQTLformulanicely
#
######################################################################

######################################################################
#
# This is the function to fit a model and generate some tables
#
# Only Haley-Knott regression and multiple imputation are implemented.
#
######################################################################

fitqtl <-
function(cross, pheno.col=1, qtl, covar=NULL, formula, method=c("imp", "hk"),
         model=c("normal", "binary"), dropone=TRUE, get.ests=FALSE,
         run.checks=TRUE, tol=1e-4, maxit=1000, forceXcovar=FALSE)
{
  # some input checking stuff in here
  if( !("cross" %in% class(cross)) )
    stop("The cross argument must be an object of class \"cross\".")

  if( !("qtl" %in% class(qtl)) )
    stop("The qtl argument must be an object of class \"qtl\".")

  if(!is.null(covar) && !is.data.frame(covar)) {
    if(is.matrix(covar) && is.numeric(covar))
      covar <- as.data.frame(covar, stringsAsFactors=TRUE)
    else stop("covar should be a data.frame")
  }

  if(LikePheVector(pheno.col, nind(cross), nphe(cross))) {
    cross$pheno <- cbind(pheno.col, cross$pheno)
    pheno.col <- 1
  }

  if(length(pheno.col) > 1) {
    pheno.col <- pheno.col[1]
    warning("fitqtl can take just one phenotype; only the first will be used")
  }

  if(is.character(pheno.col)) {
    num <- find.pheno(cross, pheno.col)
    if(is.na(num))
      stop("Couldn't identify phenotype \"", pheno.col, "\"")
    pheno.col <- num
  }

  if(pheno.col < 1 | pheno.col > nphe(cross))
    stop("pheno.col values should be between 1 and the no. phenotypes")

  pheno <- cross$pheno[,pheno.col]
  if(!is.null(covar) && nrow(covar) != length(pheno))
    stop("nrow(covar) != no. individuals in cross.")

  method <- match.arg(method)
  model <- match.arg(model)

  if(model=="binary" && any(!is.na(pheno) & pheno != 0 & pheno != 1))
      stop("For model=\"binary\", phenotypes must by 0 or 1.")

  # allow formula to be a character string
  if(!missing(formula) && is.character(formula))
    formula <- as.formula(formula)

  if(method=="imp") {
    if(!("geno" %in% names(qtl))) {
      if("prob" %in% names(qtl)) {
        warning("The qtl object doesn't contain imputations; using method=\"hk\".")
        method <- "hk"
      }
      else
        stop("The qtl object needs to be created with makeqtl with what=\"draws\".")
    }
  }
  else {
    if(!("prob" %in% names(qtl))) {
      if("geno" %in% names(qtl)) {
        warning("The qtl object doesn't contain QTL genotype probabilities; using method=\"imp\".")
        method <- "imp"
      }
      else
        stop("The qtl object needs to be created with makeqtl with what=\"prob\".")
    }
  }

  if(qtl$n.ind != nind(cross)) {
    warning("No. individuals in qtl object doesn't match that in the input cross; re-creating qtl object.")
    if(method=="imp")
      qtl <- makeqtl(cross, qtl$chr, qtl$pos, qtl$name, what="draws")
    else
      qtl <- makeqtl(cross, qtl$chr, qtl$pos, qtl$name, what="prob")
  }
  if(method=="imp" && dim(qtl$geno)[3] != dim(cross$geno[[1]]$draws)[3])  {
    warning("No. imputations in qtl object doesn't match that in the input cross; re-creating qtl object.")
    qtl <- makeqtl(cross, qtl$chr, qtl$pos, qtl$name, what="draws")
  }

  fitqtlengine(pheno=pheno, qtl=qtl, covar=covar, formula=formula,
               method=method, model=model, dropone=dropone, get.ests=get.ests,
               run.checks=run.checks, cross.attr=attributes(cross),
               sexpgm=getsex(cross), tol=tol, maxit=maxit, forceXcovar=forceXcovar)
}


fitqtlengine <-
function(pheno, qtl, covar=NULL, formula, method=c("imp", "hk"),
         model=c("normal", "binary"),
         dropone=TRUE, get.ests=FALSE, run.checks=TRUE, cross.attr,
         sexpgm, tol, maxit, forceXcovar=FALSE)
{
  model <- match.arg(model)
  method <- match.arg(method)

  # local variables
  n.ind <- qtl$n.ind # number of individuals
  n.qtl <- qtl$n.qtl # number of selected markers
  n.gen <- qtl$n.gen # number of genotypes
  if(method=="imp")
    n.draws <- dim(qtl$geno)[3] # number of draws

  if( is.null(covar) )  # number of covariates
    n.covar <- 0
  else
    n.covar <- ncol(covar)

  # if formula is missing, build one
  # all QTLs and covariates will be additive by default
  if(missing(formula)) {
    tmp.Q <- paste("Q", 1:n.qtl, sep="") # QTL term names
    formula <- "y~Q1"
    if(n.qtl > 1)
      for (i in 2:n.qtl)
        formula <- paste(formula, tmp.Q[i], sep="+")
    if (n.covar) { # if covariate is not empty
      tmp.C <- colnames(covar) # covariate term names
      for(i in 1:n.covar)
        formula <- paste(formula, tmp.C[i], sep="+")
    }
    formula <- as.formula(formula)
  }

  # check input formula
  if(run.checks) {
    formula <- checkformula(formula, qtl$altname, colnames(covar))

    if(qtl$n.ind != length(pheno))
      stop("No. individuals in qtl object doesn't match length of input phenotype.")

    # drop covariates that are not in the formula
    if(!is.null(covar)) {
      theterms <- rownames(attr(terms(formula), "factors"))
      m <- match(colnames(covar), theterms)
      if(all(is.na(m))) covar <- NULL
      else covar <- covar[,!is.na(m),drop=FALSE]
    }

    # check phenotypes and covariates; drop ind'ls with missing values
    if(!is.null(covar)) phcovar <- cbind(pheno, covar)
    else phcovar <- as.data.frame(pheno, stringsAsFactors=TRUE)
    if(any(is.na(phcovar))) {
      if(ncol(phcovar)==1) hasmissing <- is.na(phcovar)
      else hasmissing <- apply(phcovar, 1, function(a) any(is.na(a)))
      if(all(hasmissing))
        stop("All individuals are missing phenotypes or covariates.")
      if(any(hasmissing)) {
        warning("Dropping ", sum(hasmissing), " individuals with missing phenotypes.\n")
        pheno <- pheno[!hasmissing]
        n.ind <- qtl$n.ind <- sum(!hasmissing)
        if(method=="imp")
          qtl$geno <- qtl$geno[!hasmissing,,,drop=FALSE]
        else
          qtl$prob <- lapply(qtl$prob, function(a) a[!hasmissing,,drop=FALSE])

        if(!is.null(covar)) covar <- covar[!hasmissing,,drop=FALSE]

        # subset sexpgm
        for(i in seq(along=sexpgm))
          if(!is.null(sexpgm[[i]]))
            sexpgm[[i]] <- sexpgm[[i]][!hasmissing]
      }
    }
  }

  # parse the input formula
  p <- parseformula(formula, qtl$altname, colnames(covar))
  # make an array n.gen.QC to represent the genotype numbers
  # for all input QTLs and covariates. For covariates the
  # number of genotyps is 1. This makes programming easier
  n.gen.QC <- c(n.gen[p$idx.qtl]-1, rep(1, p$n.covar))

  # covariates to be passed to C function
  # This is done in case of that user input covar but has no covar in formula
  covar.C <- NULL
  if(!is.null(p$idx.covar))
    covar.C <- as.matrix(covar[,p$idx.covar,drop=FALSE])

  sizefull <- 1+sum(n.gen.QC)
  if(p$n.int > 0) {
    form <- p$formula.intmtx*n.gen.QC
    if(!is.matrix(form)) {
      sizefull <- sizefull + prod(form[form!=0])
    }
    else {
      form <- apply(form,2,function(a) prod(a[a != 0]))
      sizefull <- sizefull + sum(form)
    }
  }

  if(method != "imp") {
    # form genotype probabilities as a matrix
    prob <- matrix(ncol=sum(qtl$n.gen[p$idx.qtl]), nrow=n.ind)

    curcol <- 0
    for(i in p$idx.qtl) {
      prob[,curcol+1:n.gen[i]] <- qtl$prob[[i]]
      curcol <- curcol + n.gen[i]
    }
  }

  Xadjustment <- scanoneXnull(cross.attr$class[1], sexpgm, cross.attr)
  n.origcovar <- p$n.covar
  if((sum(qtl$chrtype[p$idx.qtl]=="X") >= 1 || forceXcovar) && Xadjustment$adjustX) { # need to include X chromosome covariates
    adjustX <- TRUE

    n.newcovar <- ncol(Xadjustment$sexpgmcovar)
    n.gen.QC <- c(n.gen.QC, rep(1, n.newcovar))
    p$n.covar <- p$n.covar + n.newcovar
    covar.C <- cbind(covar.C, Xadjustment$sexpgmcovar)
    sizefull <- sizefull + n.newcovar

    if(p$n.int==1)
      p$formula.intmtx <- c(p$formula.intmtx, rep(0,n.newcovar))
    if(p$n.int>1) {
      for(i in 1:n.newcovar)
        p$formula.intmtx <- rbind(p$formula.intmtx, rep(0,p$n.int))
    }
  }
  else adjustX <- FALSE

  # call C function to do the genome scan
  if(model=="normal") {
    if(method == "imp") {
      z <- .C("R_fitqtl_imp",
              as.integer(n.ind), # number of individuals
              as.integer(p$n.qtl), # number of qtls
              as.integer(n.gen.QC), # number of genotypes QTLs and covariates
              as.integer(n.draws), # number of draws
              as.integer(qtl$geno[,p$idx.qtl,]), # genotypes for selected marker
              as.integer(p$n.covar), # number of covariate
              as.double(covar.C), # covariate
              as.integer(p$formula.intmtx),  # formula matrix for interactive terms
              as.integer(p$n.int), # number of interactions in the formula
              as.double(pheno), # phenotype
              as.integer(get.ests), # get estimates?
              # return variables
              lod=as.double(0), # LOD score
              df=as.integer(0), # degree of freedom
              ests=as.double(rep(0,sizefull)),
              ests.cov=as.double(rep(0,sizefull*sizefull)),
              design.mat=as.double(rep(0,sizefull*n.ind)),
              PACKAGE="qtl")
    }
    else {
      z <- .C("R_fitqtl_hk",
              as.integer(n.ind), # number of individuals
              as.integer(p$n.qtl), # number of qtls
              as.integer(n.gen.QC), # number of genotypes QTLs and covariates
              as.double(prob),      # QTL genotype probabilities
              as.integer(p$n.covar), # number of covariate
              as.double(covar.C), # covariates
              as.integer(p$formula.intmtx),  # formula matrix for interactive terms
              as.integer(p$n.int), # number of interactions in the formula
              as.double(pheno), # phenotype
              as.integer(get.ests), # get estimates?
              # return variables
              lod=as.double(0), # LOD score
              df=as.integer(0), # degree of freedom
              ests=as.double(rep(0,sizefull)),
              ests.cov=as.double(rep(0,sizefull*sizefull)),
              design.mat=as.double(rep(0,sizefull*n.ind)),
              PACKAGE="qtl")
    }
  }
  else {
    if(method=="imp") {
      z <- .C("R_fitqtl_imp_binary",
              as.integer(n.ind), # number of individuals
              as.integer(p$n.qtl), # number of qtls
              as.integer(n.gen.QC), # number of genotypes QTLs and covariates
              as.integer(n.draws), # number of draws
              as.integer(qtl$geno[,p$idx.qtl,]), # genotypes for selected marker
              as.integer(p$n.covar), # number of covariate
              as.double(covar.C), # covariate
              as.integer(p$formula.intmtx),  # formula matrix for interactive terms
              as.integer(p$n.int), # number of interactions in the formula
              as.double(pheno), # phenotype
              as.integer(get.ests), # get estimates?
              # return variables
              lod=as.double(0), # LOD score
              df=as.integer(0), # degree of freedom
              ests=as.double(rep(0,sizefull)),
              ests.cov=as.double(rep(0,sizefull*sizefull)),
              design.mat=as.double(rep(0,sizefull*n.ind)),
              as.double(tol),
              as.integer(maxit),
              PACKAGE="qtl")
    }
    else {
      z <- .C("R_fitqtl_hk_binary",
              as.integer(n.ind), # number of individuals
              as.integer(p$n.qtl), # number of qtls
              as.integer(n.gen.QC), # number of genotypes QTLs and covariates
              as.double(prob),      # QTL genotype probabilities
              as.integer(p$n.covar), # number of covariate
              as.double(covar.C), # covariates
              as.integer(p$formula.intmtx),  # formula matrix for interactive terms
              as.integer(p$n.int), # number of interactions in the formula
              as.double(pheno), # phenotype
              as.integer(get.ests), # get estimates?
              # return variables
              lod=as.double(0), # LOD score
              df=as.integer(0), # degree of freedom
              ests=as.double(rep(0,sizefull)),
              ests.cov=as.double(rep(0,sizefull*sizefull)),
              design.mat=as.double(rep(0,sizefull*n.ind)),
              # convergence
              as.double(tol),
              as.integer(maxit),
              PACKAGE="qtl")
    }
  }

  if(get.ests) {
    # first, construct the new design matrix
    #  X = the matrix used in the C coe
    #  Z = the matrix we want
    thenames <- qtl$name[p$idx.qtl]
    if(n.covar > 0) thenames <- c(thenames,names(covar)[p$idx.covar])

    ests <- z$ests
    ests.cov <- matrix(z$ests.cov,ncol=sizefull)

    if(adjustX) {
      keep <- 1:(length(ests)-n.newcovar)
      ests <- ests[keep]
      ests.cov <- ests.cov[keep,keep]
    }

    if(any(qtl$n.gen[p$idx.qtl]>=4)) {
      type <- cross.attr$class[1]
      if(type == "4way")
        genotypes <- c("AC","BC","AD","BD")
      else {
        genotypes <- cross.attr$genotypes
        if(is.null(genotypes))
          genotypes <- as.character(1:max(qtl$n.gen))
      }

      # just attach rownames for this case
      thenames <- "Intercept"

      if(length(p$idx.qtl) > 0) { # qtl names
        qtlnames <- vector("list", length(p$idx.qtl))
        names(qtlnames) <- paste("Q", 1:length(p$idx.qtl), sep="")
        for(i in seq(along=p$idx.qtl)) {
          qtlnames[[i]] <- paste(qtl$name[p$idx.qtl[i]], genotypes[2:qtl$n.gen[p$idx.qtl[i]]], sep=".")
          thenames <- c(thenames, qtlnames[[i]])
        }
      }

      if(p$n.covar > 0) {
        covnames <- colnames(covar)[p$idx.covar]
        thenames <- c(thenames, covnames)
      }

      if(p$n.int > 0) { # interactions
        if(!is.matrix(p$formula.intmtx)) {
          nam <- names(p$formula.intmtx)
          p$formula.intmtx <- matrix(p$formula.intmtx, nrow=p$n.int)
          colnames(p$formula.intmtx) <- nam
        }

        for(i in 1:p$n.int) {
          wh <- which(p$formula.intmtx[i,]==1)
          nam <- colnames(p$formula.intmtx)[wh]

          if(length(grep("^Q[0-9]+$", nam[1])) > 0)
            curnam <- qtlnames[[nam[1]]]
          else
            curnam <- nam[1]

          for(j in 2:length(nam)) {
            if(length(grep("Q[0-9]+", nam[j])) > 0)
              curnam <- paste(curnam, qtlnames[[nam[j]]], sep=".")
            else
              curnam <- paste(curnam, nam[j], sep=".")
          }
          thenames <- c(thenames, curnam)
        }
      }

      if(length(thenames) ==length(ests)) {
        names(ests) <- thenames
        dimnames(ests.cov) <- list(thenames, thenames)
      }
      else
        warning("Estimated QTL effects not yet made meaningful for this case.\n   ")

    }
    else {
      X <- matrix(z$design.mat,ncol=sizefull)
      Z <- matrix(0,nrow=n.ind,ncol=sizefull)
      colnames(Z) <- rep("",sizefull)

      # mean column
      Z[,1] <- 1
      colnames(Z)[1] <- "Intercept"

      # ZZ stores the main effects matrices, for creating the interactions
      ZZ <- vector("list",p$n.qtl+p$n.covar)
      curcol <- 1
      # covariates
      if(p$n.covar > 0) {
        for(j in 1:p$n.covar) {
          Z[,curcol+j] <- ZZ[[p$n.qtl+j]] <- as.matrix(covar[,p$idx.covar[j],drop=FALSE])
          colnames(Z)[curcol+j] <- colnames(ZZ[[p$n.qtl+j]]) <- names(covar)[p$idx.covar[j]]
        }
        curcol <- curcol + p$n.covar
      }
      # QTL main effects
      for(i in seq(along=p$idx.qtl)) {
        if(n.gen[i]==2) {
          if(method=="imp") {
            if(cross.attr$class[1] == "bc") {
              Z[qtl$geno[,p$idx.qtl[i],1]==1,curcol+1] <- -0.5
              Z[qtl$geno[,p$idx.qtl[i],1]==2,curcol+1] <- 0.5
            } else {
              Z[qtl$geno[,p$idx.qtl[i],1]==1,curcol+1] <- -1
              Z[qtl$geno[,p$idx.qtl[i],1]==2,curcol+1] <- 1
            }
          }
          else
            if(cross.attr$class[1] == "bc") {
              Z[,curcol+1] <- (qtl$prob[[p$idx.qtl[i]]][,2] - qtl$prob[[p$idx.qtl[i]]][,1])/2
            } else {
              Z[,curcol+1] <- (qtl$prob[[p$idx.qtl[i]]][,2] - qtl$prob[[p$idx.qtl[i]]][,1])
            }
          colnames(Z)[curcol+1] <- thenames[i]
        }
        else { # 3 genotypes
          if(method=="imp") {
            Z[qtl$geno[,p$idx.qtl[i],1]==1,curcol+1] <- -1
            Z[qtl$geno[,p$idx.qtl[i],1]==3,curcol+1] <- 1
            Z[qtl$geno[,p$idx.qtl[i],1]==2,curcol+2] <- 0.5
            Z[qtl$geno[,p$idx.qtl[i],1]!=2,curcol+2] <- -0.5
          }
          else {
            Z[,curcol+1] <- qtl$prob[[p$idx.qtl[i]]][,3] - qtl$prob[[p$idx.qtl[i]]][,1]
            Z[,curcol+2] <- (qtl$prob[[p$idx.qtl[i]]][,2] - qtl$prob[[p$idx.qtl[i]]][,1] -
                             qtl$prob[[p$idx.qtl[i]]][,3])/2
          }
          colnames(Z)[curcol+1:2] <- paste(thenames[i],c("a","d"),sep="")
        }
        ZZ[[i]] <- Z[,curcol+1:(n.gen[i]-1),drop=FALSE]
        curcol <- curcol + n.gen[i]-1
      }

      if(p$n.int>0) {
        for(i in 1:p$n.int) {
          if(p$n.int==1)
            intform <- p$formula.intmtx
          else
            intform <- p$formula.intmtx[,i]
          tempZ <- matrix(1,ncol=1,nrow=nrow(Z))
          colnames(tempZ) <- ""
          for(j in seq(along=intform)) {
            if(intform[j]==1) {
              tZ <- NULL
              for(k in 1:ncol(ZZ[[j]])) {
                tZZ <- tempZ * ZZ[[j]][,k]
                if(all(colnames(tempZ) == ""))
                  colnames(tZZ) <- colnames(ZZ[[j]])[k]
                else
                  colnames(tZZ) <- paste(colnames(tempZ),colnames(ZZ[[j]])[k],sep=":")
                tZ <- cbind(tZ,tZZ)
              }
              tempZ <- tZ
            }
          }
          Z[,curcol+1:ncol(tempZ)] <- tempZ
          colnames(Z)[curcol+1:ncol(tempZ)] <- colnames(tempZ)
          curcol <- curcol + ncol(tempZ)
        }
      }

      b <- solve(t(Z) %*% Z, t(Z) %*% X)
      ests <- as.numeric(b %*% ests)
      ests.cov <- b %*% ests.cov %*% t(b)
      names(ests) <- colnames(Z)
      dimnames(ests.cov) <- list(colnames(Z),colnames(Z))
    }
  }

  ##### output ANOVA table for full model #####
  result.full <- matrix(NA, 3, 7)
  colnames(result.full) <- c("df", "SS", "MS", "LOD", "%var", "Pvalue(Chi2)",
                             "Pvalue(F)")
  rownames(result.full) <- c("Model", "Error", "Total")
  result.full[1,1] <- z$df # model degree of freedom

  if(model=="normal") {
    # compute the SS for total

    if(adjustX) {
      mpheno <- mean(pheno)
      Rss0 <- sum( (pheno-mpheno)^2 )

      Rss0adj <- Rss0x <- sum( lm(pheno ~ Xadjustment$sexpgmcovar)$resid^2 )
      OrigModellod <- z$lod
      Modellod <- z$lod + length(pheno)/2 * (log10(Rss0x) - log10(Rss0))
    } else {
      mpheno <- mean(pheno)
      Rss0adj <- Rss0 <- sum( (pheno-mpheno)^2 )
      OrigModellod <- Modellod <- z$lod
    }

    # third row, for Total
    result.full[3,1] <- length(pheno) - 1 # total degree of freedom
    result.full[3,2] <- Rss0adj # total sum of squares

    # first row, for Model
    result.full[1,1] <- z$df # df for Model
    # Variance explained by model
    result.full[1,5] <- 100 * (1 - exp(-2*Modellod*log(10)/n.ind))
    result.full[1,2] <- Rss0adj * result.full[1,5]/100  # SS for model
    result.full[1,3] <- result.full[1,2]/z$df # MS for model
    result.full[1,4] <- Modellod # Model LOD score

    # Second row, for Error
    # df
    result.full[2,1] <- result.full[3,1] - result.full[1,1]
    # SS
    result.full[2,2] <- result.full[3,2] - result.full[1,2]
    # MS
    result.full[2,3] <- result.full[2,2] / result.full[2,1]

    # first row, P values
    # P value (chi2) for model
    result.full[1,6] <- 1 - pchisq(2*log(10)*Modellod, z$df)
    # P value (F statistics) for model
    df0 <- result.full[3,1]; df1 <- result.full[2,1];
    Rss1 <- result.full[2,2]
    Fstat <- ((Rss0adj-Rss1)/(df0-df1)) / (Rss1/df1)
    result.full[1,7] <- 1 - pf(Fstat, df0-df1, df1)
  }
  else {
    # third row, for Total
    result.full[3,1] <- length(pheno) - 1 # total degree of freedom
    result.full[3,2] <- NA # total sum of squares

    # first row, for Model
    result.full[1,1] <- z$df # df for Model
    # Variance explained by model
    result.full[1,5] <- 100 * (1 - exp(-2*z$lod*log(10)/n.ind))
    result.full[1,2] <- NA  # SS for model
    result.full[1,3] <- NA # MS for model
    result.full[1,4] <- z$lod # Model LOD score
    OrigModellod <- Modellod <- z$lod

    # Second row, for Error
    # df
    result.full[2,1] <- result.full[3,1] - result.full[1,1]
    # SS
    result.full[2,2] <- NA
    # MS
    result.full[2,3] <- NA

    # first row, P values
    # P value (chi2) for model
    result.full[1,6] <- 1 - pchisq(2*log(10)*z$lod, z$df)
    # P value (F statistics) for model
    result.full[1,7] <- NA
  }
  ############# Finish ANOVA table for full model


  # initialize output object
  output <- NULL
  output$result.full <- result.full

  # drop one at a time?
  if(dropone && (p$n.qtl+n.origcovar)>1) {
    # user wants to do drop one term at a time and output anova table

    # get the terms etc. for input formula
    f.terms <- terms(formula)
    f.order <- attr(f.terms, "order")
    f.label <- attr(f.terms, "term.labels")

    # initialize output matrix
    # ANOVA table will have five columns, e.g., df,Type III SS,
    # LOD, %var, Pvalue for each dropping term
    # Full model result will not be in this table
    result <- matrix(0, length(f.order), 7)
    colnames(result) <- c("df", "Type III SS", "LOD", "%var", "F value",
                          "Pvalue(Chi2)", "Pvalue(F)")
    rownames(result) <- rep("",length(f.order))

    drop.term.name <- NULL
    formulas <- rep("", length(f.order))
    lods <- rep(NA, length(f.order))

    for( i in (1:length(f.order)) ) {
      # loop thru all terms in formula, from the highest order
      # the label of the term to be droped
      label.term.drop <- f.label[i]

      ### find the corresponding QTL name for this term ###
      # This is used for output ANOVA table
      if(f.order[i] == 1) {
        # this is a first order term
        # if the term label is like Q(q)1, Q(q)2, etc., then it's a QTL
        if( length(grep("Q[0-9]", label.term.drop, ignore.case=TRUE)) != 0) {
          idx.qtlname <- as.integer(substr(label.term.drop, 2, nchar(label.term.drop)))
          drop.term.name[i] <- qtl$name[idx.qtlname]
        }
        else { # this is a covariate
          drop.term.name[i] <- label.term.drop
        }
      }
      else {
        # this is a 2nd (or higher)order and the term is a string like "Q2:Q3:C1"
        # I use strsplit to split it to a string vector "Q2" "Q3" "C1".
        # then take out 2 and 3 as integer. Then find out the
        # QTL name from the input QTL object and concatenate them
        tmp.str <- strsplit(label.term.drop,":")[[1]]
        for(j in 1:length(tmp.str)) {
          if( length(grep("Q[0-9]", tmp.str[j], ignore.case=TRUE)) != 0 ) {
            # this is a QTL
            idx.qtlname <- as.integer(substr(tmp.str[j], 2, nchar(tmp.str[j])))
            tmp.str[j] <- qtl$name[idx.qtlname]
          }
          if(j == 1) # first term
            drop.term.name[i] <- tmp.str[j]
          else # not the first term
            drop.term.name[i] <- paste(drop.term.name[i], tmp.str[j], sep=":")
        }
      }
      ### Finish QTL name ###

      # find the indices of the term(s) to be dropped
      # All terms contain label.term.drop will be dropped
      idx.term.drop <- NULL
      tmp.str.drop <- tolower(strsplit(label.term.drop,":")[[1]])
      for(j in 1:length(f.label)) {
        tmp.str.label <- tolower(strsplit(f.label[j], ":")[[1]])
        if(all(tmp.str.drop %in% tmp.str.label))
          idx.term.drop <- c(idx.term.drop, j)
      }

      # the indices of term(s) to be kept
      idx.term.kept <- setdiff(1:length(f.order), idx.term.drop)

      #### regenerate a formula with the kept terms additive ###
      if(length(idx.term.kept) == 0) # nothing left after drop label.term.drop
        stop("There will be nothing left if drop ", drop.term.name[i])
      else {
        # All terms for idx.term.kept will be additive
        formula.new <- as.formula(paste("y~", paste(f.label[idx.term.kept], collapse="+"), sep=""))
      }

      ### Start fitting model again
      # parse the input formula
      p.new <- parseformula(formula.new, qtl$altname, colnames(covar))
      n.gen.QC <- c(n.gen[p.new$idx.qtl]-1, rep(1, p.new$n.covar))

      formulas[i] <- deparseQTLformula(formula.new)

      # covariate to be passed to C function
      covar.C <- NULL
      if(!is.null(p.new$idx.covar))
        covar.C <- as.matrix(covar[,p.new$idx.covar,drop=FALSE])

      if(method != "imp") {
        # form genotype probabilities as a matrix
        prob <- matrix(ncol=sum(qtl$n.gen[p.new$idx.qtl]), nrow=n.ind)

        curcol <- 0
        for(z in p.new$idx.qtl) {
          prob[,curcol+1:n.gen[z]] <- qtl$prob[[z]]
          curcol <- curcol + n.gen[z]
        }
      }

      if(adjustX)  { # need to include X chromosome covariates
        n.newcovar <- ncol(Xadjustment$sexpgmcovar)
        n.gen.QC <- c(n.gen.QC, rep(1, n.newcovar))
        p.new$n.covar <- p.new$n.covar + n.newcovar
        covar.C <- cbind(covar.C, Xadjustment$sexpgmcovar)
        sizefull <- sizefull + n.newcovar

        if(p.new$n.int==1)
          p.new$formula.intmtx <- c(p.new$formula.intmtx, rep(0,n.newcovar))
        if(p.new$n.int>1) {
          for(i2 in 1:n.newcovar)
            p.new$formula.intmtx <- rbind(p.new$formula.intmtx, rep(0,p.new$n.int))
        }
      }

      # call C function fit model
      if(model=="normal") {
        if(method == "imp") {
          z <- .C("R_fitqtl_imp",
                  as.integer(n.ind), # number of individuals
                  as.integer(p.new$n.qtl), # number of qtls
                  as.integer(n.gen.QC), # number of genotypes QTLs and covariates
                  as.integer(n.draws), # number of draws
                  as.integer(qtl$geno[,p.new$idx.qtl,]), # genotypes for selected marker
                  as.integer(p.new$n.covar), # number of covariate
                  as.double(covar.C), # covariate
                  as.integer(p.new$formula.intmtx),  # formula matrix for interactive terms
                  as.integer(p.new$n.int), # number of interactions in the formula
                  as.double(pheno), # phenotype
                  as.integer(0),
                  # return variables
                  lod=as.double(0), # LOD score
                  df=as.integer(0), # degree of freedom
                  as.double(rep(0,sizefull)),
                  as.double(rep(0,sizefull*sizefull)),
                  as.double(rep(0,n.ind*sizefull)),
                  PACKAGE="qtl")
        }

        else {
          z <- .C("R_fitqtl_hk",
                  as.integer(n.ind), # number of individuals
                  as.integer(p.new$n.qtl), # number of qtls
                  as.integer(n.gen.QC), # number of genotypes QTLs and covariates
                  as.double(prob),
                  as.integer(p.new$n.covar), # number of covariate
                  as.double(covar.C), # covariate
                  as.integer(p.new$formula.intmtx),  # formula matrix for interactive terms
                  as.integer(p.new$n.int), # number of interactions in the formula
                  as.double(pheno), # phenotype
                  as.integer(0),
                  # return variables
                  lod=as.double(0), # LOD score
                  df=as.integer(0), # degree of freedom
                  as.double(rep(0,sizefull)),
                  as.double(rep(0,sizefull*sizefull)),
                  as.double(rep(0,n.ind*sizefull)),
                  PACKAGE="qtl")
        }
      }
      else { # binary trait
        if(method=="imp") {
          z <- .C("R_fitqtl_imp_binary",
                  as.integer(n.ind), # number of individuals
                  as.integer(p.new$n.qtl), # number of qtls
                  as.integer(n.gen.QC), # number of genotypes QTLs and covariates
                  as.integer(n.draws), # number of draws
                  as.integer(qtl$geno[,p.new$idx.qtl,]), # genotypes for selected marker
                  as.integer(p.new$n.covar), # number of covariate
                  as.double(covar.C), # covariate
                  as.integer(p.new$formula.intmtx),  # formula matrix for interactive terms
                  as.integer(p.new$n.int), # number of interactions in the formula
                  as.double(pheno), # phenotype
                  as.integer(0),
                  # return variables
                  lod=as.double(0), # LOD score
                  df=as.integer(0), # degree of freedom
                  as.double(rep(0,sizefull)),
                  as.double(rep(0,sizefull*sizefull)),
                  as.double(rep(0,n.ind*sizefull)),
                  as.double(tol),
                  as.integer(maxit),
                  PACKAGE="qtl")
        }
        else {
          z <- .C("R_fitqtl_hk_binary",
                  as.integer(n.ind), # number of individuals
                  as.integer(p.new$n.qtl), # number of qtls
                  as.integer(n.gen.QC), # number of genotypes QTLs and covariates
                  as.double(prob),
                  as.integer(p.new$n.covar), # number of covariate
                  as.double(covar.C), # covariate
                  as.integer(p.new$formula.intmtx),  # formula matrix for interactive terms
                  as.integer(p.new$n.int), # number of interactions in the formula
                  as.double(pheno), # phenotype
                  as.integer(0),
                  # return variables
                  lod=as.double(0), # LOD score
                  df=as.integer(0), # degree of freedom
                  as.double(rep(0,sizefull)),
                  as.double(rep(0,sizefull*sizefull)),
                  as.double(rep(0,n.ind*sizefull)),
                  # convergence
                  as.double(tol),
                  as.integer(maxit),
                  PACKAGE="qtl")
        }
      }

      if(model=="normal" && adjustX) # adjust for X chromosome covariates
        z$lod <- z$lod + length(pheno)/2 * (log10(Rss0x) - log10(Rss0))

      # record the result for dropping this term
      # df
      result[i,1] <- result.full[1,1] - z$df
      # LOD score
      result[i,3] <- Modellod - z$lod
      # % variance explained
      result[i,4] <- result.full[1,5] - 100*(1 - 10^(-2*z$lod/n.ind))

      # lod score for reduced model
      lods[i] <- z$lod

      # Type III SS for this term - computed from %var
      if(model=="normal")
        result[i,2] <- result.full[3,2] * result[i,4] / 100
      else
        result[i,2] <- NA
      # F value
      if(model=="normal") {
        df0 <- length(pheno) - z$df - 1; df1 <- result.full[2,1];
        Rss0p <- result.full[2,2] + result[i,2];
        Rss1p <- result.full[2,2]
        Fstat <- ((Rss0p-Rss1p)/(df0-df1)) / (Rss1/df1)
        result[i,5] <- Fstat
        # P value (F)
        result[i,7] <- 1 - pf(Fstat, df0-df1, df1)
      }
      else # ignore F stat for binary trait
        result[i,c(5,7)] <- NA
      # P value (chi2)
      result[i,6] <- 1 - pchisq(2*log(10)*result[i,3], result[i,1])
      # assign row name
      rownames(result)[i] <- drop.term.name[i]
    } # finish dropping terms loop

    attr(result, "formulas") <- formulas
    attr(result, "lods") <- lods

    # assign output object
    output$result.drop <- result

  }  ## if(dropone)

  if(get.ests)
    output$ests <- list(ests=ests, covar=ests.cov)

  output$lod <- output$result.full[1,4]

  class(output) <- "fitqtl"
  attr(output, "method") <- method
  attr(output, "model") <- model
  attr(output, "formula") <- deparseQTLformula(formula)
  attr(output, "type") <- qtl$type
  attr(output, "nind") <- length(pheno)
#  attr(output, "design.mat") <- matrix(z$design.mat, nrow=n.ind)
  output

}


######################################################################
# checkformula
#
# check that input formula satisfies our imposed hiearchy: that
# main effects for terms in any interactions are also included
######################################################################
checkformula <-
function(formula, qtl.name, covar.name)
{
  factors <- attr(terms(formula), "factors")
  altform <- deparseQTLformula(formula)

  if(sum(factors[1,])==0) factors <- factors[-1,,drop=FALSE]

  rn <- rownames(factors)
  # if mentions of "q1" or such, convert to "Q1" and such
  g <- grep("^[Qq][0-9]+$", rn)
  todrop <- NULL
  if(length(g) >= 1) {
    rownames(factors)[g] <- rn[g] <- toupper(rn[g])
    if(any(table(rn) > 1)) { # now there are some duplicates
      urn <- unique(rn)
      for(i in urn) {
        wh <- which(rn == i)
        if(length(wh) > 1) {
          factors[wh[1],] <- apply(factors[wh,], 2, sum)
          todrop <- c(todrop, wh[-1])
          rownames(factors)[wh[1]] <- rn[wh[1]]
        }
      }
    }
  }
  if(length(todrop) > 0) factors <- factors[-todrop,]
  rn <- rownames(factors)

  if(!missing(qtl.name) || !missing(covar.name)) {
    m <- match(rn, c(qtl.name, covar.name))
    if(any(is.na(m)))
      warning("Terms ", paste(rn[is.na(m)], collapse=" "), " not understood.")
  }

  # paste rows together
  zo <- factors
  zo[zo>1] <- 1
  pzo <- apply(zo, 2, paste, collapse="")
  nt <- apply(zo, 2, sum)

  # form binary representations
  maxnt <- max(nt)
  v <- vector("list", maxnt)
  for(i in 2:maxnt) {
    v[[i]] <- mybinaryrep(i)
    v[[i]] <- v[[i]][,-ncol(v[[i]])+c(0,1)]
  }

  # for each higher-order column, form all lower-order terms
  for(i in which(nt > 1)) {
    cur <- zo[,i]
    wh <- which(cur==1)
    z <- v[[nt[i]]]
    zz <- matrix(0, ncol=ncol(z), nrow=length(cur))
    for(j in seq(along=wh))
      zz[wh[j],] <- z[j,]
    pzo <- unique(c(pzo, apply(zz, 2, paste, collapse="")))
  }

  zo <- matrix(as.numeric(unlist(strsplit(pzo, ""))), ncol=length(pzo))
  nt <- apply(zo, 2, sum)
  zo <- zo[,order(nt, apply(1-zo, 2, paste, collapse="")), drop=FALSE]
  rownames(zo) <- rn

  # form column names
  theterms <- apply(zo, 2, function(a, b) paste(b[as.logical(a)], collapse=":"),
                    rownames(zo))

  as.formula(paste("y ~ ", paste(theterms, collapse=" + ")))
}

#####################################################################
#
# parseformula
#
# Function to be called by fitqtl. It's used to
# parse the input formula
#
# This is the internal function and not supposed to be used by user
#
#####################################################################

parseformula <- function(formula, qtl.dimname, covar.dimname)
{
  # The terms for input formula
  f.formula <- terms(formula)
  order.term <- attr(f.formula, "order") # get the order of the terms
  idx.term <- which(order.term==1) # get the first order terms
  label.term <- attr(f.formula, "term.labels")[idx.term]
  formula.mtx <- attr(f.formula, "factors") # formula matrix

  idx.qtl <- NULL
  idx.covar <- NULL

  # loop thru all terms and find out how many QTLs and covariates
  # are there in the formula. Construct idx.qtl and idx.covar at the same time
  termisqtl <- rep(0, length(idx.term))
  for (i in 1:length(idx.term)) {
    # find out if there term is a QTL or a covariate
    # ignore the case for QTLs, e.g., Q1 is equivalent to q1
    idx.tmp <- grep(paste(label.term[i],"$", sep=""),
                    qtl.dimname, ignore.case=TRUE)
    if( length(idx.tmp) ) { # it's a QTL
      idx.qtl <- c(idx.qtl, idx.tmp)
      termisqtl[i] <- 1
    }
    else if(label.term[i] %in% covar.dimname) # it's a covariate
      idx.covar <- c(idx.covar, which(label.term[i]==covar.dimname))
    else
      stop("Unrecognized term ", label.term[i], " in formula")
  }
  n.qtl <- length(idx.qtl) # number of QTLs in formula
  n.covar <- length(idx.covar) # number of covariates in formula
  # now idx.qtl and idx.covar are the indices for genotype
  # and covariate matrices according to input formula

  # loop thru all terms again and reorganize formula.mtx
  formula.idx <- NULL
  ii <- 1
  jj <- 1
  for (i in 1:length(idx.term)) {
#    if(label.term[i] %in% qtl.dimname) {  # it's a QTL
    if(termisqtl[i]) {
      formula.idx <- c(formula.idx, ii)
      ii <- ii+1
    }
    else { # it's a covariate
      formula.idx <- c(formula.idx, jj+n.qtl)
      jj <- jj+1
    }
  }

  # reorganize formula.mtx according to formula.idx
  # remove the first row (for y)
  formula.mtx <- formula.mtx[2:nrow(formula.mtx),]
  # rearrange the rows according to formula.idx if there's more than one row
  if(length(formula.idx) > 1)
    formula.mtx <- formula.mtx[order(formula.idx),]
  # take out only part of the matrix for interactions and pass to C function
  # all the input QTLs and covariates for C function will be additive
  n.int <- length(order.term) - length(idx.term) # number of interactions
  if(n.int != 0)
    formula.intmtx <- formula.mtx[,(length(idx.term)+1):length(order.term)]
  else # no interaction terms
    formula.intmtx <- NULL

  # return object
  result <- NULL
  result$idx.qtl <- idx.qtl
  result$n.qtl <- n.qtl
  result$idx.covar <- idx.covar
  result$n.covar <- n.covar
  result$formula.intmtx <- formula.intmtx
  result$n.int <- n.int

  result

}


#####################################################################
#
# summary.fitqtl
#
#####################################################################
summary.fitqtl <-
function(object, pvalues=TRUE, simple=FALSE, ...)
{
  if(!any(class(object) == "fitqtl"))
    stop("Input should have class \"fitqtl\".")

  # this is just an interface.
  if("ests" %in% names(object)) {
    ests <- object$ests$ests
    se <- sqrt(diag(object$ests$covar))
    object$ests <- cbind(est=ests, SE=se, t=ests/se)
  }
  class(object) <- "summary.fitqtl"
  if(simple) pvalues <- FALSE
  attr(object, "pvalues") <- pvalues
  attr(object, "simple") <- simple
  object
}


#####################################################################
#
# print.summary.fitqtl
#
#####################################################################
print.summary.fitqtl <- function(x, ...)
{
  cat("\n")
  cat("\t\tfitqtl summary\n\n")
  meth <- attr(x, "method")
  mod <- attr(x, "model")
  if(is.null(mod)) mod <- "normal"
  if(meth=="imp") meth <- "multiple imputation"
  else if(meth=="hk") meth <- "Haley-Knott regression"
  cat("Method:", meth, "\n")
  cat("Model: ", mod, "phenotype\n")
  cat("Number of observations :", attr(x, "nind"), "\n\n")

  # print ANOVA table for full model
  cat("Full model result\n")
  cat("----------------------------------  \n")
  cat("Model formula:")
  w <- options("width")[[1]]
  printQTLformulanicely(attr(x, "formula"), "                   ", w+5, w)
  cat("\n")

  pval <- attr(x, "pvalues")
  simple <- attr(x, "simple")
  if(!is.null(pval) && !pval)
    x$result.full <- x$result.full[,-ncol(x$result.full)+(0:1)]
  if(mod=="binary" || (!is.null(simple) && simple))
    x$result.full <- x$result.full[1,-c(2:3,7),drop=FALSE]

  print(x$result.full, quote=FALSE, na.print="")
  cat("\n")

  # print ANOVA table for dropping one at a time analysis (if any)
  if("result.drop" %in% names(x)) {
    cat("\n")
    cat("Drop one QTL at a time ANOVA table: \n")
    cat("----------------------------------  \n")
    # use printCoefmat instead of print.data.frame
    # make sure the last column is P value
    if(!is.null(pval) && !pval)
      x$result.drop <- x$result.drop[,-ncol(x$result.drop)+(0:1)]
    if(mod=="binary" || (!is.null(simple) && simple))
      x$result.drop <- x$result.drop[,-c(2,5,7),drop=FALSE]

    printCoefmat(x$result.drop, digits=4, cs.ind=1, P.values=TRUE, has.Pvalue=TRUE)
    cat("\n")
  }

  if("ests" %in% names(x)) {
    cat("\n")
    cat("Estimated effects:\n")
    cat("-----------------\n")
    printCoefmat(x$ests,digits=4)
    cat("\n")
  }

}

######################################################################
# binary repreentation of the numbers 1...2^n;
# used in checkformula
######################################################################
mybinaryrep <-
function(n)
{
  lx <- 2^n
  x <- 1:lx
  ans <- 0:(n-1)
  x <- matrix(rep(x,rep(n, lx)), ncol=lx)
  (x %/% 2^ans) %% 2
}

######################################################################
# deparseQTLformula: turn QTL formula into a string
######################################################################
deparseQTLformula <-
function(formula, reorderterms=FALSE)
{
  if(is.null(formula)) return(NULL)
  if(reorderterms) {
    if(is.character(formula)) formula <- as.formula(formula)
    factors <- colnames(attr(terms(formula), "factors"))
    wh <- grep("^[Qq][0-9]+$", factors)
    if(length(wh)>0)
      factors[wh] <- paste("Q", sort(as.numeric(substr(factors[wh], 2, nchar(factors[wh])))), sep="")
    wh <- grep(":", factors)
    if(length(wh)>0) {
      temp <- strsplit(factors[wh], ":")
      temp <- sapply(temp, function(a) {
        wh <- grep("^[Qq][0-9]+$", a)
        if(any(wh)) a[wh] <- paste("Q", sort(as.numeric(substr(a[wh], 2, nchar(a[wh])))), sep="")
        paste(a[order(is.na(match(seq(along=a),wh)))], collapse=":")
      })
      factors[wh] <- temp
    }
    return(paste("y ~ ", paste(factors, collapse=" + "), sep=""))
  }


  if(is.character(formula)) return(formula)
  paste(as.character(formula)[c(2,1,3)], collapse=" ")
}

printQTLformulanicely <-
function(formula, header, width, width2, sep=" ")
{
  if(!is.character(formula)) formula <- deparseQTLformula(formula)
  thetext <- unlist(strsplit(formula, " "))

  if(missing(width2)) width2 <- width
  nleft <- width - nchar(header)
  nsep <- nchar(sep)
  if(length(thetext) < 2) cat("", thetext, "\n", sep=sep)
  else {
    z <- paste("", thetext[1], sep=sep, collapse=sep)
    for(j in 2:length(thetext)) {
      if(nchar(z) + nsep + nchar(thetext[j]) > nleft) {
        cat(z, "\n")
        nleft <- width2
        z <- paste(header, thetext[j], sep=sep)
      }
      else {
        z <- paste(z, thetext[j], sep=sep)
      }
    }
    cat(z, "\n")
  }
}

# end of fitqtl.R
byandell/qtl documentation built on May 13, 2019, 9:28 a.m.