miqtl: Suite of QTL Mapping Functions

Documented in get.gev.thresholds grab.locus.from.scan rint

#' Returns the rank-based inverse normal transformation
#'
#' This function takes a phenotype vector and returns the rank-based inverse normal transformation.
#'
#' @param phenotype A vector of phenotype values for which the rank-based inverse normal transformation is output.
#' @param prop DEFAULT: 0.5. This allows Inf to not be returned for the maximum of phenotype.
#' @export
#' @examples rint()
rint <- function(phenotype, 
                 prop=0.5){
  rint_phenotype <- qnorm((rank(phenotype, na.last="keep")-prop)/sum(!is.na(phenotype)))
  return(rint_phenotype)
}

#' @export
rankZ = function(x) {
  x = rank(x, na.last = "keep", ties.method = "average") / (sum(!is.na(x)) + 1)
  return(qnorm(x))
} # Dan Gatti's version of rint()

#' Pulls loci from scan objects based on specified criteria
#' 
#' This function parses a scan object and returns loci, for instance, the locus with the minimum
#' p-value.
#' 
#' @param scan.object A scan.h2lmm() object (ROP or multiple imputations). If multiple imputations, median and confidence interval 
#' on median are plotted.
#' @param use.lod DEFAULT: FALSE. Specifies whether loci should be selected based on LOD scores or p-values.
#' @param chr DEFAULT: "all". The portion of the scan that loci are being pulled from.
#' @param criterion DEFAULT: "min". The criterion by which loci are selected. "min" selects the locus with the minimum statistical score.
#' "max" selects the locus with the maximum statistical score.
#' @param return.value DEFAULT: "marker". If "marker", returns the marker name. If "positions", returns the position in both cM and Mb.
#' If "chr", returns the chr of the peak. If "index", returns the index of the vector.
#' @export
#' @examples grab.locus.from.scan()
grab.locus.from.scan <- function(scan.object, 
                                 use.lod=FALSE, 
                                 chr="all", 
                                 criterion=c("min", "max"), 
                                 return.value=c("marker", "position", "chr", "index")){
  return.value <- return.value[1]
  criterion <- criterion[1]
  if(use.lod){ outcome <- scan.object$LOD }
  else{ outcome <- scan.object$p.value }
  
  if(chr == "all"){ keep <- rep(TRUE, length(outcome)) }
  else{ keep <- scan.object$chr %in% chr }
  
  if(criterion == "min"){
    if(return.value == "marker"){
      result <- scan.object$loci[keep][which.min(outcome[keep])]
    }
    else if(return.value == "position"){
      result <- c(scan.object$pos$cM[keep][which.min(outcome[keep])], scan.object$pos$Mb[keep][which.min(outcome[keep])])
      names(result) <- c("cM", "Mb")
    }
    else if(return.value == "chr"){
      result <- scan.object$chr[keep][which.min(outcome[keep])]
    }
    else if(return.value == "index"){
      result <- which.min(outcome[keep])
    }
  }
  
  if(criterion == "max"){
    if(return.value == "marker"){
      result <- scan.object$loci[keep][which.max(outcome[keep])]
    }
    else if(return.value == "position"){
      result <- c(scan.object$pos$cM[keep][which.max(outcome[keep])], scan.object$pos$Mb[keep][which.max(outcome[keep])])
      names(result) <- c("cM", "Mb")
    }
    else if(return.value == "chr"){
      result <- scan.object$chr[keep][which.max(outcome[keep])]
    }
    else if(return.value == "index"){
      result <- which.max(outcome[keep])
    }
  }
  
  return(result)
}

#' Returns a significance threshold based on fitting max LODs or max -log10p to a generalized extreme value (GEV) distribution
#'
#' This function takes an scan.h2lmm() object, and returns a specified number of outcome samples, either permutations or
#' from the null model of no locus effect.
#'
#' @param threshold.scans Output object from run.threshold.scans().
#' @param use.lod DEFAULT: FALSE. "TRUE" specifies LOD scores. "FALSE" specifies p-values.
#' @param percentile DEFAULT: 0.95. The desired alpha level (false positive probability) from the GEV distribution.
#' @param type DEFAULT: "min". If "min", the minimum statistics are expected, which is usually desired with p-values - though they
#' are transformed to maxima using -log10(). If "max", the maximum statistics are expected. This is mainly expected to be used
#' with mediation scans for direct mediations (p-value drop).
#' @export
#' @examples get.gev.thresholds()
get.gev.thresholds <- function(threshold.scans, 
                               use.lod=FALSE, 
                               percentile=0.95, 
                               type=c("min", "max")){
  type <- type[1]
  if(!use.lod){
    if(!is.list(threshold.scans$max.statistics$p.value)){
      extreme.values <- -log10(threshold.scans$max.statistics$p.value)
    }
    else{
      extreme.values <- -log10(threshold.scans$max.statistics$p.value[[type]])
    }
  }
  else{
    extreme.values <- threshold.scans$max.statistics$LOD
  }
  evd.pars <- as.numeric(evir::gev(extreme.values)$par.est)
  thresh <- evir::qgev(p=ifelse(type == "min", percentile, 1 - percentile), xi=evd.pars[1], sigma=evd.pars[2], mu=evd.pars[3])
  return(thresh)
}

#' @export
get.gev.padjust <- function(p.value, 
                            threshold.scans, 
                            use.lod = FALSE,
                            type=c("min", "max"),
                            right.side=TRUE){
  type <- type[1]
  if(!use.lod){
    if(!is.list(threshold.scans$max.statistics$p.value)){
      extreme.values <- -log10(threshold.scans$max.statistics$p.value)
    }
    else{
      extreme.values <- -log10(threshold.scans$max.statistics$p.value[[type]])
    }
  }
  else{
    extreme.values <- threshold.scans$max.statistics$LOD
  }
  evd.pars <- as.numeric(evir::gev(extreme.values)$par.est)
  adj.p <- evir::pgev(q=-log10(p.value), xi=evd.pars[1], sigma=evd.pars[2], mu=evd.pars[3])
  ## Rough handling of extreme adjusted p-values
  if (is.nan(adj.p) | adj.p == 1) { adj.p <- 1 - .Machine$double.eps }
  else if (adj.p == 0) { adj.p <- .Machine$double.eps }
  if(right.side){
    adj.p <- 1 - adj.p
  }
  return(adj.p)
}


#' @export
ci.median <- function(x, 
                      conf=0.95){ # from R/asbio
  n <- nrow(as.matrix(x))
  if(qbinom((1 - conf)/2, n, 0.5) == 0){ stop("CI not calculable") }
  L <- qbinom((1 - conf)/2, n, 0.5)
  U <- n - L + 1
  if (L >= U){ stop("CI not calculable") }
  order.x <- sort(x)
  ci <- c(lower = order.x[L], upper = order.x[n - L + 1])
  return(ci)
}

#' @export
ci.mean <- function(x, 
                    alpha=0.05, 
                    na.rm=TRUE){
  n <- sum(!is.na(x))
  if(n > 1){ # need more than one observation for confint
    n <- ifelse(n != 0, n, NA)
    sd <- sd(x, na.rm=na.rm)
    se <- sd/sqrt(n)
    er <- qt(1-alpha/2, df=n-1, lower.tail=FALSE)*se
    ci <- c(mean(x, na.rm=TRUE)-er, mean(x, na.rm=TRUE)+er)
  }
  else{
    ci <- rep(NA, 2)
  }
  return(ci)
}

predict.lmmbygls <- function(fit0.no.augment,
                             original.n, 
                             augment.n, 
                             covariates,
                             weights){
  e <- rnorm(augment.n, 0, sd=sqrt(fit0.no.augment$sigma2.mle))
  if(!is.null(weights)){
    e <- sqrt(weights[-(1:original.n)]) * e
  }
  u <- rmvnorm(n=1, mean=rep(0, original.n + augment.n), sigma=fit0.no.augment$tau2.mle*K, method="chol")[-(1:original.n)]
  covariate.matrix <- rep(1, augment.n)
  if(!is.null(covariates)){
    for(i in 1:length(covariates)){
      if(is.factor(data[,covariates[i]])){
        covariate.matrix <- cbind(covariate.matrix, matrix(0, nrow=augment.n, ncol=nlevels(data[,covariates[i]])-1))
      }
      if(is.numeric(data[,covariates[i]])){
        covariate.matrix <- cbind(covariate.matrix, rep(mean(data[,covariates[i]]), augment.n))
      }
    }
  }
  null.mean <- (rbind(fit0.no.augment$x, covariate.matrix) %*% fit0.no.augment$coefficients)[-(1:original.n),]
  null.y.hat <- null.mean + u + e
  return(null.y.hat)
}

#### Not in use currently #####
calc.LRT.mean.coef <- function(){
  mean.coef <- colMeans(imp.coef, na.rm=TRUE)
  mean.varcomps <- colMeans(imp.varcomps)
  lambda <- sum(mean.varcomps)
  sample.size <- nrow(imp.X[[1]])
  H.inv <- t(fit1$M)%*%fit1$M
  imp.constr.logLik <- rep(0, length(imp.X))
  y <- fit1$y
  if(is.null(weights)){
    for(i in 1:length(imp.X)){
      #this.X <- imp.X[[i]][,-ncol(imp.X[[1]])]
      this.X <- imp.X[[i]]
      imp.constr.logLik[i] <- -0.5*sample.size*(log(2*pi) + log(lambda)) - 0.5*(1/lambda)*t(y - this.X[, !is.nan(mean.coef)]%*%mean.coef[!is.nan(mean.coef)]) %*% H.inv %*% (y - this.X[, !is.nan(mean.coef)]%*%mean.coef[!is.nan(mean.coef)]) - 0.5*fit1$logDetV
    }
  }
  if(!is.null(weights)){
    for(i in 1:length(X.list)){
      this.X <- imp.X[[i]][,-ncol(imp.X[[i]])]
      imp.constr.logLik[i] <- -0.5*sample.size*log(2*pi) - 0.5*t(y - this.X[, !is.nan(mean.coef)]%*%mean.coef[!is.nan(mean.coef)]) %*% H.inv %*% (y - this.X[, !is.nan(mean.coef)]%*%mean.coef[!is.nan(mean.coef)]) - 0.5*fit1$logDetV
    }
  }
  imp.constr.LRT <- 2*(imp.constr.logLik - fit0$logLik)
  return(imp.constr.LRT)
}

calc.mi.LRT <- function(){
  num.imp <- length(imp.constr.LRT)
  k <- df1 - df0
  
  rL <- (num.imp + 1)*(mean(imp.LRT) - mean(imp.constr.LRT))/(k*(num.imp - 1))
  
  DL <- mean(imp.constr.LRT)/k*(1 + rL)
  
  v <- k*(num.imp - 1)
  w.rL <- ifelse(v > 4, 4 + (v-4)*(1 + (1 - 2/v)*(1/rL))^2, (1/2)*v*(1 + 1/k)*(1 + (1/rL))^2)
  p.val <- pf(DL, df1=k, df2=w.rL, lower.tail=FALSE)
  return(p.val)
}
###########################

#' @export straineff.mapping.matrix
straineff.mapping.matrix <- function(M=8){
  T <- M*(M+1)/2
  mapping <- matrix(rep(0, T*M), M, T)
  idx <- 1;
  for (i in 1:M){
    mapping[i, idx] <- mapping[i, idx] + 2
    idx <- idx + 1;
  }
  for (i in 2:M){
    for (j in 1:(i-1)){
      mapping[i, idx] <- mapping[i, idx] + 1;
      mapping[j, idx] <- mapping[j, idx] + 1;
      idx <- idx + 1;
    }
  }
  return(t(mapping))
}

run.imputation <- function(diplotype.probs, 
                           impute.map){
  if(!all(as.character(impute.map[,1]) == as.character(impute.map[,2]))){
    pheno.id <- names(impute.map)[1]
    geno.id <- names(impute.map)[2]
    if(pheno.id == geno.id){ geno.id <- paste0(geno.id, "_2"); names(impute.map)[2] <- geno.id }
    diplotype.probs <- data.frame(1:nrow(diplotype.probs), rownames(diplotype.probs), diplotype.probs, row.names=NULL, stringsAsFactors=FALSE)
    names(diplotype.probs)[1:2] <- c("original.order", geno.id)
    diplotype.probs <- merge(x=diplotype.probs, y=impute.map, by=geno.id)
    diplotype.probs <- diplotype.probs[order(diplotype.probs$original.order),]
    imputable.diplotype.probs <- as.matrix(diplotype.probs[!duplicated(diplotype.probs[,geno.id]),][,!(names(diplotype.probs) %in% c("original.order", names(impute.map)))])
    rownames(imputable.diplotype.probs) <- diplotype.probs[,geno.id][!duplicated(diplotype.probs[,geno.id])]
    imputation <- t(apply(imputable.diplotype.probs, 1, function(x) rmultinom(1, 1, x)))
    full.imputation <- imputation[as.character(impute.map[, geno.id]),]
    rownames(full.imputation) <- impute.map[, pheno.id]
  }
  else{
    full.imputation <- t(apply(diplotype.probs, 1, function(x) rmultinom(1, 1, x)))
    rownames(full.imputation) <- impute.map[, 1]
  }
  return(full.imputation)
}
  
process_eigen_decomposition <- function(eigen.decomp, 
                                        tol=1e-6){
  # from Robert, who took it from MASS::mvrnorm()
  if(!all(eigen.decomp$values >= -tol * abs(eigen.decomp$values[1L]))){
    stop("K is not positive definite")
  }
  if(any(eigen.decomp$values < 0)){
    if(any(eigen.decomp$values < -tol)){
      message("Zeroing negative eigenvalues: smallest eigenvalue was ", min(eigen.decomp$values), "\n")
    }
    eigen.decomp$values <- pmax(eigen.decomp$values, 0)
  }
  return(eigen.decomp)
}

replicates.eigen <- function(Z, K) {
  eigen <- eigen(K %*% crossprod(Z,Z ), symmetric=FALSE)
  return(list(values=eigen$values,
              vectors=qr.Q(qr(Z %*% eigen$vectors))))
}

#' @export
get.f.stat.p.val <- function(qr.alt, 
                             qr.null, 
                             y){
  rss0 <- sum(qr.resid(qr.null, y)^2)
  rss1 <- sum(qr.resid(qr.alt, y)^2)
  df1 <- qr.alt$rank - qr.null$rank
  df2 <- length(y) - qr.alt$rank
  
  mst <- (rss0 - rss1)/df1
  mse <- rss1/df2
  f.stat <- mst/mse
  p.val <- pf(q=f.stat, df1=df1, df2=df2, lower.tail=FALSE)
  return(p.val)
}

#' @export
get.p.value <- function(fit0, 
                        fit1, 
                        method=c("LRT", "ANOVA", "LRT.random.locus"),
                        round.tol=10){
  method <- method[1]
  if(method == "LRT"){
    p.value <- pchisq(q=-2*(fit0$logLik - fit1$logLik), df=fit1$rank-fit0$rank, lower.tail=FALSE)
  }
  if(method == "ANOVA"){
    p.value <- get.f.stat.p.val(qr.alt=fit1$qr, qr.null=fit0$qr, y=fit0$y)
  }
  if(method == "LRT.random.locus"){
    chi.sq <- -2*(round(fit0$REML.logLik, round.tol) - round(fit1$REML.logLik, round.tol))
    p.value <- ifelse(chi.sq == 0, 1, 0.5*pchisq(q=chi.sq, df=1, lower.tail=FALSE))
  }
  return(p.value)
}

#' @export get.allele.effects.from.fixef
get.allele.effects.from.fixef <- function(fit, 
                                          founders, 
                                          allele.in.intercept, 
                                          center=TRUE, 
                                          scale=FALSE){
  effects <- fit$coefficients[founders]
  names(effects) <- founders
  
  effects <- effects + fit$coefficients["(Intercept)"]
  effects[allele.in.intercept] <- fit$coefficients["(Intercept)"]
  return(as.vector(scale(effects, center=center, scale=scale)))
}

#' @export get.allele.effects.from.ranef
get.allele.effects.from.ranef <- function(fit, 
                                          founders=NULL, 
                                          center=TRUE, 
                                          scale=FALSE){
  ## Big time savings potentially
  if(fit$locus.h2 == 0){
    effects <- rep(0, 8)
  }
  else{
    X <- fit$x
    Z <- fit$z
    if(is.null(founders)){ founders <- colnames(Z) }
    ZZt <- tcrossprod(Z)
    weights <- fit$weights
    if(is.null(weights)){ weights <- rep(1, nrow(Z)) }
    sigma2 <- fit$sigma2.reml
    tau2 <- fit$locus.h2*(sigma2/((1 - fit$locus.h2)*fit$h2 + fit$locus.h2))
    Sigma <- ZZt*tau2 + diag(1/weights)*sigma2
    inv.Sigma <- chol2inv(chol(Sigma))
    effects <- as.vector(crossprod(Z*tau2, inv.Sigma) %*% (fit$y - X %*% chol2inv(chol(crossprod(X, crossprod(inv.Sigma, X)))) %*% crossprod(X, crossprod(inv.Sigma, fit$y))))
  }
  names(effects) <- founders

  return(as.vector(scale(effects, center=center, scale=scale)))
}

# get.blup.from.ranef.fit <- function(fit, center=TRUE, scale=FALSE){
#   
#   ## Big time savings potentially
#   if(fit$h2 == 0 | is.null(fit$K)){
#     blups <- rep(0, length(fit$y))
#   }
#   else{
#     X <- fit$x
#     K <- fit$K
#     weights <- fit$weights
#     if(is.null(weights)){ weights <- rep(1, nrow(K)) }
#     sigma2 <- fit$sigma2.reml
#     kinship.h2 <- (1 - fit$locus.h2)*fit$h2
#     total.variance <- sigma2/(1 - fit$locus.h2 - kinship.h2)
#     tau2 <- kinship.h2 * total.variance
#     Sigma <- K*tau2 + diag(1/weights)*sigma2
#     inv.Sigma <- chol2inv(chol(Sigma))
#     blups <- as.vector(crossprod(K*tau2, inv.Sigma) %*% (fit$y - X %*% chol2inv(chol(crossprod(X, cross.prod(inv.Sigma, X)))) %*% crossprod(X, crossprod(inv.Sigma, fit$y))))
#   }
#   names(blups) <- names(y)
#   
#   return(as.vector(scale(blups, center=center, scale=scale)))
# }