R/q.opt.R

Defines functions q.opt

Documented in q.opt

##' Estimate the optimal number of genes to construct QuEST model
##'
##' @title Estimate the optimal number of genes to construct QuEST model
##' @param x A data matrix (row: samples, col: genes).
##' @param y A vector of an environment in which the samples were collected.
##' @param newx (optional) A data matrix for validation. Columns of newx and x must be
##' identical (default: NULL).
##' @param newy (optional) A vector for validation (default: NULL).
##' @param range A sequence of numbers of genes to be tested for MAE calculation (default: 5:50).
##' @param method A string to specify the method of regression for calculating R-squared values.
##' "linear" (default), "quadratic" or "cubic" regression model can be specified.
##' @param rep The number of replications for each case set by range (default: 1).
##' @param signif If TRUE, 95% statistical threshold of the MAE value under the null hypothesis
##' will be illustrated on the resulting plot (default: FALSE).
##' @importFrom ggplot2 ggplot aes geom_point stat_function geom_hline geom_text theme element_rect element_blank
##' @importFrom stats rnorm nls
##' @return A sample-MAE curve
##' @examples
##' data(Pinus)
##' train <- q.clean(Pinus$train)
##' target <- Pinus$target
##' q.opt(train, target)
##' @author Takahiko Koizumi
##' @export
q.opt <- function(x, y, newx = NULL, newy = NULL, range = 5:50, method = "linear", rep = 1, signif = FALSE) {
  ngene <- MAE <- NULL

  degree <- switch(method,
                   "linear" = 1,
                   "quadratic" = 2,
                   "cubic" = 3,
                   stop("Select the methods from linear, quadratic, or cubic")
  )

  if(min(range) < 0){
    stop("<range> must not include a negative value")
  }else if(max(range) > ncol(x)){
    stop(paste("<range> must not exceed", ncol(x), sep = " "))
  }
  if(rep <= 0){
    stop("<rep> must be a natural number")
  }

  unit <- rep(NA, length(range) * rep)
  comp <- data.frame(
    ngene = unit,
    MAE = unit
  )

  ## when <signif> = TRUE
  if(is.null(newy)){
    y.null <- y
  }else{
    y.null <- newy
  }
  mae.null <- rep(NA, 1000)
  for(i in 1:1000){
    y.perm <- sample(y.null, length(y.null))
    mae.null[i] <- mean(abs(y.null - y.perm))
  }
  mae.null <- sort(mae.null)[25]

  ## sort genes in descending order of R2
  x.qs <- q.sort(x, y, method = method)
  x.qs <- x.qs[, 1:max(range)]
  message("Data prepared")

  ## process
  proc <- seq(20, 100, by = 20)
  perc <- (1:(length(range) * rep)) / (length(range) * rep) * 100
  p <- rep(NA, length(proc))
  for(i in 1:length(proc)){
    p[i] <- which.min(abs(perc - proc[i]))
  }

  ## calculate MAE using variable number of genes
  counter <- 1
  for(i in 1:length(range)){
    x.q.test <- x.q <- x.qs[, 1:range[i]]

    for(j in 1:rep){
      ## run QuEST
      if(is.null(newx)){

        ## add noise
        for(k in 1:range[i]){
          ext <- x.q[, k]
          x.q.test[, k] <- rnorm(length(ext), ext, ext/2)
        }

        q <- quest(x.q, y, newx = x.q.test, method = method)
        comp[counter, ] <- c(range[i], mean(abs(y - q)))
      }else{
        newx.q <- newx[, colnames(x.q)]

        ## add noise
        for(k in 1:range[i]){
          ext <- newx.q[, k]
          newx.q[, k] <- rnorm(length(ext), ext, ext/2)
        }

        q <- quest(x.q, y, newx = newx.q, method = method)
        comp[counter, ] <- c(range[i], mean(abs(newy - q)))
      }

      if(any(p == counter)){
        prog <- 100 * counter / length(unit)
        message(paste(proc[which.min(abs(proc - prog))], "% completed", sep = ""))
      }

      counter <- counter + 1
    }
  }

  ## estimate the minimum MAE
  fit <- nls(MAE ~ a + b * exp(- c * ngene), start = c (a = 1, b = 1, c = 0.1), data = comp)
  a <- fit$m$getPars()[[1]]
  b <- fit$m$getPars()[[2]]
  c <- fit$m$getPars()[[3]]

  upper.lim <- max(comp$MAE, mae.null, a + b, a)

  if(signif){
    ggplot(comp, aes(x = ngene, y = MAE)) +
      geom_point(aes(colour = "MAE", fill = "MAE"), shape = 21) +
      stat_function(fun = function(x) a + b * exp(- c * x)) +
      geom_hline(aes(yintercept = a), colour = "red", linetype = "dotted") +
      geom_text(aes(0, a, label = format(round(a, 3), nsmall = 3), hjust = 0.2, vjust = - 1)) +
      geom_hline(aes(yintercept = mae.null), colour = "gray", linetype = "dashed") +
      geom_text(aes(0, mae.null, label = format(round(mae.null, 3), nsmall = 3), hjust = 0.2, vjust = - 1)) +
      ylim(0, upper.lim * 1.1) +
      theme(panel.background = element_rect(fill = "transparent", colour = "black"),
            panel.grid = element_blank(),
            strip.background = element_blank(),
            legend.position = "none")
  }else{
    ggplot(comp, aes(x = ngene, y = MAE)) +
      geom_point(aes(colour = "MAE", fill = "MAE"), shape = 21) +
      stat_function(fun = function(x) a + b * exp(- c * x)) +
      geom_hline(aes(yintercept = a), colour = "red", linetype = "dotted") +
      geom_text(aes(0, a, label = format(round(a, 3), nsmall = 3), hjust = 0.2, vjust = - 1)) +
      theme(panel.background = element_rect(fill = "transparent", colour = "black"),
            panel.grid = element_blank(),
            strip.background = element_blank(),
            legend.position = "none")
  }
}
takakoizumi/QuEST documentation built on Dec. 31, 2021, 12:06 a.m.