##' Estimate the optimal number of genes to construct QuEST model
##'
##' @title Estimate the optimal number of genes to construct QuEST model
##' @param x A data matrix (row: samples, col: genes).
##' @param y A vector of an environment in which the samples were collected.
##' @param newx (optional) A data matrix for validation. Columns of newx and x must be
##' identical (default: NULL).
##' @param newy (optional) A vector for validation (default: NULL).
##' @param range A sequence of numbers of genes to be tested for MAE calculation (default: 5:50).
##' @param method A string to specify the method of regression for calculating R-squared values.
##' "linear" (default), "quadratic" or "cubic" regression model can be specified.
##' @param rep The number of replications for each case set by range (default: 1).
##' @param signif If TRUE, 95% statistical threshold of the MAE value under the null hypothesis
##' will be illustrated on the resulting plot (default: FALSE).
##' @importFrom ggplot2 ggplot aes geom_point stat_function geom_hline geom_text theme element_rect element_blank
##' @importFrom stats rnorm nls
##' @return A sample-MAE curve
##' @examples
##' data(Pinus)
##' train <- q.clean(Pinus$train)
##' target <- Pinus$target
##' q.opt(train, target)
##' @author Takahiko Koizumi
##' @export
q.opt <- function(x, y, newx = NULL, newy = NULL, range = 5:50, method = "linear", rep = 1, signif = FALSE) {
ngene <- MAE <- NULL
degree <- switch(method,
"linear" = 1,
"quadratic" = 2,
"cubic" = 3,
stop("Select the methods from linear, quadratic, or cubic")
)
if(min(range) < 0){
stop("<range> must not include a negative value")
}else if(max(range) > ncol(x)){
stop(paste("<range> must not exceed", ncol(x), sep = " "))
}
if(rep <= 0){
stop("<rep> must be a natural number")
}
unit <- rep(NA, length(range) * rep)
comp <- data.frame(
ngene = unit,
MAE = unit
)
## when <signif> = TRUE
if(is.null(newy)){
y.null <- y
}else{
y.null <- newy
}
mae.null <- rep(NA, 1000)
for(i in 1:1000){
y.perm <- sample(y.null, length(y.null))
mae.null[i] <- mean(abs(y.null - y.perm))
}
mae.null <- sort(mae.null)[25]
## sort genes in descending order of R2
x.qs <- q.sort(x, y, method = method)
x.qs <- x.qs[, 1:max(range)]
message("Data prepared")
## process
proc <- seq(20, 100, by = 20)
perc <- (1:(length(range) * rep)) / (length(range) * rep) * 100
p <- rep(NA, length(proc))
for(i in 1:length(proc)){
p[i] <- which.min(abs(perc - proc[i]))
}
## calculate MAE using variable number of genes
counter <- 1
for(i in 1:length(range)){
x.q.test <- x.q <- x.qs[, 1:range[i]]
for(j in 1:rep){
## run QuEST
if(is.null(newx)){
## add noise
for(k in 1:range[i]){
ext <- x.q[, k]
x.q.test[, k] <- rnorm(length(ext), ext, ext/2)
}
q <- quest(x.q, y, newx = x.q.test, method = method)
comp[counter, ] <- c(range[i], mean(abs(y - q)))
}else{
newx.q <- newx[, colnames(x.q)]
## add noise
for(k in 1:range[i]){
ext <- newx.q[, k]
newx.q[, k] <- rnorm(length(ext), ext, ext/2)
}
q <- quest(x.q, y, newx = newx.q, method = method)
comp[counter, ] <- c(range[i], mean(abs(newy - q)))
}
if(any(p == counter)){
prog <- 100 * counter / length(unit)
message(paste(proc[which.min(abs(proc - prog))], "% completed", sep = ""))
}
counter <- counter + 1
}
}
## estimate the minimum MAE
fit <- nls(MAE ~ a + b * exp(- c * ngene), start = c (a = 1, b = 1, c = 0.1), data = comp)
a <- fit$m$getPars()[[1]]
b <- fit$m$getPars()[[2]]
c <- fit$m$getPars()[[3]]
upper.lim <- max(comp$MAE, mae.null, a + b, a)
if(signif){
ggplot(comp, aes(x = ngene, y = MAE)) +
geom_point(aes(colour = "MAE", fill = "MAE"), shape = 21) +
stat_function(fun = function(x) a + b * exp(- c * x)) +
geom_hline(aes(yintercept = a), colour = "red", linetype = "dotted") +
geom_text(aes(0, a, label = format(round(a, 3), nsmall = 3), hjust = 0.2, vjust = - 1)) +
geom_hline(aes(yintercept = mae.null), colour = "gray", linetype = "dashed") +
geom_text(aes(0, mae.null, label = format(round(mae.null, 3), nsmall = 3), hjust = 0.2, vjust = - 1)) +
ylim(0, upper.lim * 1.1) +
theme(panel.background = element_rect(fill = "transparent", colour = "black"),
panel.grid = element_blank(),
strip.background = element_blank(),
legend.position = "none")
}else{
ggplot(comp, aes(x = ngene, y = MAE)) +
geom_point(aes(colour = "MAE", fill = "MAE"), shape = 21) +
stat_function(fun = function(x) a + b * exp(- c * x)) +
geom_hline(aes(yintercept = a), colour = "red", linetype = "dotted") +
geom_text(aes(0, a, label = format(round(a, 3), nsmall = 3), hjust = 0.2, vjust = - 1)) +
theme(panel.background = element_rect(fill = "transparent", colour = "black"),
panel.grid = element_blank(),
strip.background = element_blank(),
legend.position = "none")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.