Nothing
nblda <- function(x, y, xte = NULL, rhos = 0, beta = 1, type = c("mle", "deseq", "quantile", "tmm"),
prior = NULL, transform = FALSE, alpha = NULL, truephi = NULL, phi.epsilon = 0.05,
target = NULL, normalize.target = TRUE, delta = NULL, return.selected.features = FALSE, ...){
# xte = NULL; rhos = best.rho; beta = tc$beta; type = type; prior = tc$prior;
# transform = FALSE; alpha = alpha; return.selected.features = tc$return.selected.features;
# truephi = tc$truephi; target = tc$target; normalize.target = tc$normalize.target;
# delta = tc$delta; phi.epsilon = tc$phi.epsilon
type <- match.arg(type)
if (!is.null(target)){
if (target < 0){
warning("'target' should be non-negative. It is set to 0.")
target <- 0
}
}
if (is.null(phi.epsilon)){
phi.epsilon <- 0
}
if (is.null(xte)){
xte <- x
warning("Since no xte was provided, testing was performed on training data set.")
}
if (is.null(rhos)){
warning("'rhos' cannot be NULL. Setting rho = 0.")
}
# if (length(rhos) > 1){
# warning("Multiple values of rho is given. Only the first element of rho is used.")
# }
if (transform){
if (is.null(alpha)){
### Train ve test seti buradaki "alpha" değerine göre transform ediliyor.
### Alpha değeri MLE altında size factor hesabı kullanılarak yapılıyor.
### TO DO ####
### Further Research: Farklı size factor kestirimleri altında power transformations'lar kullanılabilir.
alpha <- FindBestTransform(x)
}
if (alpha <= 0 || alpha > 1) stop("alpha must be between 0 and 1")
x <- x^alpha
xte <- xte^alpha
}
if (length(unique(y)) < 2){
stop("Response variable must have at least two classes.")
}
if (is.null(prior)){
prior <- rep(1/length(unique(y)), length(unique(y)))
}
null.out <- NullModel(x, type = type)
ns <- null.out$n
nste <- NullModelTest(null.out, xte)$nste
uniq <- sort(unique(y))
## Dispersion estimates.
if (!is.null(truephi)){
if (length(truephi) >= 2 & (length(truephi) != ncol(ns))){
truephi <- truephi[1]
disperhat <- rep(truephi, ncol(ns))
warning("The length of \"truephi\" should be the same as number of features. Only the first element is used and replicated for all features.")
} else if (length(truephi) == 1){
disperhat <- rep(truephi, ncol(ns))
} else if (length(truephi) >= 2 & (length(truephi) == ncol(ns))){
disperhat <- truephi
}
disperhat.list = list(initial = disperhat, adj = disperhat, cmp = NULL, delta = NULL, target = NULL)
} else {
####### TO DO #######
# tt değerinin 0'a eşit olması durumu ile analiz yapılıyor ama normalde üstteki değere shrink ediliyor. Yu et al çalışmasındaki gibi.
# Bunun nedeni araştırılacak. 0 Değerini almak problem yaratır mı diye incelenecek.
if (is.null(target)){
if (normalize.target){
tt <- getT(t(x), sizeFactors = null.out$rawsizestr, verbose = FALSE, propForSigma = c(0, 1))$target
} else {
tt <- getT(t(x), sizeFactors = rep(1, nrow(x)), verbose = FALSE, propForSigma = c(0, 1))$target
}
} else {
tt <- target
}
### Moment estimation of gene-wise dispersions.
rM = rowMeans(t(x))
rV = rowVars(t(x))
disp = (rV - rM)/rM^2 ## Dispersion estimates using method-of-moments.
# disp0 = numeric()
## Negative dispersions are set to 0.
disp0 <- sapply(disp, function(x){
max(0, x)
})
# disperhat = getAdjustDisp(disp0, shrinkTarget = tt, verbose = FALSE)$adj
disperhat.list = getShrinkedDispersions(obs = disp0, shrinkTarget = tt, delta = delta)
disperhat <- disperhat.list$adj
disperhat <- pmax(rep(0, length(disperhat)), disperhat) ## Negative dispersions are set to 0.
}
phihat <- as.numeric(disperhat)
### Shrink dispersion estimates to 0 using phi.epsilon.
if (any(phihat <= phi.epsilon)){
phihat[phihat <= phi.epsilon] <- 0
disperhat <- disperhat.list$adj <- phihat
}
# Replace Infinity with zero dispersions.
inv.phihat <- (1/phihat)
if (any(inv.phihat == Inf | inv.phihat == -Inf)){
id.inf <- which(inv.phihat == Inf | inv.phihat == -Inf)
inv.phihat[id.inf] <- 0
} else {
id.inf <- NULL
}
save <- lapply(rhos, function(u){
ds <- GetD(ns = ns, x = x, y = y, rho = u, beta = beta)
discriminant <- matrix(NA, nrow = nrow(xte), ncol = length(uniq))
# NBLDA
for (k in 1:length(uniq)){
dstar = ds[k, ]
part2 = 1 + t(nste) * dstar * phihat
part1 = dstar/part2
part3 <- inv.phihat * log(part2)
if (!is.null(id.inf)){
part3.limits <- t(nste[ ,id.inf])*dstar[id.inf]
part3[id.inf, ] <- part3.limits # Replace zero dispersed genes with limit values.
}
discriminant[ ,k] <- rowSums(xte * t(log(part1))) - colSums(part3) + log(prior[k])
}
# Feature selection
selectedFeaturesIdx.ds <- apply(ds, 2, function(x){
all(x != 1)
})
selectedFeaturesIdx.disperhat <- disperhat != 0
selectedFeaturesIdx <- which(selectedFeaturesIdx.ds | selectedFeaturesIdx.disperhat)
selectedFeaturesNames <- colnames(x)[selectedFeaturesIdx]
## If all features are selected, return NULL.
if (length(selectedFeaturesNames) == ncol(x)){
selectedFeaturesIdx <- selectedFeaturesNames <- NULL
}
save.i <- list(ns = ns, nste = nste, ds = ds, discriminant = discriminant,
ytehat = uniq[apply(discriminant, 1, which.max)],
alpha = alpha, rho = u, x = x, y = y, xte = xte, type = type,
prior = prior, dispersions = disperhat.list,
transform = transform, trainParameters = null.out,
selectedFeatures = list(idx = selectedFeaturesIdx, names = selectedFeaturesNames))
save.i
})
# class(save) <- "poicla"
return(save)
}
#' @title Train Model over Different Tuning Parameters
#'
#' @description This function fits the Negative Binomial classifier using various model parameters and finds the best model parameter using the resampling based performance measures.
#'
#' @usage
#' trainNBLDA(x, y, type = c("mle", "deseq", "quantile", "tmm"),
#' tuneLength = 10, metric = c("accuracy", "error"), train.control = nbldaControl(), ...)
#'
#' @param x a n-by-p data frame or matrix. Samples should be in the rows and variables in the columns. Used to train the classifier.
#' @param y a vector of length n. Each element corresponds to a class label of a sample. Integer and/or factor types are allowed.
#' @param type a character string indicating the type of normalization method within the NBLDA model. See details.
#' @param tuneLength a positive integer. This is the total number of levels to be used while tuning the model parameter(s) in grid search.
#' @param metric which criteria should be used while determining the best model parameter? overall accuracy or average number of misclassified samples?
#' @param train.control a list with control parameters to be used in the NBLDA model. See \link{nbldaControl} for details.
#' @param ... further arguments. Deprecated.
#'
#' @return an \code{nblda} object with following slots:
#' \item{input}{an \code{nblda_input} object including the raw count data and response variable. See \code{\linkS4class{nblda_input}} for details.}
#' \item{result}{an \code{nblda_trained} object including the results from cross-validated and final models. See \code{\linkS4class{nblda_trained}} for details.}
#' \item{call}{a call expression.}
#'
#' @details NBLDA is proposed to classify count data from any field, e.g., economics, social sciences, genomics, etc. In RNA-Seq studies, for example, normalization is used to adjust between-sample differences for downstream analysis. \code{type} is used to define normalization method. Available options are "mle", "deseq", "quantile", and "tmm". Since "deseq", "quantile", and "tmm" methods are originally proposed as robust methods to be used in RNA-Sequencing studies, one should carefully define normalization types. In greater detail, "deseq" estimates the size factors by dividing each sample by the geometric means of the transcript counts (Anders and Huber, 2010). "tmm" trims the lower and upper side of the data by log-fold changes to minimize the log-fold changes between the samples and by absolute intensity (Robinson and Oshlack, 2010). "quantile" is quantile normalization approach of Bullard et al. (2010). "mle" (less robust) divides total counts of each sample to the total counts (Witten, 2010). See related papers for mathematical backgrounds.
#'
#' @author Dincer Goksuluk
#'
#' @references
#' Witten, DM (2011). Classification and clustering of sequencing data using a Poisson model. Ann. Appl. Stat. 5(4), 2493--2518. doi:10.1214/11-AOAS493.
#'
#' Dong, K., Zhao, H., Tong, T., & Wan, X. (2016). NBLDA: negative binomial linear discriminant analysis for RNA-Seq data. BMC Bioinformatics, 17(1), 369. http://doi.org/10.1186/s12859-016-1208-1.
#'
#' Anders S. Huber W. (2010). Differential expression analysis for sequence count data. Genome Biology, 11:R106
#'
#' Witten D. et al. (2010) Ultra-high throughput sequencing-based small RNA discovery and discrete statistical biomarker analysis in a collection of cervical tumours and matched controls. BMC Biology, 8:58
#'
#' Robinson MD, Oshlack A (2010). A scaling normalization method for differential expression analysis of RNA-Seq data. Genome Biology, 11:R25, doi:10.1186/gb-2010-11-3-r25
#'
#' @examples
#' set.seed(2128)
#' counts <- generateCountData(n = 20, p = 10, K = 2, param = 1, sdsignal = 0.5, DE = 0.8,
#' allZero.rm = FALSE, tag.samples = TRUE)
#' x <- t(counts$x + 1)
#' y <- counts$y
#' xte <- t(counts$xte + 1)
#' ctrl <- nbldaControl(folds = 2, repeats = 2)
#'
#' fit <- trainNBLDA(x = x, y = y, type = "mle", tuneLength = 10,
#' metric = "accuracy", train.control = ctrl)
#'
#' fit
#' nbldaTrained(fit) # Cross-validated model summary.
#'
#' @name trainNBLDA
#' @rdname trainNBLDA
#'
#' @importFrom methods new
#'
#' @export
trainNBLDA <- function(x, y, type = c("mle", "deseq", "quantile", "tmm"), tuneLength = 10, metric = c("accuracy", "error"),
train.control = nbldaControl(), ...){
# type = "deseq"
# tuneLength = 10
# metric = "accuracy"
# train.control = ctrl2
call <- match.call()
type <- match.arg(type)
metric <- match.arg(metric)
y <- as.factor(y)
tc <- train.control
if (tc$transform){
if (is.null(tc$alpha)){
alpha <- FindBestTransform(x)
}
if (alpha <= 0 || alpha > 1) stop("alpha must be between 0 and 1")
x <- x^alpha
tc$alpha <- alpha
} else {
alpha <- NULL
}
# 1. Define sets of model parameter values to evaluate
if (is.null(tc$rhos)){
ns <- NullModel(x, type = type)$n
uniq <- sort(unique(y))
maxrho <- rep(NA, length(uniq))
for (k in 1:length(uniq)){
a <- colSums(x[y == uniq[k],]) + tc$beta
b <- colSums(ns[y == uniq[k],]) + tc$beta
maxrho[k] <- max(abs(a/b - 1) * sqrt(b), na.rm = TRUE)
}
tc$rhos <- rhos <- seq(0, max(maxrho, na.rm = TRUE) * (2/3), len = tuneLength)
} else {
rhos <- tc$rhos
}
# 2. For each parameter set do
if (is.null(tc$foldIdx)){
# Each element of the returned list is the indices of test samples in this fold.
allFolds <- lapply(1:tc$repeats, function(x){
tmp <- balanced.folds(y, nfolds = tc$folds)
names(tmp) <- paste("Fold.", 1:tc$folds, sep = "")
tmp
})
names(allFolds) <- paste("Repeat.", 1:tc$repeats, sep = "")
tc$foldIdx <- allFolds
repeats <- tc$repeats
folds <- tc$folds
} else {
allFolds <- tc$foldIdx
repeats <- length(allFolds)
folds <- length(allFolds[[1]])
tc$repeats <- repeats
tc$folds <- folds
}
tuningRes <- tuningResults <- NULL
tuneGrid <- expand.grid(rho = rhos, epsilon = tc$phi.epsilon)
# for each fold within each repat, fit a plda model.
# tuningResults <- foreach(iii = 1:nrow(tuneGrid), .combine = "c", .inorder = TRUE, .verbose = FALSE,
# .export = c("tuneGrid", "allFolds")) %dopar% {
#
#
# }
foldRes <- lapply(allFolds, function(ii){
errors <- acc <- nonzeros <- matrix(NA, nrow = length(rhos), ncol = length(ii))
for (jj in 1:length(ii)){
idx <- ii[[jj]]
# out <- plda(x = x[-idx, ], y = y[-idx], xte = x[idx, ], rhos = rhos, beta = beta,
# type = "quantile", prior = prior, transform = FALSE)
plist <- tc
plist$transform <- FALSE
plist <- c(list(x = x[-idx, ], y = y[-idx], xte = x[idx, ], type = type), plist)
out <- do.call("nblda", plist)
# out <- nblda(x = x[-idx, ], y = y[-idx], xte = x[idx, ], rhos = rhos, beta = tc$beta, type = type,
# prior = tc$prior, transform = FALSE, alpha = alpha, truephi = tc$true.dispersions,
# target = tc$target, normalize.target = tc$normalize.target, delta = tc$delta,
# return.selected.features = tc$return.selected.features)
err.i <- nonzero.i <- acc.i <- NULL
for (i in 1:length(out)){
acc.i <- c(acc.i, sum(out[[i]][["ytehat"]] == y[idx]) / length(y[idx]))
err.i <- c(err.i, sum(out[[i]][["ytehat"]] != y[idx]))
nonzero.i <- c(nonzero.i, sum(colSums(out[[i]][["ds"]] != 1) != 0 | out[[i]]$dispersions$adj != 0))
}
acc[ ,jj] <- acc.i
errors[ ,jj] <- err.i
nonzeros[ ,jj] <- nonzero.i
}
list(errors = rowMeans(errors), accuracies = rowMeans(acc), nonzeros = rowMeans(nonzeros))
# list(errors = errors, nonzeros = nonzeros)
})
err.sum <- nonzero.sum <- acc.sum <- 0
for (r in 1:length(foldRes)){
acc.sum <- acc.sum + foldRes[[r]]$accuracies
err.sum <- err.sum + foldRes[[r]]$errors
nonzero.sum <- nonzero.sum + foldRes[[r]]$nonzeros
}
tuningRes <- cbind(rhos, round(acc.sum/repeats, 10), round(err.sum/repeats, 10), round(nonzero.sum/repeats, 10))
colnames(tuningRes) <- c("rho", "accuracy", "errors", "nonzero")
if (metric == "accuracy"){
best.idx <- max(which(tuningRes[, "accuracy"] == max(tuningRes[ ,"accuracy"])))
best.rho <- tuningRes[, "rho"][best.idx]
best.metric <- tuningRes[, "accuracy"][best.idx]
best.nonzero <- tuningRes[, "nonzero"][best.idx]
} else if (metric == "error"){
best.idx <- max(which(tuningRes[, "errors"] == min(tuningRes[ ,"errors"])))
best.rho <- tuningRes[, "rho"][best.idx]
best.metric <- tuningRes[, "errors"][best.idx]
best.nonzero <- tuningRes[, "nonzero"][best.idx]
}
# Set transform to FALSE since data is already transformed above.
final.model <- nblda(x = x, y = y, xte = NULL, rhos = best.rho, beta = tc$beta, type = type, prior = tc$prior,
transform = FALSE, alpha = alpha, return.selected.features = tc$return.selected.features,
truephi = tc$truephi, target = tc$target, normalize.target = tc$normalize.target,
delta = tc$delta, phi.epsilon = tc$phi.epsilon)
final.model[[1]]$transform <- tc$transform
res <- list(tuning.results = tuningRes, best.rho = best.rho, final.model = final.model[[1]])
## Input object
input.list <- new("nblda_input", x = x, y = y) # Input data.
# Trained model object
result.list <- new("nblda_trained",
crossValidated = list(tuning.results = tuningRes, best.rho = best.rho,
best.metric = best.metric, best.nonzero = best.nonzero, metric = metric),
finalModel = final.model[[1]],
control = tc)
## nblda object.
new("nblda", input = input.list,
result = result.list,
call = call)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.