Nothing
#' @title Generate Count Data
#'
#' @description This function can be used to generate counts, e.g., RNA-Sequencing data, for both the classification and clustering purposes.
#'
#' @usage
#' generateCountData(n, p, K, param, sdsignal = 1, DE = 0.3, allZero.rm = TRUE,
#' tag.samples = FALSE)
#'
#' @param n number of samples.
#' @param p number of variables/features.
#' @param K number of classes.
#' @param param overdispersion parameter. This parameter is matched with the argument \code{size} in the \code{\link[stats]{rnbinom}} function. Hence, the Negative Binomial distribution approximates to the Poisson distribution as \code{param} increases.
#' @param sdsignal a nonzero numeric value. As \code{sdsignal} increases, the observed counts greatly differs among K classes.
#' @param DE a numeric value within the interval [0, 1]. This is the proportion of total number of variables that is significantly different among K classes. The remaining part is assumed to be having no contribution to the discrimination function.
#' @param allZero.rm a logical. If TRUE, the columns having all zero cells are dropped.
#' @param tag.samples a logical. If TRUE, the row names are automatically generated using a tag for each sample such as "S1", "S2", etc.
#'
#' @return
#' \item{\code{x, xte}}{count data matrix for training and test set.}
#' \item{\code{y, yte}}{class labels for training and test set.}
#' \item{\code{truesf, truesfte}}{true size factors for training and test set. See Witten (2011) for more information on estimating size factors.}
#'
#' @author Dincer Goksuluk
#'
#' @examples
#' set.seed(2128)
#' counts <- generateCountData(n = 20, p = 10, K = 2, param = 1, sdsignal = 0.5, DE = 0.8,
#' allZero.rm = FALSE, tag.samples = TRUE)
#' head(counts$x)
#'
#' @name generateCountData
#' @rdname generateCountData
#'
#' @importFrom stats rexp rnorm rnbinom runif sd var
#'
#' @export
generateCountData <- function(n, p, K, param, sdsignal = 1, DE = 0.3, allZero.rm = TRUE, tag.samples = FALSE) {
if (n < 4 * K){
stop("We require n to be at least 4*K.")
}
if (DE < 0 || DE > 1){
stop("DE should be a value within [0, 1].")
}
q0 <- rexp(p, rate = 1/25) # Gene Total
isDE <- runif(p) < DE
classk <- matrix(NA, nrow = K, ncol = p)
for (k in 1:K) {
lfc <- rnorm(p, sd = sdsignal) # dkj parametreleri.
classk[k, ] <- ifelse(isDE, q0 * exp(lfc), q0) ## Hangi genlerin DE olduğu bilgisi. Bunları da generate edilen veri ile döndürebiliriz.
##
}
truesf <- runif(n) * 2 + 0.2
truesfte <- runif(n) * 2 + 0.2
conds <- sample(c(rep(1:K, 4), sample(1:K, n - 4 * K, replace = TRUE)))
condste <- sample(c(rep(1:K, 4), sample(1:K, n - 4 * K, replace = TRUE)))
x <- xte <- matrix(NA, nrow = n, ncol = p)
for (i in 1:n) {
for (k in 1:K) {
if (conds[i] == k)
x[i, ] <- rnbinom(p, mu = truesf[i] * classk[k, ], size = param)
if (condste[i] == k)
xte[i, ] <- rnbinom(p, mu = truesfte[i] * classk[k, ], size = param)
}
}
if (allZero.rm){
rm <- apply(x, 2, sum) == 0 # Removes all zero columns.
} else {
rm <- logical(ncol(x))
}
# x <- as.data.frame(x)
# xte <- as.data.frame(xte)
colnames(x) <- colnames(xte) <- paste("G", 1:p, sep = "")
if (tag.samples){
rownames(x) <- rownames(xte) <- paste("S", 1:n, sep = "")
}
return(
structure(
list(
x = t(x[, !rm]),
xte = t(xte[, !rm]),
y = conds,
yte = condste,
truesf = truesf,
truesfte = truesfte
),
class = "count.data")
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.