Nothing
## ==================================================================
##
## Internal Fuctions
##
## ==================================================================
#' @title Some functions for a given distribution
#'
#' @description Provides the instrumental or candidate probability density function of the Accept-Reject method and a random number generator.
#'
#' @param dname character, instrumental or candidate distribution name.
#' @return list of the density function and a random number generator for the given distribution.
#'
#' @noRd
get_inst_distr <- function (dname) {
switch (
dname,
"mvnorm" = {
density <- function (x, params) mvtnorm::dmvnorm(x, mean = params$mean, sigma = params$sigma)
random <- function (n, params) mvtnorm::rmvnorm(n, mean = params$mean, sigma = params$sigma)
},
"norm" = {
density <- function (x, params) stats::dnorm(x, mean = params$mean, sd = params$sd)
random <- function (n, params) stats::rnorm(n, mean = params$mean, sd = params$sd)
},
"exp" = {
density <- function (x, params) stats::dexp(x, rate = params$rate)
random <- function (n, params) stats::rexp(n, rate = params$rate)
},
"unif" = {
density <- function (x, params) stats::dunif(x, min = params$min, max = params$max)
random <- function (n, params) stats::runif(n, min = params$min, max = params$max)
},
"gamma" = {
density <- function (x, params) stats::dgamma(x, shape = params$shape, rate = params$rate)
random <- function (n, params) stats::rgamma(n, shape = params$shape, rate = params$rate)
},
"cauchy" = {
density <- function (x, params) stats::dcauchy(x, location = params$location, scale = params$scale)
random <- function (n, params) stats::rcauchy(n, location = params$location, scale = params$scale)
},
"lnorm" = {
density <- function (x, params) stats::dlnorm(x, meanlog = params$meanlog, sdlog = params$sdlog)
random <- function (n, params) stats::rlnorm(n, meanlog = params$meanlog, sdlog = params$sdlog)
},
"weibull" = {
density <- function (x, params) stats::dweibull(x, shape = params$shape, scale = params$scale)
random <- function (n, params) stats::rweibull(n, shape = params$shape, scale = params$scale)
}
)
list("density" = density, "random" = random)
}
#' @title The constant of the Accept-Reject method
#'
#' @description Returns the constant of the Accept-Reject method for a given kernel density estimation, an instrumental or candidate density and its parameters.
#'
#' @param estimated.values kernel density estimation.
#' @param eval.points evaluated points by using KDE.
#' @param ddistr function, an instrumental or candidate distribution density function from \code{\link{get_inst_distr}}.
#' @param params instrumental or candidate distribution parameters.
#' @return numeric, the constant of the Accept-Reject method.
#'
#' @noRd
est_const <- function (estimated.values, eval.points, ddistr, params) {
## instrumental or candidate distribution density
density <- ddistr(x = eval.points, params)
## constant of accept reject method
max(estimated.values / density)
}
#' @title Data Validation
#'
#' @description It validates data for a given instrumental distribution.
#'
#' @param x a numeric vector; data.
#' @param distr character; instrumental or candidate distribution name.
#'
#' @return logical value which indicates data is valid for a given instrumental distribution.
#'
#' @noRd
data_validation <- function (x, distr) {
if (is.vector(x) && distr %in% c("exp", "gamma", "lnorm", "weibull") && any(x < 0)) {
stop("Data x is invalid for the given distribution by the argument distr.")
}
}
## ==================================================================
##
## Public Fuctions
##
## ==================================================================
#' @title Simulation with Kernel Density Estimation
#'
#' @description Generates random values from a univariate and multivariate continuous distribution by using kernel density estimation based on a sample. The function uses the Accept-Reject method.
#'
#' @param x a numeric vector, matrix or data frame; data.
#' @param n integer; the number of random values will be generated.
#' @param distr character; instrumental or candidate distribution name. See details.
#' @param const.only logical; if \code{TRUE}, the constant of the Accept-Reject method will be returned.
#' @param seed a single value, interpreted as an integer, or \code{NULL} (default).
#' @param parallel logical; if \code{TRUE} parallel generator will be worked. \code{FALSE} is default.
#' @param ... other parameters for functions \code{\link[ks]{kde}}.
#'
#' @return list of given data, simulated values, kernel density estimation and the constant of the Accept-Reject method when \code{const.only} is \code{FALSE} (default).
#'
#' @details Such function uses the function \code{\link[ks]{kde}} as kernel density estimator.
#'
#' The Accept-Reject method is used to simulate random variables.
#' Following code named distributions can be used as a value of the argument \code{distr} and an instrumental or candidate distribution of the simulation method.
#' For univariate distributions:
#' \describe{
#' \item{norm}{normal distribution (default), \eqn{(-\infty,+\infty)}}
#' \item{cauchy}{Cauchy distribution, \eqn{(-\infty,+\infty)}}
#' \item{lnorm}{log-normal distribution, \eqn{(0,+\infty)}}
#' \item{exp}{exponential distribution, \eqn{(0,+\infty)}}
#' \item{gamma}{gamma distribution, \eqn{(0,+\infty)}}
#' \item{weibull}{Weibull distribution, \eqn{(0,+\infty)}}
#' \item{unif}{uniform distribution, \eqn{(a,b)}}
#' }
#' And you can choose the best fitting instrumental distribution to simulate random variables more effectively by using \code{\link{find_best_fit}}. See examples.
#'
#' For multivariate distributions, "norm" (multivariate normal distribution) is used.
#'
#' @seealso \code{\link{find_best_fit}}, \code{\link[ks]{kde}}
#'
#' @references \itemize{
#' \item Tarn Duong (2018). ks: Kernel Smoothing. R package version 1.11.2. \url{https://CRAN.R-project.org/package=ks}
#' \item Christian P. Robert and George Casella (2010) Introducing Monte Carlo Methods with R. Springer. Pages 51-57.
#' }
#'
#' @export
#'
#' @examples
#' ## 1-dimensional data
#' data(faithful)
#' hist(faithful$eruptions)
#' res <- simukde::simulate_kde(x = faithful$eruptions, n = 100, parallel = FALSE)
#' hist(res$random.values)
#'
#' ## Simulation with the best fitting instrumental distribution
#' data(faithful)
#' par(mfrow = c(1, 3))
#' hist(faithful$eruptions)
#' fit <- simukde::find_best_fit(x = faithful$eruptions, positive = TRUE)
#' res <- simukde::simulate_kde(
#' x = faithful$eruptions, n = 100,
#' distr = fit$distribution, parallel = FALSE
#' )
#' hist(res$random.values)
#' par(mfrow = c(1, 1))
#' \donttest{
#' ## 2-dimensional data
#' data(faithful)
#' res <- simukde::simulate_kde(x = faithful, n = 100)
#' plot(res$kde, display = "filled.contour")
#' points(x = res$random.values, cex = 0.25, pch = 16, col = "green")
#' points(x = faithful, cex = 0.25, pch = 16, col = "black")}
#' \dontshow{
#' ## 2-dimensional data
#' data(faithful)
#' res <- simukde::simulate_kde(x = faithful, n = 1, parallel = FALSE)
#' plot(res$kde, display = "filled.contour")
#' points(x = res$random.values, cex = 0.5, pch = 16, col = "blue")
#' points(x = faithful, cex = 0.25, pch = 16, col = "gray")}
simulate_kde <- function (x, n = 100, distr = "norm", const.only = FALSE, seed = NULL, parallel = FALSE, ...) {
## check input for distr
data_validation(x = x, distr = distr)
## check sample size
if (n != as.integer(n) || n <= 0) {
stop("n is invalid.")
}
## kernel density estimation (KDE)
kd <- ks::kde(x = x, ...)
## instrumental or candidate distribution
if (is.matrix(x) || is.data.frame(x)) {
if (distr == "norm") { # multivariate normal distribution
params <- list("mean" = colMeans(kd$x), "sigma" = stats::cov(kd$x))
inst_distr <- get_inst_distr(dname = "mvnorm")
eval.points <- as.matrix(expand.grid(kd$eval.points))
estimated.values <- kd$estimate
} else {
stop("distr is invalid for multivariate data.")
}
} else if (is.vector(x)) {
if (distr == "norm") { # normal distribution
params <- list("mean" = mean(kd$x), "sd" = stats::sd(kd$x))
inst_distr <- get_inst_distr(dname = "norm")
valid.points <- rep(x = TRUE, times = length(x = kd$eval.points))
} else if (distr == "exp") { # exponential distribution
params <- list("rate" = 1 / mean(kd$x))
inst_distr <- get_inst_distr(dname = "exp")
valid.points <- which(kd$eval.points > 0)
} else if (distr == "unif") { # uniform distribution
params <- list("min" = min(kd$x), "max" = max(kd$x))
inst_distr <- get_inst_distr(dname = "unif")
valid.points <- which(kd$eval.points > params$min & kd$eval.points < params$max)
} else if (distr == "gamma") { # gamma distribution
suppressWarnings(param.est <- MASS::fitdistr(x = kd$x, densfun = "gamma")$estimate)
params <- list("shape" = param.est[["shape"]], "rate" = param.est[["rate"]])
inst_distr <- get_inst_distr(dname = "gamma")
valid.points <- which(kd$eval.points > 0)
} else if (distr == "cauchy") { # Cauchy distribution
suppressWarnings(param.est <- MASS::fitdistr(x = kd$x, densfun = "cauchy")$estimate)
params <- list("location" = param.est[["location"]], "scale" = param.est[["scale"]])
inst_distr <- get_inst_distr(dname = "cauchy")
valid.points <- rep(x = TRUE, times = length(x = kd$eval.points))
} else if (distr == "lnorm") { # log normal distribution
param.est <- MASS::fitdistr(x = kd$x, densfun = "lognormal")$estimate
params <- list("meanlog" = param.est[["meanlog"]], "sdlog" = param.est[["sdlog"]])
inst_distr <- get_inst_distr(dname = "lnorm")
valid.points <- which(kd$eval.points > 0)
} else if (distr == "weibull") { # Weibull distribution
suppressWarnings(param.est <- MASS::fitdistr(x = kd$x, densfun = "weibull")$estimate)
params <- list("shape" = param.est[["shape"]], "scale" = param.est[["scale"]])
inst_distr <- get_inst_distr(dname = "weibull")
valid.points <- which(kd$eval.points > 0)
} else {
stop("The value of distr is invalid for univariate data.")
}
eval.points <- kd$eval.points[valid.points]
estimated.values <- kd$estimate[valid.points]
} else {
stop("x must be a vector, matrix or data frame.")
}
if (!exists("inst_distr")) {
stop("distr is invalid.")
}
## constant of the Accept-Reject method
const <- est_const(estimated.values = estimated.values, eval.points = eval.points, ddistr = inst_distr$density, params)
if (const.only) {
return(const)
}
## simulate multivariate random variable with KDE
if (parallel) {
cl <- parallel::makeCluster(parallel::detectCores() - 1)
if (!is.null(seed)) {
parallel::clusterSetRNGStream(cl = cl, iseed = seed)
}
y <- parallel::parSapply(cl = cl, X = seq_len(n), FUN = function (i) {
while (TRUE) {
u <- stats::runif(n = 1)
y <- inst_distr$random(n = 1, params = params)
f <- ks::kde(x = kd$x, eval.points = y)$estimate
g <- inst_distr$density(x = y, params = params)
if (u * const < f / g) {
break
}
}
y
})
parallel::stopCluster(cl = cl)
} else {
if (!is.null(seed)) {
set.seed(seed)
}
y <- replicate(n = abs(as.integer(n)), expr = {
while (TRUE) {
u <- stats::runif(n = 1)
y <- inst_distr$random(n = 1, params = params)
f <- ks::kde(x = kd$x, eval.points = y)$estimate
g <- inst_distr$density(x = y, params = params)
if (u * const < f / g) {
break
}
}
y
}, simplify = TRUE)
}
## result
if (is.matrix(y)) {
y <- t(y)
}
list(
"x" = x,
"random.values" = y,
"kde" = kd,
"distr" = distr,
"const" = const,
"sample.size" = n,
"seed" = seed
)
}
#' @title Find The Best Fitting Distribution
#'
#' @description It finds the best fitting distribution from supported univariate continuous distributions for given data.
#'
#' @param x a numeric vector; data.
#' @param positive a logical constant; distribution type.
#' @param plot a logical constant. If \code{TRUE} (default), a histogram and density lines are drawn.
#' @param legend.pos a character string. Indicates the legend position and must be one of "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright" (default), "right" and "center".
#' @param dlc a vector; probability density line colors for supported (up to 7) distributions. If unspecified, the rainbow color palette will be used.
#' @param dlw a numerical constant; probability density line width.
#' @param ... Further arguments and parameters for the function \code{\link[graphics]{hist}}, particularly, main title and axis labels. However, the parameter \code{freq} is not able to override.
#'
#' @return A list containing the following items:
#' \describe{
#' \item{distribution}{the name of the best fitting distribution.}
#' \item{ks.statistic}{the Kolmogorov-Smirnov test statistic for the distribution.}
#' \item{p.value}{the p-value of the test.}
#' \item{summary}{results similar to above for other distributions.}
#' \item{x}{given data.}
#' \item{n}{the sample size.}
#' }
#'
#' @details This function is supported following univariate distributions:
#' \itemize{
#' \item for positive random variables: Log normal, Exponential, Gamma and Weibull.
#' \item for all random variables: Normal, Cauchy, Log normal, Exponential, Gamma, Weibull and Uniform.
#' }
#'
#' Legends of the plot are ordered by p-values of the test.
#'
#' @seealso \code{\link[stats]{ks.test}}, \code{\link[MASS]{fitdistr}}, \code{\link[graphics]{hist}}
#'
#' @references
#' \enumerate{
#' \item William J. Conover (1971). Practical Nonparametric Statistics. New York: John Wiley & Sons. Pages 295–301.
#' \item Venables, W. N. and Ripley, B. D. (2002) Modern Applied Statistics with S. Fourth edition. Springer.
#' }
#'
#' @export
#'
#' @examples
#' petal.length <- datasets::iris$Petal.Length[datasets::iris$Species == "setosa"]
#' simukde::find_best_fit(x = petal.length, positive = TRUE)
find_best_fit <- function (x, positive = FALSE, plot = TRUE, legend.pos = "topright", dlc = NULL, dlw = 1, ...) {
# Input validation
if (!is.vector(x)) {
stop("x must be a vector.")
}
# Sample range
r <- range(x)
# Prepare for plot
if (isTRUE(plot)) {
if (!legend.pos %in% c("bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center")) {
stop("legend.pos is invalid.")
}
p.d.f <- list()
p.d.f.args <- seq(from = r[1], to = r[2], length.out = 100)
}
# Sample size
n <- length(x)
# Avoid from ties
if (any(table(x) > 1)) {
x.ks <- x + stats::rnorm(n = n, sd = min(1e-6, stats::sd(x) / 1000000))
} else {
x.ks <- x
}
# Result
distribution <- list()
statistic <- list()
p.value <- list()
# Common expressions
expr <- expression({
statistic[""] <- test$statistic
p.value[""] <- test$p.value
})
# Normal Distribution
distribution[""] <- "norm"
param <- MASS::fitdistr(x = x, densfun = "normal")$estimate
test <- stats::ks.test(x = x.ks, y = stats::pnorm, mean = param[1], sd = param[2])
eval(expr)
if (isTRUE(plot)) {
p.d.f$norm <- stats::dnorm(x = p.d.f.args, mean = param[1], sd = param[2])
}
# Cauchy Distribution
distribution[""] <- "cauchy"
suppressWarnings(param <- MASS::fitdistr(x = x, densfun = "cauchy")$estimate)
test <- stats::ks.test(x = x.ks, y = stats::pcauchy, location = param[1], scale = param[2])
eval(expr)
if (isTRUE(plot)) {
p.d.f$cauchy <- stats::dcauchy(x = p.d.f.args, location = param[1], scale = param[2])
}
# Uniform Distribution
distribution[""] <- "unif"
param <- r
test <- stats::ks.test(x = x.ks, y = stats::punif, min = param[1], max = param[2])
eval(expr)
if (isTRUE(plot)) {
p.d.f$unif <- stats::dunif(x = p.d.f.args, min = param[1], max = param[2])
}
# Positive Distributions
if (isTRUE(positive)) {
if (any(x < 0)) {
warning("Positive distributions were skipped due to data contains a negative value.")
} else {
# Log normal Distribution
distribution[""] <- "lnorm"
param <- MASS::fitdistr(x = x, densfun = "lognormal")$estimate
test <- stats::ks.test(x = x.ks, y = stats::plnorm, meanlog = param[1], sdlog = param[2])
eval(expr)
if (isTRUE(plot)) {
p.d.f$lnorm <- stats::dlnorm(x = p.d.f.args, meanlog = param[1], sdlog = param[2])
}
# Exponential Distribution
distribution[""] <- "exp"
param <- MASS::fitdistr(x = x, densfun = "exponential")$estimate
test <- stats::ks.test(x = x.ks, y = stats::pexp, rate = param[1])
eval(expr)
if (isTRUE(plot)) {
p.d.f$exp <- stats::dexp(x = p.d.f.args, rate = param[1])
}
# Gamma Distribution
distribution[""] <- "gamma"
suppressWarnings(param <- MASS::fitdistr(x = x, densfun = "gamma")$estimate)
test <- stats::ks.test(x = x.ks, y = stats::pgamma, shape = param[1], rate = param[2])
eval(expr)
if (isTRUE(plot)) {
p.d.f$gamma <- stats::dgamma(x = p.d.f.args, shape = param[1], rate = param[2])
}
# Weibull Distribution
distribution[""] <- "weibull"
suppressWarnings(param <- MASS::fitdistr(x = x, densfun = "weibull")$estimate)
test <- stats::ks.test(x = x.ks, y = stats::pweibull, shape = param[1], scale = param[2])
eval(expr)
if (isTRUE(plot)) {
p.d.f$weibull <- stats::dweibull(x = p.d.f.args, shape = param[1], scale = param[2])
}
}
}
S <- data.frame(
distribution = simplify2array(distribution),
ks.statistic = simplify2array(statistic),
p.value = simplify2array(p.value),
stringsAsFactors = FALSE
)
distr.order <- order(S$p.value, decreasing = TRUE)
S <- S[distr.order,]
if (isTRUE(plot)) {
# Histogram
ellipsis <- list(...)
ellipsis$freq <- FALSE
if (!methods::hasArg("ylim")) {
ellipsis$ylim <- c(0, max(graphics::hist(x = x, plot = FALSE)$density, simplify2array(p.d.f)))
}
if (!methods::hasArg("xlab")) {
ellipsis$xlab <- deparse(substitute(x))
}
xname <- ellipsis$xlab
if (!methods::hasArg("main")) {
ellipsis$main <- paste("Histogram of" , xname)
}
do.call(what = graphics::hist, args = c(list(x), ellipsis))
# Density lines
k <- length(p.d.f)
if (is.null(dlc) || length(dlc) < k) {
dlc <- grDevices::rainbow(n = 7)
}
p.d.f <- p.d.f[distr.order]
for (i in 1:k) {
graphics::lines(x = p.d.f.args, y = p.d.f[[i]], type = "l", col = dlc[i], lty = i, lwd = dlw)
}
graphics::legend(x = legend.pos, legend = names(p.d.f), col = dlc[1:k], lty = 1:k)
}
list(
"distribution" = S$distribution[1],
"ks.statistic" = S$ks.statistic[1],
"p.value" = S$p.value[1],
"summary" = S,
"x" = x,
"n" = n
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.