#' User-friendly univariate Maximum Likelihood Estimation
#'
#' Finds the MLE from a univariate sample using a parametric family of
#' distributions built into R (or others similar to them).
#'
#' @param x Vector of univariate sample drawn from the distribution.
#' @param ddist A family of densities or mass functions,
#' for example \code{\link{dnorm}} or \code{\link{dnbinom}}. Each parameter
#' should have its own named argument.
#' @param init.val The initial guess at parameter values for the numerical
#' optimizer. Could be (1) a vector, in which the order provided will correspond
#' to the order given in the R documentation for that distribution, or (2) a list
#' with names being the parameter names, or (3) \code{NULL} in which
#' case default parameter values are used (if there even are defaults).
#' @param in.range The same as the argument in \code{\link{rnlm}} -- a function
#' that takes the parameter values and returns \code{TRUE} if that value of
#' the parameter is "proper"; \code{FALSE} otherwise.
#' @note If you want to make your own distribution (density function), just
#' be sure to follow R's order of arguments (\code{x} first,
#' then named parameters).
#' @return A list of the parameter estimates, named according to their
#' corresponding parameter names
#' @examples
#' ## MLE of a Gaussian sample
#' unimle(rnorm(1000))
#' x <- rnorm(1000, 5, 100)
#' unimle(x)
#' unimle(x, init.val = list(sd = 80, mean = 10)) # Notice the order output order
#'
#' ## MLE of negative binomial
#' x <- rnbinom(1000, size = 2, mu = 10)
#' unimle(x, dnbinom) # No defaults, so must specify inits
#' unimle(x, dnbinom, c(1, 0.3))
#' unimle(x, dnbinom, list(size = 1, mu = 5))
#' @export
unimle <- function(x, ddist = dnorm, init.val = NULL, in.range = NULL) {
## Get names of parameters
if (is.null(init.val)) {
## init.val not specified. Take the defaults.
## Note: Assume (besides x) the only non-parameter option is possibly "log"
#### Get arguments
arg <- formals(ddist)
arg <- as.list(arg) # Change from "Dotted" list to list.
#### Get rid of log and "x"
arg <- arg[-1]
arg$log <- NULL
#### Find out what arguments remain that are specified.
keep <- sapply(arg, is.vector)
#### Use those defaults as the initial values
init.val <- arg[keep]
#### Capture parameter names
parnames <- names(init.val)
}
if (is.vector(init.val)) {
## Initial values are inputted as a vector. Take the first non-x
## arguments to be the parameters.
parnames <- formalArgs(ddist)[1+1:length(init.val)]
}
if (is.list(init.val)) {
## User entered list of parameters, supposedly named.
parnames <- names(init.val)
if (length(parnames) != length(init.val)){
stop("If you enter a list for init.val, it must be named by the parameter names.")
}
}
## Set up a mapping from a vector of parameters to arguments of ddist.
vec2arg <- function(vec){
res <- as.list(vec)
names(res) <- parnames
c(list(x = x), res)
}
## Finally, set up neg. log likelihood.
nllh <- function(theta){
arg <- vec2arg(theta)
-sum(log(do.call(ddist, arg)))
}
## Minimize:
fit <- rnlm(nllh, c(init.val, recursive = T), in.range = in.range)
vec2arg(fit$estimate)[-1]
}
#' Estimate Marginal distribution
#'
#' Estimates the marginal distribution of a univariate sample, either
#' by MLE (if a parametric family is specified) or empirical distribution (edf,
#' with kernel density pdf)
#' otherwise.
#'
#' @param x Vector representing the univariate sample drawn from the
#' distribution to be estimated.
#' @param dist.name String; name of the parametric family of distributions, if
#' you don't want the edf. Use R's naming conventions
#' for distributions ahead of the d/p/q/r. For example, the Normal
#' distribution would be \code{"norm"}.
#' @param ecdf.split Numeric; real number for which to fit the specified model
#' for data above such number, and empirical distribution to be fit below.
#' @param init.val If a parametric distribution is specified (through
#' \code{dist.name}), this should be the initial value specification for the
#' parameters. Could be (1) a vector, in which the order provided will correspond
#' to the order given in the R documentation for that distribution, or (2) a list
#' with names being the parameter names, or (3) remain \code{NULL} in which
#' case default parameter values are used (if there even are defaults).
#' @param soften.cdf Logical; should the cdf be modified so that it never returns
#' the boundary cases 0 or 1? (uses 0.5/n or 1-0.5/n instead,
#' where n is amount of data)
#' @param eqf.type If the empirical quantile function is being estimated, this
#' argument specifies the type of quantile algorithm to use, as in the
#' \code{type} argument of the \code{\link{quantile}} function. In particular,
#' an integer between 1 and 9.
#' @param in.range The same as the argument in \code{\link{rnlm}} -- a function
#' that takes the parameter values and returns \code{TRUE} if that value of
#' the parameter is "proper"; \code{FALSE} otherwise.
#' @details Some noteworthy options for \code{eqf.type} are: 1 = inverse
#' empirical cdf (default of \code{marginal}); 7 = default of
#' \code{\link{quantile}}, a continuous function.
#' @return If \code{return.param = FALSE} (default), output is a list of
#' length three:
#'
#' \code{$cdf}: The estimated distribution function
#'
#' \code{$qf}: The estimated quantile function (inverse of the cdf estimate)
#'
#' \code{$pdf}: The estimated density function or mass function. Gives a kernel
#' density estimate if using the empirical distribution.
#' @export
marginal <- function(x,
dist.name = NULL,
ecdf.split = NULL,
init.val = NULL,
in.range = NULL,
soften.cdf = TRUE,
eqf.type = 1){
numorig <- length(x)
x <- na.omit(x)
n <- length(x)
if (n != numorig)
warning(paste("Function 'marginal()' is ignoring the", numorig - n, "NA values."))
## Case 1: no model given. Use empirical distribution.
if (is.null(dist.name)){
## Get edf if no distribution family is given.
## edf
cdf <- ecdf(x)
attributes(cdf) <- NULL
## quantile function (it's vectorized over tau)
qf <- function(tau){
res <- quantile(x, tau, type = eqf.type)
attributes(res) <- NULL
res
}
# ## mass function... for completeness only.
# tab <- table(x)
# nams <- names(tab)
# nx <- length(x)
# pdf <- function(q){
# mass <- sapply(q, function(q_){
# res <- which(nams == as.character(q_))
# if (length(res) == 0){
# res <- 0
# } else {
# res <- tab[res] / nx
# }
# res
# })
# names(mass) <- NULL
# mass
# }
## Use kernel density instead of mass function, since we're probably working
## with continuous variables.
#### Density estimate. Use default (only return points, not a function)
dens <- density(x)
pdf <- approxfun(dens$x, dens$y)
} else { ## Case 2: Model is given
## Get distribution names
ddist <- get(paste0("d", dist.name))
qdist <- get(paste0("q", dist.name))
pdist <- get(paste0("p", dist.name))
## Just to be safe, get the name of the first argument of those functions:
d1 <- formalArgs(ddist)[1]
q1 <- formalArgs(qdist)[1]
p1 <- formalArgs(pdist)[1]
## Case 2a: Use the model for all the data:
if (is.null(ecdf.split)) {
## Get estimate
thetahat <- unimle(x, ddist, init.val = init.val, in.range = in.range)
## Get distribution-related functions:
cdf <- function(q){
arg <- c(list(q), thetahat)
names(arg)[1] <- p1
do.call(pdist, arg)
}
pdf <- function(x){
arg <- c(list(x), thetahat)
names(arg)[1] <- d1
do.call(ddist, arg)
}
qf <- function(tau){
arg <- c(list(tau), thetahat)
names(arg)[1] <- q1
do.call(qdist, arg)
}
} else { ## Case 2b: This model only applies above some threshold.
## Split the data
xlow <- subset(x, x < ecdf.split)
xhigh <- subset(x, x >= ecdf.split)
## Estimate probability of being low or high:
plow <- length(xlow) / length(x)
phigh <- 1 - plow
## Get lower and upper fits:
lowfit <- marginal(xlow,
eqf.type = eqf.type,
soften.cdf = soften.cdf,
in.range = in.range)
highfit <- marginal(xhigh,
dist.name = dist.name,
init.val = init.val,
soften.cdf = soften.cdf,
in.range = in.range)
## Combine.
cdf <- function(x) {
sapply(x, function(x_) {
if (x_ < ecdf.split) {
lowfit$cdf(x_) * plow
} else {
plow + highfit$cdf(x_) * phigh
}
})
}
pdf <- function(x) {
lowfit$pdf(x) * plow +
highfit$pdf(x) * phigh
}
qf <- function(u) {
sapply(u, function(u_) {
if (u_ < plow){
lowfit$qf(u_ / plow)
} else {
highfit$qf((u_ - plow) / phigh)
}
})
}
}
}
## Lastly, if softened edges are wanted, make them:
if (soften.cdf) {
hardcdf <- cdf
## Remake cdf:
cdf <- function(x) {
ans <- hardcdf(x)
sapply(ans, function(ans_) max(min(1-0.5/n, ans_), 0.5/n))
}
}
list(cdf = cdf, qf = qf, pdf = pdf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.