Nothing
#' @title add stratified/unstratified artificial missing observations
#' @description to examine the performance of imputation algorithms, artificial
#' missing data are added to datasets and then imputed, to compare
#' the original observations with the imputed values. this function
#' can add stratified or unstratified artificial missing data. stratified
#' missing data can be particularly useful if your categorical or ordinal
#' variables are imbalanced, i.e., one category appears at a much higher
#' rate than others.
#' @param x data.frame. x must be strictly a data.frame and any other
#' data.table classes will be rejected
#' @param p percentage of missingness to be added to the data
#' @param stratify logical. if TRUE (default), stratified sampling will be
#' carried out, when adding NA values to 'factor' variables
#' (either ordered or unordered). this feature makes evaluation
#' of missing data imputation algorithms more fair, especially
#' when the factor levels are imbalanced.
#' @param classes character vector, specifying the variable classes that should
#' be selected for adding NA values. the default value is NULL,
#' meaning all variables will receive NA values with probability of 'p'.
#' however, if you wish to add NA values only to a specific classes, e.g.
#' 'numeric' variables or 'ordered' factors, specify them in this argument.
#' e.g. write "classes = c('numeric', 'ordered')" if you wish to add NAs
#' only to numeric and ordered factors.
#' @param seed integer. a random seed number for reproducing the result (recommended)
#' @author E. F. Haghish
#' @examples
#'
#' \dontrun{
#' # adding stratified NA to an atomic vector
#' x <- as.factor(c(rep("M", 100), rep("F", 900)))
#' table(mlim.na(x, p=.5, stratify = TRUE))
#'
#' # adding unstratified NAs to all variables of a data.frame
#' data(iris)
#' mlim.na(iris, p=0.5, stratify = FALSE, seed = 1)
#'
#' # or add stratified NAs only to factor variables, ignoring other variables
#' mlim.na(iris, p=0.5, stratify = TRUE, classes = "factor", seed = 1)
#'
#' # or add NAs to numeric variables
#' mlim.na(iris, p=0.5, classes = "numeric", seed = 1)
#' }
#' @return data.frame
#' @export
mlim.na <- function(x, p = 0.1, stratify=FALSE, classes=NULL, seed = NULL) {
# Syntax processing
# ------------------------------------------------------------
stopifnot(
"'p' should be between 0 and 1" = p >= 0 & p <= 1,
"'x' type is not recognized" = is.atomic(x) || is.data.frame(x),
"clas of 'x' must be strictly data.frame" = ! class(x) %in% c("tbl", "tbl_df")
)
# set the seed for reproducibility
if (!is.null(seed)) set.seed(seed)
# if 'x' is not a dataframe:
if (is.atomic(x)) {
x <- addNA(x, p, stratify = stratify)
return(x)
}
else {
for (i in colnames(x)) {
if (is.null(classes)) x[, i] <- addNA(x[, i], p, stratify = stratify)
else {
# force drop in case 'x' is not a data.frame
if (class(x[, i, drop=TRUE])[1] %in% classes) x[, i] <- addNA(x[, i], p, stratify = stratify)
}
}
}
return(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.