#' @title Impute missing observations for a given Dataset
#'
#' @description Impute missing observations for a given dataset.
#' Missing observations can be imputed using the mean, the mode, multiple linear regression or binominal logistic regression.
#' When using mice package, the NA values must only be in one column.
#'
#' @param dataset The dataset that the power terms are derived from
#'
#' @param percentage The percentage specifies identifies the attributes with a missing number of observations
#'
#' @param y_index A natural number specifying the column in the data frame to be imputed
#'
#' @param type The type of imputation to be used.
#' Either "mean", "mode" or "mice".
#'
#' @param method The method of imputation to be used in conjungtion with type equal to mice.
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs the imputed data as a data frame
#'
#' @import mice
#'
#' @export
#'
#' @seealso \code{\link{remove_variables}}, \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#'
impute_variables <- function(dataset,
percentage = NULL,
y_index = NULL,
type = c("mean", "mode", "mice"),
method = c("cart", "rf", "norm.predict", "norm.boot", "logreg", "logreg.boot"),
file_name = NULL,
directory = NULL)
{
# Convert the dataset set to a data frame
dataset <- as.data.frame(dataset)
# Confirm correct choice for type
type <- match.arg(type)
#------------------------------------------------------------------#
# If y_index is NULL #
#------------------------------------------------------------------#
if(is.null(y_index)){
#------------------------------------------------------------------#
# If type is mean #
#------------------------------------------------------------------#
if(type == "mean"){
for (i in 1:ncol(dataset)) {
missing_prc <- (sum(is.na(x = dataset[,i])) * 100) / nrow(dataset)
if(is.numeric(dataset[,i]) && 0 < missing_prc && missing_prc < percentage) {
mean <- mean(dataset[,i], na.rm = T)
dataset[which(is.na(dataset[,i])),i] <- mean
} else if(is.numeric(dataset[,i]) && is.null(percentage)){
mean <- mean(dataset[,i], na.rm = T)
dataset[which(is.na(dataset[,i])),i] <- mean
}
}
#------------------------------------------------------------------#
# If type is mode #
#------------------------------------------------------------------#
} else if(type == "mode"){
for (i in 1:ncol(dataset)) {
missing_prc <- (sum(is.na(x = dataset[,i])) * 100) / nrow(dataset)
if (is.factor(dataset[,i]) && 0 < missing_prc && missing_prc < percentage) {
mode <- attributes(summary(dataset[,i]))$names[which.max(summary(dataset[,i]))]
dataset[which(is.na(dataset[,i])),i] <- mode
} else if(is.factor(dataset[,i]) && is.null(percentage)){
mode <- attributes(summary(dataset[,i]))$names[which.max(summary(dataset[,i]))]
dataset[which(is.na(dataset[,i])),i] <- mode
}
}
#------------------------------------------------------------------#
# If type is mice #
#------------------------------------------------------------------#
} else if (type == "mice"){
impute_data <- mice(data = dataset, method = method)
dataset <- complete(impute_data)
}
#----------------------------------------------------------------------#
# If y_index is not NULL #
#----------------------------------------------------------------------#
} else if(!is.null(y_index)){
#------------------------------------------------------------------#
# If type is mean #
#------------------------------------------------------------------#
if(type == "mean"){
missing_prc <- (sum(is.na(x = dataset[,y_index])) * 100) / nrow(dataset)
if(is.numeric(dataset[,y_index]) & (0 < missing_prc) & (missing_prc < percentage)){
mean <- mean(dataset[,y_index], na.rm = T)
dataset[which(is.na(dataset[,y_index])), y_index] <- mean
} else if(is.numeric(dataset[,y_index]) & is.null(percentage)){
mean <- mean(dataset[,y_index], na.rm = T)
dataset[which(is.na(dataset[,y_index])),y_index] <- mean
}
#------------------------------------------------------------------#
# If type is mode #
#------------------------------------------------------------------#
} else if(type == "mode"){
missing_prc <- (sum(is.na(x = dataset[,y_index])) * 100) / nrow(dataset)
if (is.factor(dataset[,y_index]) && 0 < missing_prc && missing_prc < percentage) {
mode <- attributes(summary(dataset[,y_index]))$names[which.max(summary(dataset[,y_index]))]
dataset[which(is.na(dataset[,y_index])),y_index] <- mode
} else if(is.factor(dataset[,y_index]) && is.null(percentage)){
mode <- attributes(summary(dataset[,y_index]))$names[which.max(summary(dataset[,y_index]))]
dataset[which(is.na(dataset[,y_index])),y_index] <- mode
}
#-------------------------------------------------------------------#
# If type is mice #
#-------------------------------------------------------------------#
} else if (type == "mice"){
impute_data <- mice(data = dataset, method = method)
dataset <- complete(impute_data)
}
}
if(!is.null(directory)) {
write.csv(x = dataset,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# return the imputed dataset
return(dataset)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.