#' @title Remove Variables / Observations from a Dataset
#'
#' @description This function removes variables / obersvations from a dataset based on a percentage of missing data.
#' The results are outputed as a data frame.
#' Furthermore, the resulting data frame can be saved as .csv file in a specified directory.
#'
#' @param dataset The dataset from which the variables / observations are removed.
#'
#' @param percentage The cut off percentage of missing observations for removing variables, the default is NULL.
#' If a variable is missing more than the specified percentage then the variable is removed from the dataset.
#' With the default set to NULL, all missing observations are removed from the dataset.
#'
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The name must include the .csv suffixs.
#' The default is NULL.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs the completed data as a data frame.
#'
#' @export
#'
#' @seealso \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#'
#' @examples
#'
#' # Example 1 - Titanic
#' descriptive_statistics(dataset = titanic, type = "numeric")
#' # remove all missing observations form the data frame
#' remove_variables(dataset = titanic)
#' # remove all variables that are missing over 15% of their observations
#' remove_variables(dataset = titanic, percentage = 15)
#'
remove_variables <- function(dataset,
percentage = NULL,
file_name = NULL,
directory = NULL) {
# Make sure the datset is converted to a data frame
dataset <- as.data.frame(dataset)
#----------------------------------------------------------------#
# If percentage != NULL #
#----------------------------------------------------------------#
# Remove all variables that are missing over a certain percentage of their observations
if(!is.null(percentage)){
# j will act as an index, storing all the columns will missing data
j <- numeric(0)
# use a for loop to cycle throw all variables in the dataset
for(i in 1:(ncol(dataset))) {
# if variable x has more than percentage of missing observations
# save the index of that variable
if (round(sum(is.na(dataset[,i])) * 100 / length(dataset[,i]), digits = 4) >= percentage) {
# update the j index
j <- c(j, i)
}
# remove all variables found to be missing more than a certain percentage of the data.
reduced_data <- dataset[, -j]
}
#---------------------------------------------------------------#
# If percentage = NULL #
#---------------------------------------------------------------#
# Remove all missing observations from the dataset
} else if(is.null(percentage)){
# Remove all missing values
reduced_data <- dataset[complete.cases(dataset),]
}
# write the resulting data frame as .csv in the specified directory
if(!is.null(directory)) {
write.csv(x = reduced_data,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# return the created reduced dataset as a data frame
return(reduced_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.