R/remove_variables.R

#' @title Remove Variables / Observations from a Dataset
#' 
#' @description This function removes variables / obersvations from a dataset based on a percentage of missing data.
#' The results are outputed as a data frame.
#' Furthermore, the resulting data frame can be saved as .csv file in a specified directory.
#' 
#' @param dataset The dataset from which the variables / observations  are removed.
#' 
#' @param percentage The cut off percentage of missing observations for removing variables, the default is NULL.
#' If a variable is missing more than the specified percentage then the variable is removed from the dataset.
#' With the default set to NULL, all missing observations are removed from the dataset.
#' 
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The name must include the .csv suffixs.
#' The default is NULL.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs the completed data as a data frame.
#' 
#' @export
#' 
#' @seealso \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#' 
#' @examples 
#' 
#' # Example 1 - Titanic
#' descriptive_statistics(dataset = titanic, type = "numeric")
#' # remove all missing observations form the data frame
#' remove_variables(dataset = titanic)
#' # remove all variables that are missing over 15% of their observations
#' remove_variables(dataset = titanic, percentage = 15)
#' 
remove_variables <- function(dataset, 
                             percentage = NULL, 
                             file_name = NULL, 
                             directory = NULL) {
  
  # Make sure the datset is converted to a data frame
  dataset <- as.data.frame(dataset)
  
  #----------------------------------------------------------------#
  # If percentage != NULL                                          #
  #----------------------------------------------------------------#
  
  # Remove all variables that are missing over a certain percentage of their observations
  
  if(!is.null(percentage)){
    
    # j will act as an index, storing all the columns will missing data
    j <- numeric(0)
    
    # use a for loop to cycle throw all variables in the dataset
    for(i in 1:(ncol(dataset))) {
      
      # if variable x has more than percentage of missing observations
      # save the index of that variable
      
      if (round(sum(is.na(dataset[,i])) * 100 / length(dataset[,i]), digits = 4) >= percentage) {
        
        # update the j index
        j <- c(j, i)
        
      }
      
      # remove all variables found to be missing more than a certain percentage of the data. 
      reduced_data <- dataset[, -j]
      
    }
    
    #---------------------------------------------------------------#
    # If percentage = NULL                                          #
    #---------------------------------------------------------------#
    
    # Remove all missing observations from the dataset
    
  } else if(is.null(percentage)){
    
    # Remove all missing values
    reduced_data <- dataset[complete.cases(dataset),]
    
  }
  
  # write the resulting data frame as .csv in the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = reduced_data, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the created reduced dataset as a data frame
  return(reduced_data)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.