R/standardise_varaibles.R

#' @title Standardise the Numeric Variables of a given Dataset
#' 
#' @description This function standardises numeric variables of a given data.
#'     There are three methods; range standardisation, normalisation standardisation, and median absolute deviation standardisation.
#'     Range standardisation standardises the numeric variables to a specified range, the default is [0, 1].
#'     Normalisation standardisation standardises the numeric variables to have mean 0 and standard deviation 1.
#'     Median Absolute Devation standardisation standardises the numeric variables to have median 0 and median absolute deviation 1.
#' 
#' @param dataset A dataset to be standardised, the dataset can have mixed types.
#' 
#' @param method A charactor object denoting the method of standardisation used.
#'    One of three possible options; "range", "norm", "MAD".
#' 
#' @param lower_bound The lower bound of the range standardisation, default is 0.
#' 
#' @param upper_bound The upper bound of the range standardisation, default is 1.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#'                  The default is NULL.
#'                  The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs the standardised dataset as data frame.
#' 
#' @export
#' 
#' @seealso \code{\link{remove_variables}}, \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{transform_variables}}
#' 
#' @examples 
#' # Example Data
#' x1 <- rnorm(n = 60, mean = 50, sd = 10)
#' x2 <- rpois(n = 60, lambda = 50)
#' x3 <- sample(x = 1:10, size = 60, replace = TRUE)
#' # Standardise the Numeric Variables
#' standardise_x(dataset = x1, method = "range")
#' standardise_x(dataset = iris, method = "range", lower_bound = 10, upper_bound = 100)
#' standardise_x(dataset = x2, method = "norm")
#' standardise_x(dataset = x3, method = "MAD")
#' 
standardise_variables <- function(dataset, 
                                  method = c("range", "norm", "MAD"), 
                                  lower_bound = 0, 
                                  upper_bound = 1, 
                                  file_name = NULL, 
                                  directory = NULL) 
  {
  
  #-------------------------------------------------------------------------------#
  # If dataset is a data frame                                                    #
  #-------------------------------------------------------------------------------#
  
  if(is.data.frame(x = dataset)){
    
    # Make sure the datset is converted to a data frame
    dataset <- as.data.frame(x = dataset)
    
    # Create the standardised dataset
    standardised_dataset <- dataset
    
    # Match the specified method argument with the possible options
    method <- match.arg(method)
    
    if(method == "range"){
      
      for (i in 1:ncol(dataset)) {
        
        if (is.numeric(dataset[,i])){
          
          # Define the range standardising function
          
          standardise_range <- function(vector, lower_bound, upper_bound){
          
            standardised_vector <- ((((vector - min(vector)) / (max(vector) - min(vector))) * (upper_bound - lower_bound)) + lower_bound)
            
            return(standardised_vector)
          
          }
          
          # Apply the range standardising function to the dataset
          standardised_dataset[,i] <- standardise_range(vector = dataset[,i], lower_bound = lower_bound, upper_bound = upper_bound)
        
        }
        
      }
      
    } else if(method == "norm"){
      
      for (i in 1:ncol(dataset)) {
        
        if (is.numeric(dataset[,i])){
          
          # Define the normalisation standardising function
          standardise_norm <- function(vector){
            
            standardised_vector <- ((vector- mean(vector)) / sd(vector))
            
            return(standardised_vector)
            
          }
          
          # Apply the range standardising function to the dataset
          standardised_dataset[,i] <- standardise_norm(vector = dataset[,i])
          
        }
        
      }
      
    } else if (method == "MAD"){
      
      for (i in 1:ncol(dataset)) {
        
        if (is.numeric(dataset[,i])){
          
          # Define the normalisation standardising function
          standardise_MAD <- function(vector){
            
            standardised_vector <- ((vector - median(vector, na.rm = TRUE)) / median(abs(vector - median(vector, na.rm = TRUE)), na.rm = TRUE))  
            
            return(standardised_vector)
            
          }
          
          # Apply the range standardising function to the dataset
          standardised_dataset[,i] <- standardise_MAD(vector = dataset[,i])
          
        }
        
      }
      
    }
    
    #-------------------------------------------------------------------------------#
    # If dataset is a vector                                                        #
    #-------------------------------------------------------------------------------#
  
  } else if(is.vector(dataset)){
    
    # Match the specified method argument with the possible options
    method <- match.arg(method)
    
    if(method == "range"){
      
      standardised_dataset <- ((((dataset - min(dataset)) / (max(dataset) - min(dataset))) * (upper_bound - lower_bound)) + lower_bound)
    
    } else if(method == "norm") {
      
      standardised_dataset <- ((dataset- mean(dataset)) / sd(dataset))
    
    } else if(method == "MAD") {
      
      standardised_dataset <- ((dataset - median(dataset, na.rm = T)) / median(abs(dataset - median(dataset, na.rm = T))))  
    
    }
    
  }
  
  if(!is.null(directory)) {
    
    write.csv(x = standardised_dataset, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the standardised dataset
  return(standardised_dataset)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.