R/transform_variables.R

#' @title Transforms the Numeric Variables of a given Dataset based on a skewness criterion
#' 
#' @description This function transforms numeric variables of a given data based on skeweness (and kurtosis criterion).
#'     There are five transformation methods; square root, log and inverse.
#'     Square root transformation transforms a numeric variable by takingthe square root of the variable.
#'     Log transformation transforms a numeric variable by taking the log of the variable.
#'     Power transformation transforms a numeric variable by taking a power p of the variable.
#'     Without the skew bounds this function acts as apply(MARGIN = 2)
#' 
#' @param dataset A dataset to be transformed, the dataset can have mixed types.
#' 
#' @param method A charactor object denoting the method of transformation used.
#'    One of two possible options; "log" or "power".
#' 
#' @param skew_bound A vector length two representing the lower and upper skeweness bounds.
#'     Default is NULL.
#'     
#' @param p The power in association with the power transformations, default is NULL
#' 
#' @return Outputs the transformed dataset as data frame.
#' 
#' @import moments
#' 
#' @export
#' 
#' @seealso \code{\link{remove_variables}}, \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}
#' 
#' @examples 
#' # Example Data
#' x1 <- rnorm(n = 60, mean = 50, sd = 10)
#' x2 <- rpois(n = 60, lambda = 50)
#' x3 <- sample(x = 1:10, size = 60, replace = TRUE)
#' x4 <- rep(x = c("yes", "no"), times = 30)
#' x5 <- rep(x = c("high", "medium", "low"), times = 20)
#' x6 <- sample(x = c("yes", "no"), size = 60, replace = TRUE)
#' # Save as a data frame
#' data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
#' # Transform the Numeric Variables
#' transformation_x(data)
#' 
transform_variables <- function (dataset, 
                                 skew_bound = NULL, 
                                 method = c("log", "power"), 
                                 p = NULL) 
  {
  
  #------------------------------------------------------------------------#
  # When Skew_bounds != NULL                                               #
  #------------------------------------------------------------------------#
  
  # Match the specified method argument with the possible options
  method <- match.arg(method)
  
  if(!is.null(skew_bound)){
    
    for (i in 1:ncol(dataset)){
      
      if (is.numeric(dataset[,i]) && skewness(dataset[,i], na.rm = T) >= min(skew_bound) && skewness(dataset[,i], na.rm = T) <= max(skew_bound)) {
        
        if (method == "log") {
          
          # Guarenteed to work for non-negative data
          dataset[,i] <- dataset[,i] + 1
          dataset[,i] <- log(dataset[,i])
          
        } else if (method == "power") {
          
          # Guarenteed to work for non-negative data
          dataset[,i] <- dataset[,i] + 1
          dataset[,i] <- (dataset[,i])^p
          
        }
        
      }
      
    }
    
    #-----------------------------------------------------------------------#
    # When Skew_bounds = NULL                                               #
    #-----------------------------------------------------------------------#
    
  } else if(is.null(skew_bound)){
    
    for (i in 1:ncol(dataset)){
      
      
      if (is.numeric(dataset[,i])) {
        
        if (method == "log") {
          
          # Guarenteed to work for non-negative data
          dataset[,i] <- dataset[,i] + 1
          dataset[,i] <- log(dataset[,i])
          
        } else if (method == "power") {
          
          # Guarenteed to work for non-negative data
          dataset[,i] <- dataset[,i] + 1
          dataset[,i] <- (dataset[,i])^p
          
        }
        
      }
      
    }
    
  }
  
  transformed_data <- dataset
  return(transformed_data)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.