R/derive_variables.R

#' @title Derive Variables from a given Dataset
#' 
#' @description This function derives power, interaction or dummy variables for a given dataset.
#' The power terms are derived from raising each numeric variable in the specified dataset by a power.
#' The interaction terms are derived from multipling all the numeric variables among one another.
#' The dummy terms are derived from by generating binary terms for each level of the factor variables.
#' The resulting data frame can be saved to a specified dataset.
#' 
#' @param dataset The dataset that the variables are derived from.
#' 
#' @param y_index A natural number representing the response variable of the dataset that will be used in the derivation of new variables.
#' Default is NULL.
#' 
#' @param type The type of variables to be derived; either dummy, interction or power.
#' Default is interaction.
#' 
#' @param power A numeric value indicating the desired power, used in conjungtion with deriving power terms.
#' Default is NULL.
#' 
#' @param integer A logical object indicating whether the dummy variables should be stored as integers, used in conjungtion with deriving dummy terms.
#' Alternatively the dummy variables are stored as factors.
#' Default is TRUE.
#' 
#' @param return_dataset A logical object indicating whether the newly derived power terms and the original terms should be returned.
#' Alteratively, only the newly derived terms are returned.
#' Default is FALSE.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#' Default is NULL.
#'
#' @return Outputs the newly derived terms as a data frame
#' 
#' @import dummy
#'
#' @export
#' 
#' @seealso \code{\link{remove_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#' 
#' @keywords derive variables, interaction terms, polynomial terms, dummy variables
#' 
#' @examples 
#' # Example - Lung Capacity Data
#' 
#' # Save the current working directory
#' dir <- getwd()
#' 
#' # Intital Data Profiling
#' descriptive_statistics(dataset = lungcap, type = "numeric")
#' 
#' # Derive Interaction Variables 
#' derive_variables(dataset = lungcap, type = "interaction")
#' derive_variables(dataset = lungcap, type = "interaction", y_index = 1)
#' derive_variables(dataset = lungcap, type = "interaction", y_index = 1, return_dataset = TRUE)
#' 
#' # Derive Power Variables
#' derive_variables(dataset = lungcap, type = "power", p = 2)
#' derive_variables(dataset = lungcap, type = "power", p = 3, y_index = 1)
#' derive_variables(dataset = lungcap, type = "power", p = 2, y_index = 1, return_dataset = TRUE)
#' 
#' # Derive Dummy Variables
#' derive_variables(dataset = lungcap, type = "dummy")
#' derive_variables(dataset = lungcap, type = "dummy", integer = FALSE)
#' derive_variables(dataset = lungcap, type = "dummy", y_index = 5, return_dataset = TRUE))
#' 
derive_variables <- function(dataset, 
                             y_index = NULL, 
                             type = c("interaction", "power", "dummy"), 
                             power = NULL, 
                             integer = TRUE, 
                             return_dataset = FALSE, 
                             file_name = NULL, 
                             directory = NULL) 
  {
  
  # Convert the dataset set to a data frame
  dataset <- as.data.frame(dataset)
  
  # Confirm correct choice for type
  type <- match.arg(type)
  
  # create empty dataframe to store the interaction terms 
  derived_data <- as.data.frame(matrix(nrow = nrow(dataset), 
                                       ncol = 1))
  
  # create k a column index for the derived data
  k = 1
  
  #--------------------------------------------------------------------------#
  # If y_index is NULL                                                       #
  #--------------------------------------------------------------------------#
  
  if(is.null(y_index)){
    
    if(type == "interaction"){
      
      for (i in 1:ncol(dataset)){
        
        # create j a column index for the specidied dataset
        j = i + 1
        
        while(j <= ncol(dataset)){
          
          if(is.numeric(dataset[,i]) & is.numeric(dataset[,j])){
            
            # extract the variable names
            v1name <- colnames(dataset)[i]
            v2name <- colnames(dataset)[j]
            
            # derive the interaction terms
            derived_data[,k] <- dataset[,i] * dataset[,j]
            
            # assign an appropriate column name to the newly derived term
            colnames(derived_data)[k] <- paste(v1name, "*", v2name, sep = "")
            
            # update the k index
            k = k + 1
            
          }
          
          # update the j index
          j = j + 1
          
        }
        
      }
      
    } else if(type == "power"){
      
      for (i in 1:ncol(dataset)){
        
        if(is.numeric(dataset[,i])) {
          
          # derive the power terms
          derived_data[,k] <- dataset[,i]^power
          
          # assign an appropriate column name to the newly derived term
          colnames(derived_data)[k] <- paste(colnames(dataset)[i], deparse(substitute(power)), sep = "^")
          
          # update the k index
          k = k + 1
          
        }
        
      }
      
    } else if(type == "dummy"){
      
      # first seperate out the categorcal variables from the dataset
      factor_data <- extract_variables(dataset = dataset, 
                                  type = "factor")
      
      # dummy encode the categorical variables
      derived_data <- dummy(x = factor_data, 
                                  int = integer)
      
      # extract other data as dataset for future combination
      dataset <- extract_variables(dataset = dataset, 
                                 type = "factor", 
                                 extract_not = TRUE)
      
      # Future Notes: remove unary variables and linear combinations
      
    }
    
    #--------------------------------------------------------------------------#
    # If y_index is not NULL                                                   #
    #--------------------------------------------------------------------------#
    
  } else if (!is.null(y_index)){
    
    # extract the test data
    test_data <- dataset[, -y_index]
    
    if(type == "interaction"){
      
      for (i in 1:ncol(test_data)){
        
        if(is.numeric(test_data[,i]) & is.numeric(dataset[,y_index])){
          
          # extract the variable names
          v1name <- colnames(test_data)[i]
          v2name <- colnames(dataset)[y_index]
          
          # derive the interaction terms
          derived_data[,k] <- test_data[,i] * dataset[,y_index]
          
          # assign an appropriate column name to the newly derived term
          colnames(derived_data)[k] <- paste(v1name, "*", v2name, sep = "")
          
          # update the k index
          k = k + 1
          
        }
    
      }
      
    } else if(type == "power"){
      
      if(is.numeric(dataset[,y_index])) {
        
        # derive the power terms
        derived_data[,1] <- dataset[,y_index]^power
        
        # assign an appropriate column name to the newly derived term
        colnames(derived_data)[1] <- paste(colnames(dataset)[y_index], deparse(substitute(power)), sep = "^")

      }
      
    } else if(type == "dummy"){
      
      # first seperate out the categorcal variables from the dataset
      factor_data <- as.data.frame(dataset[, y_index])
      
      # dummy encode the categorical variables
      derived_data <- dummy(x = factor_data, 
                            int = integer)
      
      # extract other data as dataset for future combination
      dataset <- extract_variables(dataset = dataset[, -y_index], 
                                   type = "factor", 
                                   extract_not = TRUE)
      
      # Future Notes: remove unary variables and linear combinations
      
    }
    
  }
  
  # Return the original dataset too
  if(return_dataset == TRUE){
    derived_data <- as.data.frame(cbind(dataset, derived_data))
    
  }
  
  # write the results to the specified directory
  if(!is.null(directory)) {
    
    write.csv(x = derived_data, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the derived data
  return(derived_data)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.