#' @title Derive Variables from a given Dataset
#'
#' @description This function derives power, interaction or dummy variables for a given dataset.
#' The power terms are derived from raising each numeric variable in the specified dataset by a power.
#' The interaction terms are derived from multipling all the numeric variables among one another.
#' The dummy terms are derived from by generating binary terms for each level of the factor variables.
#' The resulting data frame can be saved to a specified dataset.
#'
#' @param dataset The dataset that the variables are derived from.
#'
#' @param y_index A natural number representing the response variable of the dataset that will be used in the derivation of new variables.
#' Default is NULL.
#'
#' @param type The type of variables to be derived; either dummy, interction or power.
#' Default is interaction.
#'
#' @param power A numeric value indicating the desired power, used in conjungtion with deriving power terms.
#' Default is NULL.
#'
#' @param integer A logical object indicating whether the dummy variables should be stored as integers, used in conjungtion with deriving dummy terms.
#' Alternatively the dummy variables are stored as factors.
#' Default is TRUE.
#'
#' @param return_dataset A logical object indicating whether the newly derived power terms and the original terms should be returned.
#' Alteratively, only the newly derived terms are returned.
#' Default is FALSE.
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#' Default is NULL.
#'
#' @return Outputs the newly derived terms as a data frame
#'
#' @import dummy
#'
#' @export
#'
#' @seealso \code{\link{remove_variables}}, \code{\link{extract_variables}}, \code{\link{impute_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#'
#' @keywords derive variables, interaction terms, polynomial terms, dummy variables
#'
#' @examples
#' # Example - Lung Capacity Data
#'
#' # Save the current working directory
#' dir <- getwd()
#'
#' # Intital Data Profiling
#' descriptive_statistics(dataset = lungcap, type = "numeric")
#'
#' # Derive Interaction Variables
#' derive_variables(dataset = lungcap, type = "interaction")
#' derive_variables(dataset = lungcap, type = "interaction", y_index = 1)
#' derive_variables(dataset = lungcap, type = "interaction", y_index = 1, return_dataset = TRUE)
#'
#' # Derive Power Variables
#' derive_variables(dataset = lungcap, type = "power", p = 2)
#' derive_variables(dataset = lungcap, type = "power", p = 3, y_index = 1)
#' derive_variables(dataset = lungcap, type = "power", p = 2, y_index = 1, return_dataset = TRUE)
#'
#' # Derive Dummy Variables
#' derive_variables(dataset = lungcap, type = "dummy")
#' derive_variables(dataset = lungcap, type = "dummy", integer = FALSE)
#' derive_variables(dataset = lungcap, type = "dummy", y_index = 5, return_dataset = TRUE))
#'
derive_variables <- function(dataset,
y_index = NULL,
type = c("interaction", "power", "dummy"),
power = NULL,
integer = TRUE,
return_dataset = FALSE,
file_name = NULL,
directory = NULL)
{
# Convert the dataset set to a data frame
dataset <- as.data.frame(dataset)
# Confirm correct choice for type
type <- match.arg(type)
# create empty dataframe to store the interaction terms
derived_data <- as.data.frame(matrix(nrow = nrow(dataset),
ncol = 1))
# create k a column index for the derived data
k = 1
#--------------------------------------------------------------------------#
# If y_index is NULL #
#--------------------------------------------------------------------------#
if(is.null(y_index)){
if(type == "interaction"){
for (i in 1:ncol(dataset)){
# create j a column index for the specidied dataset
j = i + 1
while(j <= ncol(dataset)){
if(is.numeric(dataset[,i]) & is.numeric(dataset[,j])){
# extract the variable names
v1name <- colnames(dataset)[i]
v2name <- colnames(dataset)[j]
# derive the interaction terms
derived_data[,k] <- dataset[,i] * dataset[,j]
# assign an appropriate column name to the newly derived term
colnames(derived_data)[k] <- paste(v1name, "*", v2name, sep = "")
# update the k index
k = k + 1
}
# update the j index
j = j + 1
}
}
} else if(type == "power"){
for (i in 1:ncol(dataset)){
if(is.numeric(dataset[,i])) {
# derive the power terms
derived_data[,k] <- dataset[,i]^power
# assign an appropriate column name to the newly derived term
colnames(derived_data)[k] <- paste(colnames(dataset)[i], deparse(substitute(power)), sep = "^")
# update the k index
k = k + 1
}
}
} else if(type == "dummy"){
# first seperate out the categorcal variables from the dataset
factor_data <- extract_variables(dataset = dataset,
type = "factor")
# dummy encode the categorical variables
derived_data <- dummy(x = factor_data,
int = integer)
# extract other data as dataset for future combination
dataset <- extract_variables(dataset = dataset,
type = "factor",
extract_not = TRUE)
# Future Notes: remove unary variables and linear combinations
}
#--------------------------------------------------------------------------#
# If y_index is not NULL #
#--------------------------------------------------------------------------#
} else if (!is.null(y_index)){
# extract the test data
test_data <- dataset[, -y_index]
if(type == "interaction"){
for (i in 1:ncol(test_data)){
if(is.numeric(test_data[,i]) & is.numeric(dataset[,y_index])){
# extract the variable names
v1name <- colnames(test_data)[i]
v2name <- colnames(dataset)[y_index]
# derive the interaction terms
derived_data[,k] <- test_data[,i] * dataset[,y_index]
# assign an appropriate column name to the newly derived term
colnames(derived_data)[k] <- paste(v1name, "*", v2name, sep = "")
# update the k index
k = k + 1
}
}
} else if(type == "power"){
if(is.numeric(dataset[,y_index])) {
# derive the power terms
derived_data[,1] <- dataset[,y_index]^power
# assign an appropriate column name to the newly derived term
colnames(derived_data)[1] <- paste(colnames(dataset)[y_index], deparse(substitute(power)), sep = "^")
}
} else if(type == "dummy"){
# first seperate out the categorcal variables from the dataset
factor_data <- as.data.frame(dataset[, y_index])
# dummy encode the categorical variables
derived_data <- dummy(x = factor_data,
int = integer)
# extract other data as dataset for future combination
dataset <- extract_variables(dataset = dataset[, -y_index],
type = "factor",
extract_not = TRUE)
# Future Notes: remove unary variables and linear combinations
}
}
# Return the original dataset too
if(return_dataset == TRUE){
derived_data <- as.data.frame(cbind(dataset, derived_data))
}
# write the results to the specified directory
if(!is.null(directory)) {
write.csv(x = derived_data,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# return the derived data
return(derived_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.