#' @title Recategorization of Factor Variables by Decision Tree Leaves
#' @description The function takes in a data set that contains categorical variable(s) and a response variable.
#' It creates a decision tree by using one of the categorical variables (class factor) and the response variable.
#' The decision tree is created from the rpart() function from the 'rpart' package
#' The rules from the leaves of the decision tree are extracted, and used to recategorize the appropriate categorical variable (predictor).
#' This step is performed for each of the categorical (class factor) variables that is fed into the data component of the function.
#' Only variables containing more than 2 factors will be considered in the function.
#' The final output generates a data set containing the recategorized variables or a list containing a mapping table
#' for each of the candidate variables.
#' @param data A data.frame.
#' @param y The response variables to be used in the rpart() function.
#' @param bin.nm The string that will be used to categorize the variables. The default "Group." will be assigned.
#' E.g. If a variable of 6 factors is recategorized into 3 factors, then setting bin.name equal to "Group." will
#' name the three new factors to "Group.1", "Group.2", and "Group.3"
#' @param method This is the method that will be used in the rpart() function.
#' If null, the default method will be used. See rpart() for further detail.
#' @param control This is the control that will be used in the rpart() function. The user has 3 options,
#' one of which is the default selected control by the rpart() function. The remaining two option are:
#' 1) Specity a cp value which will prune each decision tree by the specified value or
#' 2) Specity a two-dimensional data.frame() that contains the variable name(s) as identified in the data component for the first column
#' and the respective cp of each variable in the second column.
#' Variable(s) not included in this data.frame() will use the cp generated by the rpart() function.
#' See rpart() and rpart.control() for further detail.
#' @param return This is what the function will return. There are three options:
#' 1) new.fctrs - will provide a data.frame with the recategorized categorical variables.
#' 2) lkup.list - will provide a list of lookup tables. Each element will contain the original to new mapping for each recategorized variable.
#' 3) both - it will return both: the new.fctrs and lkup.list objects.
#' @export
#' @import dplyr
#' @importFrom rpart rpart
#' @importFrom rpart rpart.control
#' @importFrom rpart.utils rpart.lists
#' @importFrom data.table as.data.table
#' @importFrom data.table setnames
#' @keywords rpart, factor, relevel
#' @seealso \code{\link[tree.bins]{bin.oth}}, \code{\link[rpart]{rpart}}, \code{\link[rpart]{rpart.control}},\code{\link[rpart.utils]{rpart.lists}}
#' @return NULL
#' @examples
#' #Returns a data.frame of recategorized variables
#' library(rpart)
#' sample.df <- AmesImpFctrs[, c("Neighborhood", "MS.Zoning", "SalePrice")]
#' tree.bins(data = sample.df, y = SalePrice)
#'
#' #Returns a list of mapping tables generated from tree.bins()
#' tree.bins(data = sample.df, y = SalePrice, return = "lkup.list")
#'
#' #Allows the user to choose the naming convention for the attribute naming convention
#' tree.bins(data = sample.df, y = SalePrice, bin.nm = "bin#")
#'
#' #Allows user to manually assign a cp to each decision tree evaluated in rpart()
#' tree.bins(data = sample.df, y = SalePrice, control = rpart.control(cp = .01))
#'
#' #Allows user to manually assign a cp to specified variables
#' demo.df <- data.frame(Variables = c("Neighborhood", "MS.Zoning"), CP = c(.001, .2))
#' tree.bins(data = sample.df, y = SalePrice, control = demo.df)
## quiets concerns of R CMD check re: the .'s that appear in pipelines
#if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
tree.bins <- function(data, y, bin.nm = "Group.",
method = NULL, control = NULL, return = "new.fctrs") {
#Updating Code to remove "no visible binding for global variable" Notes in R check
#if(getRversion() >= "2.15.1") utils::globalVariables(names = c("tree.df$Names", "tree.df$Number"), package = "tree.bin")
#Allowing y to pass in to the rpart function.
colnm <- eval(substitute(y),data, parent.frame())
#Placing the y factor to the end of the table so it will not be valuated.
only.y <- data[ ,deparse(substitute(y)), drop = FALSE]
if(dim(data)[2] == 2) {
only.fctrs <- data[ , !(names(data) %in% names(only.y)), drop = FALSE] %>% as.data.frame()
}else{
only.fctrs <- data[ , !(names(data) %in% names(only.y))] %>% as.data.frame()
}
#Removing non-factor variables.
only.fctrs.final = select_if(only.fctrs, is.factor)
#Joining only factor variables and the response.
#The response will be the last variable of the data.frame.
df = cbind(only.fctrs.final, only.y)
if (exists("new.fctrs", inherits = TRUE)) {
rm(new.fctrs, envir = environment(), inherits = TRUE)
#rm(new.fctrs)
}
#Initiating a list variable for a loop to pass through and store lookup tables.
lkup.list <- list()
#Creating for loop to create a decision tree for each variable.
#Subtracting 1 so it will not evaluate the last variable => y the predictor.
for (i in 1:(dim(df)[2]-1)) {
#Making sure to remove any object that is final.groups
#Final.groups is generated running the for loop, so need to remove this
#at the beginning of each for loop. Final.groups is stored into a list
#at the end of each loops cycle.
if (exists("final.groups", inherits = TRUE)) {
rm(final.groups, envir = environment(), inherits = TRUE)
#rm(final.groups, envir = .GlobalEnv)
}
#Assessing which type of rpart part to run based off of user inputs.
if (is.null(method) & is.null(control)) {
tree = rpart(formula = colnm ~ df[,i], data = df)
tree.rules = unlist(rpart.lists(tree))
} else if (!is.null(method) & is.null(control)){
tree = rpart(formula = colnm ~ df[,i], data = df, method = method)
tree.rules = unlist(rpart.lists(tree))
} else if (is.null(method) & !is.data.frame(control) & !is.null(control)){
tree = rpart(formula = colnm ~ df[,i], data = df, control = control)
tree.rules = unlist(rpart.lists(tree))
} else if (!is.null(method) & !is.data.frame(control) & !is.null(control)){
tree = rpart(formula = colnm ~ df[,i], data = df, method = method, control = control)
tree.rules = unlist(rpart.lists(tree))
} else if (is.null(method) & is.data.frame(control) & (names(df[,i, drop = F]) %in% as.character(control[,1]))){
tree = rpart(formula = colnm ~ df[,i], data = df,
control = rpart.control(cp = control[control == as.character(names(df[, i, drop = F])), 2]))
tree.rules = unlist(rpart.lists(tree))
} else if (!is.null(method) & is.data.frame(control) & (names(df[,i, drop = F]) %in% as.character(control[,1]))){
tree = rpart(formula = colnm ~ df[,i], data = df, method = method,
control = rpart.control(cp = control[control == as.character(names(df[, i, drop = F])), 2]))
tree.rules = unlist(rpart.lists(tree))
} else if (is.null(method) & is.data.frame(control) & !(names(df[,i, drop = F]) %in% as.character(control[,1]))){
tree = rpart(formula = colnm ~ df[,i], data = df)
tree.rules = unlist(rpart.lists(tree))
} else if (!is.null(method) & is.data.frame(control) & !(names(df[,i, drop = F]) %in% as.character(control[,1]))){
tree = rpart(formula = colnm ~ df[,i], data = df, method = method)
tree.rules = unlist(rpart.lists(tree))
}
#Assessing criteria on whether or not a particular variable meets the criteria for recategorization.
#The variable must be a factor, cannot have only 2 levels, and must contain leaves. Variables with
#only 1 level will be caught by the "must contain leaves" rule.
if (is.factor(df[,i]) & length(levels(df[, i])) != 2 & !is.null(c(names(tree.rules)))) {
#In order to remove the "No visible binding for global variable" Note
Names <- Values <- Number <- Cuts <- Categories <- NULL
#Creating a df of the tree.rules
tree.df <- data_frame(Names = c(names(tree.rules)),
Values = unlist(tree.rules),
#LR = ifelse(grepl("R.", Names), "R.", "L."), # Not needed for code to run
#VarName = paste(names(df[,i, drop = FALSE])), # Not needed for code ro run
Number = ifelse(sub(".*]", "", Names) != "", sub(".*]", "", Names), 0) %>% as.integer())
#Creating an identifier for each split. "End.Group" is where the group will end.
tree.df.v2 <- tree.df %>%
mutate(Cuts = ifelse(is.na(lag(Number)) | lag(Number) < Number, "Continue", "End.Group"),
Cuts = lead(Cuts),
Cuts = ifelse(Number == 0 | is.na(Cuts), "End.Group", Cuts),
Index = seq(1:nrow(tree.df)))
#Extracting the Index number for each End.Group to start creating character arrays.
tree.df.v3 <- tree.df.v2 %>%
filter(Cuts == "End.Group")
#Creating subsets of tree.df.v2 through the Index column and creating
#combined character string vectors for each group to store in a list
#(Group defined by beginning after an "End.Group" in the "Cuts" column).
#Initiating a list variable for a loop to pass through and string vectors.
string.list <- list()
for (j in 1:length(tree.df.v3$Index)) {
if(j ==1) {
strings = data.frame(Values = c(tree.df.v2[j:tree.df.v3$Index[j],]$Values),
Strings = c(tree.df.v2[j:tree.df.v3$Index[j],]$Cuts),
Index = tree.df.v3$Index[j], stringsAsFactors = FALSE)
} else{
strings = data.frame(Values = c(tree.df.v2[(tree.df.v3$Index[j - 1] + 1):tree.df.v3$Index[j],]$Values),
Strings = c(tree.df.v2[(tree.df.v3$Index[j - 1] + 1):tree.df.v3$Index[j],]$Cuts),
Index = tree.df.v3$Index[j], stringsAsFactors = FALSE)
}
string.list[[j]] <- strings
}
#Looping for each element of the list through each of the other element
#in the list to identify final groupings
for (k in 1:length(string.list)) {
#Create list to store checks
check.list <- list()
for (m in 1:length(string.list)) {
#Checks to make sure the "Value" column of the kth element of the list is within each
#element of the list, IF that element contains at least one value in its "Value" column
#that matches any one of the kth element's values from its "Value" column
if (all((string.list[[k]][, 1] %in% string.list[[m]][, 1] ) == TRUE) &
sum((string.list[[k]][, 1] %in% string.list[[m]][, 1]) == TRUE) >= 1) {
check.list[[m]] <- "True"
#Evaluates whether or not kth element vector is within the mth element. If no strings are present,
#then it should not be considered into the True/False evaluation and returns NULL instead.
} else if (sum((string.list[[k]][, 1] %in% string.list[[m]][, 1]) == TRUE) == 0) {
check.list[[m]] <- "NULL"
}else{
check.list[[m]] <- "FALSE"
}
}
#Remove NULL values from the list so it will not be considered for True/False evaluation
list.no.null = check.list[!check.list == "NULL"]
#Creates a data.frame that indicates whether the string is a group or not
if (all(list.no.null == "True")) {
string.list[[k]] <- cbind(string.list[[k]],
data.frame(GroupOrNot = rep("Yes", times = nrow(string.list[[k]])), stringsAsFactors = FALSE))
} else{
string.list[[k]] <- cbind(string.list[[k]],
data.frame(GroupOrNot = rep("No", times = nrow(string.list[[k]])), stringsAsFactors = FALSE))
}
}
#Checking for strings that should be grouped and overwritting those that shouldn't with NULL
for (n in 1:length(string.list)) {
if(all(string.list[[n]]$GroupOrNot == "No")){
string.list[[n]] <- "NULL"
}
}
#Remove NULL values from the list so it will not be considered in the final groupings
final.string.list = string.list[!string.list == "NULL"]
#Creating category names
for (p in 1:length(final.string.list)) {
final.string.list[[p]] <- final.string.list[[p]] %>%
mutate(Categories = paste0(bin.nm, p))
}
#Converting list into a data.frame
for (q in 1:length(final.string.list)) {
if (exists("final.groups")){
final.groups <- rbind(final.groups, final.string.list[[q]])
}
if (!exists("final.groups")){
final.groups <- rbind(final.string.list[[q]])
}
}
#Creating a lookup data.frames to save as a list to reference lookup tables.
lkup.df <- final.groups %>% select(Values, Categories)
#Create variables to assign and filter data.frames by.
nm = names(df[i])
#Correctly names the lookup data.frames
colnames(lkup.df) = c(nm, "Categories")
#Store the values of each data.frame into a list.
lkup.list[[i]] <- lkup.df
#Create a holder for data to avoid manipulation of the data variable.
#Will be used to convert factor level variables that are considered for
#recategorizing into character level variables.
if(!exists("chr.fctrs")){
chr.fctrs <- data
}
#Converts the considered variable into a character.
#Left joins by matching the considered variable's manipulated
#data.frame with the lookup table. The manipulated data.frame is created
#and adjusted by each variable that is considered. The manipulated data.frame
#is the final data.frame that produces the recategorized variables. The "Categories"
#column is added onto the manipulated data.frame. The original variable
#is removed and "Categories" is renamed into the original variable's name.
if(exists("new.fctrs")) {
new.fctrs <- as.data.frame(new.fctrs)
new.fctrs[, nm] <- as.character(new.fctrs[, nm])
join.fctrs <- new.fctrs %>% left_join(lkup.df, by = paste(nm))
new.fctrs <- join.fctrs[ , !(names(join.fctrs) %in% nm)] %>% as.data.table() %>% setnames("Categories", paste0(nm))
}
#Converts the considered variable into a character.
#Left joins by matching the considered variable's original
#data.frame with the lookup table. The "Categories" column is
#added onto it the original data.frame. The original variable
#is removed and "Categories" is renamed into the original variable's name.
if(!exists("new.fctrs")){
chr.fctrs[,nm] <- as.character(chr.fctrs[, nm])
join.fctrs <- chr.fctrs %>% left_join(lkup.df, by = paste(nm))
new.fctrs <- join.fctrs[ , !(names(join.fctrs) %in% nm)] %>% as.data.table() %>% setnames("Categories", paste0(nm))
}
}
}
#Allows the user to return either the recategorized data.frame by selecting "new.fctrs"
#or the list containing all of the lookup tables by selecting "lkup.list"
#or both by selecting "both."
output = list()
output$new.fctrs = new.fctrs
output$lkup.list = lkup.list
if(return == "both"){
return(output)
}else if(return == "new.fctrs"){
return(new.fctrs)
} else{
return(lkup.list)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.