#' @title Discretize a data frame
#' @description Converts all numerical variables into factor or character, depending on 'stringsAsFactors' parameter,
#' based on equal frequency criteria. The thresholds for each segment in each variable are generated based on the
#' output of \code{\link{discretize_get_bins}} function, which returns a data frame
#' containing the threshold for each variable. This result is must be the 'data_bins' parameter input.
#' Important to note that the returned data frame contains the non-transformed variables plus the transformed ones.
#' More info about converting numerical into categorical variables
#' can be found at: \url{https://livebook.datascienceheroes.com/data-preparation.html#data_types}
#' @param data Input data frame
#' @param data_bins data frame generated by 'discretize_get_bins' function. It contains the variable name and the
#' thresholds for each bin, or segment.
#' @param stringsAsFactors Boolean variable which indicates if the discretization result is character or factor.
#' When TRUE, the segments are ordered. TRUE by default.
#' @examples
#' \dontrun{
#' # Getting the bins thresholds for each. If input is missing, will run for all numerical variables.
#' d_bins=discretize_get_bins(data=heart_disease,
#' input=c("resting_blood_pressure", "oldpeak"), n_bins=5)
#'
#' # Now it can be applied on the same data frame,
#' # or in a new one (for example in a predictive model that change data over time)
#' heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T)
#'
#'}
#' @return Data frame with the transformed variables
#' @export
discretize_df <- function(data, data_bins, stringsAsFactors=T)
{
# Recover all the variables to discretize
vars_num=data_bins$variable
for(i in vars_num)
{
v_orig=data[[i]]
v_bins=filter(data_bins, variable==i) %>% .$cuts
v_res=dis_recover(x = v_orig, cuts = v_bins, stringsAsFactors = stringsAsFactors)
data[[i]]=v_res
}
if(stringsAsFactors)
{
data_2b=data %>% mutate_at(vars(vars_num), conv_factor)
data_3=data_2b %>% mutate_at(vars(vars_num), funs(factor(replace(., is.na(.), "NA."))))
} else {
data_3=data %>% mutate_at(vars(vars_num), funs(ifelse(is.na(.), "NA.", .)))
}
message(sprintf("Variables processed: %s", paste(vars_num, collapse = ", ")))
return(data_3)
}
#' @title Get the data frame thresholds for discretization
#' @description It takes a data frame and returns another data frame indicating the threshold for each bin (or segment)
#' in order to discretize the variable.
#' @param data Data frame source
#' @param n_bins The number of desired bins (or segments) that each variable will have.
#' @param input Vector of string containing all the variables that will be processed.
#' If empty it will run for all numerical variables that match the following condition, the number of unique values
#' must be higher than the ones defined at 'n_bins' parameter. NAs values are automatically handled by converting
#' them into another category (more info about it at
#' \url{https://livebook.datascienceheroes.com/data-preparation.html#treating-missing-values-in-numerical-variables}).
#' This function must be used with \link{discretize_df}.
#' If it is needed a different number of bins per variable, then the function must be called more than once.
#' @examples
#' \dontrun{
#' # Getting the bins thresholds for each. If input is missing, will run for all numerical variables.
#' d_bins=discretize_get_bins(data=heart_disease,
#' input=c("resting_blood_pressure", "oldpeak"),
#' n_bins=5)
#'
#' # Now it can be applied on the same data frame,
#' # or in a new one (for example in a predictive model that change data over time)
#' heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T)
#'
#' # Checking results
#' df_status(heart_disease_discretized)
#'}
#' @return Data frame containing the thresholds or cuts to bin every variable
#' @export
discretize_get_bins <- function(data, n_bins=5, input=NULL)
{
#vars_num=df_status(data, print_results = F) %>% filter(type %in% c("integer","numeric"), unique>n_bins) %>% .$variable
stat=status(data)
vars_num = stat %>% filter(type %in% c("integer","numeric"), unique>n_bins) %>% pull(variable)
## If input then runs for all variables
if(!missing(input))
{
vars_num=vars_num[vars_num %in% input]
# Check number of unique values
vars_not_to_process = stat %>% filter(variable %in% input, unique<=n_bins) %>% pull(variable)
if(length(vars_not_to_process) > 0)
{
message(sprintf("Skipping variables to discretize (unique values <= than n_bins): '%s'",
paste(vars_not_to_process,collapse = ', ')))
}
}
if(length(vars_num)==0)
{
message("No variables to discretize")
return()
}
## Begin
d_bins=sapply(select(data, one_of(vars_num)), function(x) dis_bins(x, n_bins)) %>% as.data.frame(.)
d_bins$variable=as.character(rownames(d_bins))
d_bins=rename(d_bins, cuts='.') %>% select(variable, cuts)
d_bins$cuts=as.character(d_bins$cuts)
rownames(d_bins)=NULL
message(sprintf("Variables processed: %s", paste(vars_num, collapse = ", ")))
return(d_bins)
}
get_bins_processed <- function(x, n_bins)
{
cuts=cut2(x, g = n_bins, onlycuts = T)
# when saving the 1st element, the min, is not necesary
cuts=cuts[-1]
# the last is the max: Inf
cuts[length(cuts)]=Inf
return(cuts)
}
dis_bins <- function(x, n_bins=5)
{
cuts=get_bins_processed(x, n_bins)
res=paste(cuts, collapse = "|")
return(res)
}
dis_recover <- function(x, cuts, stringsAsFactors)
{
number_of_digits=4
oldopt = options("digits")
options(digits = number_of_digits) # number of significant decimal points
on.exit(options(oldopt))
cuts_v=as.numeric(unlist(strsplit(cuts, '[|]')))
#x[x<min(cuts_v)]=min(cuts_v)
#x[x>max(cuts_v)]=max(cuts_v)
# forcing -Inf as the min value
cuts_v=c(-Inf, cuts_v)
res=Hmisc::cut2(x = x, cuts = cuts_v, digits=number_of_digits, oneval=FALSE)
# Hack correction on "-Inf" label: [-Inf, min_value)
if("-Inf" %in% trimws(levels(res)))
{
cuts_v_aux=cuts_v[cuts_v!=-Inf]
min_value=format(min(cuts_v_aux))
levels(res)[trimws(levels(res))=="-Inf"]=sprintf("[-Inf, %s)", min_value)
}
########################
if(!stringsAsFactors)
{
res=as.character(res)
}
return(res)
}
conv_factor <- function(x)
{
levels(x)=c(levels(x), "NA.")
new_x=factor(x, levels = levels(x))
return(new_x)
}
#' @title Convert every column in a data frame to character
#' @description It converts all the variables present in 'data' to character. Criteria conversion is based on
#' two functions, \code{\link{discretize_get_bins}} plus \code{\link{discretize_df}}, which will discretize
#' all the numerical variables based on equal frequency criteria, with the number of bins equal to 'n_bins'.
#' This only applies for numerical variables which unique valuesare more than 'n_bins' parameter.
#' After this step, it may happen that variables remain non-character, so these variables will be converting
#' directly into character.
#'
#' @param data input data frame to discretize
#' @param n_bins number of bins/segments for each variable
#' @examples
#' \dontrun{
#' # before
#' df_status(heart_disease)
#'
#' # after
#' new_df=convert_df_to_categoric(data=heart_disease, n_bins=5)
#' df_status(new_df)
#' }
#' @return data frame containing all variables as character
#' @export
convert_df_to_categoric <- function(data, n_bins)
{
# Discretizing numerical variables
d_cuts=suppressMessages(discretize_get_bins(data = data, n_bins = n_bins))
data_cat=discretize_df(data = data, data_bins = d_cuts, stringsAsFactors = F)
# Converting remaining variables
data_cat_2=data_cat %>% mutate_all(as.character)
return(data_cat_2)
}
#' @title Concatenate 'N' variables
#' @description Concatenate 'N' variables using the char pipe: <|>.
#' This function is used when there is the need of measuring the mutual information and/or the information
#' gain between 'N' input variables an against a target variable. This function makes sense when it is used based on
#' categorical data.
#' @param data data frame containing the two variables to concatenate
#' @param vars character vector containing all variables to concatenate
#' @examples
#' \dontrun{
#' new_variable=concatenate_n_vars(mtcars, c("cyl", "disp"))
#' # Checking new variable
#' head(new_variable)
#' }
#' @return vector containing the concatenated values for the given variables
#' @export
concatenate_n_vars <- function(data, vars)
{
df=data %>% select(one_of(vars))
new_col=apply(df, 1, function(x) paste(x, collapse = " | ") )
return(new_col)
}
binary_gain_ratio <- function(input, target, test_points)
{
input_bin=cut2(input, cuts = test_points)
if(length(levels(input_bin))==1) return(0)
gr=gain_ratio(input_bin, target)
return(gr)
}
recursive_gr_cuts_aux <- function(input, target, fpoints, max_depth, min_n)
{
# seq(0.01, 0.99, by=0.01)
points=unique(quantile(input, probs = seq(0.2, 0.8, by=0.2)))
if(length(points)==1 | length(fpoints)>=max_depth) return(fpoints)
r=c()
for(p in points)
{
gr=binary_gain_ratio(input, target, test_points = p)
# check sample size against the total
total_left=sum(input<p)
total_right=sum(input>=p)
gr=if(total_left < min_n | total_right < min_n) 0 else gr
#
r=c(r, gr)
}
# Quality stopping criteria
#if(max(r) < 0.0099) return(fpoints)
position_max=which(max(r)==r)[1]
max_point=points[position_max];
input_left=input[input<max_point]
input_right=input[input>=max_point]
target_left=target[input<max_point]
target_right=target[input>=max_point]
if(length(input_left)>min_n & length(input_right)>min_n & length(fpoints)<=max_depth)
{
fpoints=c(fpoints, max_point);
fpoints_left= recursive_gr_cuts_aux(input=input_left, target=target_left, fpoints,max_depth, min_n)
fpoints_right=recursive_gr_cuts_aux(input=input_right, target=target_right, fpoints,max_depth, min_n)
fpoints=unique(c(fpoints_right,fpoints_left))
#message(paste(fpoints,collapse = ", "))
}
return(fpoints)
}
#' @title Variable discretization by gain ratio maximization
#' @description Discretize numeric variable by maximizing the gain ratio
#' between each bucket and the target variable.
#'
#' @param input numeric input vector to discretize
#' @param target character or factor multi-calss target variable
#' @param min_perc_bins minimum percetange of rows for each split or segment (controls the sample size), 0,1 (or 10 percent) as default
#' @param max_n_bins maximum number of bins or segments to split the input variable, 5 bins as default
#' @examples
#' \dontrun{
#' library(funModeling)
#' data=heart_disease
#' input=data$oldpeak
#' target=as.character(data$has_heart_disease)
#'
#' input2=discretize_rgr(input, target)
#'
#' # checking:
#' summary(input2)
#' }
#' @return discretized variable (factor)
#' @export
discretize_rgr <- function(input, target, min_perc_bins=0.1, max_n_bins=5)
{
fpoints=c()
max_depth=20
target=as.character(target)
min_n=round(min_perc_bins*length(input))
all_cuts=recursive_gr_cuts_aux(input, target, fpoints, max_depth, min_n)
#
max_n_bins=max_n_bins-1
fpoints_top=all_cuts[1:min(max_n_bins,length(all_cuts))]
fpoints_top_ord=fpoints_top[order(fpoints_top)]
input_bin=Hmisc::cut2(input, cuts = c(fpoints_top_ord, max(input)))
return(input_bin)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.