funModeling: Exploratory Data Analysis and Data Preparation Tool-Box

Documented in discretize_df

#' @title Discretize a data frame
#' @description Converts all numerical variables into factor or character, depending on 'stringsAsFactors' parameter,
#'  based on equal frequency criteria. The thresholds for each segment in each variable are generated based on the
#'  output of \code{\link{discretize_get_bins}} function, which returns a data frame
#' containing the threshold for each variable. This result is must be the 'data_bins' parameter input.
#' Important to note that the returned data frame contains the non-transformed variables plus the transformed ones.
#' More info about converting numerical into categorical variables
#' can be found at: \url{https://livebook.datascienceheroes.com/data-preparation.html#data_types}
#' @param data Input data frame
#' @param data_bins data frame generated by 'discretize_get_bins' function. It contains the variable name and the
#' thresholds for each bin, or segment.
#' @param stringsAsFactors Boolean variable which indicates if the discretization result is character or factor.
#' When TRUE, the segments are ordered. TRUE by default.
#' @examples
#' \dontrun{
#' # Getting the bins thresholds for each. If input is missing, will run for all numerical variables.
#' d_bins=discretize_get_bins(data=heart_disease,
#' input=c("resting_blood_pressure", "oldpeak"), n_bins=5)
#'
#' # Now it can be applied on the same data frame,
#' # or in a new one (for example in a predictive model that change data over time)
#' heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T)
#'
#'}
#' @return Data frame with the transformed variables
#' @export
discretize_df <- function(data, data_bins, stringsAsFactors=T)
{
	# Recover all the variables to discretize
	vars_num=data_bins$variable

	for(i in vars_num)
	{
		v_orig=data[[i]]
		v_bins=filter(data_bins, variable==i)  %>% .$cuts
		v_res=dis_recover(x = v_orig, cuts = v_bins, stringsAsFactors = stringsAsFactors)

		data[[i]]=v_res
	}

	if(stringsAsFactors)
	{
		data_2b=data %>% mutate_at(vars(vars_num), conv_factor)
		data_3=data_2b %>% mutate_at(vars(vars_num), funs(factor(replace(., is.na(.), "NA."))))
	} else {
		data_3=data %>% mutate_at(vars(vars_num), funs(ifelse(is.na(.), "NA.", .)))
	}

	message(sprintf("Variables processed: %s", paste(vars_num, collapse = ", ")))

	return(data_3)
}

#' @title Get the data frame thresholds for discretization
#' @description It takes a data frame and returns another data frame indicating the threshold for each bin (or segment)
#' in order to discretize the variable.
#' @param data Data frame source
#' @param n_bins The number of desired bins (or segments) that each variable will have.
#' @param input Vector of string containing all the variables that will be processed.
#'  If empty it will run for all numerical variables that match the following condition, the number of unique values
#'  must be higher than the ones defined at 'n_bins' parameter. NAs values are automatically handled by converting
#'  them into another category (more info about it at
#'  \url{https://livebook.datascienceheroes.com/data-preparation.html#treating-missing-values-in-numerical-variables}).
#'  This function must be used with \link{discretize_df}.
#'  If it is needed a different number of bins per variable, then the function must be called more than once.
#' @examples
#' \dontrun{
#' # Getting the bins thresholds for each. If input is missing, will run for all numerical variables.
#' d_bins=discretize_get_bins(data=heart_disease,
#'                            input=c("resting_blood_pressure", "oldpeak"),
#'                            n_bins=5)
#'
#' # Now it can be applied on the same data frame,
#' # or in a new one (for example in a predictive model that change data over time)
#'  heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T)
#'
#' # Checking results
#' df_status(heart_disease_discretized)
#'}
#' @return Data frame containing the thresholds or cuts to bin every variable
#' @export
discretize_get_bins <- function(data, n_bins=5, input=NULL)
{
	#vars_num=df_status(data, print_results = F) %>% filter(type %in% c("integer","numeric"), unique>n_bins) %>% .$variable
	stat=status(data)
	vars_num = stat %>% filter(type %in% c("integer","numeric"), unique>n_bins) %>% pull(variable)

	## If input then runs for all variables
	if(!missing(input))
	{
		vars_num=vars_num[vars_num %in% input]

		# Check number of unique values
		vars_not_to_process = stat %>% filter(variable %in% input, unique<=n_bins) %>% pull(variable)
		if(length(vars_not_to_process) > 0)
		{
			message(sprintf("Skipping variables to discretize (unique values <= than n_bins): '%s'",
											paste(vars_not_to_process,collapse = ', ')))
		}
	}

	if(length(vars_num)==0)
	{
		message("No variables to discretize")
		return()
	}

	## Begin
	d_bins=sapply(select(data, one_of(vars_num)), function(x) dis_bins(x, n_bins)) %>% as.data.frame(.)
	d_bins$variable=as.character(rownames(d_bins))
	d_bins=rename(d_bins, cuts='.') %>% select(variable, cuts)
	d_bins$cuts=as.character(d_bins$cuts)
	rownames(d_bins)=NULL

	message(sprintf("Variables processed: %s", paste(vars_num, collapse = ", ")))

	return(d_bins)
}


get_bins_processed <- function(x, n_bins)
{
	cuts=cut2(x, g = n_bins, onlycuts = T)

	# when saving the 1st element, the min, is not necesary
	cuts=cuts[-1]

	# the last is the max: Inf
	cuts[length(cuts)]=Inf

	return(cuts)
}


dis_bins <- function(x, n_bins=5)
{
	cuts=get_bins_processed(x, n_bins)
	res=paste(cuts, collapse = "|")

	return(res)
}


dis_recover <- function(x, cuts, stringsAsFactors)
{
	number_of_digits=4
	oldopt = options("digits")
	options(digits = number_of_digits) # number of significant decimal points
	on.exit(options(oldopt))

	cuts_v=as.numeric(unlist(strsplit(cuts, '[|]')))

	#x[x<min(cuts_v)]=min(cuts_v)
	#x[x>max(cuts_v)]=max(cuts_v)

	# forcing -Inf as the min value
	cuts_v=c(-Inf, cuts_v)

	res=Hmisc::cut2(x = x, cuts = cuts_v, digits=number_of_digits, oneval=FALSE)

	# Hack correction on "-Inf" label: [-Inf, min_value)
	if("-Inf" %in% trimws(levels(res)))
	{
		cuts_v_aux=cuts_v[cuts_v!=-Inf]
		min_value=format(min(cuts_v_aux))
		levels(res)[trimws(levels(res))=="-Inf"]=sprintf("[-Inf, %s)", min_value)
	}
	########################


	if(!stringsAsFactors)
	{
		res=as.character(res)
	}

	return(res)
}

conv_factor <- function(x)
{
	levels(x)=c(levels(x), "NA.")
	new_x=factor(x, levels = levels(x))

	return(new_x)
}


#' @title Convert every column in a data frame to character
#' @description It converts all the variables present in 'data' to character. Criteria conversion is based on
#' two functions, \code{\link{discretize_get_bins}} plus \code{\link{discretize_df}}, which will discretize
#' all the numerical variables based on equal frequency criteria, with the number of bins equal to 'n_bins'.
#' This only applies for numerical variables which unique valuesare more than 'n_bins' parameter.
#' After this step, it may happen that variables remain non-character, so these variables will be converting
#' directly into character.
#'
#' @param data input data frame to discretize
#' @param n_bins number of bins/segments for each variable
#' @examples
#' \dontrun{
#' # before
#' df_status(heart_disease)
#'
#' # after
#' new_df=convert_df_to_categoric(data=heart_disease, n_bins=5)
#' df_status(new_df)
#' }
#' @return data frame containing all variables as character
#' @export
convert_df_to_categoric <- function(data, n_bins)
{
	# Discretizing numerical variables
	d_cuts=suppressMessages(discretize_get_bins(data = data, n_bins = n_bins))
	data_cat=discretize_df(data = data, data_bins = d_cuts, stringsAsFactors = F)

	# Converting remaining variables
	data_cat_2=data_cat %>% mutate_all(as.character)

	return(data_cat_2)
}

#' @title Concatenate 'N' variables
#' @description Concatenate 'N' variables using the char pipe: <|>.
#' This function is used when there is the need of measuring the mutual information and/or the information
#' gain between 'N' input variables an against a target variable. This function makes sense when it is used based on
#' categorical data.
#' @param data data frame containing the two variables to concatenate
#' @param vars character vector containing all variables to concatenate
#' @examples
#' \dontrun{
#' new_variable=concatenate_n_vars(mtcars, c("cyl", "disp"))
#' # Checking new variable
#' head(new_variable)
#' }
#' @return vector containing the concatenated values for the given variables
#' @export
concatenate_n_vars <- function(data, vars)
{
	df=data %>% select(one_of(vars))

	new_col=apply(df, 1, function(x) paste(x, collapse = " | ") )

	return(new_col)
}

binary_gain_ratio <- function(input, target, test_points)
{
	input_bin=cut2(input, cuts = test_points)
	if(length(levels(input_bin))==1) return(0)

	gr=gain_ratio(input_bin,  target)

	return(gr)
}



recursive_gr_cuts_aux <- function(input, target, fpoints, max_depth, min_n)
{
	# seq(0.01, 0.99, by=0.01)
	points=unique(quantile(input, probs = seq(0.2, 0.8, by=0.2)))
	if(length(points)==1 | length(fpoints)>=max_depth) return(fpoints)

	r=c()
	for(p in points)
	{
		gr=binary_gain_ratio(input, target, test_points = p)

		# check sample size against the total
		total_left=sum(input<p)
		total_right=sum(input>=p)
		gr=if(total_left < min_n | total_right < min_n) 0 else gr

		#
		r=c(r, gr)
	}

	# Quality stopping criteria
	#if(max(r) < 0.0099) return(fpoints)

	position_max=which(max(r)==r)[1]
	max_point=points[position_max];

	input_left=input[input<max_point]
	input_right=input[input>=max_point]
	target_left=target[input<max_point]
	target_right=target[input>=max_point]

	if(length(input_left)>min_n & length(input_right)>min_n & length(fpoints)<=max_depth)
	{
		fpoints=c(fpoints, max_point);
		fpoints_left= recursive_gr_cuts_aux(input=input_left,  target=target_left, fpoints,max_depth, min_n)
		fpoints_right=recursive_gr_cuts_aux(input=input_right, target=target_right, fpoints,max_depth, min_n)
		fpoints=unique(c(fpoints_right,fpoints_left))

		#message(paste(fpoints,collapse = ", "))
	}

	return(fpoints)
}




#' @title Variable discretization by gain ratio maximization
#' @description Discretize numeric variable by maximizing the gain ratio
#' between each bucket and the target variable.
#'
#' @param input numeric input vector to discretize
#' @param target character or factor multi-calss target variable
#' @param min_perc_bins minimum percetange of rows for each split or segment (controls the sample size), 0,1 (or 10 percent) as default
#' @param max_n_bins maximum number of bins or segments to split the input variable, 5 bins as default
#' @examples
#' \dontrun{
#' library(funModeling)
#' data=heart_disease
#' input=data$oldpeak
#' target=as.character(data$has_heart_disease)
#'
#' input2=discretize_rgr(input, target)
#'
#' # checking:
#' summary(input2)
#' }
#' @return discretized variable (factor)
#' @export
discretize_rgr <- function(input, target, min_perc_bins=0.1, max_n_bins=5)
{
	fpoints=c()
	max_depth=20
	target=as.character(target)
	min_n=round(min_perc_bins*length(input))

	all_cuts=recursive_gr_cuts_aux(input, target, fpoints, max_depth, min_n)

	#
	max_n_bins=max_n_bins-1
	fpoints_top=all_cuts[1:min(max_n_bins,length(all_cuts))]
	fpoints_top_ord=fpoints_top[order(fpoints_top)]

	input_bin=Hmisc::cut2(input, cuts = c(fpoints_top_ord, max(input)))

	return(input_bin)
}