Nothing
#' Encode a given factor variable automatically
#'
#' @description Transforms the original design matrix automatically, using the appropriate encoding.
#'
#' @param X The data.frame/data.table to transform.
#' @param Y Optional: The dependent variable to ignore in the transformation.
#' @param fact Optional: The factor variable(s) to encode by -
#' either positive integer(s) specifying the
#' column number, or the name(s) of the column.
#' If left empty a heuristic is used to determine the factor
#' variable(s), and a warning is written with the names of
#' the variables converted.
#' @param method Optional: A character string indicating which encoding method to use,
#' either of the following:
#' * "mean"
#' * "median"
#' * "deviation"
#' * "lowrank"
#' * "SPCA"
#' * "mnl"
#' * "dummy"
#' * "difference"
#' * "helmert"
#' * "simple_effect"
#' * "repeated_effect"
#' If only a single method is specified, it is taken to encode either all of the variables
#' supplied through *fact*, or variables which have been flagged as factors automatically.
#' If multiple methods are specified, the number of methods must match the number of
#' factor variables in *fact* - and these are applied to correspond in the order in
#' which they were supplied. In case a missmatch occurs, an error is raised.
#' If left empty, the appriopriate method is selected on a case by case basis
#' (and the selected methods are written out to console).
#' @param keep Whether to keep the original factor column(s), defaults to **FALSE**.
#' @return A new data.table X which contains the new columns and optionally the old factor(s).
#' @details Automatically selects the appropriate method given the number of anticipated
#' newly created variables, based on the results in Johannemann et al.(2019)
#' 'Sufficient Representations for Categorical Variables', and
#' a simple heuristic - where
#'
#' @importFrom data.table data.table
#' @importFrom data.table .SD
#' @importFrom data.table ':='
#' @export
#'
#' @examples
#'
#' design_mat <- cbind( data.frame( matrix(rnorm(5*100),ncol = 5) ),
#' sample( sample(letters, 10), 100, replace = TRUE)
#' )
#' colnames(design_mat)[6] <- "factor_var"
#'
#' encode_categories( design_mat, method = "mean" )
#'
encode_categories <- function( X,
Y = NULL,
fact = NULL,
method = NULL,
keep = FALSE )
{
X <- data.table::data.table(X)
if(!is.null(Y)){
if(is.numeric(Y)){
Y <- colnames(X)[Y]
}
X_curr <- X[, .SD, .SDcols = !Y]
}
else{
X_curr <- X
}
is_likely_factor <- function(datafm)
{
res <- lapply(datafm, FUN = function(i){
total_levels <- levels(as.factor(as.character(i)))
total_length <- length(unlist(i))
ratio_of_unique_values <- (total_length - length(total_levels))/(total_length)
if(ratio_of_unique_values > 0.2){
return(TRUE)
}
else{
return(FALSE)
}
})
return(unlist(res))
}
if(is.null(fact)){
all_factor <- names(which(is_likely_factor(X) == TRUE))
if(length(all_factor) == ncol(X)){
stop("\n Factor inference impossible: inferred total number of factors: ",
length(all_factor), ", equal to total number of columns: ", ncol(X))
}
warning("\n Inferring factors: \n ", paste(all_factor, "\n"))
}
else{
if(is.numeric(fact)){
fact <- colnames(X)[fact]
}
all_factor <- fact
}
method_table <- list( encode_mean, encode_median, encode_deviation,
encode_lowrank, encode_SPCA, encode_mnl,
encode_dummy, encode_difference, encode_helmert,
encode_simple_effect, encode_repeated_effect )
method_names <- c( "mean", "median", "deviation",
"lowrank", "SPCA", "mnl",
"dummy", "difference", "helmert",
"simple_effect", "repeated_effect" )
if(!is.null(method))
{
methods_used <- lapply( method, FUN = match, table = method_names)
if(sum(is.na(unlist(methods_used))) == TRUE ){
stop("Failed to match the supplied method(s).")
}
}
else
{
methods_used <- lapply(1:length(all_factor), FUN = function(i){
total_levels <- length(levels(unlist(X[,(all_factor[i])])))
# more subgroups than there are other columns
# if( total_levels > ncol(X[,!..all_factor]) ){
if( total_levels > (ncol(X) - ncol(X[, .SD, .SDcols = all_factor])) ){
# default to mean encoding
return(1)
}
else{
return(5) }})
}
if(length(methods_used) < length(all_factor)){
stop(paste("The number of supplied methods(", length(methods_used),
") is not equal to the number of factors(", length(all_factor),").",
"Please specify the correct number of methods, and/or factor variables.",
sep = ""))
}
if(length(methods_used) > length(all_factor)){
warning(paste("More methods(", length(methods_used),") than factors
(", length(all_factor),") detected, using only the first few,
until there is enough methods for the number of factor variables,
and dropping the rest.", sep = ""))
methods_used <- methods_used[1:length(all_factor)]
}
final <- lapply(1:length(all_factor), FUN = function(i){
current_factor <- all_factor[i]
X_curr <- cbind( X_curr[,.SD, .SDcols = !all_factor],
X_curr[,.SD, .SDcols = current_factor])
res <- do.call( what = method_table[[methods_used[[i]]]],
args = list( X_curr,
fact = current_factor,
keep_factor = TRUE,
encoding_only = TRUE))
})
res <- X[final[[1]], on = all_factor[1]]
if(length(all_factor) > 1){
for(i in 2:length(all_factor)){
res <- res[final[[i]], on = all_factor[i]]
}
}
if(keep == FALSE){
res[,(all_factor) := NULL]
}
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.