#' @title converts factor to numeric preserving numeric levels and order in
#' character levels.
#' @description before converting we check whether the levels contain a number,
#' if they do the number will be preserved.
#' @param vec vector
#' @return vector
#' @examples
#' fac_num = factor( c(1,3,8) )
#' fac_chr = factor( c('foo','bar') )
#' fac_chr_ordered = factor( c('a','b','c'), ordered = TRUE )
#' manip_factor_2_numeric( fac_num )
#' manip_factor_2_numeric( fac_chr )
#' manip_factor_2_numeric( fac_chr_ordered )
#' # does not work for decimal numbers
#' manip_factor_2_numeric(factor(c("A12", "B55", "10e4")))
#' manip_factor_2_numeric(factor(c("1.56", "4.56", "8.4")))
#' @seealso \code{\link[stringr]{str_detect}}
#' @rdname manip_factor_2_numeric
#' @export
#' @importFrom stringr str_detect str_replace
#' @import dplyr
#' @importFrom grDevices boxplot.stats col2rgb rgb
#' @importFrom stats var
#' @importFrom utils head
#' @importFrom stats median
manip_factor_2_numeric = function(vec){
bool = as.character(vec) %>%
stringr::str_detect('^\\d+$' ) %>%
if( bool ){
vec = vec %>%
as.character() %>%
} else{
vec = as.numeric(vec)
#' @title bin numerical columns
#' @description centers, scales and Yeo Johnson transforms numeric variables in
#' a dataframe before binning into n bins of equal range. Outliers based on
#' boxplot stats are capped (set to min or max of boxplot stats).
#' @param x dataframe with numeric variables, or numeric vector
#' @param bins number of bins for numerical variables, passed to cut as breaks
#' parameter, Default: 5
#' @param bin_labels labels for the bins from low to high, Default: c("LL",
#' "ML", "M", "MH", "HH"). Can also be one of c('mean', 'median', 'min_max',
#' 'cuts'), the corresponding summary function will supply the labels.
#' @param scale logical, Default: T
#' @param center logical, Default: T
#' @param transform logical, apply Yeo Johnson Transformation, Default: T
#' @param round_numeric, logical, rounds numeric results if bin_labels is
#' supplied with a supported summary function name.
#' @param digits, integer, number of digits to round to
#' @param NA_label character vector, define label for missing data, Default:
#' 'NA'
#' @examples
#' summary( mtcars2 )
#' summary( manip_bin_numerics(mtcars2) )
#' summary( manip_bin_numerics(mtcars2, bin_labels = 'mean'))
#' summary( manip_bin_numerics(mtcars2, bin_labels = 'cuts'
#' , scale = FALSE, center = FALSE, transform = FALSE))
#' @return dataframe
#' @rdname manip_bin_numerics
#' @import recipes
#' @importFrom purrr is_bare_numeric walk
#' @importFrom tibble is_tibble
#' @export
manip_bin_numerics = function(x
, bins = 5
, bin_labels = c('LL', 'ML', 'M', 'MH', 'HH')
, center = T
, scale = T
, transform = T
, round_numeric = T
, digits = 2
, NA_label = 'NA'){
# check if input is vector or dataframe, conv vec to df
df = tibble( x = x )
input_vector = T
} else if( | is_tibble(x) ){
df = x
input_vector = F
} else{
numerics = df %>%
select_if( is.numeric ) %>%
select_if( function(x) var(x, na.rm = T) > 0 ) %>% ##boxplotstats produces NA if var == 0
characters = df %>%
select_if( is.character ) %>%
columns = names(df)
if( is_empty(numerics) ){
return( df )
if( length(bin_labels) != bins[1] & ! bin_labels[1] %in% c('median', 'cuts', 'mean', 'min_max') ){
stop( "bin_labels length must be equal to bins or one of c('median', 'cuts', 'mean', 'min_max')")
# we need to assign an ID to restore the correct order at the end
df = mutate(df, easyalluvialid = row_number() )
# recipes 0.1.6 uses old unnest()/nest() API which throws warnings
rec = recipe(df) %>%
update_role( easyalluvialid, new_role = 'id variable')
if( center ) rec = rec %>%
step_center( one_of(numerics) )
if( scale ) rec = rec %>%
step_scale( one_of(numerics) )
if( transform ) rec = rec %>%
step_YeoJohnson( one_of(numerics) )
rec = rec %>%
rename_levels = function(x){
levels(x) = bin_labels
data_new <- bake(rec, df ) %>%
mutate_at( vars(numerics), function(x) ifelse( x > max(boxplot.stats(x)$stats)
, max(boxplot.stats(x)$stats)
, x)
) %>%
mutate_at( vars(numerics), function(x) ifelse( x < min(boxplot.stats(x)$stats)
, min(boxplot.stats(x)$stats)
, x)
) %>%
mutate_at( vars(numerics), function(x) cut(x, breaks = bins) ) %>%
#bake() is converting character variables to factor which we need to revert
mutate_at( vars(characters), as.character )
purrr::walk(numerics, check_empty_lvl, data_new)
summary_as_label = function(df, df_old, fun){
# joins df with original dataframe. Groups by segments and calculates
# summary stat using the given function. Summary stat replaces segment
# labels
df = df %>%
mutate_if(is.factor, fct_explicit_na, na_level = NA_label ) %>%
left_join( select(df_old, one_of( c(numerics, 'easyalluvialid') ) ), by = 'easyalluvialid')
for(num in numerics){
df = df %>%
group_by( !! paste0(num, '.x') ) ) %>%
mutate( !! paste0(num, '.y') ) := fun( !! paste0(num, '.y') ) ) )
df = df %>%
ungroup() %>%
select( - ends_with('.x') )
df = df %>%
mutate_if( is.numeric, round, digits = digits )
df = df %>%
mutate_if(is.numeric, as.factor ) %>%
mutate_if(is.factor, fct_explicit_na, na_level = NA_label ) %>%
rename_at( vars( ends_with('.y') ) , .funs = function(x) str_replace(x, '\\.y$', '') )
if( length(bin_labels) == bins[1] ){
data_new = data_new %>%
mutate_at( vars(numerics), rename_levels)
}else if( bin_labels == 'median'){
data_new = data_new %>%
summary_as_label(df_old = df, fun = median)
}else if( bin_labels == 'mean'){
data_new = summary_as_label(data_new, df_old = df, fun = mean)
}else if( bin_labels == 'min_max'){
df_min = summary_as_label(data_new, df_old = df, fun = min)
df_max = summary_as_label(data_new, df_old = df, fun = max)
join_by = names(df_min)[! names(df_min) %in% numerics ]
join_by = c(join_by, 'easyalluvialid')
join_by = unique(join_by)
data_new = df_min %>%
left_join(df_max, by = join_by )
for(num in numerics){
sym_min = paste0(num, '.x') )
sym_max = paste0(num, '.y') )
data_new = data_new %>%
arrange( !! sym_min ) %>%
mutate( !! num ) := map2_chr( !! sym_max, !! sym_min, function(x,y) paste(x,'-\n',y) ) ) %>%
mutate( !! num ) := as_factor(!! num ) ) )
NA_min_max = paste(NA_label,'-\n',NA_label)
if( NA_min_max %in% levels(data_new[[num]]) ){
data_new = data_new %>%
mutate( !! num ) := fct_recode(!! num )
, !! := NA_min_max ))
data_new = data_new %>%
select( -ends_with('.x'), -ends_with('.y') ) %>%
arrange( easyalluvialid )
#remove easyalluvialid
data_new = select(data_new, columns) %>%
mutate_if( is.factor, fct_explicit_na, na_level = NA_label )
if( input_vector ){
return( data_new$x )
#' @title get most frequent level of factor
#' @param x factor
#' @return factor
#' @examples
#' x <- LETTERS[c(7,6,7,7,3,4,20)]
#' x <- as.factor(x)
#' get_most_frequent_lvl(x)
#' @rdname get_most_frequent_lvl
#' @noRd
get_most_frequent_lvl <- function(x) {
x <- unname(x)
lvl <- table(x) %>%
sort(decreasing = TRUE) %>%
.[1] %>%
return(x[which(x == lvl)][1])
#' @title check for empty lvl
#' @param col character, column name
#' @return df dataframe
#' @examples
#' df <- data.frame(x = cut(c(1,1,2,2,9,9,10), 5))
#' check_empty_lvl("x", df)
#' @rdname check_empty_lvl
#' @noRd
check_empty_lvl <- function(col, df){
df_check <- df %>%
group_by(!!, .drop = FALSE) %>%
count() %>%
filter(n == 0)
if(nrow(df_check) > 0){
warning(paste("bins ", paste(unique(df_check[[col]]), collapse = ","), "of", col, "are empty" ))
