#'@import tibble
#'@import tidyr
#' @title converts matrices to tibble, preserving row.names
#' @description row.names are added as row_names column as the first column of
#' the tibble. Function does not fail when object cannot be converted to
#' tibble thus can be used to map over lists with various variable types such
#' as modells and objects.
#' @param x any variable
#' @return a tibble or if the input variable is neither matrix dataframe or
#' tibble the original input object.
#' @examples
#'
#' mat = as.matrix(mtcars)
#' head( mat, 10)
#' f_manip_matrix_2_tibble( mat )
#'
#' # convert all matrices from a list
#' pca = prcomp( mtcars ) %>%
#' map( f_manip_matrix_2_tibble )
#' pca
#'
#' @rdname f_manip_matrix_2_tibble
#' @export
f_manip_matrix_2_tibble = function(x){
if( ! ( is.matrix(x) | is.data.frame(x) ) ){
warning('argument is not a matrix, returning input object')
return(x)
}
tib = as_tibble(x)
if( is.character( row.names(x) ) & ! is.tibble(x) ){
row_names = row.names(x)
tib = tib %>%
mutate( row_names = row_names) %>%
select( row_names, everything() )
}
return(tib)
}
#'@title transpose a tibble
#'@description transpose a tibble, values in first column will become column
#' titles. Row names will be converted to first columns
#'@param tib tibble
#'@return tibble
#' @examples
#'
#' tib = mtcars %>%
#' as_tibble() %>%
#' f_manip_transpose_tibble()
#'tib
#'@rdname f_manip_transpose_tibble
#'@export
f_manip_transpose_tibble = function(tib){
if( ! ( is.matrix(tib) | is.data.frame(tib) | is.tibble(tib) ) ){
stop('input not transposable')
}
tib = tib %>%
f_manip_matrix_2_tibble() %>%
gather( key = 'key', value = 'value', 2:ncol(.) ) %>%
spread( key = 1, value = value ) %>%
rename( row_names = key ) %>%
arrange( row_names )
}
#' @title takes a data_ls list created by f_clean_data() and returns a list with
#' all medians for numerical and most common level for categorical variables.
#' @param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#' @return list
#' \describe{
#' \item{data}{summarized data as dataframe}
#' \item{data_boxcox}{summarized boxcox data as dataframe}
#' }
#' @examples
#' summarized_ls = f_clean_data(mtcars) %>%
#' f_boxcox() %>%
#' f_manip_summarize_2_median_and_most_common_factor()
#'
#' summarized_ls$data
#' summarized_ls$boxcox_data
#' @rdname f_manip_summarize_2_median_and_most_common_factor
#' @export
f_manip_summarize_2_median_and_most_common_factor = function(data_ls){
df_numericals = data_ls$data %>%
select( one_of(data_ls$numericals) ) %>%
select_if( is.numeric ) %>%
summarize_all( median )
df_categoricals = data_ls$data %>%
select( one_of(data_ls$categoricals) ) %>%
select_if( is.factor ) %>%
summarize_all( f_manip_get_most_common_level )
data = df_numericals %>%
bind_cols(df_categoricals) %>%
select( one_of( names(data_ls$data) ) )
if( 'boxcox_data' %in% names(data_ls) ){
df_boxcox = data_ls$boxcox_data %>%
select( one_of(data_ls$boxcox_names) ) %>%
select_if( is.numeric ) %>%
summarize_all( median )
return( list( data = data
, boxcox_data = df_boxcox) )
}else{
return( list( data = data ) )
}
}
#' @title get most common level from vector
#' @param x factor vector
#' @return character vector
#' @examples
#' data_ls = f_clean_data(mtcars)
#' f_manip_get_most_common_level( data_ls$data$cyl)
#' @seealso
#' \code{\link[broom]{tidy}}
#' @rdname f_manip_get_most_common_level
#' @export
f_manip_get_most_common_level = function(x){
if( ! is.factor(x) ){
stop( 'f_manip_get_most_common_level called on none factor vector' )
}
level = summary(x) %>%
sort( decreasing = T ) %>%
names() %>%
head(1)
x = x[x == level] %>%
head(1)
return(x)
}
#' @title get variables from formula
#' @param formula formula
#' @return character vector
#' @examples
#' f = foo~bar1 + bar2
#'
#' vars = f_manip_get_variables_from_formula(f)
#' response_var = f_manip_get_response_variable_from_formula(f)
#'
#' @seealso
#' \code{\link{f_manip_get_response_variable_from_formula}}
#' @rdname f_manip_get_variables_from_formula
#' @export
#' @importFrom stringr str_split
f_manip_get_variables_from_formula = function( formula ) {
if( ! inherits(formula, 'formula') ){
stop('f_manip_get_variables_from_formula called with non formula object')
}
vars = formula %>%
as.character() %>%
.[[3]] %>%
stringr::str_split( ' \\+ ') %>%
unlist() %>%
stringr::str_trim() %>% ## for long formulas as.character
unlist() ## will add some whitespace to some variables
if( vars[1] == '.'){
stop('cannot extract variables from formula if formula was constructed with "~." ')
}
return(vars)
}
#' @title get response variable from formula
#' @param formula formula
#' @return character vector
#' @seealso
#' \code{\link{f_manip_get_variables_from_formula}}
#' @rdname f_manip_get_response_variable_from_formula
#' @export
#' @importFrom stringr str_split
f_manip_get_response_variable_from_formula = function( formula ) {
if( ! inherits(formula, 'formula') ){
stop('f_manip_get_response_variable_from_formula called with non formula object')
}
var = formula %>%
as.character() %>%
.[[2]]
return(var)
}
#' @title converts factor to numeric preserving numeric levels and order in character levels
#' @param vec vector
#' @return vector
#' @examples
#' fac_num = factor( c(1,3,8) )
#' fac_chr = factor( c('foo','bar') )
#' fac_chr_ordered = factor( c('a','b','c'), ordered = T )
#'
#' f_manip_factor_2_numeric( fac_num )
#' f_manip_factor_2_numeric( fac_chr )
#' f_manip_factor_2_numeric( fac_chr_ordered )
#' @seealso
#' \code{\link[stringr]{str_detect}}
#' @rdname f_manip_factor_2_numeric
#' @export
#' @importFrom stringr str_detect
f_manip_factor_2_numeric = function(vec){
bool = as.character(vec) %>%
stringr::str_detect('^\\d+$' ) %>%
all()
if( bool ){
vec = vec %>%
as.character %>%
as.numeric()
} else{
vec = as.numeric(vec)
}
return(vec)
}
#' @title bring vector to positice range
#' @description if min < 0, add abs(min) to all values
#' @param vec numeric vector
#' @return vector
#' @examples
#' vec = c( -2,0,2,4,6)
#' vec = f_manip_bring_to_pos_range( vec )
#' vec
#' @rdname f_manip_bring_to_pos_range
#' @export
f_manip_bring_to_pos_range = function(vec){
if( min(vec)< 0) vec = vec + abs(min(vec))
return(vec)
}
#' @title append object to list
#' @description convenience function to replace l[[length(l)+1]] = x
#' @param l list
#' @param x object
#' @return list
#' @examples
#' l = list('a', 'b')
#' l = f_manip_append_2_list(l, 'c')
#' str(l)
#' @rdname f_manip_append_2_list
#' @export
f_manip_append_2_list = function(l, x){
len = length(l)
l[[len+1]] = x
return(l)
}
#' @title brings data to model.matrix format
#' @description model.matrix() creates dummy variables for factors. The names of
#' these dummy variables however are not compatible with the formula syntax.
#' This wrapper cleans up the names of the new variables.
#' @param data a dataframe
#' @param formula formula
#' @param scale_data boolean
#' @param center_data boolean
#' @param exclude_na_columns boolean
#' @return list with new dataframe and new formula
#' @examples
#'
#' data_ls = f_clean_data(mtcars)
#' data = data_ls$data
#' formula = hp ~ disp + am + gear
#' data_trans = f_manip_data_2_model_matrix_format( data, formula )
#' response_var =f_manip_get_response_variable_from_formula(data_trans$formula)
#' vars = f_manip_get_variables_from_formula(data_trans$formula)
#' x = as.matrix( select( data_trans$data, one_of(vars) ) )
#' y = data_trans$data[[response_var]]
#' glmnet::glmnet( x , y )
#'
#' @seealso \code{\link[stringr]{str_replace_all}}
#' @rdname f_manip_data_2_model_matrix_format
#' @export
#' @importFrom stringr str_replace_all
f_manip_data_2_model_matrix_format = function(data
, formula
, scale_data = T
, center_data = T
, exclude_na_columns = T){
data = as_tibble(data)
vars = oetteR::f_manip_get_variables_from_formula(formula)
response_var = oetteR::f_manip_get_response_variable_from_formula(formula)
data_trans = select(data, one_of( c(response_var, vars ) ) )
if( scale_data == T){
data_pred = data_trans %>%
select( one_of(vars) ) %>%
mutate_if( is.numeric, scale, center = center_data)
data_trans = data_trans %>%
select( one_of(response_var) ) %>%
bind_cols( data_pred )
}
if( exclude_na_columns == T){
data_trans = data_trans %>%
select_if( function(x) ! any(is.na(x)) )
vars = names( select(data_trans, - one_of(response_var) ) )
formula = paste( response_var, '~', paste(vars, collapse = ' + ') ) %>%
as.formula
}
data_keep = select(data, - one_of( c(response_var, vars ) ) )
matrix_trans = model.matrix(formula, data_trans) %>%
as_tibble %>%
select( - one_of( '(Intercept)' ) )
#correct names
new_names = names(matrix_trans) %>%
gsub("([^A-Za-z0-9])+", "", x = .)
# stringi::stri_enc_toascii() %>%
# stringr::str_replace_all( '\\.', '_' ) %>%
# stringr::str_replace_all( ' ', '_' )
# stringr::str_replace_all( ',', '_' ) %>%
# stringr::str_replace_all( '\\-', '_' ) %>%
# stringr::str_replace_all( '\\+', '_' ) %>%
# stringr::str_replace_all( '\\^', '_' ) %>%
# stringr::str_replace_all( '\\/', '_' ) %>%
# stringr::str_replace_all( '\\', '_' )
names(matrix_trans) = new_names
new_formula = paste( response_var, '~', paste(new_names, collapse = ' + ') ) %>%
as.formula
data_new = select(data, one_of(response_var) ) %>%
bind_cols( matrix_trans ) %>%
bind_cols( data_keep )
return( list( data = data_new, formula = new_formula ) )
}
#' @title bin numerical columns
#' @description centers, scales and Yeo Johnson transforms numeric variables in
#' a dataframe before binning into n bins of eqal range. Outliers based on
#' boxplot stats are capped (set to min or max of boxplot stats).
#' @param df dataframe with numeric variables
#' @param bins number of bins for numerical variables, Default: 5
#' @param bin_labels labels for the bins from low to high, Default: c("LL",
#' "ML", "M", "MH", "HH")#' @param center boolean, Default: T
#' @param scale boolean, Default: T
#' @param center boolean, Default: T
#' @param transform boolean, Default: T
#' @return dataframe
#' @examples
#' \dontrun{
#' if(interactive()){
#' #EXAMPLE1
#' }
#' }
#' @rdname f_manip_bin_numerics
#' @export
f_manip_bin_numerics = function(df
, bins = 5
, bin_labels = c('LL', 'ML', 'M', 'MH', 'HH')
, center = T
, scale = T
, transform = T){
require( recipes )
if( length(bin_labels) != bins ){
stop( 'bin_labes must be equal to bins')
}
numerics = df %>%
select_if( is.numeric ) %>%
select_if( function(x) var(x) > 0 ) %>% ##boxplotstats produces NA if var == 0
names()
if( is_empty(numerics) ){
return( df )
}
rec = recipe(df)
if( center ) rec = rec %>%
step_center( one_of(numerics) )
if( scale ) rec = rec %>%
step_scale( one_of(numerics) )
if( transform ) rec = rec %>%
step_YeoJohnson( one_of(numerics) )
rec = rec %>%
prep()
rename_levels = function(x){
levels(x) = bin_labels
return(x)
}
data_new <- bake(rec, df ) %>%
mutate_at( vars(numerics), function(x) ifelse( x > max(boxplot.stats(x)$stats)
, max(boxplot.stats(x)$stats)
, x)
) %>%
mutate_at( vars(numerics), function(x) ifelse( x < min(boxplot.stats(x)$stats)
, min(boxplot.stats(x)$stats)
, x)
) %>%
mutate_at( vars(numerics), function(x) cut(x, breaks = bins) ) %>%
mutate_at( vars(numerics), rename_levels)
return(data_new)
}
#' @title converts columns of type double to integer if maximum number of
#' decimal digits is zero
#' @param df dataframe
#' @return tibble
#' @examples
#' as_tibble(mtcars)
#' f_manip_double_2_int(mtcars)
#' @rdname f_manip_double_2_int
#' @export
f_manip_double_2_int = function( df ){
get_no_digits = function(x){
if( ! is.numeric(x) ){
return(NULL)
}
x = x %% 1
x = as.character(x)
no_digits = nchar(x) - 2
no_digits = ifelse( no_digits == -1, 0, no_digits )
return(no_digits)
}
suppressWarnings({
new_df = df %>%
as_tibble() %>%
mutate_if( function(x) max( get_no_digits(x), na.rm = T ) == 0, as.integer )
})
return(new_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.