R/f_manip.R

#'@import tibble
#'@import tidyr

#' @title converts matrices to tibble, preserving row.names
#' @description row.names are added as row_names column as the first column of
#'   the tibble. Function does not fail when object cannot be converted to
#'   tibble thus can be used to map over lists with various variable types such
#'   as modells and objects.
#' @param x any variable
#' @return a tibble or if the input variable is neither matrix dataframe or
#'   tibble the original input object.
#' @examples
#'
#' mat = as.matrix(mtcars)
#' head( mat, 10)
#' f_manip_matrix_2_tibble( mat )
#'
#' # convert all matrices from a list
#' pca = prcomp( mtcars ) %>%
#'  map( f_manip_matrix_2_tibble )
#' pca
#'
#' @rdname f_manip_matrix_2_tibble
#' @export
f_manip_matrix_2_tibble = function(x){

  if( ! ( is.matrix(x) | is.data.frame(x) ) ){
    warning('argument is not a matrix, returning input object')
    return(x)
  }

  tib = as_tibble(x)

  if( is.character( row.names(x) ) & ! is.tibble(x) ){

    row_names =  row.names(x)

    tib = tib %>%
      mutate( row_names = row_names) %>%
      select( row_names, everything() )
  }

  return(tib)
}

#'@title transpose a tibble
#'@description transpose a tibble, values in first column will become column
#'  titles. Row names will be converted to first columns
#'@param tib tibble
#'@return tibble
#' @examples
#'
#' tib = mtcars %>%
#'   as_tibble() %>%
#'   f_manip_transpose_tibble()
#'tib
#'@rdname f_manip_transpose_tibble
#'@export
f_manip_transpose_tibble = function(tib){

  if(  ! ( is.matrix(tib) | is.data.frame(tib) | is.tibble(tib) ) ){
    stop('input not transposable')
  }

  tib = tib %>%
    f_manip_matrix_2_tibble() %>%
    gather( key = 'key', value = 'value', 2:ncol(.) ) %>%
    spread( key = 1, value = value ) %>%
    rename( row_names = key ) %>%
    arrange( row_names )
}

#' @title takes a data_ls list created by f_clean_data() and returns a list with
#'   all medians for numerical and most common level for categorical variables.
#' @param data_ls data_ls object generated by f_clean_data(), or a named list
#'   list( data = <dataframe>, numericals = < vector with column names of
#'   numerical columns>)
#' @return list
#' \describe{
#'   \item{data}{summarized data as dataframe}
#'   \item{data_boxcox}{summarized boxcox data as dataframe}
#' }
#' @examples
#' summarized_ls = f_clean_data(mtcars) %>%
#'   f_boxcox() %>%
#'   f_manip_summarize_2_median_and_most_common_factor()
#'
#' summarized_ls$data
#' summarized_ls$boxcox_data
#' @rdname f_manip_summarize_2_median_and_most_common_factor
#' @export
f_manip_summarize_2_median_and_most_common_factor = function(data_ls){

  df_numericals = data_ls$data %>%
    select( one_of(data_ls$numericals) ) %>%
    select_if( is.numeric ) %>%
    summarize_all( median )

  df_categoricals = data_ls$data %>%
    select( one_of(data_ls$categoricals) ) %>%
    select_if( is.factor ) %>%
    summarize_all( f_manip_get_most_common_level )

  data = df_numericals %>%
    bind_cols(df_categoricals) %>%
    select( one_of( names(data_ls$data) ) )

  if( 'boxcox_data' %in% names(data_ls) ){

  df_boxcox = data_ls$boxcox_data %>%
    select( one_of(data_ls$boxcox_names) ) %>%
    select_if( is.numeric ) %>%
    summarize_all( median )

  return( list( data = data
                , boxcox_data = df_boxcox) )

  }else{
    return( list( data = data ) )
  }


}

#' @title get most common level from vector
#' @param x factor vector
#' @return character vector
#' @examples
#' data_ls = f_clean_data(mtcars)
#' f_manip_get_most_common_level( data_ls$data$cyl)
#' @seealso
#'  \code{\link[broom]{tidy}}
#' @rdname f_manip_get_most_common_level
#' @export
f_manip_get_most_common_level = function(x){

  if( ! is.factor(x) ){
    stop( 'f_manip_get_most_common_level called on none factor vector' )
  }

  level = summary(x)  %>%
    sort( decreasing = T ) %>%
    names() %>%
    head(1)

  x = x[x == level] %>%
    head(1)

  return(x)

}

#' @title get variables from formula
#' @param formula formula
#' @return character vector
#' @examples
#' f = foo~bar1 + bar2
#'
#' vars = f_manip_get_variables_from_formula(f)
#' response_var = f_manip_get_response_variable_from_formula(f)
#'
#' @seealso
#'  \code{\link{f_manip_get_response_variable_from_formula}}
#' @rdname f_manip_get_variables_from_formula
#' @export
#' @importFrom stringr str_split
f_manip_get_variables_from_formula = function( formula ) {

  if( ! inherits(formula, 'formula') ){
    stop('f_manip_get_variables_from_formula called with non formula object')
  }

  vars = formula %>%
    as.character() %>%
    .[[3]] %>%
    stringr::str_split( ' \\+ ') %>%
    unlist() %>%
    stringr::str_trim() %>% ## for long formulas as.character
    unlist()                ## will add some whitespace to some variables

  if( vars[1] == '.'){
    stop('cannot extract variables from formula if formula was constructed with "~." ')
  }

  return(vars)

}

#' @title get response variable from formula
#' @param formula formula
#' @return character vector
#' @seealso
#'  \code{\link{f_manip_get_variables_from_formula}}
#' @rdname f_manip_get_response_variable_from_formula
#' @export
#' @importFrom stringr str_split
f_manip_get_response_variable_from_formula = function( formula ) {

  if( ! inherits(formula, 'formula') ){
    stop('f_manip_get_response_variable_from_formula called with non formula object')
  }

  var = formula %>%
    as.character() %>%
    .[[2]]

  return(var)
}

#' @title converts factor to numeric preserving numeric levels and order in character levels
#' @param vec vector
#' @return vector
#' @examples
#' fac_num = factor( c(1,3,8) )
#' fac_chr = factor( c('foo','bar') )
#' fac_chr_ordered = factor( c('a','b','c'), ordered = T )
#'
#' f_manip_factor_2_numeric( fac_num )
#' f_manip_factor_2_numeric( fac_chr )
#' f_manip_factor_2_numeric( fac_chr_ordered )
#' @seealso
#'  \code{\link[stringr]{str_detect}}
#' @rdname f_manip_factor_2_numeric
#' @export
#' @importFrom stringr str_detect
f_manip_factor_2_numeric = function(vec){

  bool = as.character(vec) %>%
    stringr::str_detect('^\\d+$' ) %>%
    all()

  if( bool ){

    vec = vec %>%
      as.character %>%
      as.numeric()

  } else{
    vec = as.numeric(vec)
  }

  return(vec)
}

#' @title bring vector to positice range
#' @description if min < 0, add abs(min) to all values
#' @param vec numeric vector
#' @return vector
#' @examples
#' vec = c( -2,0,2,4,6)
#' vec = f_manip_bring_to_pos_range( vec )
#' vec
#' @rdname f_manip_bring_to_pos_range
#' @export
f_manip_bring_to_pos_range = function(vec){

  if( min(vec)< 0) vec = vec + abs(min(vec))
  return(vec)
}

#' @title append object to list
#' @description convenience function to replace  l[[length(l)+1]] = x
#' @param l list
#' @param x object
#' @return list
#' @examples
#' l = list('a', 'b')
#' l = f_manip_append_2_list(l, 'c')
#' str(l)
#' @rdname f_manip_append_2_list
#' @export
f_manip_append_2_list = function(l, x){

  len = length(l)
  l[[len+1]] = x

  return(l)

}

#' @title brings data to model.matrix format
#' @description model.matrix() creates dummy variables for factors. The names of
#'   these dummy variables however are not compatible with the formula syntax.
#'   This wrapper cleans up the names of the new variables.
#' @param data a dataframe
#' @param formula formula
#' @param scale_data boolean
#' @param center_data boolean
#' @param exclude_na_columns boolean
#' @return list with new dataframe and new formula
#' @examples
#'
#' data_ls = f_clean_data(mtcars)
#' data = data_ls$data
#' formula = hp ~ disp + am + gear
#' data_trans = f_manip_data_2_model_matrix_format( data, formula )
#' response_var =f_manip_get_response_variable_from_formula(data_trans$formula)
#' vars = f_manip_get_variables_from_formula(data_trans$formula)
#' x = as.matrix( select( data_trans$data, one_of(vars) ) )
#' y = data_trans$data[[response_var]]
#' glmnet::glmnet( x , y )
#'
#' @seealso \code{\link[stringr]{str_replace_all}}
#' @rdname f_manip_data_2_model_matrix_format
#' @export
#' @importFrom stringr str_replace_all
f_manip_data_2_model_matrix_format = function(data
                                              , formula
                                              , scale_data = T
                                              , center_data = T
                                              , exclude_na_columns = T){

  data = as_tibble(data)

  vars = oetteR::f_manip_get_variables_from_formula(formula)
  response_var = oetteR::f_manip_get_response_variable_from_formula(formula)

  data_trans = select(data, one_of( c(response_var, vars ) ) )

  if( scale_data == T){
    data_pred = data_trans %>%
      select( one_of(vars) ) %>%
      mutate_if( is.numeric, scale, center = center_data)

    data_trans = data_trans %>%
      select( one_of(response_var) ) %>%
      bind_cols( data_pred )
  }

  if( exclude_na_columns == T){

    data_trans = data_trans %>%
      select_if( function(x) ! any(is.na(x)) )

    vars = names( select(data_trans, - one_of(response_var) ) )

    formula = paste( response_var, '~', paste(vars, collapse = ' + ') ) %>%
      as.formula
  }

  data_keep  = select(data, - one_of( c(response_var, vars ) ) )

  matrix_trans = model.matrix(formula, data_trans) %>%
    as_tibble %>%
    select( - one_of( '(Intercept)' ) )

  #correct names

  new_names = names(matrix_trans) %>%
    gsub("([^A-Za-z0-9])+", "", x = .)

    # stringi::stri_enc_toascii() %>%
    # stringr::str_replace_all( '\\.', '_' ) %>%
    # stringr::str_replace_all( ' ', '_' )
    # stringr::str_replace_all( ',', '_' ) %>%
    # stringr::str_replace_all( '\\-', '_' ) %>%
    # stringr::str_replace_all( '\\+', '_' ) %>%
    # stringr::str_replace_all( '\\^', '_' ) %>%
    # stringr::str_replace_all( '\\/', '_' ) %>%
    # stringr::str_replace_all( '\\', '_' )

  names(matrix_trans) = new_names

  new_formula = paste( response_var, '~', paste(new_names, collapse = ' + ') ) %>%
    as.formula

  data_new = select(data, one_of(response_var) ) %>%
    bind_cols( matrix_trans ) %>%
    bind_cols( data_keep )

  return( list( data = data_new, formula = new_formula ) )

}

#' @title bin numerical columns
#' @description centers, scales and Yeo Johnson transforms numeric variables in
#'   a dataframe before binning into n bins of eqal range. Outliers based on
#'   boxplot stats are capped (set to min or max of boxplot stats).
#' @param df dataframe with numeric variables
#' @param bins number of bins for numerical variables, Default: 5
#' @param bin_labels labels for the bins from low to high, Default: c("LL",
#'   "ML", "M", "MH", "HH")#' @param center boolean, Default: T
#' @param scale boolean, Default: T
#' @param center boolean, Default: T
#' @param transform boolean, Default: T
#' @return dataframe
#' @examples
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname f_manip_bin_numerics
#' @export
f_manip_bin_numerics = function(df
                                , bins = 5
                                , bin_labels = c('LL', 'ML', 'M', 'MH', 'HH')
                                , center = T
                                , scale = T
                                , transform = T){
  require( recipes )

  if( length(bin_labels) != bins ){
    stop( 'bin_labes must be equal to bins')
  }

  numerics = df %>%
    select_if( is.numeric ) %>%
    select_if( function(x) var(x) > 0 ) %>%  ##boxplotstats produces NA if var == 0
    names()

  if( is_empty(numerics) ){
    return( df )
  }

  rec = recipe(df)

  if( center ) rec = rec %>%
    step_center( one_of(numerics) )

  if( scale ) rec = rec  %>%
    step_scale( one_of(numerics) )

  if( transform ) rec = rec %>%
    step_YeoJohnson( one_of(numerics) )

  rec = rec %>%
    prep()

  rename_levels = function(x){
    levels(x) = bin_labels
    return(x)
  }

  data_new <- bake(rec, df ) %>%
    mutate_at( vars(numerics), function(x) ifelse( x > max(boxplot.stats(x)$stats)
                                                   , max(boxplot.stats(x)$stats)
                                                   , x)
               ) %>%
  mutate_at( vars(numerics), function(x) ifelse( x < min(boxplot.stats(x)$stats)
                                                 , min(boxplot.stats(x)$stats)
                                                 , x)
             ) %>%
  mutate_at( vars(numerics), function(x) cut(x, breaks = bins) ) %>%
    mutate_at( vars(numerics),  rename_levels)

  return(data_new)

}


#' @title converts columns of type double to integer if maximum number of
#'   decimal digits is zero
#' @param df dataframe
#' @return tibble
#' @examples
#' as_tibble(mtcars)
#' f_manip_double_2_int(mtcars)
#' @rdname f_manip_double_2_int
#' @export

f_manip_double_2_int = function( df ){

  get_no_digits = function(x){

    if( ! is.numeric(x) ){
      return(NULL)
    }

    x = x %% 1
    x = as.character(x)

    no_digits = nchar(x) - 2

    no_digits = ifelse( no_digits == -1, 0, no_digits )

    return(no_digits)

  }

  suppressWarnings({

    new_df = df %>%
      as_tibble() %>%
      mutate_if( function(x) max( get_no_digits(x), na.rm = T ) == 0, as.integer )

  })

  return(new_df)

}
erblast/oetteR documentation built on May 27, 2019, 12:11 p.m.