R/f_model_var_dep.R

#' @title generates sequence of variable spanning from min to max
#' @description similar to modelr::seq_range but can handle categorical variables
#' @param data_ls data_ls object generated by f_clean_data(), or a named list
#'   list( data = <dataframe>, numericals = < vector with column names of
#'   numerical columns>)
#' @param col_var character vector denoting variable
#' @param n integer number of intermediate data points, Default: 500
#' @return vector
#' @examples
#' data_ls = f_clean_data(mtcars)
#' col_var = 'disp'
#' f_model_seq_range( data_ls, col_var, 10)
#' @rdname f_model_seq_range
#' @export
f_model_seq_range = function( data_ls, col_var, n = 500 ){

  vec = data_ls$data[[col_var]]

  if(col_var %in% data_ls$numericals){
    sequence = seq( min(vec), max(vec), length.out = n)
  }else{
    # we need to maintain the variable type (unordered vs ordered)
    # this approach is messy but a bit safer then extracting levels
    # as character and then converting back to the original data_type
    sequence = tibble( levels = vec ) %>%
      group_by(levels) %>%
      mutate( rwn = 1
              , rwn = cumsum(rwn) ) %>%
      filter( rwn == 1 ) %>%
      arrange(levels) %>%
      .$levels
  }

  return(sequence)
}

#' @title generates a data grid based on a formula
#' @description the range of one specified variable is expanded, while all other
#'   variables are set to the most common values. Similar to modelr::data_grid
#'   but it can deal with factors.
#' @param col_var character vector, denoting variable that should be expanded
#' @param data_ls data_ls object generated by f_clean_data(), or a named list
#'   list( data = <dataframe>, numericals = < vector with column names of
#'   numerical columns>)
#' @param formula formula
#' @param n integer, length of grid, datapoints in between range of col_var
#' @param set_manual named list, set some variables manually instead of
#'   defaulting to median or most common factor. !! Values need to be of the same
#'   variable type as in the original data.
#' @return dataframe
#' @examples
#' data_ls = f_clean_data(mtcars)
#' formula = disp~cyl+mpg+hp
#' f_model_data_grid( 'mpg', data_ls, formula,  10 )
#' f_model_data_grid( 'mpg', data_ls, formula,  10 , set_manual = list( cyl = min(data_ls$data$cyl) ) )
#' @rdname f_model_data_grid
#' @export
f_model_data_grid = function( col_var, data_ls, formula, n = 500, set_manual = list() ){

  vars         = f_manip_get_variables_from_formula(formula)

  response_var_sym = as.name(col_var)

  if( ! col_var %in% vars ){
    stop('f_model_data_grid supplied variable is not a response variable in formula')
  }

  summarized_ls = f_manip_summarize_2_median_and_most_common_factor(data_ls)

  range = f_model_seq_range( data_ls, col_var, n )

  grid = summarized_ls$data %>%
    select( one_of(vars) )%>%
    mutate( !!response_var_sym := list( tibble( !! response_var_sym := range ) ) ) %>%
    unnest( !!response_var_sym , .drop = F) %>%
    select( one_of(col_var), everything() )


  #implement manually set values

  if( ! all(names(set_manual) %in% data_ls$all_variables ) ){
    stop('variables to be set manually for grid have not been found')
  }

  if( col_var %in% names(set_manual) ){
    set_manual = set_manual[ names(set_manual) != col_var ]
  }

  if( ! is_empty( set_manual ) ){

    tib = as_tibble( set_manual ) %>%
      sample_n( nrow(grid), T )

    grid = grid[, ! names(grid) %in% names(tib) ] %>%
      bind_cols(tib)
  }

  return(grid)

}

#' @title add predictions to grid (regression models)
#' @description wrapper for f_predict_regression_add_predictions
#' @param grid grid containing all variables used for the model
#' @param m model
#' @param var character vector denoting response variable
#' @return grid
#' @examples
#' data_ls = f_clean_data(mtcars)
#' formula = disp~hp+mpg
#' m = lm(formula, data_ls$data)
#' grid = f_model_data_grid(data_ls, formula, 'hp', 10) %>%
#'   f_model_add_predictions_2_grid_regression( m, 'disp')
#' @seealso
#'  \code{\link[modelr]{add_predictions}}
#' @rdname f_model_add_predictions_2_grid_regression
#' @export
#' @importFrom modelr add_predictions
f_model_add_predictions_2_grid_regression = function( grid, m, var){

  grid = f_predict_regression_add_predictions( grid, m ) %>%
    rename( !! as.name(var) := pred )

  return(grid)

}



#' @title plot model dependency on most important variables
#' @description response variable will be plotted against the entire range of
#'   each variable staring with the most important ones. All other variables
#'   will be set to median or most common factor. This function requires a ranked list
#'   of the most important variables as returned by f_model_importance()
#' @param m a regression model
#' @param ranked_variables datafram as returned by f_model_importance()
#' @param title character vector as plot title, Default:
#'   unlist(stringr::str_split(class(m)[1], "\\."))[1]
#' @param data a dataframe, only necessary if it differs from data_ls$data, Default: NULL
#' @param formula the formula used to train the model
#' @param data_ls data_ls object generated by f_clean_data(), or a named list
#'   list( data = <dataframe>, numericals = < vector with column names of
#'   numerical columns>)
#' @param variable_color_code dataframe created by f_plot_color_code_variables()
#' @param limit integer limit the number of variables to be plotted, Default: 12
#' @param set_manual named list, set some variables manually instead of
#'   defaulting to median or most common factor. !! Values need to be of the same
#'   variable type as in the original data.
#' @param log_y boolean log_scale for y axis
#' @param ... arguments passed to facet_wrap e.g. usefull for nrow, ncol
#' @return plot
#' @examples
#'
#' # regular version--------------------------------------
#' data_ls             = f_clean_data(mtcars)
#' data                = data_ls$data
#' formula             = disp~hp+mpg+cyl
#' m                   = randomForest::randomForest(formula, data)
#' ranked_variables    = f_model_importance( m, data)
#' variable_color_code = f_plot_color_code_variables(data_ls)
#' limit               = 12
#' f_model_plot_variable_dependency_regression( m
#'                                              , ranked_variables
#'                                              , title = unlist( stringr::str_split( class(m)[1], '\\.') )[1]
#'                                              , formula = formula
#'                                              , data_ls = data_ls
#'                                              , variable_color_code = variable_color_code
#'                                              , limit = limit
#'                                              )
#'
#' #pipe version ------------------------------------------
#'
#' data_ls = f_clean_data(mtcars)
#' form = as.formula('disp~hp+cyl+wt')
#' variable_color_code = f_plot_color_code_variables(data_ls)
#' limit            = 10
#'
#'  pl = pipelearner::pipelearner( data_ls$data ) %>%
#'   pipelearner::learn_models( rpart::rpart, form ) %>%
#'   pipelearner::learn_models( randomForest::randomForest, form ) %>%
#'   pipelearner::learn_models( e1071::svm, form ) %>%
#'   pipelearner::learn() %>%
#'   mutate( imp   = map2(fit, train, f_model_importance)
#'           ,plot = pmap( list( m = fit, ranked_variables = imp, title = model, data = train)
#'                         , .f = f_model_plot_variable_dependency_regression
#'                         , formula = form
#'                         , data_ls = data_ls
#'                         , variable_color_code = variable_color_code
#'                        , limit = limit
#'          )
#'   )
#'
#' @seealso \code{\link[stringr]{str_split}}
#' @rdname f_model_plot_variable_dependency_regression
#' @export
#' @importFrom stringr str_split
f_model_plot_variable_dependency_regression = function( m
                                                       , ranked_variables
                                                       , title = unlist( stringr::str_split( class(m)[1], '\\.') )[1]
                                                       , data = NULL
                                                       , formula
                                                       , data_ls
                                                       , variable_color_code = f_plot_color_code_variables(data_ls)
                                                       , limit = 12
                                                       , log_y = F
                                                       , set_manual = list()
                                                       , ...
                                                      ){

  # modelling data might have been filtered therefore we will replace data
  # data_ls

  if( !is.null(data) ){
    data_ls$data = as.data.frame(data)
  }

  response_var = f_manip_get_response_variable_from_formula( formula )
  vars         = f_manip_get_variables_from_formula( formula )

  #plot function

  vars_and_col = ranked_variables %>%
    rename( variables = row_names ) %>%
    arrange_( 'rank' ) %>%
    filter( variables %in% vars ) %>%
    head( limit ) %>%
    left_join( variable_color_code ) %>%
    arrange( variables )

  col_vector = vars_and_col[['color']]

  grid = vars_and_col %>%
    mutate( grid = map( variables
                        , f_model_data_grid
                        , data_ls
                        , formula
                        , set_manual = set_manual )
            , grid = map( grid, f_model_add_predictions_2_grid_regression, m, response_var)
            ) %>%
    unnest(grid, .drop = F) %>%
    mutate_if( is.factor, f_manip_factor_2_numeric ) %>%
    mutate( rwn = 1:nrow(.)
            , x = map2(variables, rwn, function(var,rwn,data) data[[var]][rwn], .)
            , x = unlist(x)
            , variables = forcats::fct_reorder(variables, desc(value) )
            )

  # bring colors into same order as the factor variable
  vars_and_col = vars_and_col %>%
    mutate( variables = forcats::fct_reorder(variables, desc(value) ) ) %>%
    arrange(variables)


  if( log_y ){

    grid[, response_var][ grid[, response_var] <= 0] = 0.01
    grid[, response_var] = log( grid[, response_var] )

  }

  p = ggplot( grid, aes_string( 'x', response_var, color = 'variables') ) +
      geom_line( size = 2) +
      facet_wrap(~variables
                 , scales = 'free_x'
                 , ... ) +
      theme( legend.position = 'None') +
      labs( x = ''
            , y = response_var
            , title = title
            ) +
      scale_color_manual( name = 'variables', values = vars_and_col$color , limits = levels(grid$variables) )

    return(p)

}


#' @title plot vmodel varaible dependency over the range of a specified variable
#' @description Some models are able to capture relative dependencies. In order
#'   to visualise them the dataset is split into three parts. 0-25,25-75,75-100
#'   percentile or the three most common factors.Then variable dependencies for
#'   each of the three splits are plotted. In the mtcars example below we can
#'   see that the model predicts an increase in disp if drat increases for cars
#'   with 8 cylinders, while the opposite is true for cars with only 6
#'   cylinders.
#' @param m a model
#' @param title model title
#' @param variables character vector with variable names, or ranked variables as
#'   returned by f_model_importance()
#' @param range_variable character vector denoting range variable
#' @param data dataset
#' @param formula formula
#' @param data_ls data_ls object generated by f_clean_data(), or a named list
#'   list( data = <dataframe>, numericals = < vector with column names of
#'   numerical columns>) - The data_ls object provides the entire dataset
#' @param variable_color_code dataframe created by f_plot_color_code_variables()
#' @param limit integer limit the number of variables to be plotted, Default: 12
#' @param log_y boolean log_scale for y axis
#' @return grid can be printed with gridExtra::grid.arrange()
#' @examples
#' \dontrun{
#'
#'  # single output example ---------------------------------------
#' .f                  = randomForest::randomForest
#' data_ls             = f_clean_data(mtcars)
#' data                = data_ls$data
#' formula             = disp~mpg+cyl+am+hp+drat+qsec+vs+gear+carb
#' m                   = .f(formula, data)
#' variables           = f_model_importance( m, data)
#' title               = unlist( stringr::str_split( class(m)[1], '\\.') )[1]
#' variable_color_code = f_plot_color_code_variables(data_ls)
#' limit               = 10
#' log_y               = F
#'
#' range_variable_num  = data_ls$numericals[1]
#' range_variable_cat  = data_ls$categoricals[1]
#'
#' grid_num = f_model_plot_var_dep_over_spec_var_range(m
#'                                                     , title
#'                                                     , variables
#'                                                     , range_variable_num
#'                                                     , data
#'                                                     , formula
#'                                                     , data_ls
#'                                                     , variable_color_code
#'                                                     , log_y
#'                                                     , limit  )
#' gridExtra::grid.arrange(grid_num)
#'
#' # pipe example ------------------------------------------------
#'
#' data_ls = f_clean_data(mtcars)
#' form = as.formula('disp~cyl+mpg+hp+am+gear+drat+wt+vs+carb')
#' variable_color_code = f_plot_color_code_variables(data_ls)
#'
#' grids = pipelearner::pipelearner(data_ls$data) %>%
#'   pipelearner::learn_models( rpart::rpart, form ) %>%
#'   pipelearner::learn_models( randomForest::randomForest, form ) %>%
#'   pipelearner::learn_models( e1071::svm, form ) %>%
#'   pipelearner::learn() %>%
#'   dplyr::mutate( imp = map2(fit, train, f_model_importance)
#'                  , range_var = map_chr(imp, function(x) head(x,1)$row_names )
#'                  , grid = pmap( list( m = fit
#'                                       , title = model
#'                                       , variables = imp
#'                                       , range_variable = range_var
#'                                       , data = test
#'                  )
#'                  , f_model_plot_var_dep_over_spec_var_range
#'                  , formula = form
#'                  , data_ls = data_ls
#'                  , variable_color_code = variable_color_code
#'                  , log_y = F
#'                  , limit = 12
#'                  )
#'   )  %>%
#'   .$grid
#'
#' f_plot_obj_2_html( grids,  type = "grids", output_file =  'test_me', title = 'Grids', height = 30 )
#'
#' file.remove('test_me.html')
#' }
#' @seealso \code{\link[gridExtra]{arrangeGrob}}
#' @rdname f_model_plot_var_dep_over_spec_var_range
#' @export
#' @importFrom gridExtra arrangeGrob grid.arrange
f_model_plot_var_dep_over_spec_var_range = function( m
                                                    , title
                                                    , variables
                                                    , range_variable
                                                    , data
                                                    , formula
                                                    , data_ls
                                                    , variable_color_code
                                                    , log_y = F
                                                    , limit = 12) {

  if( ! range_variable %in% data_ls$all_variables ){
    stop( 'range_var not found in data_ls' )
  }

  if( ! is.data.frame(variables) | ! is_tibble(variables) ){

    variables = tibble( row_names = variables
                        ,rank = 1:length(variables) )

  }

  if( ! all(variables$row_names %in% data_ls$all_variables) ){
    stop('not all variables can be found in data')
  }

  # symbol

  sym_range_variable = as.name(range_variable)

  # resample object to dataframe

  data = data %>%
    as.data.frame() %>%
    as_tibble()

  #in case data is only a subset of data_ls
  data_ls$data = data


  # ranges -------------------------------------------------------

  if( range_variable %in% data_ls$numericals ){
    stats = boxplot.stats( data[[range_variable]] )$stats
  }else{
    stats = data_ls$data %>%
      group_by( !! sym_range_variable ) %>%
      count() %>%
      arrange( desc(n) ) %>%
      .[[1]]
      head(3)
  }



  # plot function -------------------------------------------------

  caption = 'Graphs show model response to changes in selected input variables,
           while all other input variables are kept constant at median values.'

  f_plot = function( filtered_data, title_seg, subtitle_seg, caption = '', ... ){

    p = f_model_plot_variable_dependency_regression( m = m
                                                 , title = title_seg
                                                 , ranked_variables = variables
                                                 , data = filtered_data
                                                 , formula = formula
                                                 , data_ls = data_ls
                                                 , variable_color_code = variable_color_code
                                                 , limit = limit
                                                 , log_y = log_y
                                                 , ncol = 3
                                                 , ...
                                                ) +
      labs(title = title_seg
           , subtitle = subtitle_seg
           , caption =  caption)
  }

  #All  ---------------------------------------------------------

  title_all    = paste('Variable Dependency', title)
  subtitle_all = paste( 'Unfiltered')

  data_all = data_ls$data

  p_all = f_plot( filtered_data    = data_all
                    , title_seg    = title_all
                    , subtitle_seg = subtitle_all
                    , caption = caption )


  #Histogram ------------------------------------------------------

  if(range_variable %in% data_ls$numericals){

    data_ls_hist = data_ls
    data_ls_hist$data = data_ls_hist$data %>%
      mutate( hist_color = ifelse(    between( !! sym_range_variable, stats[1], stats[2] ), '0-25 perc.'
                           , ifelse(  between( !! sym_range_variable, stats[2], stats[4] ), '25-75 perc.'
                           , ifelse(  between( !! sym_range_variable, stats[4], stats[5] ), '75-100 perc.'
                           , 'outlier') ) )
      )

    data_ls_hist$categoricals = c( data_ls_hist$categoricals, 'hist_color' )

    p_hist = f_plot_hist( range_variable, data_ls_hist, graph_type = 'bar', group = 'hist_color') +
      labs( title = paste('Histogram', range_variable )
            , fill = '') +
      theme( legend.position = 'bottom' ) +
      scale_fill_brewer( palette = 'Dark2')

  } else{

    data_ls_hist = data_ls
    data_ls_hist$data = data_ls_hist$data %>%
      mutate( hist_color = ifelse( UQ(sym_range_variable) %in% stats, 'three most common levels', 'minor level'  )
              , hist_color = as.factor( hist_color )
      )

    data_ls_hist$categoricals = c( data_ls_hist$categoricals, 'hist_color' )

    p_hist = f_plot_hist( range_variable, data_ls_hist, graph_type = 'bar', group = 'hist_color', p_val = F ) +
      labs( title = paste('Histogram', range_variable )
            , fill = '') +
      theme( legend.position = 'bottom' ) +
      scale_fill_brewer( palette = 'Dark2')

  }

  #plot1 ---------------------------------------------------------

  title1    = paste('Variable Dependency', title)

  if( range_variable %in% data_ls$numericals ){
    subtitle1 = paste( range_variable, 'in 0-25 percentile of', range_variable)

    data1 = data_ls$data %>%
      filter( between( !! sym_range_variable, stats[1], stats[2] ) )

  }else{
    subtitle1 = paste( range_variable, '=', stats[1] )

    data1 = data_ls$data %>%
      filter( UQ(sym_range_variable) == stats[1] ) ## for TRUE FaLSE expressions we have to use  UQ() instead of !!

  }

  p1 = f_plot( filtered_data  = data1
               , title_seg    = title1
               , subtitle_seg = subtitle1)
  #plot2 ---------------------------------------------------------

  title2    = ''

  if( range_variable %in% data_ls$numericals ){
    subtitle2 = paste( range_variable, 'in 25-75 percentile of', range_variable)

    data2 = data_ls$data %>%
      filter( between( !! sym_range_variable, stats[2], stats[4] ) )

  }else{
    subtitle2 = paste( range_variable, '=', stats[2] )

    data2 = data_ls$data %>%
      filter( UQ(sym_range_variable) == stats[2] ) # for TRUE FaLSE expressions we have to use  UQ() instead of !!

  }

  p2 = f_plot( filtered_data  = data2
               , title_seg    = title2
               , subtitle_seg = subtitle2)

  #plot3 ---------------------------------------------------------


  title3    = ''

  if( range_variable %in% data_ls$numericals ){
    subtitle3 = paste( range_variable, 'in 75-100 percentile of', range_variable)

    data3 = data_ls$data %>%
      filter( between( !! sym_range_variable, stats[4], stats[5] ) )

  }else{
    subtitle3 = paste( range_variable, '=', stats[3] )

    data3 = data_ls$data %>%
      filter( UQ(sym_range_variable) == stats[2] ) # for TRUE FaLSE expressions we have to use  UQ() instead of !!

  }

  p3 = f_plot( filtered_data  = data3
               , title_seg    = title3
               , subtitle_seg = subtitle3
               , caption      = caption )
  # arrange ---------------------------------------------------------

  nrow = ifelse( nrow(variables) > limit, limit, nrow(variables) )
  nrow = ceiling( nrow/3 )

  g = gridExtra::arrangeGrob( p_all, p_hist, p1, p2, p3, ncol = 1 )

}
erblast/oetteR documentation built on May 27, 2019, 12:11 p.m.