R/f_stat.R

#' @title calculates maximum difference in group means and medians
#' @description used as a helper function for f_stat_anova
#' @param df dataframe
#' @param col_group character vector denoting grouping variable
#' @param col_variable character vector denoting variable
#' @return dataframe
#' @examples
#' set.seed(1)
#' df = tibble( fct = sample(LETTERS[1:5], 100, replace = T)
#'              , v1  = 1
#'              , v2  = rnorm(100, 4)
#'              , v3  = c( rep(3, 50), rep(8,50) )
#' )
#'
#' col_group = 'fct'
#'
#' f_stat_diff_of_means_medians(df, col_group, 'v1') %>%
#'   bind_rows( f_stat_diff_of_means_medians(df, col_group, 'v2') ) %>%
#'   bind_rows( f_stat_diff_of_means_medians(df, col_group, 'v3') )
#' @rdname f_stat_diff_of_means_medians
#' @export
f_stat_diff_of_means_medians = function(df, col_group, col_variable){


  data = df %>%
    select( group = one_of(col_group), variable = one_of(col_variable) )%>%
    mutate ( variable = f_manip_bring_to_pos_range(variable) ) %>%
    group_by( group ) %>%
    summarise( means     = mean(variable,   na.rm = T)
               , medians = median(variable, na.rm = T) ) %>%
    ungroup() %>%
    summarise( variable               = col_variable
               , diff_of_means        = max(means) - min(means)
               , diff_of_means_perc   = ( ( max(means) - min(means) ) /max(means) ) *100
               , diff_of_medians      = max(medians) - min(medians)
               , diff_of_medians_perc = ( ( max(medians) - min(medians) ) /max(means) ) *100    )


  return(data)
}

#' @title generate a datatframe with anova results from a data_ls list
#' @description returns a dataframe with shapiro, anova und kruskal p values
#'   supplemented with maximum difference of means and medians between groups
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#'  list( data = <dataframe>, numericals = < vector with column names of
#'  numerical columns>)
#' @param col_group character vector denoting grouping variable
#' @param boxcox perform analysis on boxcox-transformed or numerical variables
#' @return dataframe
#' @examples
#' df_anova = data_ls = f_clean_data(mtcars) %>%
#'   f_stat_anova('cyl')
#'
#' df_anova
#' @seealso \code{\link[stringr]{str_c}}
#' \code{\link[purrr]{map}},\code{\link[purrr]{map_dbl}}
#' \code{\link{f_stat_anova}}
#' @rdname f_anova_stats
#' @export
#' @importFrom stringr str_c
#' @importFrom purrr map map_dbl
f_stat_anova = function(data_ls, col_group, boxcox = F) {

  if( ! col_group %in% data_ls$categoricals ){
    stop('grouping variable not in categoricals')
  }

  if( is.null(data_ls$numericals) | is_empty(data_ls$numericals) ){
    return()
  }

  if( boxcox == T & ! is.null(data_ls$boxcox) ){

    variables = data_ls$boxcox_names
    data = data_ls$boxcox_data

  }else{
    variables = data_ls$numericals
    data = data_ls$data

  }

  formula = stringr::str_c('value~',col_group) %>%
    as.formula()

  df_anova = data %>%
    as_tibble() %>%
    select( one_of( c(col_group, variables) ) ) %>%
    gather(key = 'variable', value = 'value', one_of( variables ) ) %>%
    group_by( variable ) %>%
    filter( sd(value) != 0 ) %>%
    nest( one_of(col_group), value) %>%
    mutate( model_anova     = purrr::map( data, ~aov( formula, data = .))
            , summary_anova = purrr::map( model_anova, summary)
            , summary_anova = purrr::map( summary_anova
                                          , function(x) x[[1]])
            , anova_pval    = purrr::map(summary_anova, 'Pr(>F)')
            , anova_pval    = purrr::map_dbl(anova_pval
                                             , function(x) x[1])
            , model_kruskal = purrr::map( data, ~kruskal.test(formula, data = .) )
            , kruskal_pval  = purrr::map_dbl(model_kruskal, 'p.value')
            , model_shapiro = purrr::map(data, function(x) f_stat_shapiro( x$value ) )
            , shapiro_stat  = purrr::map_dbl(model_shapiro,'statistic')
            , shapiro_pval  = purrr::map_dbl(model_shapiro,'p.value')
            , diff_df       = purrr::map( data, f_stat_diff_of_means_medians, col_group = col_group, col_variable = 'value') ) %>%
    unnest(diff_df) %>%
    select(variable
           , shapiro_stat
           , shapiro_pval
           , anova_pval
           , kruskal_pval
           , diff_of_means
           , diff_of_means_perc
           , diff_of_medians
           , diff_of_medians_perc
    )

  return(df_anova)
}


#' @title calculate the maximal difference in frequencies between to categorical variables
#' @description used as a helper function for f_stat_chi_square
#' @param df dataframe containing both avariables
#' @param col_var1 character vector denoting variable column 1
#' @param col_var2 character vector denoting variable column 2
#' @return dataframe
#' @examples
#' data_ls = f_clean_data(mtcars)
#' df_chi_squ = f_stat_max_diff_of_freq(data_ls$data, 'cyl', 'gear')
#' @rdname f_stat_max_diff_of_freq
#' @export
f_stat_max_diff_of_freq = function(df, col_var1, col_var2){

  t = table( df[[col_var1]], df[[col_var2]] ) %>%
    as.data.frame() %>% # as_tibble cannot convert the table input anymore
    as_tibble() %>%
    group_by( Var1 ) %>% #Var1, Var2 and Freq are the automatically assigneed column names
    mutate ( diff_var1          = ( max(Freq)-min(Freq) )
             , diff_var1_perc = ( ( max(Freq)-min(Freq))/ max(Freq) *100)
    ) %>%
    group_by( Var2 ) %>%
    mutate ( diff_var2          = ( max(Freq)-min(Freq) )
             , diff_var2_perc = ( ( max(Freq)-min(Freq))/ max(Freq) *100)
    ) %>%
    ungroup()%>%
    summarise( max_diff_freq          = max( c(diff_var1, diff_var2) )
               , max_diff_freq_perc  = max( c(diff_var1_perc, diff_var2_perc) )
    )
}


#' @title generate a datatframe with chi square results from a data_ls list
#' @description FUNCTION_DESCRIPTION
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#'  list( data = <dataframe>, numericals = < vector with column names of
#'  numerical columns>)
#' @param col_group character vector denoting grouping variable
#' @return dataframe
#' @examples
#' data_ls = f_clean_data(mtcars)
#' df_chi_squ = f_stat_chi_square(data_ls, 'cyl')
#' df_chi_squ
#' @seealso
#'  \code{\link[purrr]{is_empty}},\code{\link[purrr]{map}},\code{\link[purrr]{map_dbl}}
#' @rdname f_stat_chi_square
#' @export
#' @importFrom purrr is_empty map map_dbl
f_stat_chi_square = function(data_ls, col_group) {

  if( ! col_group %in% data_ls$categoricals ){
    stop('grouping variable not in categoricals')
  }

  data = data_ls$data

  variables = data_ls$categoricals[!data_ls$categoricals == col_group]

  if(purrr::is_empty(variables)) return()

  suppressWarnings({

    df_chi = data %>%
      as_tibble() %>%
      select( one_of( c(col_group, variables) ) ) %>%
      gather(key = 'variable', value = 'value', one_of( variables  ) ) %>%
      group_by( variable ) %>%
      nest( one_of( col_group ), value) %>%
      mutate(  model_chi = purrr::map( data, ~chisq.test(x = .[[col_group]]
                                                         , y = .[['value']]
                                                         )
                                       )
                ,chi_pval  = purrr::map_dbl(model_chi, 'p.value')
                ,diff_df   = purrr::map(data, f_stat_max_diff_of_freq, col_group, 'value')  ) %>%
      unnest(diff_df) %>%
      select(variable, chi_pval, max_diff_freq, max_diff_freq_perc)

  })

  return(df_chi)
}

#' @title combines anova with chi square results into single dataframe
#' @description keeps wither the anova or the kruskal p value depending on the
#'   results of the shapiro test (shapiro_stat > 0.9 p_val > 0.05) and keeps
#'   the difference of percent of the mean. Still works if one of the input
#'   dataframes is NULL
#' @param df_anova dataframe created with f_stat_anova, Default: NULL
#' @param df_chi_square dataframe created with f_stat_chi_square(), Default: NULL
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples
#'  data_ls = f_clean_data(mtcars)
#'  df_chi_squ = f_stat_chi_square(data_ls, 'cyl')
#'  df_anova = f_stat_anova(data_ls, 'cyl')
#'  df_comb = f_stat_combine_anova_with_chi_square(df_anova, df_chi_squ)
#'  df_comb
#'  df_comb = f_stat_combine_anova_with_chi_square(df_anova)
#'  df_comb
#'  df_comb = f_stat_combine_anova_with_chi_square(df_chi_square = df_chi_squ)
#'  df_comb
#' @rdname f_stat_combine_anova_with_chi_square
#' @export

f_stat_combine_anova_with_chi_square = function(df_anova = NULL, df_chi_square = NULL){

  if( ! is_empty(df_anova) | ! is.null(df_anova) ) {
    df_anova = df_anova %>%
      mutate( final_p_value = ifelse(shapiro_stat > 0.9 & shapiro_pval > 0.05
                                     , anova_pval, kruskal_pval) ) %>%
      select(variable, p_value = final_p_value , diff_perc = diff_of_means_perc)
  }

  if( ! is_empty(df_chi_square) | ! is.null(df_chi_square) ) {
    df_chi_square = df_chi_square %>%
      select(variable, p_value = chi_pval, diff_perc = max_diff_freq_perc)
  }

  # bind_rows takes NULL without error
  df_comb = df_anova %>%
  bind_rows( df_chi_square )%>%
  arrange(p_value)

}

#'@title analyse group difference of dataset
#'@description creates a html document with a group analysis including:
#'  \itemize{ \item P value table \item Dynamic Plots of all significant
#'  features \item static plots with brackets indicating statistical differences
#'  \item Tabplot \item Alluvial Plot \item table containing means and medians
#'  for numerical variables \item table containing counts and percentages for
#'  categorical variables } The function automatically renders three html pages
#'  one for the additional static plots, one for the tableplot and one for the
#'  alluvial plots. In the same direcotry that can be determined by the
#'  outputfile parameter. Default behaviour will also render the entire html
#'  document returning the filepath of the new html file. The other three html
#'  files will be linked to in the document. You can modify the function to
#'  return a htmltools taglist instead. The above mentioned 3 additional html
#'  files for the other types of plots will still be rendered though with
#'  default settings. These extra plots can be switched off though.
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#'  list( data = <dataframe>, numericals = < vector with column names of
#'  numerical columns>)
#'@param col_group character vector denoting grouping columns
#'@param thresh_p_val p value threshold for plots, Default: 0.05
#'@param thresh_diff_perc minimum percent difference threshold for plots,
#'  Default: 3
#'@param static_plots boolean, render static plots indicating statistical
#'  differences with brackets, Default = TRUE
#'@param alluvial boolean, render alluvial plot, Default: TRUE
#'@param alluvial_thresh_p_val double, threshold for feature to be inlcuded in
#'  alluvial plot. Features that are not highly significant and convey a large
#'  percental difference will result in a high number of flows thus cluttering
#'  the plot. It is not recommended to set these thresholds lower than the
#'  default. Default: 0.05
#'@param alluvial_thresh_diff_perc double, threshold for feature to be inlcuded
#'  in alluvial plot. Features that are not highly significant and convey a
#'  large percental difference will result in a high number of flows thus
#'  cluttering the plot. It is not recommended to set these thresholds lower
#'  than the default. Default: 7.5
#'@param tabplot boolean, render tabplot threshold for features are the same as
#'  for the dynamic plots, Default: TRUE , static_plots = T
#'@param return_taglist boolean, return taglist instead of rendereing the final
#'  html document and returning the link to the html file. Usefull if analysis
#'  should be directly included into the current markdown document.
#'@param output_file character vector containing output file name
#'@param max_alluvial_flows integer, maximum number of alluvial flows. Alluvial
#'  Plots can take a long time to render. Rendering an alluvial plot with the
#'  default setting of 1500 should take at least 10 min. Default 1500
#'@param fig.width integer Width of Alluvial and Tabplot in inches. Default
#'  values can be comfortably viewed on a 1920 x 1080 screen resolution. Default: 16
#'@param fig.height integer height of Alluvial and Tabplot in inches. Default
#'  values can be comfortably viewed on a 1920 x 1080 screen resolution. Default: 10
#'@param quiet booloean, suppress render markdown output to console, Default: FALSE
#'@return file path to html file / or taglist
#' @examples
#' \dontrun{
#'  data_ls = f_clean_data(mtcars)
#'  f_stat_group_ana(data_ls, 'cyl', output_file = 'test_me')
#'  file.remove('test_me.html')
#'  file.remove('test_me_stat_plots.html')
#'  file.remove('test_me_alluvial.html')
#'  file.remove('test_me_tabplots.html')
#' }
#'@seealso \code{\link[plotly]{ggplotly}}
#'  \code{\link[htmltools]{tagList}},\code{\link[htmltools]{h1}},\code{\link[htmltools]{h2}}
#'
#'
#'
#'
#'@rdname f_stat_group_ana
#'@export
#'@importFrom plotly ggplotly
#'@importFrom htmltools tagList h1 h2 h3 h4 h5 h6
f_stat_group_ana = function(data_ls
                              , col_group
                              , thresh_p_val = 0.05
                              , thresh_diff_perc = 3
                              , output_file = 'group_ana'
                              , static_plots = T
                              , alluvial = T
                              , alluvial_thresh_p_val = 0.05
                              , alluvial_thresh_diff_perc = 7.5
                              , max_alluvial_flows = 1500
                              , tabplot = T
                              , return_taglist = F
                              , fig.width = 16
                              , fig.height = 10
                              , quiet = FALSE
                              ){

  df_anova = f_stat_anova( data_ls, col_group )
  df_chi   = f_stat_chi_square( data_ls, col_group )
  df_comb  = f_stat_combine_anova_with_chi_square( df_anova, df_chi ) %>%
    mutate( stars = f_stat_stars(p_value) ) %>%
    select( variable, stars, p_value, diff_perc )

  df_means = f_stat_group_mean_medians(data_ls, col_group)
  df_perc  = f_stat_group_counts_percentages(data_ls, col_group)



  f_plot_plotly = function( var, title, col_group, data_ls ){


    caption = '* P:0.05, ** P:0,005, *** P:0.001'


    if(var %in% data_ls$numericals){

      # ggplotly throws a warning when trying to convert geom_sign..
      suppressWarnings({

        p = f_plot_hist( var, data_ls, col_group, graph_type = 'violin') %>%
          plotly::ggplotly( tooltip = c('y','fill') )

      })

      taglist = f_html_padding(p, 4, title, caption = caption )

    }else{

      # ggplotly throws a warning when trying to convert geom_sign..
      suppressWarnings({

        p = f_plot_hist( var, data_ls, col_group, graph_type = 'bar' , y_axis = 'count' ) %>%
          plotly::ggplotly( tooltip = c('y','fill') )


        l1 = f_html_padding(p, 4, title, subtitle = 'Counts' )

        p = f_plot_hist( var, data_ls, col_group, graph_type = 'bar' , y_axis = 'density' ) %>%
          plotly::ggplotly( tooltip = c('y','fill') )

      })


      l2 = f_html_padding(p, subtitle = 'Probabilities', caption = caption )

      taglist = htmltools::tagList( l1, l2)

    }

    return( taglist )

  }

  f_plot_ggplot = function( var, title, col_group, data_ls ){


    caption = '* P:0.05, ** P:0,005, *** P:0.001'

    if(var %in% data_ls$numericals){

      p = f_plot_hist( var, data_ls, col_group, graph_type = 'violin', title = title, caption = caption)
      return( p )
    }else{
      return( NULL )
    }

  }

  # Plot group count -----------------------------------------------------------------

  p_group_count = f_plot_hist( col_group, data_ls, y_axis = 'count')
  p_group_perc  = f_plot_hist( col_group, data_ls, y_axis = 'density')

  ## ggplotly throws a warning when trying to convert geom_sign
  suppressWarnings({

    taglist_group = htmltools::tagList( list(plotly::ggplotly(p_group_count)
                                       , plotly::ggplotly(p_group_perc) ) )

  })

  # Make Plots -----------------------------------------------------------------------

  plots = df_comb %>%
    filter( p_value <= thresh_p_val & diff_perc >= thresh_diff_perc) %>%
    arrange( desc(diff_perc) ) %>%
    mutate( stars = f_stat_stars( p_value )
            , no = row_number()
            , title = paste0( 'Plot', no,': ' ,variable, ' ' ,stars)
            , plot_plotly = map2( variable, title, f_plot_plotly, col_group, data_ls )
            , plot_ggplot = map2( variable, title, f_plot_ggplot, col_group, data_ls )
            )

  # no plotly implementation for geom_GeomSignif() throws warning
  suppressWarnings({
      plots_plotly = plots$plot_plotly
  })

  # f_plot_ggplot returns NULL if variable is not numeric
  plots_ggplot = plots %>%
    mutate( is_null = map_lgl( plot_ggplot, is.null ) ) %>%
    filter( is_null == F ) %>%
    .$plot_ggplot

  # Render ggplots -------------------------------------------------------------------

  if(static_plots){

    link_ggplot = f_plot_obj_2_html( plots_ggplot, 'plots'
                                     , output_file = paste0(output_file, '_stat_plots')
                                     , title = paste('Differences Between ', col_group, ' Groups - Static Plots')
                                     , quiet = quiet
                                     )

    html_link = f_html_filename_2_link(file_path = link_ggplot
                                     , link_text =  'Static plots of numerical variables with more statistical features')
  }else{
    html_link = 'Satic plot option is disabled'
  }

  # Make and render alluvial plot------------------------------------------------------

  if(alluvial){

    var_alluvial = df_comb %>%
      arrange( desc(diff_perc) ) %>%
      filter( p_value <= alluvial_thresh_p_val, diff_perc >= alluvial_thresh_diff_perc ) %>%
      .$variable

    var_alluvial = c( col_group, var_alluvial )

    p_alluv = f_plot_alluvial( data = data_ls$data
                               , variables = var_alluvial
                               , max_variables = 20
                               , fill_by = 'first_variable'
                               )

    df_alluf = p_alluv$data_key

    n_flows    = max( f_manip_factor_2_numeric( df_alluf$alluvial_id) )
    reduced_to = round( n_flows/nrow(df_alluf) * 100, 1 )
    max_weight = max( df_alluf$n )
    max_weight_perc = round( max_weight/nrow(df_alluf) * 100, 1 )

    line1 = paste('Number of flows:', n_flows)
    line2 = paste('Original Dataframe reduced to', reduced_to, '%' )
    line3 = paste('Maximum weight of a singfle flow', max_weight_perc, '%')

    out_alluv = paste( 'Number of flows greater than threshold.'
                       ,line1, line2, line3, sep = '\n' )

    if(n_flows < max_alluvial_flows ){

      link_alluvial = f_plot_obj_2_html( list(p_alluv)
                                         , 'plots'
                                         , output_file = paste0(output_file, '_alluvial')
                                         , title = paste('Alluvial Plot ( P <'
                                                         , alluvial_thresh_p_val
                                                         , ', difference % >'
                                                         , alluvial_thresh_diff_perc
                                                         ,')')
                                         , fig.width = fig.width
                                         , fig.height = fig.height
                                         , quiet = quiet
                                         )

      html_link_alluvial = f_html_filename_2_link( file_path = link_alluvial
                                                   , link_text = 'Alluvial Plot')
    }else{
      html_link_alluvial = paste( out_alluv )
    }

  }else{
    html_link_alluvial = 'Alluvial plot option is disabled.'
  }

  # Make and render tableplot --------------------------------------------------------

  if(tabplot){

    n_vars_tabplot = 12

    var_tabplot = df_comb %>%
      filter( p_value <= thresh_p_val & diff_perc >= thresh_diff_perc) %>%
      arrange( desc(diff_perc) ) %>%
      .$variable

    data_tab = select(data_ls$data, one_of(col_group, var_tabplot)  )

    to = c(1:ceiling( ncol(data_tab)/n_vars_tabplot)) * n_vars_tabplot
    to = to + 1
    to[length(to)] = ncol(data_tab)

    to = unique(to)

    if( length(to) > 1){
      from = to - (n_vars_tabplot-1)
    }else{
      from = 2
    }

    suppressWarnings({
      df_tab = tibble( from = from, to = to ) %>%
        mutate( data = map2(from, to, function(x,y) data_tab[,c(1,x:y)] )
                , title = paste( col_group, ' and variables',from, 'to', to, 'sorted by % difference')
                , tabplot = map( data, tabplot::tableplot, plot = F )
                )
    })

    #passing titles to the plot function does not work tweaked the rmd template to
    # accept the titles as params

    tabplots = df_tab$tabplot
    titles = df_tab$title

    link_tabplot = f_plot_obj_2_html(tabplots
                                    , type = 'tabplots'
                                    , output_file = paste0(output_file, '_tabplots')
                                    , title = paste('Tabplot Plot  ( P <'
                                                    , thresh_p_val
                                                    , ', difference % >'
                                                    , thresh_diff_perc
                                                    ,')')
                                    , titles = titles
                                    , fig.width = fig.width
                                    , fig.height = fig.height
                                    , quiet = quiet
    )

    html_link_tabplot = f_html_filename_2_link( file_path = link_tabplot
                                               , link_text = 'Tabplot')
  }else{
    html_link_tabplot = 'Tabplot option is disabled'
  }

  # Table with links -----------------------------------------------------------------

  tab_link = DT::datatable( tibble(Link = c( html_link
                                             , html_link_alluvial
                                             , html_link_tabplot)
                                   )
                            , escape = F
                            , options = list( dom = '')
                            , rownames = F )

  # Render html ----------------------------------------------------------------------


  tab_all = f_datatable_universal( df_comb, round_other_nums = 2 ) %>%
    f_html_padding( 3, title ='Table: All features'
                    , subtitle = paste('grouped by:', col_group), pad_after = 3 )

  tab_mean = f_datatable_universal( df_means, round_other_nums = 2 ) %>%
    f_html_padding( 3, title ='Table: Means and Medians of numerical features'
                    , subtitle = paste('grouped by:', col_group))

  tab_perc = f_datatable_universal( df_perc, round_other_nums = 2 ) %>%
    f_html_padding(  title ='Table: Counts and percentages of categorical features'
                    , subtitle = paste('grouped by:', col_group), pad_after = 2 )


  taglist = htmltools::tagList()
  taglist[[1]] = htmltools::h1( paste('Differences Between "', col_group, '" Groups') )
  taglist[[2]] = htmltools::h2( 'Number and percentage of observations per group' )
  taglist[[3]] = taglist_group
  taglist[[4]] = tab_all
  taglist[[5]] = f_html_padding(tab_link, title = 'Links to static plots')
  taglist[[6]] = htmltools::h1( 'Dynamic Plots' )
  taglist[[7]] = htmltools::h4( paste( 'P <', thresh_p_val, ', % difference >'
                                       , thresh_diff_perc, ', sorted by % difference' ) )
  taglist[[8]] = plots_plotly
  taglist[[9]] = f_html_breaks(5)
  taglist[[10]] = htmltools::h1( 'Summary Tables' )
  taglist[[11]] = tab_mean
  taglist[[12]] = tab_perc

  if(return_taglist){

    return( taglist )

  }else{

    link_taglist = f_plot_obj_2_html( taglist
                                      , 'taglist'
                                      , output_file = output_file
                                      , title = 'Group Analysis'
                                      , quiet = quiet )

    return( link_taglist )
  }


}

#' @title create a aggregated data frame with means and medians for numerical variables
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#'  list( data = <dataframe>, numericals = < vector with column names of
#'  numerical columns>)
#' @param col_group character vector denoting grouping columns
#' @return dataframe
#' @examples
#' f_clean_data( mtcars) %>%
#'  f_stat_group_mean_medians('cyl')
#' @rdname f_stat_group_mean_medians
#' @export
f_stat_group_mean_medians = function(data_ls, col_group){

  if( ! col_group %in% data_ls$categoricals ){
    stop('grouping variable not in categoricals')
  }

  if( is.null(data_ls$numericals) | is_empty(data_ls$numericals) ){
    return()
  }

  sym_group = as.name( col_group )

  df = data_ls$data %>%
    select( one_of(data_ls$numericals, col_group) ) %>%
    group_by_(col_group) %>%
    gather( key = 'variable', value = 'value', - one_of(col_group) ) %>%
    mutate( value = as.numeric(value) )

  df_mean = df %>%
    group_by_( col_group, 'variable' ) %>%
    summarize( mean = mean(value) ) %>%
    rename( !! as.name( paste0('mean_', col_group) )  := !!sym_group ) %>%
    spread_( key = paste0('mean_', col_group), value = 'mean', sep = '_' )

  df_median = df %>%
    group_by_( col_group, 'variable' ) %>%
    summarize( median = median(value) ) %>%
    rename( !! as.name( paste0('median_', col_group) )  := !!sym_group ) %>%
    spread_( key = paste0('median_', col_group), value = 'median', sep = '_' )

  df_p_val = f_stat_anova(data_ls, col_group) %>%
    f_stat_combine_anova_with_chi_square( ) %>%
    mutate( stars = f_stat_stars(p_value) ) %>%
    select( stars, everything() )

  df_comb = df_mean %>%
    left_join( df_median ) %>%
    left_join( df_p_val ) %>%
    arrange( p_value )

  return(df_comb)
}

#' @title create a aggregated data frame with percentagers and counts for categorical variables
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#'  list( data = <dataframe>, numericals = < vector with column names of
#'  numerical columns>)
#' @param col_group character vector denoting grouping columns
#' @return dataframe
#' @examples
#' f_clean_data( mtcars) %>%
#'  f_stat_group_counts_percentages('cyl')
#' @rdname f_stat_group_counts_percentages
#' @export
f_stat_group_counts_percentages = function(data_ls, col_group){

  if( ! col_group %in% data_ls$categoricals ){
    stop('grouping variable not in categoricals')
  }

  if( is.null(data_ls$categoricals) | is_empty(data_ls$categoricals) ){
    return()
  }

  sym_group = as.name(col_group)

  # gather() with variables of different types produces warning
  suppressWarnings({

  df = data_ls$data %>%
    select( one_of(data_ls$categoricals, col_group) ) %>%
    gather( key = 'variable', value = 'value' , -one_of(col_group) ) %>%
    mutate( value =  paste0(variable, '_', value) ) %>%
    group_by_( col_group, 'variable', 'value' ) %>%
    count() %>%
    group_by_( col_group )

  })

  df_n = df %>%
    ungroup() %>%
    mutate( !! sym_group := paste0( col_group, '_', !! sym_group, '_count') ) %>%
    spread( key = !! sym_group, value = n, fill = 0 )

  df_perc = df %>%
    group_by(variable) %>%
    mutate( !! sym_group := paste0( col_group, '_', !! sym_group, '_perc')
            , perc = n / sum(n) *100 ) %>%
    select( -n ) %>%
    spread( key = !! sym_group, value = perc, fill = 0 )

  df_p_val = f_stat_chi_square(data_ls, col_group) %>%
    f_stat_combine_anova_with_chi_square( df_chi_square = .) %>%
    mutate( stars = f_stat_stars(p_value) ) %>%
    select( stars, everything() )

  df_comb = df_n %>%
    left_join( df_perc ) %>%
    left_join( df_p_val ) %>%
    arrange( p_value )

  return(df_comb)
}

#' @title calculate significant level from p value
#' @description * P:0.05, ** P:0,005, *** P:0.001
#' @param p_value numeric
#' @return character vector
#' @examples
#' f_stat_stars(0.06)
#' f_stat_stars(0.05)
#' f_stat_stars(0.005)
#' f_stat_stars(0.001)
#' @rdname f_stat_stars
#' @export
f_stat_stars = function(p_value){

  ifelse( p_value <= 0.001, '***'
          , ifelse(p_value <= 0.005, '**'
          , ifelse(p_value <= 0.05, '*', 'ns') ) )

}


#' @title wrapper for shapiro.test()
#' @description shapiro.test i slimited to <5000 sample size and raises an error
#'   if sd(x) == 0. Wrapper samples from input vector and returns a list object
#'   with NA parameters if sd(x) == 0.
#' @param vec numeric vector
#' @return shapiro.test object or list( statistic = NA, p.value = NA )
#' @examples
#' f_stat_shapiro( rnorm(1000, 10, 1) )
#' f_stat_shapiro( runif(1000, 1, 10) )
#' @rdname f_stat_shapiro
#' @export
f_stat_shapiro = function(vec){

  if( length(vec) > 5000 ){
    vec = sample( vec, 5000, replace = F)
  }

  if( sd(vec) == 0 ){

    m = list( statistic = NA, p.value = NA )

  }else{

    m = shapiro.test(vec)
  }

  return(m)
}
erblast/oetteR documentation built on May 27, 2019, 12:11 p.m.