#' @title calculates maximum difference in group means and medians
#' @description used as a helper function for f_stat_anova
#' @param df dataframe
#' @param col_group character vector denoting grouping variable
#' @param col_variable character vector denoting variable
#' @return dataframe
#' @examples
#' set.seed(1)
#' df = tibble( fct = sample(LETTERS[1:5], 100, replace = T)
#' , v1 = 1
#' , v2 = rnorm(100, 4)
#' , v3 = c( rep(3, 50), rep(8,50) )
#' )
#'
#' col_group = 'fct'
#'
#' f_stat_diff_of_means_medians(df, col_group, 'v1') %>%
#' bind_rows( f_stat_diff_of_means_medians(df, col_group, 'v2') ) %>%
#' bind_rows( f_stat_diff_of_means_medians(df, col_group, 'v3') )
#' @rdname f_stat_diff_of_means_medians
#' @export
f_stat_diff_of_means_medians = function(df, col_group, col_variable){
data = df %>%
select( group = one_of(col_group), variable = one_of(col_variable) )%>%
mutate ( variable = f_manip_bring_to_pos_range(variable) ) %>%
group_by( group ) %>%
summarise( means = mean(variable, na.rm = T)
, medians = median(variable, na.rm = T) ) %>%
ungroup() %>%
summarise( variable = col_variable
, diff_of_means = max(means) - min(means)
, diff_of_means_perc = ( ( max(means) - min(means) ) /max(means) ) *100
, diff_of_medians = max(medians) - min(medians)
, diff_of_medians_perc = ( ( max(medians) - min(medians) ) /max(means) ) *100 )
return(data)
}
#' @title generate a datatframe with anova results from a data_ls list
#' @description returns a dataframe with shapiro, anova und kruskal p values
#' supplemented with maximum difference of means and medians between groups
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#' @param col_group character vector denoting grouping variable
#' @param boxcox perform analysis on boxcox-transformed or numerical variables
#' @return dataframe
#' @examples
#' df_anova = data_ls = f_clean_data(mtcars) %>%
#' f_stat_anova('cyl')
#'
#' df_anova
#' @seealso \code{\link[stringr]{str_c}}
#' \code{\link[purrr]{map}},\code{\link[purrr]{map_dbl}}
#' \code{\link{f_stat_anova}}
#' @rdname f_anova_stats
#' @export
#' @importFrom stringr str_c
#' @importFrom purrr map map_dbl
f_stat_anova = function(data_ls, col_group, boxcox = F) {
if( ! col_group %in% data_ls$categoricals ){
stop('grouping variable not in categoricals')
}
if( is.null(data_ls$numericals) | is_empty(data_ls$numericals) ){
return()
}
if( boxcox == T & ! is.null(data_ls$boxcox) ){
variables = data_ls$boxcox_names
data = data_ls$boxcox_data
}else{
variables = data_ls$numericals
data = data_ls$data
}
formula = stringr::str_c('value~',col_group) %>%
as.formula()
df_anova = data %>%
as_tibble() %>%
select( one_of( c(col_group, variables) ) ) %>%
gather(key = 'variable', value = 'value', one_of( variables ) ) %>%
group_by( variable ) %>%
filter( sd(value) != 0 ) %>%
nest( one_of(col_group), value) %>%
mutate( model_anova = purrr::map( data, ~aov( formula, data = .))
, summary_anova = purrr::map( model_anova, summary)
, summary_anova = purrr::map( summary_anova
, function(x) x[[1]])
, anova_pval = purrr::map(summary_anova, 'Pr(>F)')
, anova_pval = purrr::map_dbl(anova_pval
, function(x) x[1])
, model_kruskal = purrr::map( data, ~kruskal.test(formula, data = .) )
, kruskal_pval = purrr::map_dbl(model_kruskal, 'p.value')
, model_shapiro = purrr::map(data, function(x) f_stat_shapiro( x$value ) )
, shapiro_stat = purrr::map_dbl(model_shapiro,'statistic')
, shapiro_pval = purrr::map_dbl(model_shapiro,'p.value')
, diff_df = purrr::map( data, f_stat_diff_of_means_medians, col_group = col_group, col_variable = 'value') ) %>%
unnest(diff_df) %>%
select(variable
, shapiro_stat
, shapiro_pval
, anova_pval
, kruskal_pval
, diff_of_means
, diff_of_means_perc
, diff_of_medians
, diff_of_medians_perc
)
return(df_anova)
}
#' @title calculate the maximal difference in frequencies between to categorical variables
#' @description used as a helper function for f_stat_chi_square
#' @param df dataframe containing both avariables
#' @param col_var1 character vector denoting variable column 1
#' @param col_var2 character vector denoting variable column 2
#' @return dataframe
#' @examples
#' data_ls = f_clean_data(mtcars)
#' df_chi_squ = f_stat_max_diff_of_freq(data_ls$data, 'cyl', 'gear')
#' @rdname f_stat_max_diff_of_freq
#' @export
f_stat_max_diff_of_freq = function(df, col_var1, col_var2){
t = table( df[[col_var1]], df[[col_var2]] ) %>%
as.data.frame() %>% # as_tibble cannot convert the table input anymore
as_tibble() %>%
group_by( Var1 ) %>% #Var1, Var2 and Freq are the automatically assigneed column names
mutate ( diff_var1 = ( max(Freq)-min(Freq) )
, diff_var1_perc = ( ( max(Freq)-min(Freq))/ max(Freq) *100)
) %>%
group_by( Var2 ) %>%
mutate ( diff_var2 = ( max(Freq)-min(Freq) )
, diff_var2_perc = ( ( max(Freq)-min(Freq))/ max(Freq) *100)
) %>%
ungroup()%>%
summarise( max_diff_freq = max( c(diff_var1, diff_var2) )
, max_diff_freq_perc = max( c(diff_var1_perc, diff_var2_perc) )
)
}
#' @title generate a datatframe with chi square results from a data_ls list
#' @description FUNCTION_DESCRIPTION
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#' @param col_group character vector denoting grouping variable
#' @return dataframe
#' @examples
#' data_ls = f_clean_data(mtcars)
#' df_chi_squ = f_stat_chi_square(data_ls, 'cyl')
#' df_chi_squ
#' @seealso
#' \code{\link[purrr]{is_empty}},\code{\link[purrr]{map}},\code{\link[purrr]{map_dbl}}
#' @rdname f_stat_chi_square
#' @export
#' @importFrom purrr is_empty map map_dbl
f_stat_chi_square = function(data_ls, col_group) {
if( ! col_group %in% data_ls$categoricals ){
stop('grouping variable not in categoricals')
}
data = data_ls$data
variables = data_ls$categoricals[!data_ls$categoricals == col_group]
if(purrr::is_empty(variables)) return()
suppressWarnings({
df_chi = data %>%
as_tibble() %>%
select( one_of( c(col_group, variables) ) ) %>%
gather(key = 'variable', value = 'value', one_of( variables ) ) %>%
group_by( variable ) %>%
nest( one_of( col_group ), value) %>%
mutate( model_chi = purrr::map( data, ~chisq.test(x = .[[col_group]]
, y = .[['value']]
)
)
,chi_pval = purrr::map_dbl(model_chi, 'p.value')
,diff_df = purrr::map(data, f_stat_max_diff_of_freq, col_group, 'value') ) %>%
unnest(diff_df) %>%
select(variable, chi_pval, max_diff_freq, max_diff_freq_perc)
})
return(df_chi)
}
#' @title combines anova with chi square results into single dataframe
#' @description keeps wither the anova or the kruskal p value depending on the
#' results of the shapiro test (shapiro_stat > 0.9 p_val > 0.05) and keeps
#' the difference of percent of the mean. Still works if one of the input
#' dataframes is NULL
#' @param df_anova dataframe created with f_stat_anova, Default: NULL
#' @param df_chi_square dataframe created with f_stat_chi_square(), Default: NULL
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples
#' data_ls = f_clean_data(mtcars)
#' df_chi_squ = f_stat_chi_square(data_ls, 'cyl')
#' df_anova = f_stat_anova(data_ls, 'cyl')
#' df_comb = f_stat_combine_anova_with_chi_square(df_anova, df_chi_squ)
#' df_comb
#' df_comb = f_stat_combine_anova_with_chi_square(df_anova)
#' df_comb
#' df_comb = f_stat_combine_anova_with_chi_square(df_chi_square = df_chi_squ)
#' df_comb
#' @rdname f_stat_combine_anova_with_chi_square
#' @export
f_stat_combine_anova_with_chi_square = function(df_anova = NULL, df_chi_square = NULL){
if( ! is_empty(df_anova) | ! is.null(df_anova) ) {
df_anova = df_anova %>%
mutate( final_p_value = ifelse(shapiro_stat > 0.9 & shapiro_pval > 0.05
, anova_pval, kruskal_pval) ) %>%
select(variable, p_value = final_p_value , diff_perc = diff_of_means_perc)
}
if( ! is_empty(df_chi_square) | ! is.null(df_chi_square) ) {
df_chi_square = df_chi_square %>%
select(variable, p_value = chi_pval, diff_perc = max_diff_freq_perc)
}
# bind_rows takes NULL without error
df_comb = df_anova %>%
bind_rows( df_chi_square )%>%
arrange(p_value)
}
#'@title analyse group difference of dataset
#'@description creates a html document with a group analysis including:
#' \itemize{ \item P value table \item Dynamic Plots of all significant
#' features \item static plots with brackets indicating statistical differences
#' \item Tabplot \item Alluvial Plot \item table containing means and medians
#' for numerical variables \item table containing counts and percentages for
#' categorical variables } The function automatically renders three html pages
#' one for the additional static plots, one for the tableplot and one for the
#' alluvial plots. In the same direcotry that can be determined by the
#' outputfile parameter. Default behaviour will also render the entire html
#' document returning the filepath of the new html file. The other three html
#' files will be linked to in the document. You can modify the function to
#' return a htmltools taglist instead. The above mentioned 3 additional html
#' files for the other types of plots will still be rendered though with
#' default settings. These extra plots can be switched off though.
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#'@param col_group character vector denoting grouping columns
#'@param thresh_p_val p value threshold for plots, Default: 0.05
#'@param thresh_diff_perc minimum percent difference threshold for plots,
#' Default: 3
#'@param static_plots boolean, render static plots indicating statistical
#' differences with brackets, Default = TRUE
#'@param alluvial boolean, render alluvial plot, Default: TRUE
#'@param alluvial_thresh_p_val double, threshold for feature to be inlcuded in
#' alluvial plot. Features that are not highly significant and convey a large
#' percental difference will result in a high number of flows thus cluttering
#' the plot. It is not recommended to set these thresholds lower than the
#' default. Default: 0.05
#'@param alluvial_thresh_diff_perc double, threshold for feature to be inlcuded
#' in alluvial plot. Features that are not highly significant and convey a
#' large percental difference will result in a high number of flows thus
#' cluttering the plot. It is not recommended to set these thresholds lower
#' than the default. Default: 7.5
#'@param tabplot boolean, render tabplot threshold for features are the same as
#' for the dynamic plots, Default: TRUE , static_plots = T
#'@param return_taglist boolean, return taglist instead of rendereing the final
#' html document and returning the link to the html file. Usefull if analysis
#' should be directly included into the current markdown document.
#'@param output_file character vector containing output file name
#'@param max_alluvial_flows integer, maximum number of alluvial flows. Alluvial
#' Plots can take a long time to render. Rendering an alluvial plot with the
#' default setting of 1500 should take at least 10 min. Default 1500
#'@param fig.width integer Width of Alluvial and Tabplot in inches. Default
#' values can be comfortably viewed on a 1920 x 1080 screen resolution. Default: 16
#'@param fig.height integer height of Alluvial and Tabplot in inches. Default
#' values can be comfortably viewed on a 1920 x 1080 screen resolution. Default: 10
#'@param quiet booloean, suppress render markdown output to console, Default: FALSE
#'@return file path to html file / or taglist
#' @examples
#' \dontrun{
#' data_ls = f_clean_data(mtcars)
#' f_stat_group_ana(data_ls, 'cyl', output_file = 'test_me')
#' file.remove('test_me.html')
#' file.remove('test_me_stat_plots.html')
#' file.remove('test_me_alluvial.html')
#' file.remove('test_me_tabplots.html')
#' }
#'@seealso \code{\link[plotly]{ggplotly}}
#' \code{\link[htmltools]{tagList}},\code{\link[htmltools]{h1}},\code{\link[htmltools]{h2}}
#'
#'
#'
#'
#'@rdname f_stat_group_ana
#'@export
#'@importFrom plotly ggplotly
#'@importFrom htmltools tagList h1 h2 h3 h4 h5 h6
f_stat_group_ana = function(data_ls
, col_group
, thresh_p_val = 0.05
, thresh_diff_perc = 3
, output_file = 'group_ana'
, static_plots = T
, alluvial = T
, alluvial_thresh_p_val = 0.05
, alluvial_thresh_diff_perc = 7.5
, max_alluvial_flows = 1500
, tabplot = T
, return_taglist = F
, fig.width = 16
, fig.height = 10
, quiet = FALSE
){
df_anova = f_stat_anova( data_ls, col_group )
df_chi = f_stat_chi_square( data_ls, col_group )
df_comb = f_stat_combine_anova_with_chi_square( df_anova, df_chi ) %>%
mutate( stars = f_stat_stars(p_value) ) %>%
select( variable, stars, p_value, diff_perc )
df_means = f_stat_group_mean_medians(data_ls, col_group)
df_perc = f_stat_group_counts_percentages(data_ls, col_group)
f_plot_plotly = function( var, title, col_group, data_ls ){
caption = '* P:0.05, ** P:0,005, *** P:0.001'
if(var %in% data_ls$numericals){
# ggplotly throws a warning when trying to convert geom_sign..
suppressWarnings({
p = f_plot_hist( var, data_ls, col_group, graph_type = 'violin') %>%
plotly::ggplotly( tooltip = c('y','fill') )
})
taglist = f_html_padding(p, 4, title, caption = caption )
}else{
# ggplotly throws a warning when trying to convert geom_sign..
suppressWarnings({
p = f_plot_hist( var, data_ls, col_group, graph_type = 'bar' , y_axis = 'count' ) %>%
plotly::ggplotly( tooltip = c('y','fill') )
l1 = f_html_padding(p, 4, title, subtitle = 'Counts' )
p = f_plot_hist( var, data_ls, col_group, graph_type = 'bar' , y_axis = 'density' ) %>%
plotly::ggplotly( tooltip = c('y','fill') )
})
l2 = f_html_padding(p, subtitle = 'Probabilities', caption = caption )
taglist = htmltools::tagList( l1, l2)
}
return( taglist )
}
f_plot_ggplot = function( var, title, col_group, data_ls ){
caption = '* P:0.05, ** P:0,005, *** P:0.001'
if(var %in% data_ls$numericals){
p = f_plot_hist( var, data_ls, col_group, graph_type = 'violin', title = title, caption = caption)
return( p )
}else{
return( NULL )
}
}
# Plot group count -----------------------------------------------------------------
p_group_count = f_plot_hist( col_group, data_ls, y_axis = 'count')
p_group_perc = f_plot_hist( col_group, data_ls, y_axis = 'density')
## ggplotly throws a warning when trying to convert geom_sign
suppressWarnings({
taglist_group = htmltools::tagList( list(plotly::ggplotly(p_group_count)
, plotly::ggplotly(p_group_perc) ) )
})
# Make Plots -----------------------------------------------------------------------
plots = df_comb %>%
filter( p_value <= thresh_p_val & diff_perc >= thresh_diff_perc) %>%
arrange( desc(diff_perc) ) %>%
mutate( stars = f_stat_stars( p_value )
, no = row_number()
, title = paste0( 'Plot', no,': ' ,variable, ' ' ,stars)
, plot_plotly = map2( variable, title, f_plot_plotly, col_group, data_ls )
, plot_ggplot = map2( variable, title, f_plot_ggplot, col_group, data_ls )
)
# no plotly implementation for geom_GeomSignif() throws warning
suppressWarnings({
plots_plotly = plots$plot_plotly
})
# f_plot_ggplot returns NULL if variable is not numeric
plots_ggplot = plots %>%
mutate( is_null = map_lgl( plot_ggplot, is.null ) ) %>%
filter( is_null == F ) %>%
.$plot_ggplot
# Render ggplots -------------------------------------------------------------------
if(static_plots){
link_ggplot = f_plot_obj_2_html( plots_ggplot, 'plots'
, output_file = paste0(output_file, '_stat_plots')
, title = paste('Differences Between ', col_group, ' Groups - Static Plots')
, quiet = quiet
)
html_link = f_html_filename_2_link(file_path = link_ggplot
, link_text = 'Static plots of numerical variables with more statistical features')
}else{
html_link = 'Satic plot option is disabled'
}
# Make and render alluvial plot------------------------------------------------------
if(alluvial){
var_alluvial = df_comb %>%
arrange( desc(diff_perc) ) %>%
filter( p_value <= alluvial_thresh_p_val, diff_perc >= alluvial_thresh_diff_perc ) %>%
.$variable
var_alluvial = c( col_group, var_alluvial )
p_alluv = f_plot_alluvial( data = data_ls$data
, variables = var_alluvial
, max_variables = 20
, fill_by = 'first_variable'
)
df_alluf = p_alluv$data_key
n_flows = max( f_manip_factor_2_numeric( df_alluf$alluvial_id) )
reduced_to = round( n_flows/nrow(df_alluf) * 100, 1 )
max_weight = max( df_alluf$n )
max_weight_perc = round( max_weight/nrow(df_alluf) * 100, 1 )
line1 = paste('Number of flows:', n_flows)
line2 = paste('Original Dataframe reduced to', reduced_to, '%' )
line3 = paste('Maximum weight of a singfle flow', max_weight_perc, '%')
out_alluv = paste( 'Number of flows greater than threshold.'
,line1, line2, line3, sep = '\n' )
if(n_flows < max_alluvial_flows ){
link_alluvial = f_plot_obj_2_html( list(p_alluv)
, 'plots'
, output_file = paste0(output_file, '_alluvial')
, title = paste('Alluvial Plot ( P <'
, alluvial_thresh_p_val
, ', difference % >'
, alluvial_thresh_diff_perc
,')')
, fig.width = fig.width
, fig.height = fig.height
, quiet = quiet
)
html_link_alluvial = f_html_filename_2_link( file_path = link_alluvial
, link_text = 'Alluvial Plot')
}else{
html_link_alluvial = paste( out_alluv )
}
}else{
html_link_alluvial = 'Alluvial plot option is disabled.'
}
# Make and render tableplot --------------------------------------------------------
if(tabplot){
n_vars_tabplot = 12
var_tabplot = df_comb %>%
filter( p_value <= thresh_p_val & diff_perc >= thresh_diff_perc) %>%
arrange( desc(diff_perc) ) %>%
.$variable
data_tab = select(data_ls$data, one_of(col_group, var_tabplot) )
to = c(1:ceiling( ncol(data_tab)/n_vars_tabplot)) * n_vars_tabplot
to = to + 1
to[length(to)] = ncol(data_tab)
to = unique(to)
if( length(to) > 1){
from = to - (n_vars_tabplot-1)
}else{
from = 2
}
suppressWarnings({
df_tab = tibble( from = from, to = to ) %>%
mutate( data = map2(from, to, function(x,y) data_tab[,c(1,x:y)] )
, title = paste( col_group, ' and variables',from, 'to', to, 'sorted by % difference')
, tabplot = map( data, tabplot::tableplot, plot = F )
)
})
#passing titles to the plot function does not work tweaked the rmd template to
# accept the titles as params
tabplots = df_tab$tabplot
titles = df_tab$title
link_tabplot = f_plot_obj_2_html(tabplots
, type = 'tabplots'
, output_file = paste0(output_file, '_tabplots')
, title = paste('Tabplot Plot ( P <'
, thresh_p_val
, ', difference % >'
, thresh_diff_perc
,')')
, titles = titles
, fig.width = fig.width
, fig.height = fig.height
, quiet = quiet
)
html_link_tabplot = f_html_filename_2_link( file_path = link_tabplot
, link_text = 'Tabplot')
}else{
html_link_tabplot = 'Tabplot option is disabled'
}
# Table with links -----------------------------------------------------------------
tab_link = DT::datatable( tibble(Link = c( html_link
, html_link_alluvial
, html_link_tabplot)
)
, escape = F
, options = list( dom = '')
, rownames = F )
# Render html ----------------------------------------------------------------------
tab_all = f_datatable_universal( df_comb, round_other_nums = 2 ) %>%
f_html_padding( 3, title ='Table: All features'
, subtitle = paste('grouped by:', col_group), pad_after = 3 )
tab_mean = f_datatable_universal( df_means, round_other_nums = 2 ) %>%
f_html_padding( 3, title ='Table: Means and Medians of numerical features'
, subtitle = paste('grouped by:', col_group))
tab_perc = f_datatable_universal( df_perc, round_other_nums = 2 ) %>%
f_html_padding( title ='Table: Counts and percentages of categorical features'
, subtitle = paste('grouped by:', col_group), pad_after = 2 )
taglist = htmltools::tagList()
taglist[[1]] = htmltools::h1( paste('Differences Between "', col_group, '" Groups') )
taglist[[2]] = htmltools::h2( 'Number and percentage of observations per group' )
taglist[[3]] = taglist_group
taglist[[4]] = tab_all
taglist[[5]] = f_html_padding(tab_link, title = 'Links to static plots')
taglist[[6]] = htmltools::h1( 'Dynamic Plots' )
taglist[[7]] = htmltools::h4( paste( 'P <', thresh_p_val, ', % difference >'
, thresh_diff_perc, ', sorted by % difference' ) )
taglist[[8]] = plots_plotly
taglist[[9]] = f_html_breaks(5)
taglist[[10]] = htmltools::h1( 'Summary Tables' )
taglist[[11]] = tab_mean
taglist[[12]] = tab_perc
if(return_taglist){
return( taglist )
}else{
link_taglist = f_plot_obj_2_html( taglist
, 'taglist'
, output_file = output_file
, title = 'Group Analysis'
, quiet = quiet )
return( link_taglist )
}
}
#' @title create a aggregated data frame with means and medians for numerical variables
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#' @param col_group character vector denoting grouping columns
#' @return dataframe
#' @examples
#' f_clean_data( mtcars) %>%
#' f_stat_group_mean_medians('cyl')
#' @rdname f_stat_group_mean_medians
#' @export
f_stat_group_mean_medians = function(data_ls, col_group){
if( ! col_group %in% data_ls$categoricals ){
stop('grouping variable not in categoricals')
}
if( is.null(data_ls$numericals) | is_empty(data_ls$numericals) ){
return()
}
sym_group = as.name( col_group )
df = data_ls$data %>%
select( one_of(data_ls$numericals, col_group) ) %>%
group_by_(col_group) %>%
gather( key = 'variable', value = 'value', - one_of(col_group) ) %>%
mutate( value = as.numeric(value) )
df_mean = df %>%
group_by_( col_group, 'variable' ) %>%
summarize( mean = mean(value) ) %>%
rename( !! as.name( paste0('mean_', col_group) ) := !!sym_group ) %>%
spread_( key = paste0('mean_', col_group), value = 'mean', sep = '_' )
df_median = df %>%
group_by_( col_group, 'variable' ) %>%
summarize( median = median(value) ) %>%
rename( !! as.name( paste0('median_', col_group) ) := !!sym_group ) %>%
spread_( key = paste0('median_', col_group), value = 'median', sep = '_' )
df_p_val = f_stat_anova(data_ls, col_group) %>%
f_stat_combine_anova_with_chi_square( ) %>%
mutate( stars = f_stat_stars(p_value) ) %>%
select( stars, everything() )
df_comb = df_mean %>%
left_join( df_median ) %>%
left_join( df_p_val ) %>%
arrange( p_value )
return(df_comb)
}
#' @title create a aggregated data frame with percentagers and counts for categorical variables
#'@param data_ls data_ls object generated by f_clean_data(), or a named list
#' list( data = <dataframe>, numericals = < vector with column names of
#' numerical columns>)
#' @param col_group character vector denoting grouping columns
#' @return dataframe
#' @examples
#' f_clean_data( mtcars) %>%
#' f_stat_group_counts_percentages('cyl')
#' @rdname f_stat_group_counts_percentages
#' @export
f_stat_group_counts_percentages = function(data_ls, col_group){
if( ! col_group %in% data_ls$categoricals ){
stop('grouping variable not in categoricals')
}
if( is.null(data_ls$categoricals) | is_empty(data_ls$categoricals) ){
return()
}
sym_group = as.name(col_group)
# gather() with variables of different types produces warning
suppressWarnings({
df = data_ls$data %>%
select( one_of(data_ls$categoricals, col_group) ) %>%
gather( key = 'variable', value = 'value' , -one_of(col_group) ) %>%
mutate( value = paste0(variable, '_', value) ) %>%
group_by_( col_group, 'variable', 'value' ) %>%
count() %>%
group_by_( col_group )
})
df_n = df %>%
ungroup() %>%
mutate( !! sym_group := paste0( col_group, '_', !! sym_group, '_count') ) %>%
spread( key = !! sym_group, value = n, fill = 0 )
df_perc = df %>%
group_by(variable) %>%
mutate( !! sym_group := paste0( col_group, '_', !! sym_group, '_perc')
, perc = n / sum(n) *100 ) %>%
select( -n ) %>%
spread( key = !! sym_group, value = perc, fill = 0 )
df_p_val = f_stat_chi_square(data_ls, col_group) %>%
f_stat_combine_anova_with_chi_square( df_chi_square = .) %>%
mutate( stars = f_stat_stars(p_value) ) %>%
select( stars, everything() )
df_comb = df_n %>%
left_join( df_perc ) %>%
left_join( df_p_val ) %>%
arrange( p_value )
return(df_comb)
}
#' @title calculate significant level from p value
#' @description * P:0.05, ** P:0,005, *** P:0.001
#' @param p_value numeric
#' @return character vector
#' @examples
#' f_stat_stars(0.06)
#' f_stat_stars(0.05)
#' f_stat_stars(0.005)
#' f_stat_stars(0.001)
#' @rdname f_stat_stars
#' @export
f_stat_stars = function(p_value){
ifelse( p_value <= 0.001, '***'
, ifelse(p_value <= 0.005, '**'
, ifelse(p_value <= 0.05, '*', 'ns') ) )
}
#' @title wrapper for shapiro.test()
#' @description shapiro.test i slimited to <5000 sample size and raises an error
#' if sd(x) == 0. Wrapper samples from input vector and returns a list object
#' with NA parameters if sd(x) == 0.
#' @param vec numeric vector
#' @return shapiro.test object or list( statistic = NA, p.value = NA )
#' @examples
#' f_stat_shapiro( rnorm(1000, 10, 1) )
#' f_stat_shapiro( runif(1000, 1, 10) )
#' @rdname f_stat_shapiro
#' @export
f_stat_shapiro = function(vec){
if( length(vec) > 5000 ){
vec = sample( vec, 5000, replace = F)
}
if( sd(vec) == 0 ){
m = list( statistic = NA, p.value = NA )
}else{
m = shapiro.test(vec)
}
return(m)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.