# Generates data for t distribution probability density with critical section and statistic
# to depict a result of a t-test.
generate_ttest_density_data <- function(t, p.value, df, sig_level = 0.05, alternative = "two.sided") {
l <- max(5, abs(t)*1.1) # limit of x for the data we generate here.
x <- seq(from=-l,to=l,by=l/500 )
ret <- tibble::tibble(x=x, y=dt(x, df=df))
ret2 <- tibble::tibble(x=t, y=dt(x, df=df), statistic=TRUE, p.value=p.value[1])
ret <- bind_rows(ret, ret2)
if (alternative == "two.sided") {
tt <- qt(1-sig_level/2, df=df) # Threshold t for critical section.
ret <- ret %>% mutate(critical=(x>=tt|x<=-tt))
}
else if (alternative == "greater") {
tt <- qt(1-sig_level, df=df) # Threshold t for critical section.
ret <- ret %>% mutate(critical=(x>=tt))
}
else { # alternative == "less"
tt <- qt(sig_level, df=df) # Threshold t for critical section.
ret <- ret %>% mutate(critical=(x<=tt))
}
ret <- ret %>% mutate(df=df)
ret
}
# Generates data for t distribution probability density to depict the power analysis.
generate_ttest_density_data_for_power <- function(d, n1, n2, t, df, sig_level = 0.05, alternative = "two.sided", paired = TRUE) {
if (!paired) {
ncp <- d * (1/sqrt(1/n1 + 1/n2))
}
else {
ncp <- d * sqrt(n1) # Paired case. Assuming n1 == n2.
}
# Cover 3.5 sd of the both distributions.
if (alternative == "less") {
start <- -ncp-3.5
end <- 3.5
}
else {
start <- -3.5
end <- ncp+3.5
}
x <- seq(from=start,to=end,by=(end-start)/500 )
ret <- tibble::tibble(x=x, y=dt(x, df=df), ncp=0, type="Null")
if (alternative == "two.sided") {
tt <- qt(1-sig_level/2, df=df) # Threshold t for critical section.
ret <- ret %>% mutate(critical=(x>=tt|x<=-tt))
ret2 <- tibble::tibble(x=x, y=dt(x, df=df, ncp=ncp), ncp=ncp, type="Alternative")
ret2 <- ret2 %>% mutate(critical=(x<=tt))
}
else if (alternative == "greater") {
tt <- qt(1-sig_level, df=df) # Threshold t for critical section.
ret <- ret %>% mutate(critical=(x>=tt))
ret2 <- tibble::tibble(x=x, y=dt(x, df=df, ncp=ncp), ncp=ncp,type="Alternative")
ret2 <- ret2 %>% mutate(critical=(x<=tt))
}
else { # alternative == "less"
tt <- qt(sig_level, df=df) # Threshold t for critical section.
ret <- ret %>% mutate(critical=(x<=tt))
ret2 <- tibble::tibble(x=x, y=dt(x, df=df, ncp=-ncp), ncp=-ncp, type="Alternative")
ret2 <- ret2 %>% mutate(critical=(x>=tt))
}
ret <- ret %>% mutate(df=df)
ret3 <- tibble::tibble(x=t, y=dt(x, df=df), type="Null", statistic=TRUE)
ret <- bind_rows(ret, ret2, ret3)
ret <- ret %>% dplyr::mutate(type=factor(type, levels=c("Null", "Alternative")))
ret
}
# Generates data for chi-square distribution probability density with critical section and statistic
# to depict a result of a chi-square test.
generate_chisq_density_data <- function(stat, p.value, df, sig_level = 0.05) {
tx <- qchisq(1-sig_level, df=df) # The chisq value that corresponds to the significance level.
l <- max(df*3, stat*1.1, tx*1.1) # Making sure stat and tx are in the displayed range.
x <- seq(from=0, to=l, by=l/1000 )
ret <- tibble::tibble(x=x, y=dchisq(x, df=df))
ret2 <- tibble::tibble(x=stat, y=dchisq(x, df=df), statistic=TRUE, p.value=p.value[1])
ret <- bind_rows(ret, ret2)
ret <- ret %>% mutate(critical=x>=tx)
ret <- ret %>% mutate(df=df)
ret
}
# Generates data for probability density chart for chi-square test.
# df - Degree of freedom
# w - Cohen's w
# N - Sample size
# crit - Critical chi-square value, for the vertical reference line.
generate_chisq_density_data_for_power <- function(df, w, N, crit) {
# Plot up to 95 percentile.
ncp <- N*w^2
# In the chart data, cover 0 to 95 percentile of the non-centeral chi-square distribution.
l <- qchisq(0.95, df=df, ncp=ncp)
x <- seq(from=0, to=l, by=l/1000 )
ret <- tibble::tibble(x=x,
y=dchisq(x, df=df),
type="Null"
)
# Prepare and bind one-row data for the vertical reference line for the critical value.
ret0 <- tibble::tibble(x=crit, y=dchisq(crit, df=df), type="Null", statistic=TRUE)
ret <- ret %>% dplyr::bind_rows(ret0)
ret <- ret %>% dplyr::mutate(critical=(x>=crit), df=1, ncp=0)
ret2 <- tibble::tibble(x=x,
y=dchisq(x, df=df, ncp=N*w^2),
type="Alternative"
)
ret2 <- ret2 %>% dplyr::mutate(critical=(x<=crit), df=1, ncp=ncp)
ret <- ret %>% dplyr::bind_rows(ret2)
ret <- ret %>% dplyr::mutate(type=factor(type, levels=c("Null", "Alternative")))
ret
}
# Generates data for F distribution probability density with critical section and statistic
# to depict a result of a F test like one-way ANOVA.
generate_ftest_density_data <- function(stat, p.value, df1, df2, sig_level = 0.05) {
tx <- qf(1-sig_level, df1=df1, df2=df2) # The chisq value that corresponds to the significance level.
l <- max(df1/df2*3, stat*1.1, tx*1.1) # Making sure stat and tx are in the displayed range.
x <- seq(from=0,to=l,by=l/1000 )
ret <- tibble::tibble(x=x, y=df(x, df1=df1, df2=df2))
ret2 <- tibble::tibble(x=stat, y=df(x, df1=df1, df2=df2), statistic=TRUE, p.value=p.value[1])
ret <- bind_rows(ret, ret2)
ret <- ret %>% mutate(critical=x>=tx)
ret <- ret %>% mutate(df1=df1, df2=df2)
ret
}
# Generated data of a normal distribution with critical section and statistic for visualization.
# We currently use it for non-exact Wilcoxon test.
generate_norm_density_data <- function(z, p.value, mu, sigma, sig_level = 0.05, alternative = "two.sided") {
r <- max(5*sigma, abs(z-mu)*1.1) # radius of x for the data we generate here.
x <- seq(from=mu-r,to=mu+r,by=r/250)
ret <- tibble::tibble(x=x, y=dnorm(x, mean=mu, sd=sigma))
ret2 <- tibble::tibble(x=z, y=dnorm(z, mean=mu, sd=sigma), statistic=TRUE, p.value=p.value[1])
ret <- bind_rows(ret, ret2)
if (alternative == "two.sided") {
tz_greater <- qnorm(1-sig_level/2, mean=mu, sd=sigma) # Threshold z for critical section.
tz_less <- qnorm(sig_level/2, mean=mu, sd=sigma) # Threshold z for critical section.
ret <- ret %>% mutate(critical=(x>=tz_greater|x<=tz_less))
}
else if (alternative == "greater") {
tz <- qnorm(1-sig_level, mean=mu, sd=sigma) # Threshold z for critical section.
ret <- ret %>% mutate(critical=(x>=tz))
}
else { # alternative == "less"
tz <- qnorm(sig_level, mean=mu, sd=sigma) # Threshold z for critical section.
ret <- ret %>% mutate(critical=(x<=tz))
}
ret <- ret %>% mutate(mean=mu, sd=sigma)
ret
}
# Generates data for probability density chart for exact Wilcoxon test.
generate_wilcox_density_data <- function(stat, p.value, n1, n2, sig_level = 0.05, alternative = "two.sided") {
from <- min(stat, qwilcox(sig_level/4, m=n1, n=n2)) # Start of the x axis range.
to <- max(stat, qwilcox(1-sig_level/4, m=n1, n=n2)) # End of the x axis range.
# Give some space around the data range.
l <- to - from
from <- from - l/10
to <- to + l/10
x <- seq(from=floor(from),to=ceiling(to)) # x has to be integer.
ret <- tibble::tibble(x=x, y=dwilcox(x, m=n1, n=n2))
# We need to take the mean of the density at the two closest integer values since non-integer density values are zero.
ret2 <- tibble::tibble(x=stat, y=mean(dwilcox(c(floor(stat), ceiling(stat)), m=n1, n=n2)), statistic=TRUE, p.value=p.value[1])
ret <- bind_rows(ret, ret2)
if (alternative == "two.sided") {
tx_greater <- qwilcox(1-sig_level/2, m=n1, n=n2) # Threshold x for critical section.
tx_less <- qwilcox(sig_level/2, m=n1, n=n2) # Threshold x for critical section.
ret <- ret %>% mutate(critical=(x>=tx_greater|x<=tx_less))
}
else if (alternative == "greater") {
tx <- qwilcox(1-sig_level, m=n1, n=n2) # Threshold x for critical section.
ret <- ret %>% mutate(critical=(x>=tx))
}
else { # alternative == "less"
tx <- qwilcox(sig_level, m=n1, n=n2) # Threshold x for critical section.
ret <- ret %>% mutate(critical=(x<=tx))
}
ret <- ret %>% mutate(n1=n1, n2=n2)
ret
}
# Generates data for probability density chart for exact sign rank test.
generate_signrank_density_data <- function(stat, p.value, n, sig_level = 0.05, alternative = "two.sided") {
from <- min(stat, qsignrank(sig_level/4, n=n)) # Start of the x axis range.
to <- max(stat, qsignrank(1-sig_level/4, n=n)) # End of the x axis range.
# Give some space around the data range.
l <- to - from
from <- from - l/10
to <- to + l/10
x <- seq(from=floor(from),to=ceiling(to)) # x has to be integer.
ret <- tibble::tibble(x=x, y=dsignrank(x, n=n))
# We need to take the mean of the density at the two closest integer values since non-integer density values are zero.
ret2 <- tibble::tibble(x=stat, y=mean(dsignrank(c(floor(stat), ceiling(stat)), n=n)), statistic=TRUE, p.value=p.value[1])
ret <- bind_rows(ret, ret2)
if (alternative == "two.sided") {
tx_greater <- qsignrank(1-sig_level/2, n=n) # Threshold x for critical section.
tx_less <- qsignrank(sig_level/2, n=n) # Threshold x for critical section.
ret <- ret %>% mutate(critical=(x>=tx_greater|x<=tx_less))
}
else if (alternative == "greater") {
tx <- qsignrank(1-sig_level, n=n) # Threshold x for critical section.
ret <- ret %>% mutate(critical=(x>=tx))
}
else { # alternative == "less"
tx <- qsignrank(sig_level, n=n) # Threshold x for critical section.
ret <- ret %>% mutate(critical=(x<=tx))
}
ret <- ret %>% mutate(n=n)
# Smooth it out for visualization with LOESS. (dsignrank is not smooth unlike dwilcox.)
span_value <- 0.5 # Larger span values result in more smoothing
loess_model <- loess(y ~ x, data = ret, span = span_value)
ret$y <- predict(loess_model)
ret
}
#' wrapper for t.test, which compares means
#' @export
do_t.test <- function(df, value, key=NULL, ...){
value_col <- col_name(substitute(value))
with_key <- !is.null(substitute(key))
if(with_key){
# make a formula for two sample t-test
# value_col is values and key_col is labels in which grouop the values are in
key_col <- col_name(substitute(key))
# this creates a formula like `val`~`group key`
fml <- as.formula(paste(paste("`", value_col, "`", sep=""), paste("`", key_col, "`", sep=""), sep="~"))
}
# otherwise, one sample t-test from values in value_col is executed
grouped_col <- grouped_by(df)
model_col <- avoid_conflict(grouped_col, "model")
# this is executed on each group
do_t.test_each <- function(df, ...){
if(with_key){
# make key column factor
# so that which key
# the result columns (estimate1 and estimate2)
# are from can be clear
df[[key_col]] <- as.factor(df[[key_col]])
mean_col_names <- paste("mean_", levels(df[[key_col]]), sep = "")
# use formula (`value_col`~`key_col`) for two sample t-test
model <- tryCatch({
t.test(data=df, fml, ...)
}, error = function(e){
if(e$message == "grouping factor must have exactly 2 levels"){
stop("Group Column has to have 2 unique values")
}
stop(e$message)
})
} else {
# use df[[value_col]] for one sample t-test (comparison with indicated normal distribution)
model <- t.test(df[[value_col]], ...)
}
ret <- broom::tidy(model)
# convert from factor to character
ret[["method"]] <- as.character(ret[["method"]])
ret[["alternative"]] <- as.character(ret[["alternative"]])
# "two.sided" should be "two sided" because "two.sided" is confused as a url by Exploratory Desktop
# this can be "greater" or "lower" but they are okay.
ret[["alternative"]] <- ifelse(ret[["alternative"]] == "two.sided", "two sided", ret[["alternative"]])
# change column names
col_names <- avoid_conflict(grouped_col, vapply(colnames(ret), function(name){
switch (name,
estimate = "effect_size",
estimate1 = mean_col_names[[1]],
estimate2 = mean_col_names[[2]],
statistic = "t.value",
parameter = "degrees_of_freedom",
name
)
}, FUN.VALUE = ""))
colnames(ret) <- col_names
ret
}
df %>%
dplyr::do_(.dots=setNames(list(~do_t.test_each(df = ., ...)), model_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(model_col))
}
#' wrapper for var.test, which compares variances
#' @export
do_var.test <- function(df, value, key, ...){
value_col <- col_name(substitute(value))
key_col <- col_name(substitute(key))
fml <- as.formula(paste(paste("`", value_col, "`", sep=""), paste("`", key_col, "`", sep=""), sep="~"))
grouped_col <- grouped_by(df)
model_col <- avoid_conflict(grouped_col, "model")
# this is executed on each group
do_var.test_each <- function(df, ...){
model <- tryCatch({
var.test(data=df, fml, ...)
}, error = function(e){
if(e$message == "grouping factor must have exactly 2 levels"){
stop("Group Column has to have 2 unique values")
}
stop(e$message)
})
ret <- broom::tidy(model)
ret[["method"]] <- as.character(ret[["method"]])
ret[["alternative"]] <- as.character(ret[["alternative"]])
# "two.sided" should be "two sided" because "two.sided" is confused as a url by Exploratory Desktop
# this can be "greater" or "lower" but they are okay.
ret[["alternative"]] <- ifelse(ret[["alternative"]] == "two.sided", "two sided", ret[["alternative"]])
# change column names
col_names <- avoid_conflict(grouped_col, vapply(colnames(ret), function(name){
switch (name,
estimate = "variance_ratio",
num.df = "numerator_degrees_of_freedom",
den.df = "denominator_degrees_of_freedom",
statistic = "f.value",
name
)
}, FUN.VALUE = ""))
colnames(ret) <- col_names
ret
}
df %>%
dplyr::do_(.dots=setNames(list(~do_var.test_each(df = ., ...)), model_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(model_col))
}
#' Non standard evaluation version of do_chisq.test_
#' @export
do_chisq.test <- function(df, ...,
correct = TRUE,
p = NULL,
rescale.p = TRUE,
simulate.p.value = FALSE,
B = 2000){
select_dots <- lazyeval::lazy_dots(...)
cols <- evaluate_select(df, select_dots, excluded = grouped_by(df))
# p should be able to be NSE column name or numeric vector
# , so evaluated lazily
lazy_p <- lazyeval::lazy(p)
p <- lazyeval::lazy_eval(lazy_p, data = df)
do_chisq.test_(df,
selected_cols = cols,
correct = correct,
p = p,
rescale.p = rescale.p,
simulate.p.value = simulate.p.value,
B = B)
}
#' chisq.test wrapper
#' @param df Data frame to be tested.
#' @param selected_cols Names of columns of categories.
#' @param correct Whether continuity correction will be applied for 2 by 2 tables.
#' @param p This works when one column is selected. A column to be considered as probability to be tested.
#' If NULL, it's considered as uniform distribution.
#' @param rescale.p If TRUE, p is rescaled to sum to 1. If FALSE and p doesn't sum to 1, it causes an error.
#' @param simulate.p.value Whether p value should be calculated by Monte Carlo simulation.
#' @param B This works only when simulate.p.value is TRUE. The number of replicates for Monte Carlo test.
#' @export
do_chisq.test_ <- function(df,
selected_cols = c(),
correct = TRUE,
p = NULL,
rescale.p = TRUE,
simulate.p.value = FALSE,
B = 2000){
chisq.test_each <- function(df, ...) {
x <- df[, selected_cols] %>% as.matrix()
if (is.null(p)){
# default of p from chisq.test
p <- rep(1/length(x), length(x))
}
chisq.test(x = x,
correct = correct,
p = p,
rescale.p = rescale.p,
simulate.p.value = simulate.p.value,
B = B) %>%
broom::glance()
}
# Calculation is executed in each group.
# Storing the result in this tmp_col and
# unnesting the result.
# If the original data frame is grouped by "tmp",
# overwriting it should be avoided,
# so avoid_conflict is used here.
tmp_col <- avoid_conflict(colnames(df), "tmp")
df %>%
dplyr::do_(.dots = setNames(list(~chisq.test_each(.)), tmp_col)) %>%
dplyr::ungroup() %>%
unnest_with_drop(!!rlang::sym(tmp_col))
}
#' Chi-Square test wrapper for Analytics View
#' @param test_sig_level - Significance level for the t-test ifself.
#' @param sig.level - Significance level for power analysis.
#' @export
exp_chisq <- function(df, var1, var2, value = NULL, func1 = NULL, func2 = NULL, fun.aggregate = sum, correct = FALSE,
test_sig_level = 0.05, sig.level = 0.05, w = NULL, power = NULL, beta = NULL, ...) {
if (!is.null(power) && !is.null(beta) && (power + beta != 1.0)) {
stop("Specify only one of Power or Type 2 Error, or they must add up to 1.0.")
}
if (is.null(power) && !is.null(beta)) {
power <- 1.0 - beta
}
# We are turning off Yates's correction by default because...
# 1. It seems that it is commonly discussed that it is overly conservative and not necessary.
# https://en.wikipedia.org/wiki/Yates%27s_correction_for_continuity
# https://aue.repo.nii.ac.jp/?action=repository_uri&item_id=785&file_id=15&file_no=1
# 2. With Yates's correction residuals do not add up to chi-square value, which makes contributions not adding up to 100%.
var1_col <- tidyselect::vars_select(names(df), !! rlang::enquo(var1))
var2_col <- tidyselect::vars_select(names(df), !! rlang::enquo(var2))
value_col <- tidyselect::vars_select(names(df), !! rlang::enquo(value))
grouped_cols <- grouped_by(df)
if (!is.null(func1)) {
if (lubridate::is.Date(df[[var1_col]]) || lubridate::is.POSIXct(df[[var1_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var1_col) := extract_from_date(!!rlang::sym(var1_col), type=!!func1))
}
else if (is.numeric(df[[var1_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var1_col) := extract_from_numeric(!!rlang::sym(var1_col), type=!!func1))
}
}
if (!is.null(func2)) {
if (lubridate::is.Date(df[[var2_col]]) || lubridate::is.POSIXct(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_date(!!rlang::sym(var2_col), type=!!func2))
}
else if (is.numeric(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_numeric(!!rlang::sym(var2_col), type=!!func2))
}
}
# Check each category column has multiple classes.
# Currently, we filter out NA categories later. So, counting non-NA categories only.
if (n_distinct(df[[var1_col]], na.rm=TRUE) < 2) {
stop(paste0("The explanatory variable needs to have 2 or more unique values."))
}
if (n_distinct(df[[var2_col]], na.rm=TRUE) < 2) {
stop(paste0("The target variable needs to have 2 or more unique values."))
}
var1_levels <- NULL
var2_levels <- NULL
# preserve class of var1 and var2 so that we can cast them back to original class later.
var1_class <- class(df[[var1_col]])
var2_class <- class(df[[var2_col]])
if (is.factor(df[[var1_col]])) {
var1_levels <- levels(df[[var1_col]])
}
if (is.factor(df[[var2_col]])) {
var2_levels <- levels(df[[var2_col]])
}
formula = as.formula(paste0('`', var1_col, '`~`', var2_col, '`'))
chisq.test_each <- function(df) {
tryCatch({
# If there is only one class of category in this class, skip it.
# This is effectively for multiple group case, since for single group case, it is already checked before this loop.
if (n_distinct(df[[var1_col]], na.rm=TRUE) < 2) {
stop(paste0("The explanatory variable needs to have 2 or more unique values."))
}
if (n_distinct(df[[var2_col]], na.rm=TRUE) < 2) {
stop(paste0("The target variable needs to have 2 or more unique values."))
}
# TODO: For now, we are filtering out NA categories, but we should include them and display them cleanly.
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var1_col)) & !is.na(!!rlang::sym(var2_col)))
df <- df %>% dplyr::group_by(!!rlang::sym(var1_col), !!rlang::sym(var2_col))
if (is.null(value_col) || length(value_col) == 0) { # It seems that if value_col is not specified value_col can be named character(0), which can be detected by length(value_col)=0.
df <- df %>% dplyr::summarize(.temp_value_col=n())
}
else {
#TODO: handle name conflict with .temp_value_col and group cols.
if (identical(sum, fun.aggregate)) {
df <- df %>% dplyr::summarize(.temp_value_col=fun.aggregate(!!rlang::sym(value_col), na.rm=TRUE))
}
else {
# Possible fun.aggregate are, length, n_distinct, false_count (count for TRUE is done by sum),
# na_count, non_na_count. They can/should be run without na.rm=TRUE.
df <- df %>% dplyr::summarize(.temp_value_col=fun.aggregate(!!rlang::sym(value_col)))
}
}
# TODO: spread creates column named "<NA>". For consistency on UI, we want "(NA)".
# Note that this issue is currently avoided by filtering out rows with NA categories in the first place.
df <- df %>% dplyr::ungroup() %>% tidyr::spread(key = !!rlang::sym(var2_col), value = .temp_value_col, fill=0)
# na_leves is set to "(NA)" for consistency on UI.
# Note that this issue is currently avoided by filtering out rows with NA categories in the first place.
df <- df %>% dplyr::mutate(!!rlang::sym(var1_col):=forcats::fct_explicit_na(as.factor(!!rlang::sym(var1_col)), na_level = "(NA)"))
df <- df %>% tibble::column_to_rownames(var=var1_col)
x <- df %>% as.matrix()
model <- chisq.test(x = x, correct = correct, ...)
# Calculate Cohen's w from actual data.
cohens_w <- calculate_cohens_w(model$statistic, sum(x))
if (is.null(w)) {
# If w is not specified, use the one calculated from data.
cohens_w_to_detect <- cohens_w
}
else {
cohens_w_to_detect <- w
}
# add variable name info to the model
model$var1 <- var1_col
model$var2 <- var2_col
model$var1_class <- var1_class
model$var2_class <- var2_class
model$var1_levels <- var1_levels
model$var2_levels <- var2_levels
model$test_sig_level <- test_sig_level
model$sig.level <- sig.level
model$cohens_w <- cohens_w
model$cohens_w_to_detect <- cohens_w_to_detect
model$power <- power
class(model) <- c("chisq_exploratory", class(model))
model
}, error = function(e){
if (length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
class(e) <- c("chisq_exploratory", class(e))
e
} else {
stop(e)
}
})
}
do_on_each_group(df, chisq.test_each, name = "model", with_unnest = FALSE)
}
#' @export
exp_chisq_ab_aggregated <- function(df, a_b_identifier, conversion_rate, count, correct = FALSE, sig.level = 0.05) {
a_b_identifier_col <- col_name(substitute(a_b_identifier))
conversion_rate_col <- col_name(substitute(conversion_rate))
count_col <- col_name(substitute(count))
# Keep only necessary columns before pivot_longer to avoid unnecessary copies.
df <- df %>% dplyr::select(!!rlang::sym(a_b_identifier_col), !!rlang::sym(conversion_rate_col), !!rlang::sym(count_col))
df <- df %>% dplyr::mutate(`TRUE`=(!!rlang::sym(count_col))*(!!rlang::sym(conversion_rate_col)), `FALSE`=(!!rlang::sym(count_col))*((1-!!rlang::sym(conversion_rate_col)))) %>% dplyr::select(-!!rlang::sym(count_col), -!!rlang::sym(conversion_rate_col))
df <- df %>% tidyr::pivot_longer(c(`TRUE`,`FALSE`), names_to="converted", values_to="n")
res <- exp_chisq(df, !!rlang::sym(a_b_identifier_col), converted, value = n, func1 = NULL, func2 = NULL, fun.aggregate = sum, correct = FALSE,
test_sig_level = sig.level, sig.level = sig.level, w = NULL, power = NULL, beta = NULL)
res
}
#' @export
tidy.chisq_exploratory <- function(x, type = "observed") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
if (type == "observed") {
ret <- as.data.frame(x$observed)
ret <- ret %>% tibble::rownames_to_column(var = x$var1)
}
else if (type == "residuals") {
obs_df <- as.data.frame(x$observed)
obs_df <- obs_df %>% tibble::rownames_to_column(var = x$var1)
obs_df <- obs_df %>% tidyr::gather(!!rlang::sym(x$var2), "observed", -!!rlang::sym(x$var1))
expected_df <- as.data.frame(x$expected)
expected_df <- expected_df %>% tibble::rownames_to_column(var = x$var1)
expected_df <- expected_df %>% tidyr::gather(!!rlang::sym(x$var2), "expected", -!!rlang::sym(x$var1))
resid_df <- as.data.frame(x$residuals)
resid_df <- resid_df %>% tibble::rownames_to_column(var = x$var1)
resid_df <- resid_df %>% tidyr::gather(!!rlang::sym(x$var2), "residual", -!!rlang::sym(x$var1))
resid_raw_df <- as.data.frame(x$observed - x$expected) # x$residual is standardized, but here, take raw difference between observed and expected.
resid_raw_df <- resid_raw_df %>% tibble::rownames_to_column(var = x$var1)
resid_raw_df <- resid_raw_df %>% tidyr::gather(!!rlang::sym(x$var2), "residual_raw", -!!rlang::sym(x$var1))
resid_ratio_df <- as.data.frame((x$observed - x$expected)/x$expected) # x$residual is standardized, but here, take raw difference between observed and expected.
resid_ratio_df <- resid_ratio_df %>% tibble::rownames_to_column(var = x$var1)
resid_ratio_df <- resid_ratio_df %>% tidyr::gather(!!rlang::sym(x$var2), "residual_ratio", -!!rlang::sym(x$var1))
ret <- obs_df %>% left_join(expected_df, by=c(x$var1, x$var2)) # join expected column
ret <- ret %>% left_join(resid_df, by=c(x$var1, x$var2)) # join expected column
ret <- ret %>% left_join(resid_raw_df, by=c(x$var1, x$var2)) # join residual_raw column
ret <- ret %>% left_join(resid_ratio_df, by=c(x$var1, x$var2)) # join residual_ratio column
if (is.nan(x$statistic) || x$statistic <= 0) {
ret <- ret %>% mutate(contrib = 0) # avoid division by 0
}
else {
ret <- ret %>% mutate(contrib = 100*residual^2/(!!(x$statistic))) # add percent contribution too.
}
if (!is.null(x$var1_levels)) {
ret[[x$var1]] <- factor(ret[[x$var1]], levels=x$var1_levels)
}
if (!is.null(x$var2_levels)) {
ret[[x$var2]] <- factor(ret[[x$var2]], levels=x$var2_levels)
}
# if original class of var1, var2 was logical, return them as logical.
if ("logical" %in% x$var1_class) {
ret[[x$var1]] <- as.logical(ret[[x$var1]])
}
else if ("integer" %in% x$var1_class) {
ret[[x$var1]] <- as.integer(ret[[x$var1]])
}
else if ("numeric" %in% x$var1_class) {
ret[[x$var1]] <- as.numeric(ret[[x$var1]])
}
if ("logical" %in% x$var2_class) {
ret[[x$var2]] <- as.logical(ret[[x$var2]])
}
else if ("integer" %in% x$var2_class) {
ret[[x$var2]] <- as.integer(ret[[x$var2]])
}
else if ("numeric" %in% x$var2_class) {
ret[[x$var2]] <- as.numeric(ret[[x$var2]])
}
}
else { # type == "prob_dist"
ret <- generate_chisq_density_data(x$statistic, p.value=x$p.value, x$parameter, sig_level=x$test_sig_level)
ret
}
ret
}
#' @export
glance.chisq_exploratory <- function(x) {
if ("error" %in% class(x)) {
ret <- tibble::tibble(Note = x$message)
return(ret)
}
# ret <- x %>% broom:::glance.htest() # for some reason this does not work. just do it like following.
ret <- data.frame(statistic=x$statistic, parameter=x$parameter, p.value=x$p.value)
N <- sum(x$observed) # Total number of observations (rows).
k <- ncol(x$observed)
r <- nrow(x$observed)
phi <- sqrt(x$statistic/N)
V <- sqrt(x$statistic/N/min(k-1, r-1)) # Cramer's V - https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
note <- NULL
if (is.null(x$power)) {
# If power is not specified in the arguments, estimate current power.
# x$parameter is degree of freedom.
if (!is.na(x$cohens_w_to_detect)) { # It is possible that x$cohens_w_to_detect is NA. Avoid calling pwr.chisq.test not to hit an error.
# pwr functions returns value for the missing arg, which in this case is power.
tryCatch({ # pwr function can return error from equation resolver. Catch it rather than stopping the whole thing.
power_res <- pwr::pwr.chisq.test(df = x$parameter, N = N, w = x$cohens_w_to_detect, sig.level = x$sig.level)
power_val <- power_res$power
}, error = function(e) {
note <<- e$message
power_val <<- NA_real_
})
}
else {
note <- "Could not calculate Cohhen's w."
power_val <- NA_real_
}
ret <- ret %>% dplyr::mutate(phi=!!phi, v=!!V, w=!!(x$cohens_w), power=!!power_val, beta=1.0-!!power_val, n=!!N)
ret <- ret %>% dplyr::select(statistic, p.value, parameter, everything()) # Reorder to unify order with t-test.
ret <- ret %>% dplyr::rename(`Chi-Square`=statistic,
`P Value`=p.value,
`DF`=parameter,
`Phi`=phi,
`Cramer's V`=v,
`Cohen's W`=w,
`Power`=power,
`Type 2 Error`=beta,
`Rows`=n)
}
else {
# If required power is specified in the arguments, estimate required sample size.
# pwr functions returns value for the missing arg, which in this case is sample size (N).
tryCatch({ # pwr function can return error from equation resolver. Catch it rather than stopping the whole thing.
power_res <- pwr::pwr.chisq.test(df = x$parameter, w = x$cohens_w_to_detect, sig.level = x$sig.level, power = x$power)
required_sample_size <- power_res$N
}, error = function(e) {
note <<- e$message
required_sample_size <<- NA_real_
})
ret <- ret %>% dplyr::mutate(phi=!!phi, v=!!V, w=!!(x$cohens_w), power=!!(x$power), beta=1.0-!!(x$power), n=!!N, required_n=!!required_sample_size)
ret <- ret %>% dplyr::select(statistic, p.value, parameter, everything()) # Reorder to unify order with t-test.
ret <- ret %>% dplyr::rename(`Chi-Square`=statistic,
`P Value`=p.value,
`DF`=parameter,
`Phi`=phi,
`Cramer's V`=v,
`Cohen's W`=w,
`Target Power`=power,
`Target Type 2 Error`=beta,
`Current Sample Size`=n,
`Required Sample Size`=required_n)
}
if (!is.null(note)) { # Add Note column, if there was an error from pwr function.
ret <- ret %>% dplyr::mutate(Note=!!note)
}
ret
}
# Calculate standard error of difference between means for Welch's t-test.
calculate_welch_stderr <- function(N1, N2, s1, s2) {
ret <- sqrt(s1^2/N1 + s2^2/N2)
ret
}
# Calculate confidence interval of difference between means for Welch's t-test.
calculate_welch_confint <- function(N1, N2, X1, X2, s1, s2, conf.level, alternative = "two.sided") {
alpha <- 1-conf.level
dof <- calculate_welch_dof(N1, N2, s1, s2)
if (alternative == "two.sided") {
q <- qt(1-alpha/2, dof)
ci <- c(-q, q)
} else if (alternative == "greater") {
q <- qt(alpha, dof)
ci <- c(q, Inf)
} else {
q <- qt(1-alpha, dof)
ci <- c(-Inf, q)
}
stderr <- calculate_welch_stderr(N1, N2, s1, s2)
res <- X1 - X2 + stderr*ci
res
}
# Calculate t statistic for Welch's t-test.
calculate_welch_t <- function(N1, N2, X1, X2, s1, s2) {
ret <- (X1 - X2)/sqrt(s1^2/N1 + s2^2/N2)
ret
}
# Calculate degree of freedom for Welch's t-test.
calculate_welch_dof <- function(N1, N2, s1, s2) {
ret <- (s1^2/N1 + s2^2/N2)^2/((s1^4/((N1^2)*(N1-1))) + (s2^4/((N2^2)*(N2-1))))
ret
}
# Calculate p value for Welch's t-test.
calculate_welch_p <- function(N1, N2, X1, X2, s1, s2, alternative = "two.sided") {
t <- calculate_welch_t(N1, N2, X1, X2, s1, s2)
dof <- calculate_welch_dof(N1, N2, s1, s2)
p <- pt(t,dof)
if (alternative == "two.sided") {
if (p < 0.5) {
res <- 2*p
} else {
res <- 2*(1-p)
}
} else if (alternative == "greater") {
res <- 1-p
} else {
res <- p
}
res
}
# Calculate pooled standard error for Student's t-test.
calculate_pooled_stderr <- function(N1, N2, s1, s2) {
res <- sqrt(((N1-1)*s1^2 + (N2-1)*s2^2)/(N1 + N2 - 2))
res
}
# Calculate confidence interval of difference between means for Student's t-test.
calculate_student_confint <- function(N1, N2, X1, X2, s1, s2, conf.level, alternative = "two.sided") {
alpha <- 1-conf.level
dof <- calculate_student_dof(N1, N2)
if (alternative == "two.sided") {
q <- qt(1-alpha/2, dof)
ci <- c(-q, q)
} else if (alternative == "greater") {
q <- qt(alpha, dof)
ci <- c(q, Inf)
} else {
q <- qt(1-alpha, dof)
ci <- c(-Inf, q)
}
stderr <- calculate_pooled_stderr(N1, N2, s1, s2)*sqrt(1/N1 + 1/N2)
res <- X1 - X2 + stderr*ci
res
}
# Calculate t statistic for Student's t-test.
calculate_student_t <- function(N1, N2, X1, X2, s1, s2) {
ret <- (X1 - X2)/(calculate_pooled_stderr(N1, N2, s1, s2)*sqrt(1/N1 + 1/N2))
ret
}
# Calculate degree of freedom for Student's t-test.
calculate_student_dof <- function(N1, N2) {
ret <- N1 + N2 - 2
ret
}
# Calculate p value for Student's t-test.
calculate_student_p <- function(N1, N2, X1, X2, s1, s2, alternative = "two.sided") {
t <- calculate_student_t(N1, N2, X1, X2, s1, s2)
dof <- calculate_student_dof(N1, N2)
p <- pt(t,dof)
if (alternative == "two.sided") {
if (p < 0.5) {
res <- 2*p
} else {
res <- 2*(1-p)
}
} else if (alternative == "greater") {
res <- 1-p
} else {
res <- p
}
res
}
# A function that gives the same output as stats::t.test, but takes aggregated data as the input.
# N1, N2 - Sample sizes
# X1, X2 - Means
# s1, s2 - Standard Deviations
t.test.aggregated <- function(N1, N2, X1, X2, s1, s2, conf.level=0.95, mu=0, alternative = "two.sided", paired = FALSE, var.equal = FALSE) {
if (!var.equal) {
method="Welch Two Sample t-test"
statistic <- calculate_welch_t(N1, N2, X1, X2, s1, s2)
parameter <- calculate_welch_dof(N1, N2, s1, s2)
p.value <- calculate_welch_p(N1, N2, X1, X2, s1, s2, alternative = alternative)
conf.int <- calculate_welch_confint(N1, N2, X1, X2, s1, s2, conf.level, alternative = alternative)
estimate <- c(X1, X2)
stderr <- calculate_welch_stderr(N1, N2, s1, s2)
}
else {
method="Two Sample t-test"
statistic <- calculate_student_t(N1, N2, X1, X2, s1, s2)
parameter <- calculate_student_dof(N1, N2)
p.value <- calculate_student_p(N1, N2, X1, X2, s1, s2, alternative = alternative)
conf.int <- calculate_student_confint(N1, N2, X1, X2, s1, s2, conf.level, alternative = alternative)
estimate <- c(X1, X2)
stderr <- calculate_pooled_stderr(N1, N2, s1, s2)*sqrt(1/N1 + 1/N2)
}
null.value <- mu
names(statistic) <- "t"
names(parameter) <- "df"
names(mu) <- "difference in means"
attr(conf.int, "conf.level") <- conf.level
res <- list(
statistic=statistic,
parameter=parameter,
p.value=p.value,
conf.int=conf.int,
estimate=estimate,
stderr=stderr,
null.value=null.value,
method=method,
alternative=alternative
)
class(res) <- c('ttest_exploratory', 'htest')
res
}
#' t-test wrapper for Analytics View. Almost the same as exp_ttest, but takes already aggregated data.
#' @export
#' @param category - Column of the categories.
#' @param n - Column of the sample sizes.
#' @param category_mean - Column of the means of the caterories.
#' @param category_sd - Column of the standard deviations of the caterories.
#' @param conf.level - Level of confidence for confidence interval. Passed to t.test as part of ...
#' @param test_sig_level - Significance level for the t-test ifself.
#' @param sig.level - Significance level for power analysis.
#' @param d - Cohen's d to detect in power analysis.
#' @param common_sd - Used for calculation of Cohen's d.
#' @param diff_to_detect - Used for calculation of Cohen's d.
exp_ttest_aggregated <- function(df, category, n, category_mean, category_sd, test_sig_level = 0.05,
sig.level = 0.05, d = NULL, common_sd = NULL, diff_to_detect = NULL, power = NULL, beta = NULL,
...) {
if (!is.null(power) && !is.null(beta) && (power + beta != 1.0)) {
stop("Specify only one of Power or Type 2 Error, or they must add up to 1.0.")
}
if (is.null(power) && !is.null(beta)) {
power <- 1.0 - beta
}
var2_col <- col_name(substitute(category))
n_col <- col_name(substitute(n))
mean_col <- col_name(substitute(category_mean))
sd_col <- col_name(substitute(category_sd))
grouped_cols <- grouped_by(df)
# For logical explanatory variable, make it a factor and adjust label order so that
# the calculated difference is TRUE case - FALSE case, which intuitively makes better sense.
if (is.logical(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := factor(!!rlang::sym(var2_col), levels=c("TRUE", "FALSE")))
df <- df %>% dplyr::arrange(!!rlang::sym(var2_col))
}
n_distinct_res <- n_distinct(df[[var2_col]]) # save n_distinct result to avoid repeating the relatively expensive call.
if (n_distinct_res != 2) {
if (n_distinct_res == 3 && any(is.na(df[[var2_col]]))) { # automatically filter NA to make number of category 2, if it is the 3rd category.
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var2_col)))
}
else {
stop("The explanatory variable needs to have 2 unique values.")
}
}
ttest_each <- function(df) {
tryCatch({
if (nrow(df) != 2) {
stop("Number of rows of the aggregated input data must be 2.")
}
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(n_col)) & !is.na(!!rlang::sym(mean_col)) & !is.na(!!rlang::sym(sd_col))) # Remove NA from the target column.
if (nrow(df) != 2) {
stop("There is NA in the aggregated input data.")
}
if(length(grouped_cols) > 0) {
n_distinct_res_each <- n_distinct(df[[var2_col]]) # check n_distinct again within group after handling outlier.
if (n_distinct_res_each != 2) {
stop("The explanatory variable needs to have 2 unique values.")
}
}
# It seems that each group has to have at least 2 rows to avoid "not enough 'x' observations" error.
# Check it here, rather than handling it later.
min_n <- min(df[[n_col]], na.rm=TRUE)
if (min_n <= 1) {
e <- simpleError("Not enough data.")
class(e) <- c("ttest_exploratory", class(e))
e$v1 <- df[[var2_col]][1]
e$n1 <- df[[n_col]][1]
e$v2 <- df[[var2_col]][2]
e$n2 <- df[[n_col]][2]
return(e)
}
# Calculate Cohen's d from data.
cohens_d <- calculate_cohens_d_aggregated(df[[n_col]][1], df[[n_col]][2], df[[mean_col]][1], df[[mean_col]][2], df[[sd_col]][1], df[[sd_col]][2])
# Get size of Cohen's d to detect for power analysis.
# If neither d nor diff_to_detect is specified, use the one calculated from data.
if (is.null(d)) {
if (is.null(diff_to_detect)) {
# If neither d nor diff_to_detect is specified, calculate Cohen's d from data.
cohens_d_to_detect <- cohens_d
}
else { # diff_to_detect is specified.
if (is.null(common_sd)) {
# If common SD is not specified, estimate from data, and use it to calculate Cohen's d
cohens_d_to_detect <- diff_to_detect/calculate_common_sd_aggregated(df[[n_col]][1], df[[n_col]][2], df[[sd_col]][1], df[[sd_col]][2])
}
else {
cohens_d_to_detect <- diff_to_detect/common_sd
}
}
}
else {
cohens_d_to_detect <- d
}
model <- t.test.aggregated(df[[n_col]][1], df[[n_col]][2], df[[mean_col]][1], df[[mean_col]][2], df[[sd_col]][1], df[[sd_col]][2], ...)
class(model) <- c("ttest_exploratory", class(model))
model$var2 <- var2_col
model$data <- df
model$test_sig_level <- test_sig_level
model$sig.level <- sig.level
model$cohens_d <- cohens_d # model$d seems to be already used for something.
model$cohens_d_to_detect <- cohens_d_to_detect
model$power <- power
model$v1 <- df[[var2_col]][1]
model$v2 <- df[[var2_col]][2]
model$n1 <- df[[n_col]][1]
model$n2 <- df[[n_col]][2]
model$base.level <- df[[var2_col]][2] # The 2nd row is always the base in case of aggregated.
model$s1 <- df[[sd_col]][1]
model$s2 <- df[[sd_col]][2]
model$m1 <- df[[mean_col]][1]
model$m2 <- df[[mean_col]][2]
model$data_type <- "aggregated"
model
}, error = function(e){
if(length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
class(e) <- c("ttest_exploratory", class(e))
e
} else {
stop(e)
}
})
}
do_on_each_group(df, ttest_each, name = "model", with_unnest = FALSE)
}
#' t-test wrapper for Analytics View
#' @export
#' @param conf.level - Level of confidence for confidence interval. Passed to t.test as part of ...
#' @param test_sig_level - Significance level for the t-test ifself.
#' @param sig.level - Significance level for power analysis.
#' @param d - Cohen's d to detect in power analysis.
#' @param common_sd - Used for calculation of Cohen's d.
#' @param diff_to_detect - Used for calculation of Cohen's d.
exp_ttest <- function(df, var1, var2, func2 = NULL, test_sig_level = 0.05,
sig.level = 0.05, d = NULL, common_sd = NULL, diff_to_detect = NULL, power = NULL, beta = NULL,
outlier_filter_type = NULL, outlier_filter_threshold = NULL,
...) {
if (!is.null(power) && !is.null(beta) && (power + beta != 1.0)) {
stop("Specify only one of Power or Type 2 Error, or they must add up to 1.0.")
}
if (is.null(power) && !is.null(beta)) {
power <- 1.0 - beta
}
var1_col <- col_name(substitute(var1))
var2_col <- col_name(substitute(var2))
grouped_cols <- grouped_by(df)
# Apply func2 to var2.
if (!is.null(func2)) {
if (lubridate::is.Date(df[[var2_col]]) || lubridate::is.POSIXct(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_date(!!rlang::sym(var2_col), type=!!func2))
}
else if (is.numeric(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_numeric(!!rlang::sym(var2_col), type=!!func2))
}
}
# For logical explanatory variable, make it a factor and adjust label order so that
# the calculated difference is TRUE case - FALSE case, which intuitively makes better sense.
var2_logical <- is.logical(df[[var2_col]])
if (var2_logical) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := factor(!!rlang::sym(var2_col), levels=c("TRUE", "FALSE")))
}
n_distinct_res <- n_distinct(df[[var2_col]]) # save n_distinct result to avoid repeating the relatively expensive call.
if (n_distinct_res != 2) {
if (n_distinct_res == 3 && any(is.na(df[[var2_col]]))) { # automatically filter NA to make number of category 2, if it is the 3rd category.
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var2_col)))
}
else {
stop("The explanatory variable needs to have 2 unique values.")
}
}
formula = as.formula(paste0('`', var1_col, '`~`', var2_col, '`'))
ttest_each <- function(df) {
tryCatch({
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var1_col))) # Remove NA from the target column.
if (nrow(df) == 0) {
stop("There is no data left after removing NA.")
}
if (!is.null(outlier_filter_type)) {
is_outlier <- function(x) {
res <- detect_outlier(x, type=outlier_filter_type, threshold=outlier_filter_threshold) %in% c("Lower", "Upper")
res
}
df$.is.outlier <- FALSE #TODO: handle possibility of name conflict.
df$.is.outlier <- df$.is.outlier | is_outlier(df[[var1_col]])
df <- df %>% dplyr::filter(!.is.outlier)
df$.is.outlier <- NULL
}
if(length(grouped_cols) > 0) {
n_distinct_res_each <- n_distinct(df[[var2_col]]) # check n_distinct again within group after handling outlier.
if (n_distinct_res_each != 2) {
stop("The explanatory variable needs to have 2 unique values.")
}
}
# It seems that each group has to have at least 2 rows to avoid "not enough 'x' observations" error.
# Check it here, rather than handling it later.
count_df <- df %>% group_by(!!rlang::sym(var2_col)) %>% summarize(n=n())
min_n <- min(count_df$n, na.rm=TRUE)
if (min_n <= 1) {
e <- simpleError("Not enough data.")
class(e) <- c("ttest_exploratory", class(e))
e$v1 <- count_df[[1]][1]
e$n1 <- count_df$n[1]
e$v2 <- count_df[[1]][2]
e$n2 <- count_df$n[2]
return(e)
}
# Calculate Cohen's d from data.
cohens_d <- calculate_cohens_d(df[[var1_col]], df[[var2_col]])
# Get size of Cohen's d to detect for power analysis.
# If neither d nor diff_to_detect is specified, use the one calculated from data.
if (is.null(d)) {
if (is.null(diff_to_detect)) {
# If neither d nor diff_to_detect is specified, calculate Cohen's d from data.
cohens_d_to_detect <- cohens_d
}
else { # diff_to_detect is specified.
if (is.null(common_sd)) {
# If common SD is not specified, estimate from data, and use it to calculate Cohen's d
cohens_d_to_detect <- diff_to_detect/calculate_common_sd(df[[var1_col]], df[[var2_col]])
}
else {
cohens_d_to_detect <- diff_to_detect/common_sd
}
}
}
else {
cohens_d_to_detect <- d
}
# Adjust the factor levels since t.test consideres the 2nd level to be the base, which is not what we want.
# We keep the original df as is, since we want to keep the original factor order for display purpose.
if (var2_logical) {
df_test <- df
}
else if (is.numeric(df[[var2_col]])) {
# If numeric, we want the smaller number to be the base, e.g. in case of 0, 1, 0 should be the base.
df_test <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := forcats::fct_rev(as.factor(!!rlang::sym(var2_col))))
}
else if (is.factor(df[[var2_col]])) {
# For factor, drop unused levels and revert the factor levels since we want the first actually used level to be the base, just like linear regression.
df_test <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := forcats::fct_rev(forcats::fct_drop(!!rlang::sym(var2_col))))
}
else {
# For character and others, majority should become the base.
df_test <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := forcats::fct_rev(forcats::fct_infreq(as.factor(!!rlang::sym(var2_col)))))
}
base.level <- levels(df_test[[var2_col]])[2]
model <- t.test(formula, data = df_test, ...)
class(model) <- c("ttest_exploratory", class(model))
model$var1 <- var1_col
model$var2 <- var2_col
model$data <- df
model$test_sig_level <- test_sig_level
model$sig.level <- sig.level
model$cohens_d <- cohens_d # model$d seems to be already used for something.
model$cohens_d_to_detect <- cohens_d_to_detect
model$power <- power
model$v1 <- count_df[[1]][1]
model$n1 <- count_df$n[1]
model$v2 <- count_df[[1]][2]
model$n2 <- count_df$n[2]
model$base.level <- base.level
model$data_type <- "raw"
model
}, error = function(e){
if(length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
class(e) <- c("ttest_exploratory", class(e))
e
} else {
stop(e)
}
})
}
do_on_each_group(df, ttest_each, name = "model", with_unnest = FALSE)
}
#' @export
glance.ttest_exploratory <- function(x) {
if ("error" %in% class(x)) {
ret <- tibble::tibble(Note = x$message)
return(ret)
}
ret <- broom:::glance.htest(x) # for t-test. the returned content is same as tidy.
ret
}
#' @export
tidy.ttest_exploratory <- function(x, type="model", conf_level=0.95) {
if (type == "model") {
if ("error" %in% class(x)) {
if (!is.null(x$v1) && !is.null(x$v2) && !is.null(x$n1) && !is.null(x$n2)) {
ret <- tibble::tibble(`Rows`=x$n1+x$n2, n1=x$n1, n2=x$n2, Note = x$message)
ret <- ret %>% dplyr::rename(!!rlang::sym(paste0("Rows (", x$v1, ")")):=n1)
ret <- ret %>% dplyr::rename(!!rlang::sym(paste0("Rows (", x$v2, ")")):=n2)
}
else {
ret <- tibble::tibble(Note = x$message)
}
return(ret)
}
note <- NULL
ret <- broom:::tidy.htest(x)
if (is.null(ret$estimate)) { # estimate is empty when var.equal = TRUE.
# Looks like an issue from broom. Working it around.
ret <- ret %>% dplyr::mutate(estimate = estimate1 - estimate2)
}
# Get sample sizes for the 2 groups (n1, n2).
n1 <- x$n1 # number of 1st class
n2 <- x$n2 # number of 2nd class
v1 <- x$v1 # value for 1st class
v2 <- x$v2 # value for 2nd class
# t.test seems to consider the 2nd category based on alphabetical/numerical/factor sort as the base category.
# Since group_by/summarize also sorts the group based on alphabetical/numerical/factor order, we can assume that the v2 is the base category.
ret <- ret %>% dplyr::mutate(base.level = !!x$base.level)
if (is.null(x$power)) {
# If power is not specified in the arguments, estimate current power.
# TODO: pwr functions does not seem to have argument for equal variance. Is it ok?
tryCatch({ # pwr function can return error from equation resolver. Catch it rather than stopping the whole thing.
if (x$method == "Paired t-test") {
# If paired, we should be able to assume n1 == n2.
power_res <- pwr::pwr.t.test(n = n1, d = x$cohens_d_to_detect, sig.level = x$sig.level, type = "two.sample", alternative = x$alternative)
}
else {
power_res <- pwr::pwr.t2n.test(n1 = n1, n2= n2, d = x$cohens_d_to_detect, sig.level = x$sig.level, alternative = x$alternative)
}
power_val <- power_res$power
}, error = function(e) {
note <- e$message
power_val <<- NA_real_
})
ret <- ret %>% dplyr::select(statistic, p.value, parameter, estimate, conf.low, conf.high, base.level) %>%
dplyr::mutate(d=!!(x$cohens_d), power=!!power_val, beta=1.0-!!power_val) %>%
dplyr::rename(`t Value`=statistic,
`P Value`=p.value,
`DF`=parameter,
Difference=estimate,
`Conf High`=conf.high,
`Conf Low`=conf.low,
`Base Level`=base.level,
`Cohen's D`=d,
`Power`=power,
`Type 2 Error`=beta)
}
else {
# If required power is specified in the arguments, estimate required sample size.
# TODO: pwr functions does not seem to have argument for equal variance. Is it ok?
tryCatch({ # pwr function can return error from equation resolver. Catch it rather than stopping the whole thing.
power_res <- pwr::pwr.t.test(d = x$cohens_d_to_detect, sig.level = x$sig.level, power = x$power, alternative = x$alternative)
required_sample_size <- power_res$n
}, error = function(e) {
note <<- e$message
required_sample_size <<- NA_real_
})
ret <- ret %>% dplyr::select(statistic, p.value, parameter, estimate, conf.low, conf.high, base.level) %>%
dplyr::mutate(d=!!(x$cohens_d), power=!!(x$power), beta=1.0-!!(x$power)) %>%
dplyr::mutate(current_sample_size=min(!!n1,!!n2), required_sample_size=required_sample_size) %>%
dplyr::rename(`t Value`=statistic,
`P Value`=p.value,
`DF`=parameter,
Difference=estimate,
`Conf High`=conf.high,
`Conf Low`=conf.low,
`Base Level`=base.level,
`Cohen's D`=d,
`Target Power`=power,
`Target Type 2 Error`=beta,
`Current Sample Size (Each Group)`=current_sample_size,
`Required Sample Size (Each Group)`=required_sample_size)
}
ret <- ret %>% dplyr::mutate(`Rows`=!!(n1+n2))
ret <- ret %>% dplyr::mutate(!!rlang::sym(paste0("Rows (", v1, ")")):=!!n1)
ret <- ret %>% dplyr::mutate(!!rlang::sym(paste0("Rows (", v2, ")")):=!!n2)
if (!is.null(note)) { # Add Note column, if there was an error from pwr function.
ret <- ret %>% dplyr::mutate(Note=!!note)
}
}
else if (type == "data_summary") { #TODO consolidate with code in tidy.anova_exploratory
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
conf_threshold = 1 - (1 - conf_level)/2
if (x$data_type == "raw") {
ret <- x$data %>% dplyr::group_by(!!rlang::sym(x$var2)) %>%
dplyr::summarize(`Rows`=length(!!rlang::sym(x$var1)),
Mean=mean(!!rlang::sym(x$var1), na.rm=TRUE),
`Std Deviation`=sd(!!rlang::sym(x$var1), na.rm=TRUE),
# std error definition: https://www.rdocumentation.org/packages/plotrix/versions/3.7/topics/std.error
`Std Error of Mean`=sd(!!rlang::sym(x$var1), na.rm=TRUE)/sqrt(sum(!is.na(!!rlang::sym(x$var1)))),
# Note: Use qt (t distribution) instead of qnorm (normal distribution) here.
# For more detail take a look at 10.5.1 A slight mistake in the formula of "Learning Statistics with R"
`Conf High` = Mean + `Std Error of Mean` * qt(p=!!conf_threshold, df=`Rows`-1),
`Conf Low` = Mean - `Std Error of Mean` * qt(p=!!conf_threshold, df=`Rows`-1),
`Minimum`=min(!!rlang::sym(x$var1), na.rm=TRUE),
`Maximum`=max(!!rlang::sym(x$var1), na.rm=TRUE)) %>%
dplyr::select(!!rlang::sym(x$var2),
`Rows`,
Mean,
`Conf Low`,
`Conf High`,
`Std Error of Mean`,
`Std Deviation`,
`Minimum`,
`Maximum`)
}
else { # x$data_type == "aggregated"
stderr <- c(x$s1/sqrt(x$n1), x$s2/sqrt(x$n2))
ci_radius <- stderr * qt(p=conf_threshold, df=c(x$n1, x$n2)-1)
ret <- tibble::tibble(
group_ = c(x$v1, x$v2),
`Rows` = c(x$n1, x$n2),
Mean = c(x$m1, x$m2),
`Conf Low` = c(x$m1, x$m2) - ci_radius,
`Conf High` = c(x$m1, x$m2) + ci_radius,
`Std Error of Mean` = stderr,
`Std Deviation` = c(x$s1, x$s2)
)
ret <- ret %>% dplyr::rename(!!rlang::sym(x$var2):=group_)
}
}
else if (type == "prob_dist") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
ret <- generate_ttest_density_data(x$statistic, p.value=x$p.value, x$parameter, sig_level=x$test_sig_level, alternative=x$alternative)
ret
}
else { # type == "data"
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
ret <- x$data
}
ret
}
# Mean of the reference normal distribution (non-exact cases) for Wilcoxon test.
wilcox_norm_dist_mean <- function(alternative, paired, statistic, n1, n2) {
if (!paired) {
if (alternative == "two.sided") {
if (statistic - n1*n2/2 >= 0) {
correction <- 0.5
}
else {
correction <- -0.5
}
}
else if (alternative == "greater") {
correction <- 0.5
}
else if (alternative == "less") {
correction <- -0.5
}
else {
stop("Invalid alternative value.")
}
return(correction + n1*n2/2)
}
else {
if (alternative == "two.sided") {
if (statistic - n1*(n1 + 1)/4 >= 0) {
correction <- 0.5
}
else {
correction <- -0.5
}
}
else if (alternative == "greater") {
correction <- 0.5
}
else if (alternative == "less") {
correction <- -0.5
}
else {
stop("Invalid alternative value.")
}
return(correction + n1*(n1 + 1)/4)
}
}
# Standard deviation of the reference normal distribution (non-exact cases) for Wilcoxon test.
wilcox_norm_dist_sd <- function(alternative, paired, statistic, n1, n2, tie_counts) {
if (!paired) {
sqrt(n1*n2/12*(n1 + n2 + 1 - sum(tie_counts^3 - tie_counts)/((n1 + n2)*(n1 + n2 -1))))
}
else {
sqrt(n1*(n1 + 1)*(2*n1 + 1)/6 - sum(tie_counts^3 - tie_counts)/12)/2
}
}
#' Wrapper for Wilcoxon rank sum test and signed-rank test for Analytics View
#' @export
#' @param conf.int - Whether to calculate estimate and confidence interval. Default FALSE. Passed to wilcox.test as part of ...
#' @param conf.level - Level of confidence for confidence interval. Passed to wilcox.test as part of ...
exp_wilcox <- function(df, var1, var2, func2 = NULL, test_sig_level = 0.05, paired = FALSE, ...) {
var1_col <- col_name(substitute(var1))
var2_col <- col_name(substitute(var2))
grouped_cols <- grouped_by(df)
if (!is.null(func2)) {
if (lubridate::is.Date(df[[var2_col]]) || lubridate::is.POSIXct(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_date(!!rlang::sym(var2_col), type=!!func2))
}
else if (is.numeric(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_numeric(!!rlang::sym(var2_col), type=!!func2))
}
}
# For logical explanatory variable, make it a factor and adjust label order so that
# the calculated difference is TRUE case - FALSE case, which intuitively makes better sense.
var2_logical <- is.logical(df[[var2_col]])
if (var2_logical) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := factor(!!rlang::sym(var2_col), levels=c("TRUE", "FALSE")))
}
n_distinct_res <- n_distinct(df[[var2_col]]) # save n_distinct result to avoid repeating the relatively expensive call.
if (n_distinct_res != 2) {
if (n_distinct_res == 3 && any(is.na(df[[var2_col]]))) { # automatically filter NA to make number of category 2, if it is the 3rd category.
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var2_col)))
}
else {
stop("The explanatory variable needs to have 2 unique values.")
}
}
formula = as.formula(paste0('`', var1_col, '`~`', var2_col, '`'))
each_func <- function(df) {
tryCatch({
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var1_col))) # Remove NA from the target column.
if (nrow(df) == 0) {
stop("There is no data left after removing NA.")
}
if(length(grouped_cols) > 0) {
n_distinct_res_each <- n_distinct(df[[var2_col]]) # check n_distinct again within group after handling outlier.
if (n_distinct_res_each != 2) {
stop("The explanatory variable needs to have 2 unique values.")
}
}
# Adjust the factor levels since t.test consideres the 2nd level to be the base, which is not what we want.
# We keep the original df as is, since we want to keep the original factor order for display purpose.
if (var2_logical) {
df_test <- df
}
else if (is.numeric(df[[var2_col]])) {
# If numeric, we want the smaller number to be the base, e.g. in case of 0, 1, 0 should be the base.
df_test <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := forcats::fct_rev(as.factor(!!rlang::sym(var2_col))))
}
else if (is.factor(df[[var2_col]])) {
# For factor, drop unused levels and revert the factor levels since we want the first actually used level to be the base, just like linear regression.
df_test <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := forcats::fct_rev(forcats::fct_drop(!!rlang::sym(var2_col))))
}
else {
# For character and others, majority should become the base.
df_test <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := forcats::fct_rev(forcats::fct_infreq(as.factor(!!rlang::sym(var2_col)))))
}
base.level <- levels(df_test[[var2_col]])[2]
model <- wilcox.test(formula, data = df_test, paired = paired, ...)
count_df <- df %>% group_by(!!rlang::sym(var2_col)) %>% dplyr::summarize(n=n())
class(model) <- c("wilcox_exploratory", class(model))
model$var1 <- var1_col
model$var2 <- var2_col
model$paired <- paired
model$n1 <- count_df$n[1]
model$n2 <- count_df$n[2]
model$test_sig_level <- test_sig_level
model$base.level <- base.level
model$data <- df
model
}, error = function(e){
if(length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
class(e) <- c("wilcox_exploratory", class(e))
e
} else {
stop(e)
}
})
}
do_on_each_group(df, each_func, name = "model", with_unnest = FALSE)
}
#' @export
tidy.wilcox_exploratory <- function(x, type="model", conf_level=0.95) {
if (type == "model") {
if ("error" %in% class(x)) {
ret <- tibble::tibble(Note = x$message)
return(ret)
}
note <- NULL
ret <- broom:::tidy.htest(x)
if (!is.null(x$estimate)) { # Result is with estimate and confidence interval
ret <- ret %>% dplyr::select(statistic, p.value, estimate, conf.low, conf.high, method)
}
else {
ret <- ret %>% dplyr::select(statistic, p.value, method)
}
# Get sample sizes for the 2 groups (n1, n2).
data_summary <- x$data %>% dplyr::group_by(!!rlang::sym(x$var2)) %>%
dplyr::summarize(n_rows=length(!!rlang::sym(x$var1)))
n1 <- data_summary$n_rows[[1]] # number of 1st class
n2 <- data_summary$n_rows[[2]] # number of 2nd class
v1 <- data_summary[[x$var2]][[1]] # value for 1st class
v2 <- data_summary[[x$var2]][[2]] # value for 2nd class
# Switch the name of statistic based on the type of performed test.
if (stringr::str_detect(ret$method[[1]], "signed rank")) {
ret <- ret %>% dplyr::rename(`W Value`=statistic)
}
else if (stringr::str_detect(ret$method[[1]], "rank sum")) { # Intentionally matching with just "rank sum" to match "Wilcoxon rank sum exact test" too.
ret <- ret %>% dplyr::rename(`U Value`=statistic)
}
if (!is.null(x$estimate)) { # Result is with estimate and confidence interval
# wilcox.test, just like t.test, seems to consider the 2nd category based on alphabetical/numerical/factor sort as the base category.
# Since group_by/summarize also sorts the group based on alphabetical/numerical/factor order, we can assume that the v2 is the base category.
ret <- ret %>% dplyr::mutate(base.level = !!x$base.level)
ret <- ret %>% dplyr::relocate(base.level, .after = conf.high)
ret <- ret %>% dplyr::rename(`P Value`=p.value,
Difference=estimate,
`Conf High`=conf.high,
`Conf Low`=conf.low,
`Base Level`=base.level,
`Method`=method)
}
else {
ret <- ret %>% dplyr::mutate(base.level = !!x$base.level)
ret <- ret %>% dplyr::relocate(base.level, .after = p.value)
ret <- ret %>% dplyr::rename(`P Value`=p.value,
`Base Level`=base.level,
`Method`=method)
}
ret <- ret %>% dplyr::mutate(`Rows`=!!(n1+n2))
ret <- ret %>% dplyr::mutate(!!rlang::sym(paste0("Rows (", v1, ")")):=!!n1)
ret <- ret %>% dplyr::mutate(!!rlang::sym(paste0("Rows (", v2, ")")):=!!n2)
if (!is.null(note)) { # Code to add Note column if there was an error. Not used for this particular function yet.
ret <- ret %>% dplyr::mutate(Note=!!note)
}
}
else if (type == "data_summary") { #TODO consolidate with code in tidy.anova_exploratory
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
conf_threshold = 1 - (1 - conf_level)/2
ret <- x$data %>% dplyr::group_by(!!rlang::sym(x$var2)) %>%
dplyr::summarize(`Rows`=length(!!rlang::sym(x$var1)),
Mean=mean(!!rlang::sym(x$var1), na.rm=TRUE),
`Std Deviation`=sd(!!rlang::sym(x$var1), na.rm=TRUE),
# std error definition: https://www.rdocumentation.org/packages/plotrix/versions/3.7/topics/std.error
`Std Error of Mean`=sd(!!rlang::sym(x$var1), na.rm=TRUE)/sqrt(sum(!is.na(!!rlang::sym(x$var1)))),
# Note: Use qt (t distribution) instead of qnorm (normal distribution) here.
# For more detail take a look at 10.5.1 A slight mistake in the formula of "Learning Statistics with R"
`Conf High` = Mean + `Std Error of Mean` * qt(p=conf_threshold, df=`Rows`-1),
`Conf Low` = Mean - `Std Error of Mean` * qt(p=conf_threshold, df=`Rows`-1),
`Minimum`=min(!!rlang::sym(x$var1), na.rm=TRUE),
`Maximum`=max(!!rlang::sym(x$var1), na.rm=TRUE)) %>%
dplyr::select(!!rlang::sym(x$var2),
`Rows`,
Mean,
`Conf Low`,
`Conf High`,
`Std Error of Mean`,
`Std Deviation`,
`Minimum`,
`Maximum`)
}
else if (type == "prob_dist") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
tie_counts <- table(x$data[[x$var1]])
if (x$n1 < 50 && x$n2 < 50 && max(tie_counts) <= 1) { # Use exact method as wilcox.test does.
if (x$paired) {
ret <- generate_signrank_density_data(x$statistic, p.value=x$p.value, x$n1, sig_level=x$test_sig_level, alternative=x$alternative)
}
else {
ret <- generate_wilcox_density_data(x$statistic, p.value=x$p.value, x$n1, x$n2, sig_level=x$test_sig_level, alternative=x$alternative)
}
}
else {
mu <- wilcox_norm_dist_mean(x$alternative, x$paired, x$statistic, x$n1, x$n2)
sigma <- wilcox_norm_dist_sd(x$alternative, x$paired, x$statistic, x$n1, x$n2, tie_counts)
ret <- generate_norm_density_data(x$statistic, p.value=x$p.value, mu, sigma, sig_level=x$test_sig_level, alternative=x$alternative)
}
ret
}
else { # type == "data"
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
ret <- x$data
}
ret
}
# Function to perform the transformation
# column_list: e.g. list("col1", new_col=c("col2", "col3", "col4"))
gather_repeated_measures <- function(df, column_list, value_col_name) {
new_col_name <- names(column_list)[1]
cols_to_gather <- column_list[[1]]
if (length(column_list) > 1) {
cols_to_keep <- column_list[[1]]
cols_to_gather <- column_list[[2]]
new_col_name <- names(column_list[2])
}
# filter out rows that have NA in the columns to gather
df <- df %>% filter(across(all_of(cols_to_gather), ~ !is.na(.)))
df_transformed <- df %>%
dplyr::select(if (length(column_list) > 1) all_of(cols_to_keep), all_of(cols_to_gather)) %>%
tidyr::pivot_longer(
cols = all_of(cols_to_gather),
names_to = new_col_name,
values_to = value_col_name
)
return(df_transformed)
}
# Returns the column names in the long-format data returned by gather_repeated_measures is applied.
get_gather_repeated_measures_colnames <- function(column_list) {
if (length(column_list) > 1) { # This means 2-way repeated-measures ANOVA.
cols_to_keep <- column_list[[1]]
new_col_name <- names(column_list[2])
} else { # This means 1-way repeated-measures ANOVA.
cols_to_keep <- NULL
new_col_name <- names(column_list[1])
}
resulting_colnames <- c(cols_to_keep, new_col_name)
return(resulting_colnames)
}
#' ANOVA wrapper for Analytics View
#' @export
#' @param var1 - The column for dependent variable
#' @param var2 - The column for the categorical independent variable(s).
#' If it is a single column, it can be specified with unquoted column name.
#' If there are 2 columns (2-way ANOVA case), it is a character vector with 2 elements of character.
#' @param test_sig_level - Significance level for the t-test ifself.
#' @param sig.level - Significance level for power analysis.
exp_anova <- function(df, var1, var2, covariates = NULL, func2 = NULL, covariate_funs = NULL, test_sig_level = 0.05,
sig.level = 0.05, f = NULL, power = NULL, beta = NULL,
outlier_filter_type = NULL, outlier_filter_threshold = NULL,
with_interaction = FALSE, with_repeated_measures = FALSE,
...) {
if (!is.null(power) && !is.null(beta) && (power + beta != 1.0)) {
stop("Specify only one of Power or Type 2 Error, or they must add up to 1.0.")
}
if (is.null(power) && !is.null(beta)) {
power <- 1.0 - beta
}
var1_ <- substitute(var1)
if (class(var1_) == "name") { # To be able to handle wide-data too, we need to handle both unquoted existing column name, and quoted new column name.
var1_col <- col_name(var1_)
}
else {
var1_col <- var1
}
var2_ <- substitute(var2)
if (class(var2_) == "name") { # For backward compatibility with pre-6.13 when we only had one-way ANOVA.
var2_col <- col_name(var2_)
}
else {
var2_col <- var2
}
# If var2_col is a list, convert it to a character vector.
if (is.list(var2_col)) {
df <- gather_repeated_measures(df, var2_col, var1_col)
var2_col <- get_gather_repeated_measures_colnames(var2_col)
}
grouped_cols <- grouped_by(df)
# Note that applying preprocessing function is taken care of by the analytics view framework now, and this code applying func2
# is not currently used.
if (!is.null(func2)) {
for (i in 1:length(func2)) {
if (lubridate::is.Date(df[[var2_col[i]]]) || lubridate::is.POSIXct(df[[var2_col[i]]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col[i]) := extract_from_date(!!rlang::sym(var2_col[i]), type=!!func2[i]))
}
else if (is.numeric(df[[var2_col[i]]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col[i]) := extract_from_numeric(!!rlang::sym(var2_col[i]), type=!!func2[i]))
}
}
}
for (i in 1:length(var2_col)) {
if (n_distinct(df[[var2_col[i]]]) < 2) {
stop(paste0("The explanatory variable needs to have 2 or more unique values."))
}
}
# If explanatory variable(s) are numeric, convet them into factor.
for (i in 1:length(var2_col)) {
if (is.numeric(df[[var2_col[[i]]]])) {
df[[var2_col[[i]]]] <- factor(df[[var2_col[[i]]]])
}
}
# Apply preprocessing functions to the covariates.
if (!is.null(covariates) && !is.null(covariate_funs)) {
df <- df %>% mutate_predictors(covariates, covariate_funs)
# Expand the set of modified column names after the mutate_predictors call.
covariates <- names(unlist(covariate_funs))
}
# For ANCOVA or one-way ANOVA, parepare common var2 order to display un-adjusted/adjusted means everywhere.
if (!is.null(covariates) || length(var2_col) == 1) {
common_var2_order <- (df %>% ungroup() %>% group_by(!!rlang::sym(var2_col)) %>% summarize(mean=mean(!!rlang::sym(var1_col), na.rm=TRUE)) %>% arrange(desc(mean)))[[var2_col]]
}
anova_each <- function(df) {
tryCatch({
# Keep only the relevant columns.
df <- df %>% dplyr::select(c(var1_col, var2_col, covariates))
# Replace column names with names like c1_, c2_...
clean_df <- df
names(clean_df) <- paste0("c",1:length(colnames(clean_df)), "_")
# Forward mapping of column names.
name_map <- colnames(clean_df)
names(name_map) <- colnames(df)
# Reverse mapping of variable names.
terms_mapping <- names(name_map)
names(terms_mapping) <- name_map
var1_col <- name_map[var1_col]
var2_col <- name_map[var2_col]
names(var2_col) <- NULL
if (!is.null(covariates)) {
covariates <- name_map[covariates]
}
df <- clean_df
if (with_repeated_measures) {
# For repeated measures ANOVA, add a column for the subject ID.
# If 2-way, this is a mixed-design repeated-measures model. Treat only the 2nd variable as a repeated measure.
# This have to be done after the column name mapping so that subject_id is not mapped.
df <- df %>% group_by(!!rlang::sym(var2_col[length(var2_col)])) %>% mutate(subject_id = row_number()) %>% ungroup()
}
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var1_col))) # Remove NA from the target column.
if (nrow(df) == 0) {
stop("There is no data left after removing NA.")
}
if (is.null(covariates)) { # 2-way/1-way ANOVA case
if (with_interaction) {
collapse_str <- "`*`"
}
else {
collapse_str <- "`+`"
}
if (with_repeated_measures) {
# If 2-way, mixed-design repeated-measures model. Treat only the 2nd variable as a repeated measure.
formula <- as.formula(paste0('`', var1_col, '`~`', paste(var2_col, collapse=collapse_str), '` + Error(subject_id / (`', var2_col[length(var2_col)], '`))'))
}
else {
formula <- as.formula(paste0('`', var1_col, '`~`', paste(var2_col, collapse=collapse_str), '`'))
}
}
else { # ANCOVA case
if (!with_interaction) {
formula <- as.formula(paste0('`', var1_col, '`~`', var2_col, '`+`', paste(covariates, collapse="`+`"), '`'))
}
else { # Calculating interaction only with the first covariate for simplicity for now.
formula <- as.formula(paste0('`', var1_col, '`~`', var2_col, '`*`', covariates[1], '`'))
}
}
# For ANCOVA/2-way ANOVA case. Prepare contrasts setting to match SPSS.
# http://www.statscanbefun.com/rblog/2015/8/27/ensuring-r-generates-the-same-anova-f-values-as-spss
if (!is.null(covariates) || length(var2_col) > 1) {
contrasts_list <- as.list(rep("contr.helmert", length(var2_col)))
names(contrasts_list) <- var2_col
}
else {
contrasts_list <- list()
}
if (!is.null(outlier_filter_type)) { #TODO: duplicated code with exp_ttest.
is_outlier <- function(x) {
res <- detect_outlier(x, type=outlier_filter_type, threshold=outlier_filter_threshold) %in% c("Lower", "Upper")
res
}
df$.is.outlier <- FALSE #TODO: handle possibility of name conflict.
df$.is.outlier <- df$.is.outlier | is_outlier(df[[var1_col]])
df <- df %>% dplyr::filter(!.is.outlier)
df$.is.outlier <- NULL
}
if(length(grouped_cols) > 0) {
# Check n_distinct again within group after handling outliers.
# Group with NA and another category does not seem to work well with aov. Eliminating such case too. TODO: We could replace NA with an explicit level.
for (i in 1:length(var2_col)) {
n_distinct_res_each <- n_distinct(df[[var2_col[i]]], na.rm=TRUE)
if (n_distinct_res_each < 2) {
stop(paste0("The explanatory variable needs to have 2 or more unique values."))
}
}
}
# It seems that the 2nd row of broom:::tidy.aov(x) is missed, if no group has more than 1 row. Check it here, rather than handling it later.
count_df <- df %>% group_by(!!!rlang::syms(as.character(var2_col))) %>% summarize(n=n()) %>% ungroup() %>% summarize(max_n=max(n),tot_n=sum(n))
if (count_df$max_n <= 1) {
e <- simpleError("At least one group needs to have 2 or more rows.")
class(e) <- c("anova_exploratory", class(e))
e$n <- count_df$tot_n
return(e)
}
if (with_repeated_measures) {
model <- afex::aov_car(formula, data = df, type = "III")
}
else if (!is.null(covariates) || length(var2_col) > 1) {
# For ANCOVA/2-way ANOVA, use lm() rather than aov(), since we need F statistic and P value as a lm in our summary later.
model <- lm(formula, data = df, contrasts = contrasts_list, ...)
# Calculate type 3 some of square and attach to the model.
tryCatch({
ss3 <- broom::tidy(car::Anova(model, type="III"))
}, error = function(e) { # This can fail depending on the data.
# With 2-way ANOVA with interaction, car::Anova(x, type="III") fails with "there are aliased coefficients in the model" when there are empty cells.
if (with_interaction && stringr::str_detect(e$message, "there are aliased coefficients in the model")) {
stop('EXP-ANA-9 :: [] :: Most likely there is a combination of categories with no rows. Try exclusing interaction.')
}
else {
stop(e)
}
})
model$ss3 <- ss3
}
else {
model <- aov(formula, data = df)
}
# calculate Cohen's f from actual data #TODO: Support 2-way case. Also, is this valid for ANCOVA?
if (length(var2_col) == 1) {
model$cohens_f <- calculate_cohens_f(df[[var1_col]], df[[var2_col]])
}
if (is.null(f)) {
model$cohens_f_to_detect <- model$cohens_f
}
else {
model$cohens_f_to_detect <- f
}
class(model) <- c("anova_exploratory", class(model))
model$var1 <- var1_col
model$var2 <- var2_col
if (!is.null(covariates)) {
model$covariates <- covariates
}
if (!is.null(covariates) || length(var2_col) == 1) {
model$common_var2_order <- as.character(common_var2_order) # as.character is to strip names.
}
model$terms_mapping <- terms_mapping
model$dataframe <- df # model$data is used in afex_aov class. Using model$dataframe to avoid conflict.
model$test_sig_level <- test_sig_level
model$sig.level <- sig.level
model$power <- power
model$with_interaction <- with_interaction
model$with_repeated_measures <- with_repeated_measures
model
}, error = function(e){
if(length(grouped_cols) > 0 && !str_detect(e$message, 'EXP-ANA')) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
# Also, since translation mechanism for EXP-ANA-xxx message is not there on the route with Note column,
# fail over to just throwing error for those errors.
class(e) <- c("anova_exploratory", class(e))
e
} else {
stop(e)
}
})
}
do_on_each_group(df, anova_each, name = "model", with_unnest = FALSE)
}
#' @export
glance.anova_exploratory <- function(x) {
if ("error" %in% class(x)) {
ret <- tibble::tibble(Note = x$message)
return(ret)
}
ret <- broom:::tidy.aov(x) %>% slice(1:1) # there is no glance.aov. take first row of tidy.aov.
# Term value from tidy.aov() can be garbled on Windows with multibyte column name. Overwrite with not-garled value.
if (!is.null(ret$term) && length(ret$term) > 0 && !is.null(x$xlevels) && length(x$xlevels) > 0) {
ret$term[[1]] <- names(x$xlevels)[[1]]
}
ret
}
# Returns a data frame for pairwise contrast. This is a common utility function for "pairs" and "pairs_per_variable" type of the tidier.
get_pairwise_contrast_df <- function(x, formula, pairs_adjust) {
emm_fit <- emmeans::emmeans(x, formula)
if (length(levels(emm_fit)) >=2 && length(levels(emm_fit)$c3_) >= 2) { # 2-way ANOVA (repeated measures or regular)
c2_levels <- levels(emm_fit)$c2_
c3_levels <- levels(emm_fit)$c3_
# Map the levels to numbers, so that we can stably parse the output contrast column.
levels(emm_fit)$c2_ <- 1:length(levels(emm_fit)$c2_)
levels(emm_fit)$c3_ <- 1:length(levels(emm_fit)$c3_)
pw_comp <- emmeans::contrast(emm_fit, "pairwise", adjust=pairs_adjust, enhance.levels=FALSE)
ret <- tibble::as.tibble(pw_comp)
# Get confidence interval.
emm_ci <- confint(pw_comp, level=0.95)
# Bind confint to the base table immediately before arrange().
ret <- ret %>% dplyr::mutate(conf.low=!!emm_ci$lower.CL, conf.high=!!emm_ci$upper.CL)
ret <- ret %>% dplyr::relocate(conf.low, conf.high, .after=estimate)
ret <- ret %>% tidyr::separate(contrast, into = c("pair1", "pair2"), sep = " - ", extra = "merge")
ret <- ret %>% tidyr::separate(pair1, into = c("pair1_1", "pair1_2"), sep = " ", extra = "merge")
ret <- ret %>% tidyr::separate(pair2, into = c("pair2_1", "pair2_2"), sep = " ", extra = "merge")
ret <- ret %>% mutate(pair1_1=factor(c2_levels[as.integer(pair1_1)], levels=c2_levels))
ret <- ret %>% mutate(pair1_2=factor(c3_levels[as.integer(pair1_2)], levels=c3_levels))
ret <- ret %>% mutate(pair2_1=factor(c2_levels[as.integer(pair2_1)], levels=c2_levels))
ret <- ret %>% mutate(pair2_2=factor(c3_levels[as.integer(pair2_2)], levels=c3_levels))
ret <- ret %>% arrange(pair1_1, pair1_2, pair2_1, pair2_2)
# Concat pair1_1 and pair1_2 with ":" and name it as "Group 1".
# Concat pair2_1 and pair2_2 with ":" and name it as "Group 2".
ret <- ret %>% dplyr::mutate(`Group 1`=stringr::str_c(pair1_1, " : ", pair1_2))
ret <- ret %>% dplyr::mutate(`Group 2`=stringr::str_c(pair2_1, " : ", pair2_2))
ret <- ret %>% dplyr::select(-pair1_1, -pair1_2, -pair2_1, -pair2_2)
ret <- ret %>% dplyr::select(`Group 1`, `Group 2`, everything())
}
else { # Only 1 variable. Either c2_ or c3_.
if (!is.null(levels(emm_fit)$c2_)) {# c2_ case
var_levels <- levels(emm_fit)$c2_
levels(emm_fit)$c2_ <- 1:length(levels(emm_fit)$c2_)
}
else { # c3_ case
var_levels <- levels(emm_fit)$c3_
levels(emm_fit)$c3_ <- 1:length(levels(emm_fit)$c3_)
}
pw_comp <- emmeans::contrast(emm_fit, "pairwise", adjust=pairs_adjust, enhance.levels=FALSE)
ret <- tibble::as.tibble(pw_comp)
# Get confidence interval.
emm_ci <- confint(pw_comp, level=0.95)
# Bind confint to the base table immediately before arrange().
ret <- ret %>% dplyr::mutate(conf.low=!!emm_ci$lower.CL, conf.high=!!emm_ci$upper.CL)
ret <- ret %>% dplyr::relocate(conf.low, conf.high, .after=estimate)
ret <- ret %>% tidyr::separate(contrast, into = c("var1", "var2"), sep = " - ", extra = "merge")
if (length(levels(emm_fit)) >=2) { # This means ANCOVA case. Strip the covariate value after the independent variable. e.g. "2 1.97101449275362"
ret <- ret %>% mutate(var1=stringr::str_remove(var1, " .*"))
ret <- ret %>% mutate(var2=stringr::str_remove(var2, " .*"))
}
ret <- ret %>% mutate(var1=var_levels[as.integer(var1)])
ret <- ret %>% mutate(var2=var_levels[as.integer(var2)])
ret <- ret %>% arrange(var1, var2)
}
# Map the column names back to the original.
orig_terms <- x$terms_mapping[colnames(ret)]
orig_terms[is.na(orig_terms)] <- colnames(ret)[is.na(orig_terms)] # Fill the column names that did not have a matching mapping.
colnames(ret) <- orig_terms
# Example output:
# A tibble: 1 × 7
# contrast `w t` estimate SE df t.ratio p.value
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# c2_0 - c2_1 1.12 1.38 1.38 28 1.00 0.325
ret <- ret %>% dplyr::rename(any_of(c(
`Pair 1 Var 1`="pair1_1",
`Pair 1 Var 2`="pair1_2",
`Pair 2 Var 1`="pair2_1",
`Pair 2 Var 2`="pair2_2",
`Group 1`="var1",
`Group 2`="var2",
`Conf High`="conf.high",
`Conf Low`="conf.low",
`Standard Error`="SE",
`DF`="df",
`t Value`="t.ratio",
`P Value`="p.value")))
if (!is.null(x$covariates)) { # ANCOVA case
ret <- ret %>% dplyr::rename(any_of(c(`Adjusted Difference`="estimate")))
} else { # 2-way ANOVA case. For 2-way ANOVA, there is no adjustment here.
ret <- ret %>% dplyr::rename(any_of(c(`Difference`="estimate")))
}
# The version that uses multcomp. It had an issue with column names with spaces.
# ret <- eval(parse(text=paste0('multcomp::glht(x, linfct = multcomp::mcp(`', x$var2, '`="Tukey"))')))
# ret <- broom::tidy(ret)
# Output example:
# A tibble: 1 × 7
# term contrast null.value estimate std.error statistic adj.p.value
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# am 1 - 0 0 -0.0236 1.55 -0.0153 0.988
method <- switch(pairs_adjust,
"none" = "Pairwise T-Test with No Adjustment",
"tukey" = "Tukey's HSD Test",
"bonferroni" = "Pairwise T-Test with Bonferroni Correction",
"sheffe" = "Sheffe's Method",
"sidak" = "Pairwise T-Test with Sidak Correction",
"dunnett" = "Dunnett's Test",
"holm" = "Pairwise T-Test with Holm Correction",
"hochberg" = "Pairwise T-Test with Hochberg Correction"
)
ret <- ret %>% dplyr::mutate(`Method`=!!method)
ret
}
#' @export
tidy.anova_exploratory <- function(x, type="model", conf_level=0.95, pairs_adjust="none", levene_test_center="median", shapiro_seed=1, sort_factor_levels=FALSE) {
if (type %in% c("model", "between", "within")) {
if ("error" %in% class(x)) {
if (is.null(x$message) || x$message == "") {
# It seems there are some cases where x$message is an empty string. Get the error message with as.character().
message <- as.character(x)
}
else {
message <- x$message
}
# x$n can match something like x$nxyz. Check if it is numeric to make sure it is the number of rows we set.
if (!is.null(x$n) && is.numeric(x$n)) {
ret <- tibble::tibble(`Rows`=x$n, Note = message)
}
else {
ret <- tibble::tibble(Note = message)
}
return(ret)
}
note <- NULL
# Power analysis is for one-way ANOVA case only.
one_way_anova_without_repeated_measures <- is.null(x$covariates) && (length(x$var2) == 1) && !x$with_repeated_measures
if (one_way_anova_without_repeated_measures) { # one-way ANOVA case
ret <- broom:::tidy.aov(x)
} else if (x$with_repeated_measures) { # For repeated measures ANOVA, we need to extract results from Anova.mlm object from car package.
x_summary <- summary(x$Anova)
x_matrix <- matrix(as.numeric(x_summary$univariate.tests), ncol=ncol(x_summary$univariate.tests))
colnames(x_matrix) <- colnames(x_summary$univariate.tests)
ret <- as.data.frame(x_matrix)
ret$term <- rownames(x_summary$univariate.tests)
# Add info on p-value adjustment for departure from sphericity.
ret2 <- as.data.frame(x_summary$pval.adjustments)
ret2$term <- rownames(ret2)
ret <- ret %>% dplyr::rename(p.value='Pr(>F)')
total <- sum((x$dataframe[[x$var1]]-mean(x$dataframe[[x$var1]]))^2, na.rm=TRUE) # SS with subtracting mean.
ret <- ret %>% dplyr::mutate(`Eta Squared`=`Sum Sq`/!!total)
ret <- ret %>% dplyr::mutate(`Partial Eta Squared`=`Sum Sq`/(`Sum Sq`+`Error SS`))
# Remove pointless negative eta squared.
ret <- ret %>% dplyr::mutate(`Cohen's F`=ifelse(`Eta Squared`<1, sqrt(`Eta Squared`/(1-`Eta Squared`)), NA_real_))
ret <- ret %>% dplyr::mutate(`Omega Squared`=(`Sum Sq`-`num Df`*(`Error SS`/`den Df`))/(total+(`Error SS`/`den Df`)))
ret_err <- ret %>% dplyr::select(term, `Sum Sq`="Error SS", df="den Df")
ret <- ret %>% select(-`Error SS`, -`den Df`, df="num Df")
ret_gg <- ret2 %>% dplyr::select(term, eps="GG eps", p.value="Pr(>F[GG])") %>% dplyr::mutate(correction="Greenhouse-Geisser")
ret_hf <- ret2 %>% dplyr::select(term, eps="HF eps", p.value="Pr(>F[HF])") %>% dplyr::mutate(correction="Huynh-Feldt")
ret_gg <- ret_gg %>% dplyr::mutate(`Type of Variance`="Between Subjects")
ret_hf <- ret_hf %>% dplyr::mutate(`Type of Variance`="Between Subjects")
ret <- ret %>% dplyr::mutate(`Type of Variance`="Between Subjects")
ret_err <- ret_err %>% dplyr::mutate(`Type of Variance`="Within Subjects")
ret <- dplyr::bind_rows(ret, ret_err)
# Rename to make Total/Corrected Total row addition code consistent with other cases.
ret <- ret %>% dplyr::rename(sumsq=`Sum Sq`)
if (any(!is.na(ret_gg$eps))) ret <- dplyr::bind_rows(ret, ret_gg)
if (any(!is.na(ret_hf$eps))) ret <- dplyr::bind_rows(ret, ret_hf)
} else { # ANCOVA/2-way ANOVA case
ret <- x$ss3
}
if (one_way_anova_without_repeated_measures) {
# Get number of groups (k) , and the minimum sample size among those groups (min_n_rows).
data_summary <- x$dataframe %>% dplyr::group_by(!!rlang::sym(x$var2)) %>%
dplyr::summarize(n_rows=length(!!rlang::sym(x$var1))) %>%
dplyr::summarize(min_n_rows=min(n_rows), tot_n_rows=sum(n_rows), k=n())
k <- data_summary$k
# Using minimum group sample size as the sample size for power calculation.
# Reference: https://www.theanalysisfactor.com/when-unequal-sample-sizes-are-and-are-not-a-problem-in-anova/
min_n_rows <- data_summary$min_n_rows
tot_n_rows <- data_summary$tot_n_rows
}
if (x$with_repeated_measures) {
# Map ANOVA table rows to SPSS style with the following rules.
if (length(x$var2) == 1) { # one-way case
# Within Subjects,c2_ => Within Subjects,Error(c2_)
ret <- ret %>% mutate(term=ifelse(`Type of Variance`=="Within Subjects"&term=="c2_","Error(c2_)",term))
# Within Subjects,(Intercept) => Between Subjects,(Error)
ret <- ret %>% mutate(term = ifelse(`Type of Variance` == "Within Subjects" & term == "(Intercept)", "(Error)", term), `Type of Variance` = ifelse(`Type of Variance` == "Within Subjects" & term == "(Error)", "Between Subjects", `Type of Variance`))
# Between Subjects,c2_ => Within Subjects,c2_
ret <- ret %>% mutate(`Type of Variance` = ifelse(`Type of Variance` == "Between Subjects" & term == "c2_", "Within Subjects", `Type of Variance`))
}
else { # 2-way case
# Within Subjects,c2_:c3_ => remove
# Within Subjects,c3_ => remove
ret <- ret %>% filter(!(`Type of Variance`=="Within Subjects" & term=="c2_:c3_" | `Type of Variance`=="Within Subjects" & term=="c2_"))
# Within Subjects,c3_ => Within Subjects,Error(c3_)
ret <- ret %>% mutate(term=ifelse(`Type of Variance`=="Within Subjects"&term=="c3_","Error(c3_)",term))
# Within Subjects,(Intercept) => Between Subjects,(Error)
ret <- ret %>% mutate(term = ifelse(`Type of Variance` == "Within Subjects" & term == "(Intercept)", "(Error)", term), `Type of Variance` = ifelse(`Type of Variance` == "Within Subjects" & term == "(Error)", "Between Subjects", `Type of Variance`))
# Between Subjects,c3_ => Within Subjects,c3_
ret <- ret %>% mutate(`Type of Variance` = ifelse(`Type of Variance` == "Between Subjects" & term == "c3_", "Within Subjects", `Type of Variance`))
# Between Subjects,c2_:c3_ => Within Subjects,c2_:c3_
ret <- ret %>% mutate(`Type of Variance` = ifelse(`Type of Variance` == "Between Subjects" & term == "c2_:c3_", "Within Subjects", `Type of Variance`))
}
# Sort by the type of variance and the order of appearance of the terms.
ret <- ret %>% arrange(`Type of Variance`, match(term, unique(term)))
# Map the variable names in the term column back to the original.
terms_mapping <- x$terms_mapping
# Add mapping for interaction term and error term.
terms_mapping <- c(terms_mapping,c(`c2_:c3_`=paste0(terms_mapping["c2_"], " * ", terms_mapping["c3_"])))
terms_mapping <- c(terms_mapping,c(`Error(c3_)`=paste0("Error(", terms_mapping["c3_"], ")")))
terms_mapping <- c(terms_mapping,c(`Error(c2_)`=paste0("Error(", terms_mapping["c2_"], ")")))
orig_term <- terms_mapping[ret$term]
orig_term[is.na(orig_term)] <- ret$term[is.na(orig_term)] # Fill the element that did not have a matching mapping. (Should be "Residual")
ret$term <- orig_term
ret <- ret %>% dplyr::mutate(`Mean Square`=sumsq/df)
ret <- ret %>% dplyr::mutate(ssr=sumsq/!!total)
# Relocate term column to the first column.
ret <- ret %>% dplyr::relocate(`Type of Variance`, term, .before = 1)
ret <- ret %>% dplyr::relocate(`Mean Square`, .after=df)
ret <- ret %>% dplyr::relocate(ssr, .after=sumsq)
if (!is.null(ret$correction)) {
ret <- ret %>% dplyr::relocate(eps, .before = p.value)
ret <- ret %>% dplyr::relocate(correction, .after = term)
ret <- ret %>% dplyr::rename(`Correction`="correction",
`Epsilon`="eps"
)
}
ret <- ret %>% dplyr::rename(`Variable`="term", `P Value`="p.value",
`Sum of Squares`="sumsq",
`SS Ratio`="ssr",
`DF` = "df",
`F Value` = "F value"
)
if (type == "between") {
ret <- ret %>% dplyr::filter(`Type of Variance` == "Between Subjects") %>% dplyr::select(-`Type of Variance`)
}
else if (type == "within") {
ret <- ret %>% dplyr::filter(`Type of Variance` == "Within Subjects") %>% dplyr::select(-`Type of Variance`)
}
# Because of the filtering above, it is possible that the Correction/Epsilon column is all NA. If so, remove it.
if (!is.null(ret$Correction) && all(is.na(ret$Correction))) {
ret <- ret %>% dplyr::select(-`Correction`, -`Epsilon`)
}
ret
}
else if (is.null(x$power)) {
ret <- ret %>% dplyr::select(any_of(c("term", "sumsq", "df", "meansq", "statistic", "p.value")))
if (one_way_anova_without_repeated_measures) { # Power analysis is only for ANOVA case
# If power is not specified in the arguments, estimate current power.
tryCatch({ # pwr function can return error from equation resolver. Catch it rather than stopping the whole thing.
power_res <- pwr::pwr.anova.test(k = k, n= min_n_rows, f = x$cohens_f_to_detect, sig.level = x$sig.level)
power_val <- power_res$power
}, error = function(e) {
note <<- e$message
power_val <<- NA_real_
})
ret <- ret %>% dplyr::mutate(power=c(!!power_val, rep(NA, n()-1)), beta=c(1.0-!!power_val, rep(NA, n()-1)), n=c(!!tot_n_rows, rep(NA, n()-1)))
}
# Map the variable names in the term column back to the original.
terms_mapping <- x$terms_mapping
# Add mapping for interaction term
terms_mapping <- c(terms_mapping,c(`c2_:c3_`=paste0(terms_mapping["c2_"], " * ", terms_mapping["c3_"])))
orig_term <- terms_mapping[ret$term]
orig_term[is.na(orig_term)] <- ret$term[is.na(orig_term)] # Fill the element that did not have a matching mapping. (Should be "Residual")
ret$term <- orig_term
if (one_way_anova_without_repeated_measures) { # One-way ANOVA case
total <- sum(ret$sumsq)
total_df <- sum(ret$df)
error_sumsq <- (ret %>% filter(term=="Residuals"))$sumsq
error_meansq <- (ret %>% filter(term=="Residuals"))$meansq
ret <- ret %>% dplyr::mutate(`Eta Squared`=sumsq/!!total)
ret <- ret %>% dplyr::mutate(`Partial Eta Squared`=sumsq/(sumsq+error_sumsq))
ret <- ret %>% dplyr::mutate(`Cohen's F`=ifelse(`Eta Squared`<1, sqrt(`Eta Squared`/(1-`Eta Squared`)), NA_real_))
ret <- ret %>% dplyr::mutate(`Omega Squared`=(sumsq-df*error_meansq)/(total+error_meansq))
# Set NA to the above effect sizes if the term is "Residuals" or "(Corrected Model)".
ret <- ret %>% dplyr::mutate(`Eta Squared`=ifelse(term %in% c("Residuals"), NA_real_, `Eta Squared`))
ret <- ret %>% dplyr::mutate(`Partial Eta Squared`=ifelse(term %in% c("Residuals"), NA_real_, `Partial Eta Squared`))
ret <- ret %>% dplyr::mutate(`Cohen's F`=ifelse(term %in% c("Residuals"), NA_real_, `Cohen's F`))
ret <- ret %>% dplyr::mutate(`Omega Squared`=ifelse(term %in% c("Residuals"), NA_real_, `Omega Squared`))
ret <- ret %>% dplyr::add_row(sumsq = total, df = total_df)
ret <- ret %>% dplyr::mutate(ssr = sumsq/total)
ret <- ret %>% dplyr::relocate(ssr, .after = sumsq)
# x$terms_mapping[[2]] is the column name of the explanatory variable.
ret <- ret %>% dplyr::mutate(term = c(x$terms_mapping[[2]], "(Residuals)", "(Total)"))
ret <- ret %>% dplyr::rename(`Type of Variance`="term")
}
else { # ANCOVA/2-way ANOVA case
total <- sum((x$dataframe[[x$var1]]-mean(x$dataframe[[x$var1]]))^2, na.rm=TRUE) # SS with subtracting mean.
# total <- sum((broom:::tidy.aov(x))$sumsq) # Total SS could be calculated from summing up the type 1 SS, but tidy.aov does not work on x which is generated with lm() rather than aov().
total0 <- sum(x$dataframe[[x$var1]]^2, na.rm=TRUE) # SS without subtracting mean.
total_df <- sum(ret$df)
lm_summary <- broom:::glance.lm(x)
error_sumsq <- (ret %>% filter(term=="Residuals"))$sumsq
model_sumsq <- total - error_sumsq
# Exclude "(Corrected Model)" row for now.
# ret <- ret %>% dplyr::add_row(term="(Corrected Model)", sumsq = model_sumsq,
# statistic = lm_summary$statistic,
# p.value = lm_summary$p.value,
# df = lm_summary$df, .before = 1)
ret <- ret %>% dplyr::mutate(meansq = sumsq/df)
error_meansq <- (ret %>% filter(term=="Residuals"))$meansq
ret <- ret %>% dplyr::relocate(meansq, .after = df)
ret <- ret %>% dplyr::mutate(`Eta Squared`=sumsq/!!total)
ret <- ret %>% dplyr::mutate(`Partial Eta Squared`=sumsq/(sumsq+error_sumsq))
# Remove pointless negative eta squared.
ret <- ret %>% dplyr::mutate(`Cohen's F`=ifelse(`Eta Squared`<1, sqrt(`Eta Squared`/(1-`Eta Squared`)), NA_real_))
ret <- ret %>% dplyr::mutate(`Omega Squared`=(sumsq-df*error_meansq)/(total+error_meansq))
# Set NA to the above effect sizes if the term is "Residuals" or "(Corrected Model)".
ret <- ret %>% dplyr::mutate(`Eta Squared`=ifelse(term %in% c("Residuals", "(Corrected Model)"), NA_real_, `Eta Squared`))
ret <- ret %>% dplyr::mutate(`Partial Eta Squared`=ifelse(term %in% c("Residuals", "(Corrected Model)"), NA_real_, `Partial Eta Squared`))
ret <- ret %>% dplyr::mutate(`Cohen's F`=ifelse(term %in% c("Residuals", "(Corrected Model)"), NA_real_, `Cohen's F`))
ret <- ret %>% dplyr::mutate(`Omega Squared`=ifelse(term %in% c("Residuals", "(Corrected Model)"), NA_real_, `Omega Squared`))
ret <- ret %>% dplyr::add_row(term="(Total)", sumsq = total, df = total_df-1)
ret <- ret %>% dplyr::mutate(term = if_else(term=="Residuals", "(Residuals)", term))
ret <- ret %>% dplyr::mutate(ssr = sumsq/!!total)
ret <- ret %>% dplyr::relocate(ssr, .after = sumsq)
# Remove "(Intercept)" from the result.
ret <- ret %>% dplyr::filter(term!="(Intercept)")
ret <- ret %>% dplyr::rename(`Variable`="term")
}
ret <- ret %>% dplyr::rename(any_of(c(`F Value`="statistic",
`P Value`="p.value",
`Sum of Squares`="sumsq",
`DF`="df",
`Mean Square`="meansq",
`SS Ratio`="ssr",
`Effect Size (Cohen's f)`="f",
`Power`="power",
`Type 2 Error`="beta",
`Rows`="n")))
}
else { # Since we do not support power analysis for ANCOVA or 2-way ANOVA, this is only for one-way ANOVA case.
# If required power is specified in the arguments, estimate required sample size.
tryCatch({ # pwr function can return error from equation resolver. Catch it rather than stopping the whole thing.
power_res <- pwr::pwr.anova.test(k = k, f = x$cohens_f_to_detect, sig.level = x$sig.level, power = x$power)
required_sample_size <- power_res$n
}, error = function(e) {
note <<- e$message
required_sample_size <<- NA_real_
})
ret <- ret %>% dplyr::select(term, sumsq, df, meansq, statistic, p.value) %>%
dplyr::mutate(f=c(!!(x$cohens_f), NA), power=c(!!(x$power), NA), beta=c(1.0-!!(x$power), NA)) %>%
dplyr::mutate(current_sample_size=c(!!min_n_rows, NA), required_sample_size=c(!!required_sample_size, NA), n=c(!!tot_n_rows, NA))
ret <- ret %>% dplyr::add_row(sumsq = sum(ret$sumsq), df = sum(ret$df))
ret <- ret %>% dplyr::mutate(ssr = sumsq/sumsq[3])
ret <- ret %>% dplyr::relocate(ssr, .after = sumsq)
# x$terms_mapping[[2]] is the column name of the explanatory variable.
ret <- ret %>% dplyr::mutate(term = c(x$terms_mapping[[2]], "(Residuals)", "(Total)"))
ret <- ret %>% dplyr::rename(`Type of Variance`=term,
`F Value`=statistic,
`P Value`=p.value,
`DF`=df,
`Sum of Squares`=sumsq,
`SS Ratio`=ssr,
`Mean Square`=meansq,
`Effect Size (Cohen's f)`=f,
`Target Power`=power,
`Target Type 2 Error`=beta,
`Current Sample Size (Each Group)`=current_sample_size,
`Required Sample Size (Each Group)`=required_sample_size,
`Rows`=n)
}
if (!is.null(note)) { # Add Note column, if there was an error from pwr function.
ret <- ret %>% dplyr::mutate(Note=!!note)
}
}
else if (type == "multivariate") {
mvt <- summary(x$Anova)$multivariate.tests
ret <- NULL
for (var_name in names(mvt)) {
var_mvt <- mvt[[var_name]]
eigs <- Re(eigen(qr.coef(qr(var_mvt$SSPE), var_mvt$SSPH), symmetric = FALSE)$values)
car_ns <- getNamespace("car")
# Iterate over results from different test methods.
for (func_name in c("Pillai", "Wilks", "HL", "Roy")) {
var_mvt_res <- get(func_name, envir=car_ns)(eigs, var_mvt$df, var_mvt$df.residual)
names(var_mvt_res) <- c("test stat", "approx F", "num Df", "den Df")
var_mvt_res_df <- as.data.frame(as.list(var_mvt_res))
var_mvt_res_df <- var_mvt_res_df %>% dplyr::mutate(term=var_name, method=func_name)
if (is.null(ret)) {
ret <- var_mvt_res_df
}
else {
ret <- dplyr::bind_rows(ret, var_mvt_res_df)
}
}
}
ret <- ret %>% mutate(p.value=pf(approx.F, num.Df, den.Df, lower.tail = FALSE))
# Map the variable names in the term column back to the original.
terms_mapping <- x$terms_mapping
# Add mapping for interaction term
terms_mapping <- c(terms_mapping,c(`c2_:c3_`=paste0(terms_mapping["c2_"], " * ", terms_mapping["c3_"])))
orig_term <- terms_mapping[ret$term]
orig_term[is.na(orig_term)] <- ret$term[is.na(orig_term)] # Fill the element that did not have a matching mapping. (Should be "Residual")
ret$term <- orig_term
ret <- ret %>% dplyr::relocate(term, method, .before = 1)
ret <- ret %>% dplyr::rename(`Variable`="term", `Method`="method", `Test Statistic`="test.stat", `Approximate F Value`="approx.F", `Hypothesis DF`="num.Df", `Error DF`="den.Df", `P Value`="p.value")
}
else if (type == "sphericity") {
summary_x <- summary(x$Anova)
s_matrix <- as.matrix(summary_x$sphericity.tests)
class(s_matrix) <- "matrix" # Necessary to make it work with as.data.frame.
ret <- as.data.frame(s_matrix)
ret ["term"] <- rownames(ret)
# Map the variable names in the term column back to the original.
terms_mapping <- x$terms_mapping
# Add mapping for interaction term
terms_mapping <- c(terms_mapping,c(`c2_:c3_`=paste0(terms_mapping["c2_"], " * ", terms_mapping["c3_"])))
orig_term <- terms_mapping[ret$term]
orig_term[is.na(orig_term)] <- ret$term[is.na(orig_term)] # Fill the element that did not have a matching mapping. (Should be "Residual")
ret$term <- orig_term
# Relocate term column to the first column.
ret <- ret %>% dplyr::relocate(term, .before = 1)
ret <- ret %>% dplyr::rename(`Variable`="term", `W Value`="Test statistic", `P Value`="p-value")
ret <- ret %>% dplyr::mutate(`Method`="Mauchly's Sphericity Test")
ret <- ret %>% dplyr::mutate(`Result`=ifelse(`P Value` < x$test_sig_level, "Assumption is not valid.", "Assumption is valid."))
}
else if (type == "emmeans") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
if (!is.null(x$covariates)) { # ANCOVA case
if (!x$with_interaction) {
formula <- as.formula(paste0('~`', x$var2, '`|`', paste(x$covariates, collapse='`+`'), '`'))
} else {
formula <- as.formula(paste0('~`', x$var2, '`|`', x$covariates[1], '`+', x$var2, ':', x$covariates[1]))
}
} else { # 2-way ANOVA case. The separator (*, :, or +) should not matter.
formula <- as.formula(paste0('~`', paste(x$var2, collapse='`*`'), '`'))
}
# emmeans seems to work even with afex_aov objects, as well as car::Anova objects or aov objects.
ret <- emmeans::emmeans(x, formula)
ret <- tibble::as.tibble(ret)
if (!is.null(x$covariates)) { # ANCOVA case
conf_threshold = 1 - (1 - conf_level)/2
# For ANCOVA, join regular mean. [[1]] is necessary to remove name from x$var2.
mean_df <- x$dataframe %>%
dplyr::group_by(!!rlang::sym(x$var2)) %>%
dplyr::summarize(`Rows`=length(!!rlang::sym(x$var1)),
Mean=mean(!!rlang::sym(x$var1), na.rm=TRUE),
`Std Deviation`=sd(!!rlang::sym(x$var1), na.rm=TRUE),
# std error definition: https://www.rdocumentation.org/packages/plotrix/versions/3.7/topics/std.error
`Std Error`=sd(!!rlang::sym(x$var1), na.rm=TRUE)/sqrt(sum(!is.na(!!rlang::sym(x$var1)))),
# Note: Use qt (t distribution) instead of qnorm (normal distribution) here.
# For more detail take a look at 10.5.1 A slight mistake in the formula of "Learning Statistics with R"
`Conf Low` = Mean - `Std Error` * qt(p=!!conf_threshold, df=`Rows`-1),
`Conf High` = Mean + `Std Error` * qt(p=!!conf_threshold, df=`Rows`-1),
`Minimum`=min(!!rlang::sym(x$var1), na.rm=TRUE),
`Maximum`=max(!!rlang::sym(x$var1), na.rm=TRUE))
ret <- ret %>% dplyr::rename(any_of(c(`Mean (Adj)`="emmean",
`Std Error (Adj)`="SE",
`DF`="df",
`Conf Low (Adj)`="lower.CL",
`Conf High (Adj)`="upper.CL",
`Mean`="mean")))
ret <- ret %>%
dplyr::left_join(mean_df, by = x$var2[[1]]) %>%
dplyr::relocate(any_of(c("Mean (Adj)",
"Std Error (Adj)",
"Conf Low (Adj)",
"Conf High (Adj)",
"DF")), .after=`Conf High`)
}
# Set the common order to display means and emmeans.
if (sort_factor_levels && !is.null(x$common_var2_order)) {
# as.character() is needed to avoid error when the var2 column is logical.
ret <- ret %>% dplyr::mutate(!!rlang::sym(x$var2[[1]]):=forcats::fct_relevel(as.character(!!rlang::sym(x$var2[[1]])), x$common_var2_order))
}
# Map the column names back to the original.
orig_terms <- x$terms_mapping[colnames(ret)]
orig_terms[is.na(orig_terms)] <- colnames(ret)[is.na(orig_terms)] # Fill the column names that did not have a matching mapping.
colnames(ret) <- orig_terms
# Output example:
# A tibble: 2 × 7
# am wt emmean SE df lower.CL upper.CL
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 0 3.22 20.1 0.833 29 18.4 21.8
# 1 3.22 20.1 1.07 29 17.9 22.3
ret <- ret %>% dplyr::rename(any_of(c(`Adjusted Mean`="emmean",
`Standard Error`="SE",
`DF`="df",
`Conf Low`="lower.CL",
`Conf High`="upper.CL",
`Mean`="mean")))
}
else if (type == "pairs") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
if (!is.null(x$covariates)) { # ANCOVA case
if (!x$with_interaction) {
formula <- as.formula(paste0('~`', x$var2, '`|`', paste(x$covariates, collapse='`+`'), '`'))
} else {
formula <- as.formula(paste0('~`', x$var2, '`|`', x$covariates[1], '`+', x$var2, ':', x$covariates[1]))
}
} else { # 1-way/2-way ANOVA case. The separator (*, :, or +) should not matter.
formula <- as.formula(paste0('~`', paste(x$var2, collapse='`*`'), '`'))
}
ret <- get_pairwise_contrast_df(x, formula, pairs_adjust)
}
else if (type == "pairs_per_variable") { # For only 2-way ANOVA
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
formula1 <- as.formula(paste0('~`', x$var2[1], '`'))
formula2 <- as.formula(paste0('~`', x$var2[2], '`'))
# Cast "Group 1" and "Group 2" columns to character to match the data type for bind_rows.
ret <- get_pairwise_contrast_df(x, formula1, pairs_adjust) %>%
dplyr::mutate(`Variable`=(x$terms_mapping[x$var2[1]]), `Group 1`=as.character(`Group 1`), `Group 2`=as.character(`Group 2`)) %>%
dplyr::select(`Variable`, everything())
ret2 <- get_pairwise_contrast_df(x, formula2, pairs_adjust) %>%
dplyr::mutate(`Variable`=(x$terms_mapping[x$var2[2]]), `Group 1`=as.character(`Group 1`), `Group 2`=as.character(`Group 2`)) %>%
dplyr::select(`Variable`, everything())
ret <- ret %>% dplyr::bind_rows(ret2)
}
else if (type == "levene") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
# Levene's test of equality of error variances
if (levene_test_center == "mean") {
levene_test_center_fun <- mean
}
else {
levene_test_center_fun <- median
}
if (!is.null(x$covariates)) { # ANCOVA case
# x$model[[x$var2]] is the data used to build the model.
ret <- broom::tidy(car::leveneTest(x$residuals, x$model[[x$var2]], center=levene_test_center_fun))
}
else { # 2-way or 1-way ANOVA case
formula <- as.formula(paste0('`', x$var1, '`~`', paste(x$var2, collapse='`*`'), '`'))
ret <- broom::tidy(car::leveneTest(formula, data=x$dataframe, center=levene_test_center_fun))
}
# Example output:
# A tibble: 1 × 4
# statistic p.value df df.residual
# <dbl> <dbl> <int> <int>
# 0.0607 0.807 1 30
ret <- ret %>% dplyr::rename(any_of(c(`F Value`="statistic",
`P Value`="p.value",
`DF`="df",
`Residual DF`="df.residual")))
if (levene_test_center == "mean") {
ret <- ret %>% dplyr::mutate(`Method`="Levene's Test")
}
else { # Levene's test with median as the center is called Brown-Forsythe test. https://search.r-project.org/CRAN/refmans/misty/html/test.levene.html
ret <- ret %>% dplyr::mutate(`Method`="Brown-Forsythe Test")
}
ret <- ret %>% dplyr::mutate(`Result`=ifelse(`P Value` < x$test_sig_level, "Homogeneity assumption is not valid.", "Homogeneity assumption is valid."))
}
else if (type == "shapiro") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
# Shapiro-Wilk test for residual normality
if (x$with_repeated_measures) { # For afex::aov_car return, x$residuals is not available.
resid <- residuals(x)
}
else {
resid <- x$residuals
}
if (length(resid) > 5000) {
if (!is.null(shapiro_seed)) {
set.seed(shapiro_seed)
}
resid <- sample(resid, 5000)
}
ret <- broom::tidy(shapiro.test(resid))
ret$n = length(resid) # Add sample size info
# Example output:
# A tibble: 1 × 4
# statistic p.value method n
# <dbl> <dbl> <chr> <int>
# 0.933 0.0483 Shapiro-Wilk normality test 32
ret <- ret %>% dplyr::rename(any_of(c(`W Value`="statistic",
`P Value`="p.value",
`Method`="method",
`Rows`="n")))
ret <- ret %>% dplyr::mutate(`Method`="Shapiro-Wilk Normality Test") # Just making it in Title Case.
ret <- ret %>% dplyr::mutate(`Result`=ifelse(`P Value` < x$test_sig_level, "Normality assumption is not valid.", "Normality assumption is valid."))
}
else if (type == "data_summary") { #TODO consolidate with code in tidy.ttest_exploratory
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
conf_threshold = 1 - (1 - conf_level)/2
ret <- x$dataframe %>%
dplyr::group_by(!!!rlang::syms(as.character(x$var2))) %>%
dplyr::summarize(`Rows`=length(!!rlang::sym(x$var1)),
Mean=mean(!!rlang::sym(x$var1), na.rm=TRUE),
`Std Deviation`=sd(!!rlang::sym(x$var1), na.rm=TRUE),
# std error definition: https://www.rdocumentation.org/packages/plotrix/versions/3.7/topics/std.error
`Std Error`=sd(!!rlang::sym(x$var1), na.rm=TRUE)/sqrt(sum(!is.na(!!rlang::sym(x$var1)))),
# Note: Use qt (t distribution) instead of qnorm (normal distribution) here.
# For more detail take a look at 10.5.1 A slight mistake in the formula of "Learning Statistics with R"
`Conf High` = Mean + `Std Error` * qt(p=!!conf_threshold, df=`Rows`-1),
`Conf Low` = Mean - `Std Error` * qt(p=!!conf_threshold, df=`Rows`-1),
`Minimum`=min(!!rlang::sym(x$var1), na.rm=TRUE),
`Maximum`=max(!!rlang::sym(x$var1), na.rm=TRUE)) %>%
dplyr::select(!!!rlang::syms(as.character(x$var2)),
`Rows`,
Mean,
`Std Error`,
`Conf Low`,
`Conf High`,
`Std Deviation`,
`Minimum`,
`Maximum`)
# Map the column names back to the original.
orig_terms <- x$terms_mapping[colnames(ret)]
orig_terms[is.na(orig_terms)] <- colnames(ret)[is.na(orig_terms)] # Fill the column names that did not have a matching mapping.
colnames(ret) <- orig_terms
}
else if (type == "prob_dist") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
if (x$with_repeated_measures) { # Repeated measures ANOVA case - x is afex_aov class.
# Get summary for P-Value.
x_summary <- summary(x$Anova)
x_matrix <- matrix(as.numeric(x_summary$univariate.tests), ncol=ncol(x_summary$univariate.tests))
colnames(x_matrix) <- colnames(x_summary$univariate.tests)
ret0 <- as.data.frame(x_matrix)
ret <- generate_ftest_density_data(x$anova_table$F[1], p.value=ret0$p.value, df1=x$anova_table$`num Df`[1], df2=x$anova_table$`den Df`[1], sig_level=x$test_sig_level)
} else if (!is.null(x$covariates) || length(x$var2) > 1) { # ANCOVA or 2-way ANOVA case
ret0 <- x$ss3
# filter rows to extract the degree of freedoms (df1, df2) for the F-test.
# df1 is from the categorical independent variable row, and df2 is from the residuals row.
ret0 <- ret0 %>% filter(term %in% c(x$var2[1],"Residuals"))
ret <- generate_ftest_density_data(ret0$statistic[[1]], p.value=ret0$p.value, df1=ret0$df[[1]], df2=ret0$df[[2]], sig_level=x$test_sig_level)
} else { # one-way ANOVA case
ret0 <- broom:::tidy.aov(x)
ret <- generate_ftest_density_data(ret0$statistic[[1]], p.value=ret0$p.value, df1=ret0$df[[1]], df2=ret0$df[[2]], sig_level=x$test_sig_level)
}
ret
}
else { # type == "data"
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
ret <- x$dataframe
if (x$with_repeated_measures) {
# Remove the subject_id column.
ret <- ret %>% dplyr::select(-subject_id)
}
if (sort_factor_levels && !is.null(x$common_var2_order)) { # ANCOVA/one-way ANOVA and for the Means error bar.
# Set the common order to display means and emmeans.
# as.character() is needed to avoid error when the var2 column is logical.
ret <- ret %>% dplyr::mutate(!!rlang::sym(x$var2[[1]]):=forcats::fct_relevel(as.character(!!rlang::sym(x$var2[[1]])), x$common_var2_order))
}
# Map the column names back to the original.
orig_terms <- x$terms_mapping[colnames(ret)]
orig_terms[is.na(orig_terms)] <- colnames(ret)[is.na(orig_terms)] # Fill the column names that did not have a matching mapping.
colnames(ret) <- orig_terms
}
ret
}
#' Kruskal-Wallis wrapper for Analytics View
#' @param test_sig_level - Significance level for the t-test ifself.
#' @export
exp_kruskal <- function(df, var1, var2, func2 = NULL, test_sig_level = 0.05, pairs_adjust = "none", ...) {
var1_col <- col_name(substitute(var1))
var2_col <- col_name(substitute(var2))
grouped_cols <- grouped_by(df)
if (!is.null(func2)) {
if (lubridate::is.Date(df[[var2_col]]) || lubridate::is.POSIXct(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_date(!!rlang::sym(var2_col), type=!!func2))
}
else if (is.numeric(df[[var2_col]])) {
df <- df %>% dplyr::mutate(!!rlang::sym(var2_col) := extract_from_numeric(!!rlang::sym(var2_col), type=!!func2))
}
}
if (n_distinct(df[[var2_col]]) < 2) {
stop(paste0("The explanatory variable needs to have 2 or more unique values."))
}
formula = as.formula(paste0('`', var1_col, '`~`', var2_col, '`'))
each_func <- function(df) {
tryCatch({
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(var1_col))) # Remove NA from the target column.
if (nrow(df) == 0) {
stop("There is no data left after removing NA.")
}
if(length(grouped_cols) > 0) {
# Check n_distinct again within group after handling outliers.
if (n_distinct(df[[var2_col]]) < 2) {
stop(paste0("The explanatory variable needs to have 2 or more unique values."))
}
}
model <- kruskal.test(formula, data = df, ...)
df.dunn <- df
# dunn.test pre-processing.
# Escape " - " string in the variable since dunn.test concatenates
# the variable names with " - ".
preprocessed <- is.character(df.dunn[[var2_col]]) || is.factor(df.dunn[[var2_col]])
if (preprocessed) {
df.dunn[[var2_col]] <- stringr::str_replace_all(df.dunn[[var2_col]], " - ", "__SeP__")
}
# Run dunn.test.
res.dunn <- dunn.test::dunn.test(df.dunn[[var1_col]],df.dunn[[var2_col]], method=pairs_adjust, alpha=test_sig_level)
# Separate comparisons values like "Cat 1 - Cat 2" into "Cat 1" and "Cat 2".
res.dunn <- tibble::as.tibble(res.dunn) %>%
tidyr::separate(comparisons, into = c("Group 1", "Group 2"), sep = " - ", extra = "merge")
# dunn.test post-processing.
# Restore " - " string in the variable.
if (preprocessed) {
res.dunn <- res.dunn %>%
dplyr::mutate(`Group 1` = stringr::str_replace_all(`Group 1`, "__SeP__", " - "),
`Group 2` = stringr::str_replace_all(`Group 2`, "__SeP__", " - "))
}
model$dunn.test <- res.dunn
model$pairs_adjust <- pairs_adjust
N <- nrow(df)
epsilon_squared <- calculate_epsilon_squared(model, N)
class(model) <- c("kruskal_exploratory", class(model))
model$var1 <- var1_col
model$var2 <- var2_col
model$data <- df
model$epsilon_squared <- epsilon_squared
model$test_sig_level <- test_sig_level
# Calculate effect size: eta2[H] = (H - k + 1)/(n - k)
# H: Value obtained in the Kruskal-Wallis test
# k: Number of groups
# n: Total number of observation
K <- n_distinct(df[[var2_col]])
H <- model$statistic
model$effsize <- (H - K + 1)/(N - K)
model
}, error = function(e){
if(length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
class(e) <- c("kruskal_exploratory", class(e))
e
} else {
stop(e)
}
})
}
do_on_each_group(df, each_func, name = "model", with_unnest = FALSE)
}
tidy.kruskal_exploratory <- function(x, type="model", conf_level=0.95) {
if (type == "model") {
if ("error" %in% class(x)) {
ret <- tibble::tibble(Note = x$message)
return(ret)
}
tot_n_rows <- nrow(x$data)
note <- NULL
ret <- broom:::tidy.htest(x)
ret <- ret %>% dplyr::select(statistic, p.value) # Removed method since it is always "Kruskal-Wallis rank sum test" here.
ret <- ret %>% dplyr::mutate(df=!!x$parameter, effsize=!!x$effsize, epsilon_squared=!!x$epsilon_squared, n=!!tot_n_rows)
ret <- ret %>% dplyr::rename(`H Value` = statistic,
`P Value`=p.value,
`DF`=df,
`Epsilon Squared`=epsilon_squared,
`Eta Squared`=effsize,
`Rows`=n)
if (!is.null(note)) { # Add Note column, if there was an error from pwr function.
ret <- ret %>% dplyr::mutate(Note=!!note)
}
}
else if (type == "pairs") {
ret <- tibble::as_tibble(x$dunn.test)
ret <- ret %>% dplyr::select(-chi2, -P)
ret <- ret %>% dplyr::select(`Group 1`, `Group 2`, everything())
ret <- ret %>% dplyr::rename(`Z Value`=Z,
`P Value`=P.adjusted)
method <- switch(x$pairs_adjust,
"none" = "Dunn's Test with No Adjustment",
"bonferroni" = "Dunn's Test with Bonferroni Correction",
"sidak" = "Dunn's Test with Sidak Correction",
"holm" = "Dunn's Test with Holm Correction",
"hochberg" = "Dunn's Test with Hochberg Correction"
)
ret <- ret %>% dplyr::mutate(`Method`=!!method)
}
else if (type == "data_summary") { #TODO consolidate with code in tidy.ttest_exploratory
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
conf_threshold = 1 - (1 - conf_level)/2
ret <- x$data %>% dplyr::group_by(!!rlang::sym(x$var2)) %>%
dplyr::summarize(`Rows`=length(!!rlang::sym(x$var1)),
Mean=mean(!!rlang::sym(x$var1), na.rm=TRUE),
`Std Deviation`=sd(!!rlang::sym(x$var1), na.rm=TRUE),
# std error definition: https://www.rdocumentation.org/packages/plotrix/versions/3.7/topics/std.error
`Std Error of Mean`=sd(!!rlang::sym(x$var1), na.rm=TRUE)/sqrt(sum(!is.na(!!rlang::sym(x$var1)))),
# Note: Use qt (t distribution) instead of qnorm (normal distribution) here.
# For more detail take a look at 10.5.1 A slight mistake in the formula of "Learning Statistics with R"
`Conf High` = Mean + `Std Error of Mean` * qt(p=!!conf_threshold, df=`Rows`-1),
`Conf Low` = Mean - `Std Error of Mean` * qt(p=!!conf_threshold, df=`Rows`-1),
`Minimum`=min(!!rlang::sym(x$var1), na.rm=TRUE),
`Maximum`=max(!!rlang::sym(x$var1), na.rm=TRUE)) %>%
dplyr::select(!!rlang::sym(x$var2),
`Rows`,
Mean,
`Conf Low`,
`Conf High`,
`Std Error of Mean`,
`Std Deviation`,
`Minimum`,
`Maximum`)
}
else if (type == "prob_dist") {
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
ret <- generate_chisq_density_data(x$statistic, p.value=x$p.value, x$parameter, sig_level=x$test_sig_level)
ret
}
else { # type == "data"
if ("error" %in% class(x)) {
ret <- tibble::tibble()
return(ret)
}
ret <- x$data
}
ret
}
# qqline function that does not draw line and instead return intercept and slope
qqline_data <- function (y, datax = FALSE, distribution = qnorm, probs = c(0.25, 0.75), qtype = 7, ...)
{
stopifnot(length(probs) == 2, is.function(distribution))
y <- quantile(y, probs, names = FALSE, type = qtype, na.rm = TRUE)
x <- distribution(probs)
if (datax) {
slope <- diff(x)/diff(y)
int <- x[1L] - slope * y[1L]
}
else {
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
}
list(int, slope) # intercept and slope
}
#' @param n_sample - Downsample to this size before shapiro test. Note that this is not applied for qq data.
#' @param n_sample_qq - Downsample qq-plot data down to this number.
#' This is to make sure qq-line part of the data would not be sampled out in qq scatter plot.
#' Default 4500 is to make room for qqline rows. (default sample size by scatter plot data query is 5000)
#' @export
exp_normality<- function(df, ...,
n_sample = 50,
n_sample_qq = 4500,
seed = 1
) {
if (!is.null(seed)) {
set.seed(seed)
}
selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
shapiro_each <- function(df) {
df.qq <- data.frame()
df.qqline <- data.frame()
df.model <- data.frame()
for (col in selected_cols) {
if (n_distinct(df[[col]], na.rm=TRUE) <= 1) {
# skip if the column has only 1 unique value or only NAs, to avoid error.
# TODO: show what happened in the summary table.
}
else {
# set plot.it to FALSE to disable plotting (avoid launching another window)
res <- stats::qqnorm(df[[col]], plot.it=FALSE)
df.qq <- dplyr::bind_rows(df.qq, data.frame(x=res$x, y=res$y, col=col))
# bind reference line data too.
ref_res <- qqline_data(df[[col]])
min_x <- min(res$x, na.rm=TRUE)
max_x <- max(res$x, na.rm=TRUE)
ref_min_y <- ref_res[[1]] + ref_res[[2]] * min_x
ref_max_y <- ref_res[[1]] + ref_res[[2]] * max_x
df.qqline <- dplyr::bind_rows(df.qqline, data.frame(x=c(min_x, max_x), refline_y=c(ref_min_y,ref_max_y), col=col))
if (n_sample > 5000) {
n_sample <- 5000 # shapiro.test takes only up to max of 5000 samples.
}
if (length(df[[col]]) > n_sample) {
col_to_test <- sample(df[[col]], n_sample)
# If sampled, check if the column has only 1 unique value or only NAs again, to avoid error.
if (n_distinct(col_to_test, na.rm=TRUE) <= 1) {
next
}
sample_size <- n_sample
}
else {
col_to_test <- df[[col]]
sample_size <- length(col_to_test)
}
res <- shapiro.test(col_to_test) %>% tidy() %>%
dplyr::mutate(col=!!col, sample_size=!!sample_size) %>%
dplyr::select(col, everything())
df.model <- dplyr::bind_rows(df.model, res)
}
}
model <- list()
model$qq <- df.qq
model$sampled_qq <- sample_rows(df.qq, n_sample_qq)
model$qqline <- df.qqline
model$model_summary <- df.model
class(model) <- c("shapiro_exploratory", class(model))
model
}
do_on_each_group(df, shapiro_each, name = "model", with_unnest = FALSE)
}
#' @export
tidy.shapiro_exploratory <- function(x, type = "model", signif_level=0.05) {
if (type == "qq" || type == "histogram") {
if (type == "qq") {
sampled_qq_df <- x$sampled_qq
}
else {
sampled_qq_df <- x$qq
}
# table with TRUE/FALSE result on normality of each column.
normal_df <- x$model_summary %>%
dplyr::mutate(normal = p.value > !!signif_level) %>%
dplyr::select(col, normal)
ret <- dplyr::bind_rows(sampled_qq_df, x$qqline)
# join normality result so that we can show histogram with colors based on it.
ret <- ret %>% dplyr::left_join(normal_df, by="col")
ret
}
else {
ret <- x$model_summary
ret <- ret %>% dplyr::mutate(normal=ifelse(p.value > !!signif_level, "Normality assumption is valid.", "Normality assumption is not valid."))
ret <- ret %>% dplyr::select(-method)
ret <- ret %>% dplyr::rename(`Column`=col, `W Value`=statistic, `P Value`=p.value, `Result`=normal, `Sample Size`=sample_size)
ret
}
}
#' dummy - Data frame. Since it is just ignored, it is named dummy here.
#' @export
exp_chisq_power <- function(dummy, rows=2, cols=2, w=0.1, sig.level=0.05, beta=0.2, n_start=10, n_end=1000, n_step=10) {
power <- 1.0 - beta
n = seq(n_start, n_end, by=n_step)
chisq_power_each <- function(dummy) {
df = (rows-1)*(cols-1) # Degree of freedom
# Sample size vs power calculation
n_to_power_res <- pwr::pwr.chisq.test(df=df, N=n, w=w, sig.level=sig.level)
n_to_power <- tibble::tibble(n=n, power = n_to_power_res$power)
# Required sample size calculation
required_n <- (pwr::pwr.chisq.test(df=df, N=NULL, w=w, sig.level=sig.level, power=power))$N
crit <- qchisq(1-sig.level, df=df) # The chisq value that corresponds to the significance level.
density <- generate_chisq_density_data_for_power(df=df, w=w, N=required_n, crit)
model <- list(n_to_power=n_to_power,
df=df,
w=w,
sig.level=sig.level,
beta=beta,
power=power,
required_n=required_n,
density=density)
class(model) <- c("chisq_power_exploratory")
model
}
do_on_each_group(dummy, chisq_power_each, name = "model", with_unnest = FALSE)
}
#' Chi-Square test power analysis function specialized for AB test. Rows and cols are fixed to 2, and Cohen's w is calculated from the conversion rate difference to detect, etc.
#' @export
exp_chisq_power_for_ab_test <- function(dummy, a_ratio=0.5, conversion_rate=0.1, diff=0.01, sig.level=0.05, beta=0.2, n_start=10, n_end=50000, n_step=10) {
w <- calculate_cohens_w_for_ab_test(a_ratio, conversion_rate, diff)
res <- exp_chisq_power(dummy, rows=2, cols=2, w=w, sig.level=sig.level, beta=beta, n_start=n_start, n_end=n_end, n_step=n_step)
res
}
#' @export
tidy.chisq_power_exploratory <- function(x, type="summary") {
if (type == "summary") {
ret <- tibble::tibble(sig.level=x$sig.level, beta=x$beta, power=x$power, w=x$w, df=x$df, n=x$required_n)
ret <- ret %>% dplyr::rename(any_of(c(`Type 1 Error`="sig.level",
`Type 2 Error`="beta",
`Power`="power",
`Cohen's W`="w",
`DF`="df",
`Required Sample Size`="n")))
}
else if (type == "n_to_power") {
ret <- x$n_to_power
}
else if (type == "density") {
ret <- x$density
}
ret
}
#' dummy - Data frame. Since it is just ignored, it is named dummy here.
#' @export
exp_ttest_power <- function(dummy, a_ratio=0.5, d=0.2, sig.level=0.05, beta=0.2, alternative="two.sided", paired=FALSE, n_start=10, n_end=1000, n_step=10) {
power <- 1.0 - beta
# Adjust n_start and n_end so that they are not too small. The smaller of s_start*a_ratio and n_start*(1-a_ratio) should be greater than 2.
n_start <- max(n_start, 2/min(a_ratio,1-a_ratio))
n_end <- max(n_end, n_start)
n = seq(n_start, n_end, by=n_step)
n1 = a_ratio*n
n2 = (1-a_ratio)*n
if (alternative == "less") {
d_signed <- -d
}
else {
d_signed <- d
}
ttest_power_each <- function(dummy) {
# Sample size vs power calculation
if (!paired) {
n_to_power_res <- pwr::pwr.t2n.test(n1=n1, n2=n2, d=d_signed, sig.level=sig.level, alternative = alternative)
}
else {
n_to_power_res <- pwr::pwr.t.test(n=n, d=d_signed, sig.level=sig.level, type="paired", alternative = alternative)
}
n_to_power <- tibble::tibble(n=n, power = n_to_power_res$power)
# Required sample size calculation
if (!paired) {
required_n <- (pwr::pwr.t2nr.test(r=a_ratio, d=d_signed, sig.level=sig.level, power=power, alternative = alternative))$n
}
else {
required_n <- (pwr::pwr.t.test(d=d_signed, sig.level=sig.level, power=power, type="paired", alternative = alternative))$n
}
if (!paired) {
df <- required_n - 2 # Assuming Student's independent samples t-test sinde Welch's requires standard deviations as extra inputs.
}
else {
df <- required_n - 1
}
if (alternative == "greater") {
crit <- qt(1-sig.level, df=df) # The t statistic that corresponds to the significance level.
}
else if (alternative == "less") {
crit <- qt(sig.level, df=df)
}
else { # "two.sided" case
crit <- qt(1-sig.level/2, df=df)
}
if (!paired) {
n1 = a_ratio*required_n
n2 = (1-a_ratio)*required_n
}
else {
n1 = required_n
n2 = required_n
}
density <- generate_ttest_density_data_for_power(d=d, n1=n1, n2=n2, t=crit, df=df, sig_level = sig.level, alternative = alternative, paired = paired)
model <- list(n_to_power=n_to_power,
df=df,
d=d,
sig.level=sig.level,
beta=beta,
power=power,
required_n=required_n,
density=density)
class(model) <- c("ttest_power_exploratory")
model
}
do_on_each_group(dummy, ttest_power_each, name = "model", with_unnest = FALSE)
}
#' @export
tidy.ttest_power_exploratory <- function(x, type="summary") {
if (type == "summary") {
ret <- tibble::tibble(sig.level=x$sig.level, beta=x$beta, power=x$power, d=x$d, df=x$df, n=x$required_n)
ret <- ret %>% dplyr::rename(any_of(c(`Type 1 Error`="sig.level",
`Type 2 Error`="beta",
`Power`="power",
`Cohen's D`="d",
`DF`="df",
`Required Sample Size`="n")))
}
else if (type == "n_to_power") {
ret <- x$n_to_power
}
else if (type == "density") {
ret <- x$density
}
ret
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.