#-------------------------------------
# Statistic functions to 'CHMI' study
#-------------------------------------
### crosstab----------------------------------------------------------------------------------------
# '@export
chmi.stat.crosstab_tab <- function(
phen,
vars_x,
vars_y,
multi_tab = FALSE,
sum_out = TRUE)
{
### list of formulas
l_formulas <- list()
l_vars <- c(vars_x, vars_y)
### loop for 'vars_x' & 'vars_y'
if(multi_tab == FALSE) {
for(j in vars_x) {
for(i in vars_y) {
forms <- paste0(' ~ ', j, ' + ', i)
l_formulas[[length(l_formulas) + 1]] = forms
}
}
} else {
for(j in vars_x) {
for(i in vars_y) {
forms <- paste0(' ~ ', j, ' + ', paste0(unlist(vars_y, i), collapse = ' + '))
}
l_formulas[[length(l_formulas) + 1]] = forms
}
}
### extract contingency table
l_tab <- lapply(l_formulas, function(x) {
# create list of elements
tab_1 <- list(
tidy(addmargins(xtabs(
formula = x, data = phen, addNA = TRUE, drop.unused.levels = TRUE))), # frequencies info
tidy(prop.table(addmargins(xtabs(
formula = x, data = phen, addNA = TRUE, drop.unused.levels = TRUE)), margin = 1)), # rows percentatges
tidy(prop.table(addmargins(xtabs(
formula = x, data = phen, addNA = TRUE, drop.unused.levels = TRUE)), margin = 2))) # cols_percentatges
})
### list
l_dat <- list()
### loop to save 'as_tibble'
for(i in seq_along(vars_x)) {
# 'bind_cols()' all contingency info
dat <- bind_cols(l_tab[[i]]) %>%
select(vars_x[[i]], all_of(vars_y), freq = n, row_perc = n1, col_perc = n2) %>%
as_tibble()
#
l_dat[[length(l_dat) + 1]] = dat
}
### names()
names(l_dat) <- vars_x
### argument 'sum'
if(sum_out == TRUE) {
for(i in seq_along(vars_x)){
# filter 'l_dat'
l_dat[[i]] <- l_dat[[i]] %>%
filter_at(vars(vars_x[[i]], vars_y), all_vars(. != 'Sum'))
}
}
### return
return(l_dat)
}
### descriptive-------------------------------------------------------------------------------------
#' @export
chmi.stat.descriptive_tab <- function(
phen,
vars_x,
vars_y,
shapiro_test = FALSE,
stat = c('dplyr', 'e1071'))
{
### arg
stat <- match.arg(stat, c('dplyr', 'e1071'))
### loop to select 'vars_x'
var <- lapply(vars_x, function(i) i)
### list of 'tab'
l_tabs <- list()
### loop to run into 'var'
for(i in seq_along(var)) {
# summary
tab_1 <- phen %>%
select(vars_y, var[[i]]) %>%
group_by_at(vars_y) %>%
group_nest() %>%
mutate(
summary_data = map(data,
~ mutate(.x,
n = sum(!is.na(.x[[var[[i]]]])),
na = sum(is.na(.x[[var[[i]]]])),
min = min(.x[[var[[i]]]], na.rm = TRUE),
mean = mean(.x[[var[[i]]]], na.rm = TRUE),
max = max(.x[[var[[i]]]], na.rm = TRUE),
median = median(.x[[var[[i]]]], na.rm = TRUE),
kurt = e1071::kurtosis(.x[[var[[i]]]], na.rm = TRUE),
skew = e1071::skewness(.x[[var[[i]]]], na.rm = TRUE),
raw_shapiro = shapiro.test(.x[[var[[i]]]])$p.value))) %>%
select(-data) %>%
unnest(cols = c(summary_data)) %>%
distinct_at(vars(vars_y, n, na, min, mean, max, median, kurt, skew, raw_shapiro)) %>%
arrange_at(vars(vars_y))
### argument for 'shapiro.test()'
if(shapiro_test == TRUE) {
# update test
tab_1 <- tab_1 %>%
mutate(
pval_shapiro = factor(ifelse(raw_shapiro < .05, 'non_gaussian', 'gaussian')))
} else {
tab_1 <- tab_1 %>%
select(-kurt, -skew, -ends_with('_shapiro'))
}
# list
l_tabs[[length(l_tabs) + 1]] = tab_1
}
### names()
names(l_tabs) <- vars_x
### return
return(l_tabs)
}
### 'mfi' differences in 'parametric' or 'non_parametric' variables---------------------------------
# '@export
chmi.stat.statistical_test <- function(
phen,
gr_nest,
l_group,
vars_x = c('log10_mfi'),
vars_y = c('status', 'gr_hbs', 't2_point'),
padj_method = c('holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY', 'fdr', 'none'),
fold_change = FALSE)
{
# list of select variables
l_y <- l_group %>% setdiff(., 't2_point')
# update 'data'
phen_1 <- dat %>%
select(all_of(l_group, vars_x, vars_y), -ends_with('_point')) %>%
group_by_at(l_group) %>%
arrange_at(l_group) %>%
group_nest()
# perform formula
l_formula <- paste0(paste0(t_var_1), ' ~ ', paste0(t_var_2))
# one.oway test
phen_1 <- phen_1 %>%
mutate(oneway_test = map(data,
~ try(oneway.test(formula = as.formula(l_formula), .x), silent = TRUE))) %>%
mutate(raw_pval = map_dbl(oneway_test, ~ .x[['p.value']]))
# unnest & adjust_pval
phen_1 <- phen_1 %>%
select(-c(data, oneway_test)) %>%
unnest(cols = c(l_group, raw_pval)) %>%
mutate(l_forms = l_formula)
# `p.adjust()` function
phen_1 <- chmi.stat.p_adjust(phen = phen_1, method = padj_method) %>%
mutate_at(vars(c('adjust_text', 'adjust_signif')), as.factor)
### argument to `fold_change`
if(fold_change == TRUE) {
# obtain `y_min` & `y_max`
phen_2 <- phen %>%
select(l_oneway) %>%
group_by_at(l_y) %>%
group_nest() %>%
mutate(y_max = map_dbl(data, ~ max(.x[, quo_name(t_var_1)], na.rm = TRUE))) %>%
mutate(y_min = map_dbl(data, ~ min(.x[, quo_name(t_var_1)], na.rm = TRUE))) %>%
select(-data) %>%
unnest(cols = c(l_y, y_max, y_min))
# join
phen_f <- left_join(phen_1, phen_2, by = l_y)
# return
return(phen_f)
} else {
# return
return(phen_1)
}
}
### differences in 'mfi'----------------------------------------------------------------------------
# '@export
chmi.stat.oneway_test_old <- function(phen, group_by,
test_var = c('status', 'gr_hbs', 't2_point'))
{
# condition
if(test_var == 'status') {
phen_1 <- phen %>%
group_by_at(group_by) %>%
arrange_at(group_by) %>%
nest() %>%
mutate(raw_pval = map(data,
~ try(oneway.test(log10_mfi ~ status, .x)$p.value, silent = T)))
} else if (test_var == 'gr_hbs'){
phen_1 <- phen %>%
group_by_at(group_by) %>%
arrange_at(group_by) %>%
nest() %>%
mutate(raw_pval = map(data,
~ try(oneway.test(log10_mfi ~ gr_hbs, .x)$p.value, silent = T)))
} else {
phen_1 <- phen %>%
group_by_at(group_by) %>%
arrange_at(group_by) %>%
nest() %>%
mutate(raw_pval = map(data,
~ try(oneway.test(log10_mfi ~ t2_point, .x)$p.value, silent = T)))
}
# unnest & adjust_pval
tab <- phen_1 %>%
select(group_by, raw_pval) %>%
unnest() %>%
mutate(adjust_pval = p.adjust(raw_pval, method = 'BH')) %>%
mutate(adjust_text = factor(
ifelse(is.nan(adjust_pval), 'nc',
ifelse(adjust_pval <= 0.0001, 'p < 0.0001',
ifelse(adjust_pval <= 0.001, 'p < 0.001',
ifelse(adjust_pval <= 0.05, paste0('p = ', round(adjust_pval, 3)),
'ns'))))),
adjust_signif = factor(
ifelse(is.nan(adjust_pval), 'nc',
ifelse(adjust_pval <= 0.0001, '***',
ifelse(adjust_pval <= 0.001, '**',
ifelse(adjust_pval <= 0.01, '*',
ifelse(adjust_pval <= 0.05, '.',
'ns'))))))) %>%
arrange(adjust_pval)
# return
return(tab)
}
### 'mfi' differences in 'fold change'--------------------------------------------------------------
# '@export
chmi.stat.p_adjust <- function(
phen,
method = c('holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY', 'fdr', 'none'))
{
## nc == no converge
## np == no perform test
## ns == no signifincant
### update dataset with 'raw_pval' and 'adjust_pval'
phen <- phen %>%
mutate(
adjust_pval = as.numeric(
ifelse(is.na(raw_pval), 'NA',
p.adjust(raw_pval, method = method))),
adjust_text = factor(
ifelse(is.nan(adjust_pval), 'nc',
ifelse(is.na(adjust_pval), 'np',
ifelse(adjust_pval <= 0.0001, 'p < 0.0001',
ifelse(adjust_pval <= 0.001, 'p < 0.001',
ifelse(adjust_pval <= 0.05, paste0('p = ', round(adjust_pval, 3)), 'ns')))))),
adjust_signif = factor(
ifelse(is.nan(adjust_pval), 'nc',
ifelse(is.na(adjust_pval), 'np',
ifelse(adjust_pval <= 0.0001, '***',
ifelse(adjust_pval <= 0.001, '**',
ifelse(adjust_pval <= 0.01, '*',
ifelse(adjust_pval <= 0.05, '.', 'ns')))))))) %>%
arrange(adjust_pval)
return(phen)
}
### Bayesian Information Criterion (BIC)------------------------------------------------------------
## https://www.methodology.psu.edu/resources/AIC-vs-BIC/
## Kruschke K., J. Doing Bayesian Data Analysis: A tutorial with R and BUGS.
## Metric Predicted Variable with Multiple Metric Predictors (pp. 371-400).
## https://www.users.csbsju.edu/~mgass/robert.pdf
# '@export
chmi.stat.bic_selection <- function(
phen,
l_varsy,
base_covlist, tested_covlist,
n_comb, testing = FALSE,
stat = c('plyr'))
{
### variables
covlist <- c(base_covlist, tested_covlist)
# check
stopifnot(all(covlist %in% names(phen)))
### testing
if(missing(n_comb) & testing == TRUE) {
n_comb <- round(length(covlist) / 2)
# tested_covlist <- tested_covlist[1:11]
}
### create a list of 'tested_covlist' combinations
covlist_comb <- llply(1:n_comb, function(x) {
combn(tested_covlist, x, simplify = FALSE)
})
covlist_comb <- do.call('c', covlist_comb)
# add based_covlist to combinations
covlist_comb <- llply(covlist_comb, function(x) {
c(base_covlist, x)
})
### compute base_models
mod_0 <- glm(malaria_ag ~ gender_ori, phen, family = binomial, model = FALSE)
l_mods <-length(covlist_comb)
mod <- llply(1:l_mods, function(i) {
cat(' * ', i, ' / ', l_mods, '\n')
x <- covlist_comb[[i]]
f <- as.formula(paste('malaria_ag ~ ', paste(x, collapse = ' + ')))
summary(glm(f, phen, family = binomial, model = FALSE))
})
### table
l_mods <-length(covlist_comb)
mod <- llply(1:l_mods, function(i) {
cat(' * ' , i, ' / ', l_mods, '\n')
x <- covlist_comb[[i]]
f <- as.formula(paste('malaria_ag ~', paste(x, collapse = ' + ')))
glm(f, phen, family = binomial, model = FALSE)
})
### table
mod_0 <- glm(malaria_ag ~ gender_ori, phen, family = binomial, model = FALSE)
tab <- ldply(mod, function(x) {
rhs_formul <- as.character(x$formula)[3] # formula
test <- anova(mod_0, x, test = 'Rao') # anova
stat_rao <- as.data.frame(test)[2, 5] # extract 'rao' columns
ll_0 <- logLik(mod_0)[1]
ll <- logLik(x)[1]
phen_2 <- ll - ll_0 # difference between LogLink
chisq <- 2 * (as.numeric(ll) - as.numeric(ll_0)) # numeric formal
p_val <- pchisq(chisq, phen_2, lower.tail = FALSE) # p_value from 'chisq'
log_pval <- -log10(p_val) # log10 of the 'chisq'
data.frame(rhs_formul = rhs_formul,
BIC = BIC(x),
deviance = deviance(x),
log_pval = log_pval,
stringsAsFactors = FALSE)
})
# reorder tabs
tab <- tab %>%
arrange(BIC) %>%
as_tibble()
# return
return(tab)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.