R/chmi.stat_summary.R

Defines functions chmi.stat.bic_selection chmi.stat.p_adjust chmi.stat.oneway_test_old chmi.stat.statistical_test chmi.stat.descriptive_tab chmi.stat.crosstab_tab

#-------------------------------------
# Statistic functions to 'CHMI' study
#-------------------------------------


### crosstab----------------------------------------------------------------------------------------

# '@export
chmi.stat.crosstab_tab <- function(
	phen,
	vars_x,
	vars_y,
	multi_tab = FALSE,
  sum_out = TRUE)
{
### list of formulas
  l_formulas <- list()
  l_vars <- c(vars_x, vars_y)


### loop for 'vars_x' & 'vars_y'
	if(multi_tab == FALSE) {
 		for(j in vars_x) {
  		for(i in vars_y) {
  	    forms <- paste0(' ~ ', j, ' + ', i)

    	 l_formulas[[length(l_formulas) + 1]] = forms
   		}
   	}

	} else {
    for(j in vars_x) {
    	for(i in vars_y) {
    	   forms <- paste0(' ~ ', j, ' + ', paste0(unlist(vars_y, i), collapse = ' + '))
      }
      l_formulas[[length(l_formulas) + 1]] = forms
    }
  }



### extract contingency table
	l_tab <- lapply(l_formulas, function(x) {
		# create list of elements
		tab_1 <- list(
			tidy(addmargins(xtabs(
				formula = x, data = phen, addNA = TRUE, drop.unused.levels = TRUE))), # frequencies info
			tidy(prop.table(addmargins(xtabs(
				formula = x, data = phen, addNA = TRUE, drop.unused.levels = TRUE)), margin = 1)), # rows percentatges
			tidy(prop.table(addmargins(xtabs(
				formula = x, data = phen, addNA = TRUE, drop.unused.levels = TRUE)), margin = 2))) # cols_percentatges

	})


### list
	l_dat <- list()

### loop to save 'as_tibble'
	for(i in seq_along(vars_x)) {
   	# 'bind_cols()' all contingency info
   	dat <- bind_cols(l_tab[[i]]) %>%
   		select(vars_x[[i]], all_of(vars_y), freq = n, row_perc = n1, col_perc = n2) %>%
   		as_tibble()

    #
  	l_dat[[length(l_dat) + 1]] = dat
	}

### names()
	names(l_dat) <- vars_x


### argument 'sum'
  if(sum_out == TRUE) {
    for(i in seq_along(vars_x)){
      # filter 'l_dat'
      l_dat[[i]] <- l_dat[[i]] %>%
        filter_at(vars(vars_x[[i]], vars_y), all_vars(. != 'Sum'))
      }
  }

### return
	return(l_dat)
}



### descriptive-------------------------------------------------------------------------------------

#' @export
chmi.stat.descriptive_tab <- function(
	phen,
	vars_x,
	vars_y,
  shapiro_test = FALSE,
	stat = c('dplyr', 'e1071'))
{
### arg
	stat <- match.arg(stat, c('dplyr', 'e1071'))


### loop to select 'vars_x'
	var <- lapply(vars_x, function(i) i)


### list of 'tab'
	l_tabs <- list()


### loop to run into 'var'
	for(i in seq_along(var)) {
	# summary
	tab_1 <- phen %>%
		select(vars_y, var[[i]]) %>%
		group_by_at(vars_y) %>%
		group_nest() %>%
		mutate(
			summary_data = map(data,
				~ mutate(.x,
					n = sum(!is.na(.x[[var[[i]]]])),
					na = sum(is.na(.x[[var[[i]]]])),
					min = min(.x[[var[[i]]]], na.rm = TRUE),
					mean = mean(.x[[var[[i]]]], na.rm = TRUE),
					max = max(.x[[var[[i]]]], na.rm = TRUE),
					median = median(.x[[var[[i]]]], na.rm = TRUE),
					kurt = e1071::kurtosis(.x[[var[[i]]]], na.rm = TRUE),
					skew = e1071::skewness(.x[[var[[i]]]], na.rm = TRUE),
          raw_shapiro = shapiro.test(.x[[var[[i]]]])$p.value))) %>%
    select(-data) %>%
    unnest(cols = c(summary_data)) %>%
    distinct_at(vars(vars_y, n, na, min, mean, max, median, kurt, skew, raw_shapiro)) %>%
    arrange_at(vars(vars_y))


### argument for 'shapiro.test()'
    if(shapiro_test == TRUE) {
    # update test
    tab_1 <- tab_1 %>%
      mutate(
        pval_shapiro = factor(ifelse(raw_shapiro < .05, 'non_gaussian', 'gaussian')))
    } else {
    tab_1 <- tab_1 %>%
      select(-kurt, -skew, -ends_with('_shapiro'))
    }

	# list
		l_tabs[[length(l_tabs) + 1]] = tab_1
	}


 ### names()
 	names(l_tabs) <- vars_x


 ### return
	 return(l_tabs)
}


### 'mfi' differences in 'parametric' or 'non_parametric' variables---------------------------------

# '@export
chmi.stat.statistical_test <- function(
  phen,
  gr_nest,
  l_group,
  vars_x = c('log10_mfi'),
  vars_y = c('status', 'gr_hbs', 't2_point'),
  padj_method = c('holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY', 'fdr', 'none'),
  fold_change = FALSE)
{
# list of select variables
	l_y <- l_group %>% setdiff(., 't2_point')

# update 'data'
  phen_1 <- dat %>%
    select(all_of(l_group, vars_x, vars_y), -ends_with('_point')) %>%
    group_by_at(l_group) %>%
    arrange_at(l_group) %>%
    group_nest()

# perform formula
  l_formula <- paste0(paste0(t_var_1), ' ~ ', paste0(t_var_2))

# one.oway test
  phen_1 <- phen_1 %>%
    mutate(oneway_test = map(data,
    ~ try(oneway.test(formula = as.formula(l_formula), .x), silent = TRUE))) %>%
    mutate(raw_pval = map_dbl(oneway_test, ~ .x[['p.value']]))


# unnest & adjust_pval
  phen_1 <- phen_1 %>%
   	select(-c(data, oneway_test)) %>%
   	unnest(cols = c(l_group, raw_pval)) %>%
   	mutate(l_forms = l_formula)

 # `p.adjust()` function
  phen_1 <- chmi.stat.p_adjust(phen = phen_1, method = padj_method) %>%
    mutate_at(vars(c('adjust_text', 'adjust_signif')), as.factor)


		### argument to `fold_change`
		if(fold_change == TRUE) {
		# obtain `y_min` & `y_max`
			phen_2 <- phen %>%
				select(l_oneway) %>%
				group_by_at(l_y) %>%
				group_nest() %>%
				mutate(y_max = map_dbl(data, ~ max(.x[, quo_name(t_var_1)], na.rm = TRUE))) %>%
				mutate(y_min = map_dbl(data, ~ min(.x[, quo_name(t_var_1)], na.rm = TRUE))) %>%
				select(-data) %>%
				unnest(cols = c(l_y, y_max, y_min))

		# join
		phen_f <- left_join(phen_1, phen_2, by = l_y)

	# return
		return(phen_f)

	} else {
	# return
  	return(phen_1)
	}
}



### differences in 'mfi'----------------------------------------------------------------------------

# '@export
chmi.stat.oneway_test_old <- function(phen, group_by,
  test_var = c('status', 'gr_hbs', 't2_point'))
{
# condition
  if(test_var == 'status') {
    phen_1 <- phen %>%
      group_by_at(group_by) %>%
      arrange_at(group_by) %>%
      nest() %>%
      mutate(raw_pval = map(data,
      ~ try(oneway.test(log10_mfi ~ status, .x)$p.value, silent = T)))

  } else if (test_var == 'gr_hbs'){
    phen_1 <- phen %>%
      group_by_at(group_by) %>%
      arrange_at(group_by) %>%
      nest() %>%
      mutate(raw_pval = map(data,
      ~ try(oneway.test(log10_mfi ~ gr_hbs, .x)$p.value, silent = T)))

  } else {
    phen_1 <- phen %>%
      group_by_at(group_by) %>%
      arrange_at(group_by) %>%
      nest() %>%
      mutate(raw_pval = map(data,
      ~ try(oneway.test(log10_mfi ~ t2_point, .x)$p.value, silent = T)))
  }

# unnest & adjust_pval
  tab <- phen_1 %>%
   select(group_by, raw_pval) %>%
   unnest() %>%
   mutate(adjust_pval = p.adjust(raw_pval, method = 'BH')) %>%
   mutate(adjust_text = factor(
     ifelse(is.nan(adjust_pval), 'nc',
     ifelse(adjust_pval <= 0.0001, 'p < 0.0001',
     ifelse(adjust_pval <= 0.001, 'p < 0.001',
     ifelse(adjust_pval <= 0.05, paste0('p = ', round(adjust_pval, 3)),
     'ns'))))),
     adjust_signif = factor(
     ifelse(is.nan(adjust_pval), 'nc',
     ifelse(adjust_pval <= 0.0001, '***',
     ifelse(adjust_pval <= 0.001, '**',
     ifelse(adjust_pval <= 0.01, '*',
     ifelse(adjust_pval <= 0.05, '.',
     'ns'))))))) %>%
    arrange(adjust_pval)

  # return
    return(tab)
}



### 'mfi' differences in 'fold change'--------------------------------------------------------------

# '@export
chmi.stat.p_adjust <- function(
	phen,
	method = c('holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY',  'fdr', 'none'))
{
## nc == no converge
## np == no perform test
## ns == no signifincant
### update dataset with 'raw_pval' and 'adjust_pval'
phen <- phen %>%
  mutate(
   	adjust_pval = as.numeric(
   		ifelse(is.na(raw_pval), 'NA',
   		p.adjust(raw_pval, method = method))),
  	adjust_text = factor(
      ifelse(is.nan(adjust_pval), 'nc',
      ifelse(is.na(adjust_pval), 'np',
      ifelse(adjust_pval <= 0.0001, 'p < 0.0001',
      ifelse(adjust_pval <= 0.001, 'p < 0.001',
      ifelse(adjust_pval <= 0.05, paste0('p = ', round(adjust_pval, 3)), 'ns')))))),
    adjust_signif = factor(
      ifelse(is.nan(adjust_pval), 'nc',
      ifelse(is.na(adjust_pval), 'np',
      ifelse(adjust_pval <= 0.0001, '***',
      ifelse(adjust_pval <= 0.001, '**',
      ifelse(adjust_pval <= 0.01, '*',
      ifelse(adjust_pval <= 0.05, '.', 'ns')))))))) %>%
    arrange(adjust_pval)

return(phen)
}



### Bayesian Information Criterion (BIC)------------------------------------------------------------

## https://www.methodology.psu.edu/resources/AIC-vs-BIC/
## Kruschke K., J. Doing Bayesian Data Analysis: A tutorial with R and BUGS.
## Metric Predicted Variable with Multiple Metric Predictors (pp. 371-400).
## https://www.users.csbsju.edu/~mgass/robert.pdf

# '@export
chmi.stat.bic_selection <- function(
  phen,
  l_varsy,
  base_covlist, tested_covlist,
  n_comb, testing = FALSE,
  stat = c('plyr'))
{
### variables
covlist <- c(base_covlist, tested_covlist)

  # check
  stopifnot(all(covlist %in% names(phen)))

### testing
if(missing(n_comb) & testing == TRUE) {
  n_comb <- round(length(covlist) / 2)
  # tested_covlist <- tested_covlist[1:11]
}

### create a list of 'tested_covlist' combinations
covlist_comb <- llply(1:n_comb, function(x) {
  combn(tested_covlist, x, simplify = FALSE)
})

covlist_comb <- do.call('c', covlist_comb)

  # add based_covlist to combinations
  covlist_comb <- llply(covlist_comb, function(x) {
    c(base_covlist, x)
  })


### compute base_models
mod_0 <- glm(malaria_ag ~ gender_ori, phen, family = binomial, model = FALSE)


l_mods <-length(covlist_comb)
mod <- llply(1:l_mods, function(i) {
  cat(' * ', i, ' / ', l_mods, '\n')

  x <- covlist_comb[[i]]
  f <- as.formula(paste('malaria_ag ~ ', paste(x, collapse = ' + ')))

  summary(glm(f, phen, family = binomial, model = FALSE))
})

### table
l_mods <-length(covlist_comb)
mod <- llply(1:l_mods, function(i) {
  cat(' * ' , i, ' / ', l_mods, '\n')

  x <- covlist_comb[[i]]
  f <- as.formula(paste('malaria_ag ~', paste(x, collapse = ' + ')))

  glm(f, phen, family = binomial, model = FALSE)
})

### table
mod_0 <- glm(malaria_ag ~ gender_ori, phen, family = binomial, model = FALSE)

tab <- ldply(mod, function(x) {
  rhs_formul <- as.character(x$formula)[3] # formula
  test <- anova(mod_0, x, test = 'Rao') # anova
  stat_rao <- as.data.frame(test)[2, 5] # extract 'rao' columns
  ll_0 <- logLik(mod_0)[1]
  ll <- logLik(x)[1]

  phen_2 <- ll - ll_0 # difference between LogLink
  chisq <- 2 * (as.numeric(ll) - as.numeric(ll_0)) # numeric formal
  p_val <- pchisq(chisq, phen_2, lower.tail = FALSE) # p_value from 'chisq'
  log_pval <- -log10(p_val) # log10 of the 'chisq'

  data.frame(rhs_formul = rhs_formul,
    BIC = BIC(x),
    deviance = deviance(x),
    log_pval = log_pval,
    stringsAsFactors = FALSE)
})

# reorder tabs
tab <- tab %>%
  arrange(BIC) %>%
  as_tibble()

# return
  return(tab)
}
mvazquezs/chmitools documentation built on May 1, 2020, 2:06 a.m.