#' split demographics into two datasets (continuous and categorical variables)
#'
#' @param demo demographics dataset
#' @param group string stateting the column name of the grouping variable
#' @param trim remove variables with 20% missing data
#' @export
splitData = function(demo, group, trim = 0.8){
if(nrow(demo) < 10)
stop("too few samples")
## Class balance
print("Number of samples in each group")
print(table(demo[, group]))
imbal <- scale(table(demo[, group]), center = FALSE, scale = nrow(demo)) %>% drop
if(sum(imbal < 0.4) > 0)
print("Class imbalance exists")
## Remove missing demo
missing <- colSums(!is.na(demo))/nrow(demo)
varKeep <- missing[missing >= trim]
## List missing variables
if(length(setdiff(names(missing), names(varKeep))) > 0){
print("Missing variables and percentage of avaiable data")
print(missing[missing < trim][order(missing[missing < trim])])
}
## remove missing variables
data <- demo[, names(varKeep)]
## Split data into 2 datasets (categorical and continuous variables)
dataList <- lapply(1 : ncol(data), function(i){
if(nlevels(factor(as.character(data[, i]))) < 10){
x <- factor(as.character(data[, i]))
} else {
x <- as.numeric(data[, i])
}
x
})
names(dataList) <- colnames(data)
data.new <- dataList %>% do.call(cbind.data.frame, .)
## data with categorical variables
data.cat <- data.new[, !sapply(data.new, is.numeric)]
## how many variables with one level?
oneLevel <- which(dataList[colnames(data.cat)] %>% sapply(., function(i) nlevels(i)) == 1)
if(length(oneLevel) > 1){
print(paste0("The categorical variable ",
paste(names(which(dataList[colnames(data.cat)] %>% sapply(., function(i) nlevels(i)) == 1)), collapse = "/"),
" only has 1 level; User should remove this variable"))
} else {
print("All categorial variables have more than 1 level: Good!")
}
## data with continuous variables
data.cont <- data.new[, sapply(data.new, is.numeric), drop = FALSE]
## Summariize
summary <- data.frame(Total_nVar = ncol(demo),
missing_nVar = (ncol(demo) - length(varKeep)),
nonMissing_nVar = length(varKeep),
nonMissing_cat_nVar = ncol(data.cat),
nonMissing_cont_nVar = ncol(data.cont))
return(list(summary=summary, data.cat=data.cat, data.cont=data.cont))
}
#' perform hypothesis test based on variable type
#'
#' @param data input dataset (all variables of a given dataset must be of the same type)
#' @param group string stating the column name of the grouping variable
#' @details
#' This function performs either a parametric (linear model) or non-parametric test (Wilcoxon or Kruskal-Wallis) for continuous data, or the Chi-Square test for categorical variables. The diagnostic assumptions of linear models are assessed using global statistic that assessing the four assumptions of linear models (skewness, kurtosis, link function, homoscadasticity (based on a test for heteroscadasticity)) based on the gvlma R-library, \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2820257/}).
#' @export
hypothesisTests = function (data, group, details = FALSE){
library("lmtest")
if (!is.factor(data[, group]))
stop("group variable must be a factor!")
classes <- droplevels(data[, group])
if (nlevels(classes) < 2)
stop("at least 2 levels in the group variable are required")
data <- data[, setdiff(colnames(data), group), drop = FALSE]
if (all(sapply(data, is.numeric))) {
isCont = TRUE
} else {
isCont = FALSE
}
if (isTRUE(isCont)) {
## Check linear model assumptions
diagnostics = data %>%
dplyr::mutate(group = classes) %>%
tidyr::gather(Var, Value, -group) %>%
dplyr::group_by(Var) %>%
nest() %>%
dplyr::mutate(assumptions = purrr::map(data, ~{
fit <- lm(Value ~ group, data = .)
assumptions <- gvlma::gvlma(fit)
assumptions <- assumptions$GlobalTest[2:6] %>%
do.call(rbind, .) %>%
as.data.frame %>%
mutate(Test = c("Global Stat", "Skewness", "Kurtosis", "Link Function", "Heteroscedasticity"),
lmDiagnostics = ifelse(pvalue < 0.05, "Assumptions NOT satisfied!", "Assumptions acceptable."))
assumptions$Parametric <- paste("lm", signif(pf(summary(fit)$fstatistic[1],
summary(fit)$fstatistic[2],
summary(fit)$fstatistic[3],lower.tail=FALSE), 2), sep="_")
if(nlevels(classes) == 2){
assumptions$NonParametric <- paste("wilcoxon", signif(wilcox.test(Value ~ group, data = .)$p.value, 2), sep="_")
} else {
assumptions$NonParametric <- paste("Kruskal.Wallis", signif(kruskal.test(Value ~ group, data = .)$p.value, 2), sep="_")
}
assumptions
})) %>%
dplyr::mutate(summary = purrr::map(data, ~{
group_by(., group) %>% dplyr::summarize(Mean.SD = paste(c(round(mean(Value,
na.rm = TRUE), 1), round(sd(Value, na.rm = TRUE),
1)), collapse = "+/-")) %>%
tidyr::spread(group, Mean.SD)
})) %>%
unnest(summary) %>% unnest(assumptions) %>% unnest() %>%
dplyr::select(-c(Value, pvalue, Decision)) %>%
gather(TestType, Pval, Parametric:NonParametric) %>%
separate(Pval, c("Method", "p.value"), "_") %>%
mutate(Decision = ifelse(as.numeric(p.value) < 0.05, "Significant", "Non-Significant"))
## based on diagnostic run parameteric or non-parametric test
if(details){
split(diagnostics, diagnostics$Decision)
} else {
diagnostics <- diagnostics %>% filter(Test == "Global Stat") %>%
group_by(Var) %>%
filter(lmDiagnostics == "Assumptions acceptable." & TestType == "Parametric" | lmDiagnostics == "Assumptions NOT satisfied!" & TestType == "NonParametric")
split(diagnostics, diagnostics$Decision)
}
} else {
result <- apply(data, 2, function(i) {
sigTest <- data.frame(chisq = chisq.test(table(i,
classes))$p.value)
if (sigTest$chisq < 0.05) {
sigTest$Decision <- "Significant"
} else {
sigTest$Decision <- "Not_Significant"
}
sigTest
}) %>% do.call(rbind, .)
summary <- data %>% dplyr::mutate(group = classes) %>%
tidyr::gather(Var, lvl, -group) %>% dplyr::group_by(Var,
group) %>% dplyr::summarise(lvl = paste(paste(names(table(lvl)),
round(100 * table(lvl)/sum(table(lvl)), 1), sep = "_"),
collapse = "/")) %>% tidyr::spread(group, lvl) %>%
as.data.frame
diagnostics <- cbind(summary, result[summary$Var, ])
split(diagnostics, diagnostics$Decision)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.