#' multGroup
#'
#' multGroup provides summary statistics for continous and categorical variables either overall or stratified by a grouping variable. It also provides parametric or nonparametric tests for each variable.
#'
#' @param data Input data.frame
#' @param PcontinVars Vector of continous variables names from data that should be tested parametrically
#' @param PcatVars Vector of categorical variables names from data that should be tested parametrically
#' @param NPcontinVars Vector of continous variables names from data that should be tested non-parametrically
#' @param NPcatVars Vector of categorical variables names from data that should be tested non-parametrically
#' @param SortVars Vector of variable names indicating in what order the variables should be presented in the output table
#' @param varLabelTable A varLabelTable object. Converts variable names to variable labels in output table.
#' @param grouping Grouping variable of any number of levels that will be used to stratify each variable
#' @param pdec Number of decimals for p-values
#' @param dec Number of decimals for descriptive statistics
#' @param mu For a one sample t-test (no grouping), the reference value to compare population to. Default of 0.
#' @param ChiProbabilities For a one sample chi-square (no grouping), the underlying probability distribution to compare the population to. If NULL, will compare to an even distribution across all levels.
#' @param labels Labels for the grouping variable
#' @param verbose If TRUE, additional information is printed to the console
#' @param percent Indicates if the output table should present 'row', 'column', or 'overall' percents for categorical variables.
#' @param NPdescriptives Vector of continuous variable names that should present median and range in the output table.
#' @param padjust If TRUE and grouping has 3 or more levels, only pairwise comparisons that are significant at 0.05 after using a multiple comparisons adjustment are presented
#' @param provideP If TRUE, in the case of no grouping variable, the p-values for the 1-sample tests will be included in the output, otherwise only the descriptive statistics are returned
#' @param include If 'none', no additional output is included. If 'range', range is included. If '95ci' then 95% Confidence intervals are included.
#'
#' @return A data.frame including all univariate results.
#' @export
#'
#' @examples
#'
#' #df <- data.frame(time = runif(100, 1, 100),
#' # event = sample(c(0,1), 100, replace = T, prob = c(.1, .9)),
#' # x1 = rnorm(100),
#' # x2 = sample(c(1,2), 100, replace = T))
#' #multGroup(data = df,
#' # NPcontinVars = "time",
#' # grouping = "event",
#' # PcontinVars = "x1",
#' # NPcatVars = "x2")
multGroup <- function(data,
PcontinVars = NULL,
PcatVars = NULL,
NPcontinVars = NULL,
NPcatVars = NULL,
SortVars = NULL,
varLabelTable = NULL,
grouping = NULL,
pdec = 3,
dec = 2,
mu = 0,
ChiProbabilities = NULL,
labels = NULL,
percent = "column",
NPdescriptives = NULL,
NPvariance = 'range',
padjust = F,
provideP = T,
include = 'none',
verbose = T) {
deleteExtraneous <- F
data <- as.data.frame(data)
if(!is.null(grouping)) {
data <- subset(data, !is.na(data[[grouping]]))
}
if (is.null(grouping)) {
keepNames <- names(data)
data1 <- data.frame(data, fakeVar = "All Patients")
data2 <- data.frame(data, fakeVar = "zzzzzzz")
data <- rbind(data1, data2)
names(data) <- c(keepNames, "fakeVar")
data$fakeVar <- factor(data$fakeVar)
grouping <- "fakeVar"
deleteExtraneous <- T
}
if (!is.null(grouping) & !is.null(labels)) {
if (length(labels) != length(levels(factor(data[[grouping]])))) {
return(paste0(
"Number of labels does not match number of levels of ",
"grouping variable"
))
}
}
ciProp <- function(x, conf.int = .95) {
x <- x[!is.na(x)]
n <- length(x)
pbar = unlist(table(x))/n
SE = sqrt(pbar*(1-pbar)/n)
E = qnorm(conf.int +(1-conf.int)/2)*SE
out = do.call(rbind, lapply(1:length(pbar), function(i) pbar[i] + c(-E[i], E[i])))
row.names(out) <- names(pbar)
colnames(out) <- c("ci.lower", "ci.upper")
return(out)
}
#For parametric continuous variables
if (!is.null(PcontinVars) | !is.null(NPcontinVars)) {
if (verbose == T) {
nnv <- function(x) {
is.na1 <- is.na(x)
x2 <- suppressWarnings(as.numeric(as.character(x)))
is.na2 <- is.na(x2)
x[is.na2 & !is.na1]
}
}
}
if (!is.null(PcontinVars)) {
#Get a list of the groups being compared
flist <- sort(unique(data[[grouping]]))
#Function to get descriptive statistics per group
descriptive <- function(x) {
if (verbose == T)
print(paste0(x, " is now being analyzed"))
if (verbose == T) {
toNA <- nnv(data[, x])
if (length(toNA) > 0) {
print(paste0(
x,
": The following entries will be converted to NA prior to analysis"
))
print(toNA)
}
}
#data for a specified variable
xlist <- suppressWarnings(as.numeric(as.character(data[, x])))
#Number of groups being compared
numlevs <- length(flist)
#making a list of 1 to number of groups
numdos <- 1:numlevs
#finally the meat and potatoes of getting the desc. stats.
overallxs <- xlist[!is.na(xlist)]
overallN <- length(overallxs)
overallmean <-
sprintf(paste0("%.", dec, "f"), round(mean(overallxs), digits = dec))
overallsd <-
sprintf(paste0("%.", dec, "f"), round(sd(overallxs), digits = dec))
overallmed <- round(median(overallxs), digits = dec)
overallrange <-
paste0(round(range(overallxs)[1], digits = dec),
", ",
round(range(overallxs)[2], digits = dec))
overallIQR <- round(IQR(overallxs), digits = dec)
overallQuartiles <- paste0(round(quantile(overallxs)[2], digits = dec),
", ",
round(quantile(overallxs)[4], digits = dec))
if (include == '95ci') {
overallSE <- sd(overallxs) / sqrt(overallN)
E <- qt(.975, overallN - 1) * overallSE
overall95ci <- paste0(" [",
sprintf(paste0("%.", dec, "f"), round(mean(overallxs) - E, digits = dec)),
", ",
sprintf(paste0("%.", dec, "f"), round(mean(overallxs) + E, digits = dec)),
"]")
}
if (include == 'none') {
overallstats <-
paste0(
overallN,
" ",
if(!is.null(NPdescriptives) &
(x %in% NPdescriptives)) {
overallmed
} else {overallmean} ,
" ",
"(",
if(!is.null(NPdescriptives) &
(x %in% NPdescriptives)) {
if(NPvariance=='range'){
overallrange } else if(NPvariance=='IQR') {
overallIQR } else if(NPvariance=='innerQuartiles') {
overallQuartiles
}
} else {
overallsd
},
")"
)
}
if (include == 'range') {
overallstats <- paste0(overallN,
" ",
overallmean,
" ",
"(",
overallsd,
") ",
overallrange)
}
if (include == '95ci') {
overallstats <- paste0(overallN,
" ",
overallmean,
" ",
"(",
overallsd,
") ",
overall95ci)
}
getPstats <- function(level) {
#what group level are we looking at?
grouplevel <- level
#get only the data for individuals in that group
xs <- as.numeric(xlist[data[[grouping]] == flist[level]])
#get rid of all NAs
xclean <- xs[!is.na(xs)]
#get the mean, rounded
mean <-
sprintf(paste0("%.", dec, "f"), round(mean(xclean), digits = dec))
#get the std. dev, rounded
sd1 <-
sprintf(paste0("%.", dec, "f"), round(sd(xclean), digits = dec))
#get the n
n <- length(xclean)
if (include == '95ci') {
oneSE <- sd(xclean) / sqrt(n)
E <- qt(.975, n - 1) * oneSE
one95ci <- paste0(" [",
sprintf(paste0("%.", dec, "f"), round(mean(xclean) - E, digits = dec)),
", ",
sprintf(paste0("%.", dec, "f"), round(mean(xclean) + E, digits = dec)),
"]")
}
#bind it all together
if (include == 'none') {
stats <- paste0(n, " ", mean, " ", "(", sd1, ")")
}
if (include == 'range') {
stats <- paste0(n,
" ",
mean,
" ",
"(",
sd1,
") ",
paste0(
round(range(xclean)[1], digits = dec),
", ",
round(range(xclean)[2], digits =
dec)
))
}
if (include == '95ci') {
stats <- paste0(n,
" ",
mean,
" ",
"(",
sd1,
") ",
one95ci)
}
#return the result 'up' a level to be used in the next function
return(stats)
}
getNPstats <- function(level) {
#what group level are we looking at?
grouplevel <- level
#get only the data for individuals in that group
xs <- as.numeric(xlist[data[[grouping]] == flist[level]])
#get rid of all NAs
xclean <- xs[!is.na(xs)]
med <-
sprintf(paste0("%.", dec, "f"), round(median(xclean), digits = dec))
range <-
paste0(round(range(xclean)[1], digits = dec),
", ",
round(range(xclean)[2], digits =
dec))
IQR <- round(IQR(xclean), digits = dec)
Quartiles <- paste0(round(quantile(xclean)[2], digits = dec),
", ",
round(quantile(xclean)[4], digits = dec))
#get the n
n <- length(xclean)
#bind it all together
if(NPvariance=='range') stats <- paste0(n, " ", med, " ", "(", range, ")")
if(NPvariance=='IQR') stats <- paste0(n, " ", med, " ", "(", IQR, ")")
if(NPvariance=='innerQuartiles') stats <- paste0(n, " ", med, " ", "(", Quartiles, ")")
#return the result 'up' a level to be used in the next function
return(stats)
}
#we column bind the descriptive results so that we have them for each
#level of group
firstpart <-
do.call(cbind, lapply(numdos, if (x %in% NPdescriptives) {
getNPstats
} else {
getPstats
}))
#now we decide what test to run, based off the number of groups and we
#do that test
#also decides based off parametric or non-parametric
if (numlevs == 2)
Test <-
"t.test"
else if (numlevs == 1)
Test <- "Single Sample"
else
Test <- "ANOVA"
if (Test == "Single Sample") {
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(
t.test(data[, x],
alternative = "two.sided",
mu = mu)$p.value,
digits = pdec
))
if (pvalue == 0)
pvalue <- "<.0001"
testname <- "1 sample ttest"
sigpcol = "None"
}
if (Test == "t.test") {
x1 <-
suppressWarnings(as.numeric(as.character(data[, x][data[[grouping]] == flist[1]])))
x1clean <- x1[!is.na(x1)]
x2 <-
suppressWarnings(as.numeric(as.character(data[, x][data[[grouping]] == flist[2]])))
x2clean <- x2[!is.na(x2)]
tc <- tryCatch(t.test(x1clean, x2clean),
warning = function(w)
w,
error = function (e)
e)
if (is(tc, "error")) {
testname <- "None"
sigpcol = "None"
pvalue <- 'None'
sigpcol <- "None"
} else {
t <- t.test(x1clean, x2clean)
testname <- "2-sample t-test"
sigpcol = "None"
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(t$p.value, digits = pdec))
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
pvalue <- gsub("^1\\.", "> 0.", pvalue)
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
sigpcol <- "None"
}
}
if (Test == "ANOVA") {
anovasum <-
summary(aov(data[, x] ~ factor(data[[grouping]])))[[1]][["Pr(>F)"]]
testname <- "ANOVA"
modelfortuk <- aov(data[, x] ~ factor(data[[grouping]]))
#extra bit for ANOVA is doing a TukeyHSD multiple comparisons
if (anovasum[1] <= .05) {
pairps <- TukeyHSD(modelfortuk)[[1]][, 4]
sigpsTF <- pairps <= .05
print(pairps)
print(sigpsTF)
sigps <- which(sigpsTF %in% TRUE)
print(sigps)
if (length(sigps) == 0) {
sigpcol = "None"
} else {
sigpcol = paste(unlist(sigps), collapse = " ")
}
} else {
sigpcol = "None"
}
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(anovasum[1], digits = pdec))
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
pvalue <- gsub("^1\\.", "> 0.", pvalue)
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
}
if (x %in% NPdescriptives == 1) {
resp <-
paste0("N Median (", NPvariance, ")")
} else if (include == 'none') {
resp <- "N Mean (Std.Dev.)"
} else if (include == 'range') {
resp <- "N Mean (Std.Dev.) Range"
} else {
resp <- "N Mean (Std.Dev.) [95% CI]"
}
#pull together the descriptives, and the test results for a single DV
Poneline <-
data.frame(resp,
overallstats,
firstpart,
as.character(pvalue),
testname,
sigpcol)
}
#then pull toegether the results for all DVs, and label the columns
#and rows and print it
Pmorepre <- do.call(rbind, lapply(PcontinVars, descriptive))
Pmore <- data.frame(PcontinVars, PcontinVars, Pmorepre)
colnames(Pmore) <- c(
"ForSortVars",
"Variable",
"Response",
"Overall",
if (!is.null(labels)) {
labels
} else {
flist
},
"p value",
"Test",
"Pairwise"
)
}
#For nonparametric continuous variables
if (!is.null(NPcontinVars)) {
#Get a list of the groups being compared
flist <- sort(unique(data[[grouping]]))
#Function to get descriptive statistics per group
descriptive <- function(x) {
if (verbose == T) print(paste0(x, " is now being analyzed"))
if (verbose == T) {
toNA <- nnv(data[, x])
if (length(toNA) > 0) {
print(paste0(
x,
": The following entries will be converted to NA prior to analysis"
))
print(toNA)
}
}
#data for a specified variable
xlist <- suppressWarnings(as.numeric(as.character(data[, x])))
#Number of groups being compared
numlevs <- length(flist)
#making a list of 1 to number of groups
numdos <- 1:numlevs
#finally the meat and potatoes of getting the desc. stats.
overallxs <- xlist[!is.na(xlist)]
overallN <- length(overallxs)
overallmean <-
sprintf(paste0("%.", dec, "f"), round(mean(overallxs), digits = dec))
overallsd <-
sprintf(paste0("%.", dec, "f"), round(sd(overallxs), digits = dec))
overallmed <- round(median(overallxs), digits = dec)
overallrange <-
paste0(round(range(overallxs)[1], digits = dec),
", ",
round(range(overallxs)[2], digits = dec))
if (include == '95ci') {
overallSE <- sd(overallxs) / sqrt(overallN)
E <- qt(.975, overallN - 1) * overallSE
overall95ci <- paste0(" [",
sprintf(paste0("%.", dec, "f"), round(mean(overallxs) - E, digits = dec)),
", ",
sprintf(paste0("%.", dec, "f"), round(mean(overallxs) + E, digits = dec)),
"]")
}
if (include == 'none') {
overallstats <- paste0(
overallN,
" ",
ifelse(
!is.null(NPdescriptives) & (x %in% NPdescriptives),
overallmed,
overallmean
),
" ",
"(",
if(!is.null(NPdescriptives) & (x %in% NPdescriptives)) {
overallrange
} else {overallsd}
,
")"
)
}
if (include == 'range') {
overallstats <- paste0(overallN,
" ",
overallmean,
" ",
"(",
overallsd,
") ",
overallrange)
}
if (include == '95ci') {
overallstats <- paste0(overallN,
" ",
overallmean,
" ",
"(",
overallsd,
") ",
overall95ci)
}
getPstats <- function(level) {
#what group level are we looking at?
grouplevel <- level
#get only the data for individuals in that group
xs <-
suppressWarnings(as.numeric(as.character(xlist[data[[grouping]] == flist[level]])))
#get rid of all NAs
xclean <- xs[!is.na(xs)]
#get the mean, rounded
mean <-
sprintf(paste0("%.", dec, "f"), round(mean(xclean), digits = dec))
#get the std. dev, rounded
sd1 <-
sprintf(paste0("%.", dec, "f"), round(sd(xclean), digits = dec))
#get the n
n <- length(xclean)
if (include == '95ci') {
oneSE <- sd(xclean) / sqrt(n)
E <- qt(.975, n - 1) * oneSE
one95ci <- paste0(" [",
sprintf(paste0("%.", dec, "f"), round(mean(xclean) - E, digits = dec)),
", ",
sprintf(paste0("%.", dec, "f"), round(mean(xclean) + E, digits = dec)),
"]")
}
#bind it all together
if (include == 'none') {
stats <- paste0(n, " ", mean, " ", "(", sd1, ")")
}
if (include == 'range') {
stats <- paste0(n,
" ",
mean,
" ",
"( ",
sd1,
") ",
paste0(
round(range(xclean)[1], digits = dec),
", ",
round(range(xclean)[2], digits = dec)
))
}
if (include == '95ci') {
stats <- paste0(n,
" ",
mean,
" ",
"(",
sd1,
") ",
one95ci)
}
#return the result 'up' a level to be used in the next function
return(stats)
}
getNPstats <- function(level) {
#what group level are we looking at?
grouplevel <- level
#get only the data for individuals in that group
xs <-
suppressWarnings(as.numeric(as.character(xlist[data[[grouping]] == flist[level]])))
#get rid of all NAs
xclean <- xs[!is.na(xs)]
med <-
sprintf(paste0("%.", dec, "f"), round(median(xclean), digits = dec))
IQRstat <- paste0(round(range(xclean)[1], digits = dec),
", ",
round(range(xclean)[2], digits = dec))
#get the n
n <- length(xclean)
#bind it all together
stats <- paste0(n, " ", med, " ", "(", IQRstat, ")")
#return the result 'up' a level to be used in the next function
return(stats)
}
#we column bind the descriptive results so that we have them for each
#level of group
firstpart <- do.call(cbind,
lapply(numdos,
if (x %in% NPdescriptives) {
getNPstats
} else {
getPstats
}))
#now we decide what test to run, based off the number of groups and
#we do that test also decides based off parametric or non-parametric
if (numlevs == 2)
Test <-
"t.test"
else if (numlevs == 1)
Test <- "Single Sample"
else
Test <- "ANOVA"
if (Test == "Single Sample") {
pvalue <- sprintf(paste0("%.",
pdec,
"f"),
round(wilcox.exact(data[, x])$p.value, digits =
pdec))
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
pvalue <- gsub("^1\\.", "> 0.", pvalue)
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
testname <- "Rank Sum"
sigpcol = "None"
}
if (Test == "t.test") {
x1 <-
suppressWarnings(as.numeric(as.character(data[, x][data[[grouping]] == flist[1]])))
x1clean <- x1[!is.na(x1)]
x2 <-
suppressWarnings(as.numeric(as.character(data[, x][data[[grouping]] == flist[2]])))
x2clean <- x2[!is.na(x2)]
tC <- tryCatch(
wilcox.test(x1clean, x2clean, exact = T),
warning = function(w)
w,
error = function(e)
e
)
if (is(tC, "warning")) {
ddd <-
data.frame(grp = factor(c(
rep(0, length(x1clean)), rep(1, length(x2clean))
)),
val = c(x1clean, x2clean))
t <- wilcox_test(val ~ grp, ddd)
pvalue <- sprintf(paste0("%.", pdec, "f"),
round(pvalue(t, distribution = exact),
digits = pdec))
if (verbose == T) {
print(paste0(
x,
": Ties found, exact p-value is computed using mid-ranks method"
))
}
} else {
t <- wilcox.test(x1clean, x2clean, exact = T)
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(t$p.value, digits = pdec))
}
testname <- "Mann-Whitney U"
sigpcol = "None"
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
pvalue <- gsub("^1\\.", "> 0.", pvalue)
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
sigpcol <- "None"
}
if (Test == "ANOVA") {
anovasum <-
kruskal.test(data[, x] ~ data[[grouping]])$p.value
testname <- "Kruskal-Wallis"
#Again we have to do multiple comparisons since more than 2 groups
sigpsTF <-
kruskalmc(data[, x] ~ data[[grouping]])$dif.com[, 3]
sigps <- which(sigpsTF %in% TRUE)
if (length(sigps) == 0) {
sigpcol = "None"
} else {
sigpcol = paste(unlist(sigps), collapse = " ")
}
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(anovasum[1], digits = pdec))
if (pvalue == 0)
pvalue <- "<.0001"
if (pvalue == 1)
pvalue <- ">.999"
}
if (x %in% NPdescriptives == 1) {
resp <-
"N Median (Range)"
} else if (include == 'none') {
resp <- "N Mean (Std.Dev.)"
} else if (include == 'range') {
resp <- "N Mean (Std.Dev.) Range"
} else {
resp <- "N Mean (Std.Dev.) [95% CI]"
}
#pull together the descriptives, and the test results for a single DV
NPoneline <-
data.frame(resp,
overallstats,
firstpart,
as.character(pvalue),
testname,
sigpcol)
}
#then pull toegether the results for all DVs, and label the columns and
#rows and print it
NPmorepre <- do.call(rbind, lapply(NPcontinVars, descriptive))
NPmore <- data.frame(NPcontinVars, NPcontinVars, NPmorepre)
colnames(NPmore) <-
c(
"ForSortVars",
"Variable",
"Response",
"Overall",
if (!is.null(labels)) {
labels
} else {
flist
},
"p value",
"Test",
"Pairwise"
)
}
if (!is.null(PcontinVars) & !is.null(NPcontinVars)) {
more <- rbind(Pmore, NPmore)
}
if (!is.null(PcontinVars) & is.null(NPcontinVars)) {
more <- Pmore
}
if (is.null(PcontinVars) & !is.null(NPcontinVars)) {
more <- NPmore
}
if (!is.null(PcatVars)) {
posthoc.CHI <-
function (chi,
popsInRows = FALSE,
control = c("fdr",
"BH",
"BY",
"bonferroni",
"holm",
"hochberg",
"hommel"),
digits = 4)
{
control <- match.arg(control)
tbl <- chi$observed
if (!popsInRows)
tbl <- t(tbl)
popsNames <- rownames(tbl)
prs <- combn(1:nrow(tbl), 2)
tests <- ncol(prs)
pvals <- numeric(tests)
lbls <- character(tests)
for (i in 1:tests) {
pvals[i] <- suppressWarnings(chisq.test(tbl[prs[, i],])$p.value)
lbls[i] <- paste(popsNames[prs[, i]], collapse = " vs. ")
}
adj.pvals <- p.adjust(pvals, method = control)
# cat("Adjusted p-values used the", control, "method.\n\n")
data.frame(
comparison = lbls,
raw.p = round(pvals, digits),
adj.p = round(adj.pvals, digits)
)
}
#Location on catVars for which variable name to pull
indexedListLocation = 0
get.cat.stats <-
function(catVars,
group = factor(data[, grouping]),
dec = dec,
...) {
get.chi.stuff <- function(var) {
#Don't use <<, unless you have to...
indexedListLocation <<- indexedListLocation + 1
if (verbose == T)
print(paste0(catVars[indexedListLocation], " is now being analyzed"))
#Unique levels of group
flist <- sort(unique(group))
#is our table a contingency or not?
if (length(flist) == 1) {
long <- table(var)
}
else {
long <- table(var, group)
}
overalllong <- table(var)
overallprop <-
t(t(sprintf(
paste0("%.", dec, "f"),
round(100 * prop.table(overalllong), digits = dec)
)))
if(include == "95ci") {
ciPropTable <- round(100*ciProp(var), 1)
ciPropTable <- paste0("[", ciPropTable[,1], "%, ", ciPropTable[,2], "%]")
}
#get proportions for the table
longprop <- sprintf(paste0("%.", dec, "f"),
round(
100 * prop.table(long, if (percent == "column" &
length(flist) != 1) {
2
}
else if (percent == "column") {
NULL
}
else if (percent == "row") {
1
}
else if (percent == "overall") {
NULL
}),
digits = dec
))
#pick a test to use
if (provideP == TRUE) {
if (length(flist) == 1) {
if (mean(ChiProbabilities == FALSE) == 1 |
mean(c(
j <- mean(ChiProbabilities != FALSE),
k <-
(length(levels(
factor(var)
)) != length(ChiProbabilities))
)) == 1) {
if (ChiProbabilities != FALSE) {
print(
paste0(
"Number of Probabilites listed in ChiProbabilities",
"\ndoes not equal the number of levels of ",
catVars[indexlocation]
)
)
}
test <-
chisq.test(table(as.character(var)))
} else {
test <- chisq.test(table(as.character(var)), p = ChiProbabilities)
}
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(test$p.value, digits = pdec))
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
pvalue <- gsub("^1\\.", "> 0.", pvalue)
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
sigpcol <- "N/A"
method <- "Chi-Square Test"
}
}
if (provideP == FALSE)
pvalue = "NA"
sigpcol = "NA"
method = "NA"
if (length(flist) > 1) {
if (length(levels(factor(var))) > 1) {
if (length(levels(factor(group))) > 1) {
tryC <- tryCatch(
test <- chisq.test(var, group),
warning = function(w)
w,
error = function (e)
e
)
if (grepl("'x' and 'y' must have at least", tryC[1]) |
grepl("'x' and 'y' must have at least", tryC[2])) {
pvalue <- "N/A"
sigps <- NULL
sigpcol <- "N/A"
} else {
test <- chisq.test(as.character(as.factor(var)), group)
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(test$p.value, digits = pdec))
ps <-
posthoc.CHI(test)$raw.p <= (ifelse(padjust == TRUE, .05 /
sum(1:(
length(flist)
) - 1), .05))
sigps <- which(ps %in% TRUE)
}
} else {
pvalue <- "NA"
sigps <- NULL
}
if (length(sigps) == 0) {
sigpcol = "None"
} else {
sigpcol = paste(unlist(sigps), collapse = " ")
}
}
if (length(levels(factor(var))) == 1) {
pvalue <- "N/A"
sigpcol <- "N/A"
}
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
pvalue <- gsub("^1\\.", "> 0.", pvalue)
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
method <- "Chi-Square Test"
}
#get test stats
teststats <- c(pvalue, method)
#make a matrix to put all of the results into
res <-
data.frame(matrix(NA, nrow(long), 7 + if (length(flist) == 1) {
1
} else {
ncol(long)
}))
#make the column names for the matrix
names(res) <-
c(
"ForSortVars",
"Variable",
"Response",
"Overall",
if (!is.null(labels)) {
labels
} else {
flist
},
"p value",
"Test",
"Pairwise"
)
#make the first column, first row, the name of the variable,
#from the list you provided
res[, 1] <-
rep(catVars[indexedListLocation], length(res[, 1]))
res[1, 2] <- catVars[indexedListLocation]
#here the row.names are the possible responses to the categorical
#variable
res[, 3] <- row.names(long)
#now we are putting in our table of raw counts (and percentages)
#overall
if (percent == "row") {
res[, 4] <-
paste(overalllong, paste("(", "100", "%", ")", sep = ''))
}
if (percent != "row") {
if(include != "95ci") {
res[, 4] <-
paste(overalllong, paste("(", overallprop, "%", ")", sep = ''))
}
if(include == "95ci") {
res[, 4] <-
paste(overalllong, paste("(", overallprop, "%", ") ", ciPropTable, sep = ''))
}
}
if(include == "95ci" & percent == 'row') {
getoneRow <- function(rowLevel) {
group <- factor(group)
var <- factor(var)
x = group[var == rowLevel]
ciRow <- round(100*ciProp(x), 1)
ciRow <- paste0("[", ciRow[,1], "%, ", ciRow[,2], "%]")
return(ciRow)
}
CIs <- do.call(rbind, lapply(levels(factor(var)), getoneRow))
}
if(include == "95ci" & percent == 'column') {
getoneCol <- function(colLevel) {
group <- factor(group)
var <- factor(var)
x = var[group == colLevel]
ciCol <- round(100*ciProp(x), 1)
ciCol <- paste0("[", ciCol[,1], "%, ", ciCol[,2], "%]")
# print(x)
return(ciCol)
}
CIs <- do.call(cbind, lapply(levels(factor(group)), getoneCol))
}
# number of columns being used is contengent on the size of the
# grouping variable
if(include != '95ci'){
res[, 5:(4 + length(flist))] <-
paste(long, paste("(", longprop, "%", ")", sep = ''))
} else {
res[, 5:(4 + length(flist))] <-
paste(long, paste("(", longprop, "%", ")", sep = ''), CIs)
}
#add in test stats to the appropriate columns
res[1, (5 + length(flist)):(6 + length(flist))] <-
teststats
res[1, (7 + length(flist))] <- sigpcol
#make all NA cells into blank cells
res[is.na(res)] <- " "
#and return it for later use in the higher up function
return(res)
}
#do get.chi.stuff for all categorical variables in the list
if (length(PcatVars) > 1) {
chitable <- do.call(rbind, lapply(data[, PcatVars], get.chi.stuff))
}
if (length(PcatVars) == 1) {
chitable <- get.chi.stuff(data[, PcatVars])
}
#and return that up a level
return(chitable)
}
Pcattable <-
get.cat.stats(
catVars = PcatVars,
group = data[[grouping]],
data = data,
dec = dec,
pdec =
pdec,
testtype = testtype
)
}
if (!is.null(NPcatVars)) {
need <- c("0", "1")
posthoc.CHI <-
function (chi,
popsInRows = FALSE,
control = c("fdr",
"BH",
"BY",
"bonferroni",
"holm",
"hochberg",
"hommel"),
digits = 4)
{
control <- match.arg(control)
tbl <- chi$observed
if (!popsInRows)
tbl <- t(tbl)
popsNames <- rownames(tbl)
prs <- combn(1:nrow(tbl), 2)
tests <- ncol(prs)
pvals <- numeric(tests)
lbls <- character(tests)
for (i in 1:tests) {
pvals[i] <- suppressWarnings(chisq.test(tbl[prs[, i],])$p.value)
lbls[i] <- paste(popsNames[prs[, i]], collapse = " vs. ")
}
adj.pvals <- p.adjust(pvals, method = control)
# cat("Adjusted p-values used the", control, "method.\n\n")
data.frame(
comparison = lbls,
raw.p = round(pvals, digits),
adj.p = round(adj.pvals, digits)
)
}
#Location on catVars for which variable name to pull
indexedListLocation = 0
get.cat.stats <-
function(catVars,
group = data[[grouping]],
dec = dec,
...) {
get.chi.stuff <- function(var) {
#Don't use <<, unless you have to...
indexedListLocation <<- indexedListLocation + 1
if (verbose == T)
print(paste0(catVars[indexedListLocation], " is now being analyzed"))
#Unique levels of group
flist <- sort(unique(group))
#is our table a contingency or not?
if (length(flist) == 1) {
long <- table(var)
} else {
long <- table(var, group)
}
#get proportions for the table
overalllong <- table(var)
overallprop <-
t(t(sprintf(
paste0("%.", dec, "f"),
round(100 * prop.table(overalllong), digits = dec)
)))
longprop <-
sprintf(paste0("%.", dec, "f"),
round(100 * prop.table(long, if (percent == "column") {
2
}
else if (percent == "row") {
1
}
else if (percent == "overall") {
NULL
}), digits = dec))
#pick a test to use
if (length(flist) == 1) {
if (mean(ChiProbabilities == FALSE) == 1 |
mean(c(
j <-
mean(ChiProbabilities != FALSE),
k <-
(length(levels(
factor(var)
)) != length(ChiProbabilities))
)) == 1) {
if (ChiProbabilities != FALSE) {
print(
paste0(
"Number of Probabilites listed in ChiProbabilities",
"\ndoes not equal the number of levels of ",
catVars[indexlocation]
)
)
}
test <-
chisq.test(table(as.factor(as.character(var))))
} else {
test <- chisq.test(table(as.factor(as.character(var))), p = ChiProbabilities)
}
if (pvalue != "N/A")
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(test$p.value, digits = pdec))
if (pvalue != "N/A")
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
if (pvalue != "N/A")
pvalue <- gsub("^1\\.", "> 0.", pvalue)
if (pvalue != "N/A")
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
sigpcol <- "N/A"
method <- "Fisher's Exact Test"
}
if (length(flist) > 1) {
if (length(levels(factor(var))) > 1) {
tryC <- tryCatch(
fisher.test(var, group),
warning = function(w)
w,
error = function (e)
e
)
if (grepl("'x' and 'y' must have at least", tryC[1])) {
pvalue <- "N/A"
sigpcol <- "N/A"
sigps <- NULL
} else {
test <- fisher.test(as.factor(as.character(var)), group)
test2 <- suppressWarnings(chisq.test(as.factor(as.character(var)), group))
pvalue <-
sprintf(paste0("%.", pdec, "f"),
round(test$p.value, digits = pdec))
if (pvalue != "N/A")
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)",
"9",
pvalue,
perl = T)
if (pvalue != "N/A")
pvalue <- gsub("^1\\.", "> 0.", pvalue)
if (pvalue != "N/A")
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
ps <-
posthoc.CHI(test2)$raw.p <= (ifelse(padjust == TRUE, .05 /
sum(1:(
length(flist)
) - 1), .05))
sigps <- which(ps %in% TRUE)
if (length(sigps) == 0) {
sigpcol = "None"
} else {
sigpcol = paste(unlist(sigps), collapse = " ")
}
}
} else if (length(levels(factor(var))) == 1) {
pvalue <- "N/A"
sigpcol <- "N/A"
}
if (pvalue != "N/A")
pvalue <-
gsub("(?:^1\\.|\\G)\\K0(?=0*$)", "9", pvalue, perl = T)
if (pvalue != "N/A")
pvalue <- gsub("^1\\.", "> 0.", pvalue)
if (pvalue != "N/A")
pvalue <- gsub("^(0\\.0*)0$", "< \\11", pvalue)
method <- "Fisher's Exact"
}
# }
#get test stats
teststats <- c(pvalue, method)
#make a matrix to put all of the results into
res <-
data.frame(matrix(NA, nrow(long), 7 + if (length(flist) == 1) {
1
} else {
ncol(long)
}))
#make the column names for the matrix
names(res) <-
c(
"ForSortVars",
"Variable",
"Response",
"Overall",
if (!is.null(labels)) {
labels
} else {
flist
},
"p value",
"Test",
"Pairwise"
)
#make the first column, first row, the name of the variable,
#from the list you provided
res[, 1] <-
rep(catVars[indexedListLocation], length(res[, 1]))
res[1, 2] <- catVars[indexedListLocation]
#here the row.names are the possible responses to the categorical
#variable
res[, 3] <- row.names(long)
#now we are putting in our table of raw counts (and percentages)
#overall
if (percent == "row")
res[, 4] <-
paste(overalllong, paste("(", "100", "%", ")", sep = ''))
if (percent != "row")
res[, 4] <-
paste(overalllong, paste("(", overallprop, "%", ")", sep = ''))
#number of columns being used is contengent on the size of the
#grouping variable
res[, 5:(4 + length(flist))] <-
paste(long, paste("(", longprop, "%", ")", sep = ''))
#add in test stats to the appropriate columns
res[1, (5 + length(flist)):(6 + length(flist))] <-
teststats
res[1, (7 + length(flist))] <- sigpcol
#make all NA cells into blank cells
res[is.na(res)] <- " "
#and return it for later use in the higher up function
return(res)
}
#do get.chi.stuff for all categorical variables in the lsit
if (length(NPcatVars) > 1) {
chitable <- do.call(rbind, lapply(data[, NPcatVars], get.chi.stuff))
}
if (length(NPcatVars) == 1) {
chitable <- get.chi.stuff(data[, NPcatVars])
}
#and return that up a level
return(chitable)
}
NPcattable <-
get.cat.stats(
catVars = NPcatVars,
group = data[[grouping]],
data = data,
dec = dec,
pdec =
pdec,
testtype = testtype
)
}
if (!is.null(PcatVars) & !is.null(NPcatVars)) {
cattable <- rbind(Pcattable, NPcattable)
}
if (!is.null(PcatVars) & is.null(NPcatVars)) {
cattable <- Pcattable
}
if (is.null(PcatVars) & !is.null(NPcatVars)) {
cattable <- NPcattable
}
catVars <- NULL
continVars <- NULL
if (!is.null(PcatVars) &
!is.null(NPcatVars))
catVars <- c(PcatVars, NPcatVars)
if (!is.null(PcatVars) & is.null(NPcatVars))
catVars <- PcatVars
if (is.null(PcatVars) & !is.null(NPcatVars))
catVars <- NPcatVars
if (!is.null(PcontinVars) &
!is.null(NPcontinVars))
continVars <- c(PcontinVars, NPcontinVars)
if (!is.null(PcontinVars) &
is.null(NPcontinVars))
continVars <- PcontinVars
if (is.null(PcontinVars) &
!is.null(NPcontinVars))
continVars <- NPcontinVars
if (!is.null(catVars) & !is.null(continVars)) {
fullresults <- rbind(more, cattable)
newdat <-
data.frame(x = 1:length(fullresults[, 1]), fullresults)
}
else if (!is.null(catVars)) {
fullresults <- cattable
newdat <-
data.frame(x = 1:length(fullresults[, 1]), fullresults)
}
else if (!is.null(continVars)) {
fullresults <- more
newdat <-
data.frame(x = 1:length(fullresults[, 1]), fullresults)
}
if (!is.null(SortVars)) {
list <-
data.frame(sortnums = 1:(length(SortVars)), ForSortVars = SortVars)
total <- merge(list, newdat, by = "ForSortVars")
finaldat <- total[with(total, order(sortnums, x)),]
final <- finaldat[,-(1:3)]
}
else if (is.null(SortVars)) {
final <- newdat[,-(1:2)]
}
if (!is.null(varLabelTable)) {
final[, 1] <- as.character(final[, 1])
for (i in 1:length(final[, 1])) {
for (j in 1:length(varLabelTable[, 1])) {
if (as.character(final[i, 1]) == as.character(varLabelTable[j, 1])) {
final[i, 1] <- as.character(varLabelTable[j, 2])
}
}
}
}
names(final) <- c("Variable",
"Response",
"Overall",
if (!is.null(labels)) {
labels
} else {
as.character(sort(unique(data[[grouping]])))
},
"p value",
"Test",
"Pairwise")
if (deleteExtraneous == T) {
final <- final[,-c(3, 5:8)]
}
if (deleteExtraneous == F &
length(levels(factor(data[[grouping]]))) < 3) {
final <- final[,-c(length(final[1,]))]
}
return(final)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.