#' @title Create a descriptive statistics table
#'
#' @description
#' Generate a table of descriptive statistics with p-values obtained in tests for difference between the groups.
#'
#' @usage
#' descr(dat, group, var.names, percent.vertical = T, data.names = T, nonparametric = c(), landscape = F,
#' pos.pagebr = NULL, paired = F, var.equal = T, correct.cat = F, correct.wilcox = T, silent = T,
#' p.values = T, groupsize = F, n.or.miss = "n", group.miss = F, t.log = c(), index = T,
#' create = "tex", digits.m = 1, digits.sd = 2, digits.qu = c(), digits.minmax = 1, digits.p = 1)
#'
#' @param dat
#' Data frame. The data set to be analyzed. Can contain continuous or factor (also ordered) variables.
#' @param group
#' Vector of the grouping variable.
#' @param var.names
#' Optional. Vector of names to be used in the table for the analyzed variables.
#' @param percent.vertical
#' Logical. Should "vertical" percentages for categorical variables be provided?
#' @param data.names
#' Logical. If \code{var.names} are specified, should names as saved in \code{dat} be added in brackets?
#' @param nonparametric
#' Logical or vector of indices. If logical / vector of indices then all / only these continuous variables will be tested using non-parametric methods.
#' @param landscape
#' Logical. Should the table be in landscape? Only useful if you want create a "pdf"- or "knitr"-document in the following. (see \code{pos.pagebr})
#' @param pos.pagebr
#' Vector of positions of the pagebreak in tex (or pdf). This is a bit fuzzy. It is the number of lines after a pagebreak should be done.\cr
#' If it is not specified, 45 will be used for "\code{landscape=FALSE}" and 30 will be used for "\code{landscape=TURE}".\cr
#' Only useful if you want know the number for a pagebreak when you create a "pdf"- or "knitr"-document in the following.
#' @param paired
#' Logical. Should paired tests be applied? The groups must have the same length.
#' @param var.equal
#' Logical. Should variances be assumed to be equal when applying t-tests?
#' @param correct.cat
#' Logical. Should correction be used in chi-sqared tests (see \code{\link{chisq.test}})
#' @param correct.wilcox
#' Logical. Should correction be used in wilcoxon tests (see \code{\link{wilcox.test}})
#' @param silent
#' Logical. Should intermediate stages be shown (more for technical reasons)?
#' @param p.values
#' Logical. Should calculate p-values? If you won't p-values \code{index} were set to \code{FALSE}.
#' @param groupsize
#' Logical. Should be checked for each variable whether the groups contain at least two cases.
#' Number.Instead of two any other number.
#' @param n.or.miss
#' Should the number of observations, missings for continuous variables, and/or missings for categorical variables be provided ("n", "miss", "miss.cat")? Combinations are allowed.
#' @param group.miss
#' Logical. Schould add a column for the Missings in group?
#' @param t.log
#' Vector of indices: The variables for which the log of the original data should be used when testing for a difference between the groups.
#' @param index
#' Logical. Should the tests used be labeled by footnotes? Only usefull if "p-values" in \code{which.col}.
#' @param create
#' Which output document should be produced in the following step (one of "pdf", "tex", "knitr", "word" or "R").
#' @param digits.m
#' Number of digits for presentation in the table: For mean.
#' @param digits.sd
#' Number of digits for presentation in the table: For standard deviation.
#' @param digits.qu
#' Vector of numbers of digits for presentation in the table: For quantiles (if no value is specified it will be tried to provide a reasonable presentation).
#' @param digits.minmax
#' Number of digits for presentation in the table: For minimum and maximum.
#' @param digits.p
#' Number of digits for presentation in the table: For percentages.
#'
#' @return
#' Depending on the value of the create parameter either pdf, word, tex, R or an file optimized for use in connection with knitr will be created containing the descriptive statistics table with the speak for the document to create in the following.
#' For example you choose \code{create="pdf"} then the table is written in \code{TeX}-Code.
#' Attention: the table has no caption and numbers of observations per group.
#'
#' @author Lorenz Uhlmann, Csilla van Lunteren
#'
#' @seealso
#' \code{\link{med.new}}\cr
#' \code{\link{inqur}}\cr
#' \code{\link{minmax}}\cr
#' \code{\link{f.r}}\cr
#' \code{\link{formatr}}\cr
#' \code{\link{m.cat}}\cr
#' \code{\link{m.cont}}\cr
#' \code{\link{p.cat}}\cr
#' \code{\link{p.cont}}\cr
#'
#' @examples
#' \dontrun{
#' ##Dataset with two groups
#' infert
#' attach(infert)
#'
#' #is.factor(education)
#' #is.factor(age)
#' #is.factor(parity)
#' #is.factor(induced)
#' #is.factor(case)
#' #infert$case<-as.factor(case)
#' #case<-as.factor(case)
#'
#' #is.factor(spontaneous)
#' #is.factor(stratum)
#' #is.factor(pooled.stratum)
#'
#' #we use case as Grouping variable
#'
#' #Version 1
#' descr(dat = infert, group = case, var.names = c("education", "age", "parity", "induced", "spontaneous", "stratum", "pooled.stratum"),create = "word")
#' #Version 2
#' group <- case
#' dat <- infert[,-5]
#' descr(dat = dat, group = group, var.names = c("education", "age", "parity", "induced", "spontaneous", "stratum", "pooled.stratum"),create = "word")
#'
#'
#' ##Dataset with more then two groups
#'
#' ChickWeight
#' attach(ChickWeight)
#' #is.factor(weight)
#' #is.factor(Time)
#' #is.factor(Chick)
#' #is.factor(Diet)
#'
#' #we use Diet as Grouping variable
#'
#' #Version 1
#' descr(dat = ChickWeight, group = Diet, var.names = c("weight", "Time", "Chick"), create = "word")
#'
#' #Version 4
#' group <- Diet
#' dat <- ChickWeight[,-4]
#' descr(dat = dat, group = group, var.names = c("weight", "Time", "Chick"), create = "word")
#' }
#'
descr <- function(dat, group, var.names, percent.vertical = T, data.names = T, nonparametric = c(), landscape = F,
pos.pagebr = NULL, paired = F, var.equal = T, correct.cat = F, correct.wilcox = T, silent = T,
p.values = T, groupsize = F, n.or.miss = "n", group.miss = F, t.log = c(), index = T, create = "tex", digits.m = 1,
digits.sd = 2, digits.qu = c(), digits.minmax = 1, digits.p = 1) {
if (is.null(nonparametric))
nonparametric <- rep(F, ncol(dat))
if (is.null(t.log))
t.log <- rep(F, ncol(dat))
l.i <- 0
testings <- c()
pos <- c()
pos.i <- 0
pos.mult <- 1
index_var <- c()
datmiss <- matrix(NA, nrow = nrow(dat), ncol = ncol(dat))
group.na.index <- which(is.na(group))
if (group.miss) {
groupmiss <- as.numeric(group)
groupmiss[group.na.index] <- "NA"
datmiss <- dat
}
if (length(group.na.index) != 0) {
warning(paste("Missing values in the group variable ( Observations: ", list(group.na.index), " )! Observations will be removed!"))
dat <- dat[-group.na.index, ]
group <- group[-group.na.index]
}
lgr <- length(levels(group))
for (i in 1:ncol(dat)) {
gr.miss <- c()
if (is.factor(dat[ ,i])) {
a <- table(dat[ ,i], group)
if (group.miss & length(group.na.index) != 0)
gr.miss <- c(gr.miss, table(datmiss[ ,i], groupmiss)[ ,which(colnames(table(datmiss[ ,i], groupmiss)) == "NA")])
if (group.miss & length(group.na.index) == 0)
gr.miss <- rep(0,length(levels(dat[,i])))
d <- table(dat[ ,i])
if (percent.vertical == T) {
b <- prop.table(as.matrix(table(dat[ ,i], group)), 2)
e <- prop.table(as.matrix(table(dat[ ,i])), 2)
ab <- data.frame(cbind(a[ ,1], b[ ,1]*100))
if (ncol(a) >= 2) {
for (k in 2:ncol(a))
ab <- data.frame(cbind(ab, a[ ,k], b[ ,k]*100))
}
if (group.miss) {
ab <- data.frame(cbind(ab, as.vector(d) + gr.miss, as.vector(prop.table((as.matrix(table(dat[ ,i])) + gr.miss), 2))*100))
} else {
ab <- data.frame(cbind(ab, as.vector(d), as.vector(e) * 100))
}
if (create == "word" | create == "R") {
for (j in seq(1, 2 * (length(levels(group))) + 1, by = 2)) {
for (k in 1:nrow(ab)) {
if (ab[k, j] != 0 ) {
ab[k, j] <- paste(ab[k, j], " (", formatr(ab[k, (j + 1)], digits.p), "%)", sep = "")
} else {
ab[k, j] <- paste(ab[k, j], " ( - ) ")
}
}
}
} else {
for (j in seq(1, 2 * (length(levels(group))) + 1, by = 2)) {
for (k in 1:nrow(ab))
ab[k, j] <- paste(ab[k, j], " (", formatr(ab[k, (j + 1)], digits.p), "\\%)", sep = "")
}
}
} else {
b <- prop.table(as.matrix(table(dat[, i], group)), 1)
ab <- data.frame(cbind(a[, 1], b[, 1] * 100))
if (ncol(a) >= 2) {
for (k in 2:ncol(a)) {
ab <- data.frame(cbind(ab, a[, k], b[, k] * 100))
}
}
if (group.miss) {
ab <- data.frame(cbind(ab, as.vector(d) + gr.miss))
} else {
ab <- data.frame(cbind(ab, as.vector(d)))
}
if (create == "word" | create == "R") {
for (j in seq(1, 2 * (length(levels(group))), by = 2)) {
for (k in 1:nrow(ab)) {
ab[k, j] <- paste(ab[k, j], " (", formatr(ab[k, (j + 1)], digits.p), "%)", sep = "")
}
}
} else {
for (j in seq(1, 2 * (length(levels(group))), by = 2)) {
for (k in 1:nrow(ab)) {
ab[k, j] <- paste(ab[k, j], " (", formatr(ab[k, (j + 1)], digits.p), "\\%)", sep = "")
}
}
}
}
index.delete <- (1:(length(levels(group)) + 1)) * 2
ab <- ab[, -index.delete]
if ("miss.cat" %in% n.or.miss) {
miss.end <- c()
for (j in 1:lgr) {
miss.j <- length(which(is.na(dat[which(group == levels(group)[j]), i])))
miss.end <- c(miss.end, miss.j)
}
miss.all <- length(which(is.na(dat[ ,i])))
ab <- rbind(ab, c(miss.end, miss.all))
if (group.miss) {
gr.miss <- c(gr.miss, length(which(is.na(datmiss[which(groupmiss == "NA"), i]))))
}
}
ab <- rbind(rep("", lgr + 1), ab, rep("", lgr + 1))
if (group.miss)
gr.miss <- c("", gr.miss, "")
if (!(missing(var.names))) {
if (data.names == T) {
var.n1 <- var.names[i]
var.n2 <- paste(paste("(", names(dat)[i], sep = ""), ")", sep = "")
var.n <- paste(var.n1, var.n2, sep = " ")
} else {
var.n <- var.names[i]
}
} else {
var.n <- names(dat)[i]
}
ab <- cbind(NA, ab)
ab <- as.data.frame(ab)
levels(dat[,i]) <- paste(" ", levels(dat[,i]))
if (create == "R")
levels(dat[,i]) <- paste("- ", levels(dat[,i]))
if ("miss.cat" %in% n.or.miss) {
if (create == "R"){
ab[ ,1] <- c(var.n, levels(dat[ ,i]), "- Missing", "")
} else {
ab[ ,1] <- c(var.n, levels(dat[ ,i]), " Missing", "")
}
} else {
ab[ ,1] <- c(var.n, levels(dat[ ,i]), "")
}
if (create != "word" & create != "R") {
for (j in 1:(length(levels(dat[, i])) + ("miss.cat" %in% n.or.miss))) {
ab[j + 1, 1] <- paste("\\hspace{2ex}", ab[j + 1, 1])
}
} else {
for (j in 1:(length(levels(dat[ ,i])) + ("miss.cat" %in% n.or.miss))) {
ab[j + 1, 1] <- paste(" ", ab[j + 1, 1])
}
}
if (group.miss) {
ab <- cbind(ab, gr.miss)
}
pvalues_var <- matrix(F, ncol = ncol(dat))
pvalues_var[i] <- p.values
if (pvalues_var[i]) {
a <- dat[ ,i]
a.list <- list()
a.list[[(length(levels(group)) + 1)]] <- na.omit(a)
n.vector <- c()
for (k in 1:length(levels(group))) {
a.list[[k]] <- na.omit(a[which(group == levels(group)[k])])
n.vector <- c(n.vector, length(a.list[[k]]))
}
for (l in 1:length(levels(group))) {
if (n.vector[l] < groupsize) {
pvalues_var[i] <- F
}
}
if (length(table(dat[,i])) == 1) {
pvalues_var[i] = F
} else {
for (l in 1:length(table(dat[,i]))) {
if(table(dat[,i])[l] == 0) {
pvalues_var[i] = F
}
}
}
if (pvalues_var[i]) {
if (index) {
m <- m.cat(group, paired = paired, is.ordered = is.ordered(dat[, i]))
if (!(m %in% testings)) {
testings <- c(testings, m)
l.i <- l.i + 1
index.i <- letters[l.i]
}
} else {
index.i <- c()
}
pv <- p.cat(dat[ ,i], group, paired = paired, is.ordered = is.ordered(dat[ ,i]),
correct.cat = correct.cat, correct.wilcox = correct.wilcox, index = index.i,
create = create)
index_var[i] <- T
} else {
index_var[i] <- F
pv <- "--"
}
ab <- cbind(ab, c("", pv, rep("", length(levels(dat[, i])) + ("miss.cat" %in% n.or.miss))))
}
pos.i.alt <- pos.i
pos.i <- pos.i + length(levels(dat[ ,i])) + 2 + length(n.or.miss)
} else {
a <- dat[, i]
a.list <- list()
a.list[[(length(levels(group)) + 1)]] <- na.omit(a)
n.vector <- c()
for (k in 1:length(levels(group))) {
a.list[[k]] <- na.omit(a[which(group == levels(group)[k])])
n.vector <- c(n.vector, length(a.list[[k]]))
}
n.vector <- c(n.vector, length(na.omit(a)))
n.miss <- c()
for (k in 1:length(levels(group))) {
miss.k <- which(is.na(a[which(group == levels(group)[k])]))
n.miss <- c(n.miss, length(miss.k))
}
n.miss <- c(n.miss, length(which(is.na(a))))
ab <- matrix(NA, nrow = 6, ncol = (length(levels(group)) + 1))
if(group.miss) {
a.miss <- datmiss[ ,i]
a.miss <- na.omit(a.miss)
a.list.miss <- na.omit(a.miss[which(groupmiss=="NA")])
n.vector.miss <- length(a.list.miss)
miss.k.miss <- which(is.na(a.miss[which(groupmiss == "NA")]))
n.miss.miss <- c(length(miss.k.miss))
n.vector <- c(n.vector, n.vector.miss)
n.miss <- c(n.miss, n.miss.miss)
a.list[[length(levels(group)) + 2]] <- a.list.miss
ab <- matrix(NA, nrow = 6, ncol = (length(levels(group)) + 2))
}
if ("n" %in% n.or.miss)
ab[1, ] <- n.vector
if (!("n" %in% n.or.miss) & "miss" %in% n.or.miss)
ab[1, ] <- n.miss
for (d in 1:length(a.list)) {
if (length(a.list[[d]]) != 0) {
ab[2, d] <- formatr(mean(a.list[[d]]), digits.m)
ab[3, d] <- formatr(sd(a.list[[d]]), digits.sd)
}else{
ab[2,d] <- "-"
ab[3,d] <- "-"
}
}
ab[4, ] <- sapply(a.list, med.new, simplify = T, k = digits.qu)
ab[5, ] <- sapply(a.list, inqur, simplify = T, k = digits.qu)
ab[6, ] <- sapply(a.list, minmax, simplify = T, k = digits.minmax)
if ("n" %in% n.or.miss & "miss" %in% n.or.miss)
ab <- rbind(ab[1, ], n.miss, ab[2:6, ])
row.names(ab) <- NULL
if (group.miss) {
ab <- rbind(rep("", (length(levels(group)) + 2)), ab, rep("", (length(levels(group)) + 2)))
} else {
ab <- rbind(rep("", (length(levels(group)) + 1)), ab, rep("", (length(levels(group)) + 1)))
}
if (!(missing(var.names))) {
if (data.names == T) {
var.n1 <- var.names[i]
var.n2 <- paste(paste("(", names(dat)[i], sep = ""), ")", sep = "")
var.n <- paste(var.n1, var.n2, sep = " ")
} else {
var.n <- var.names[i]
}
} else {
var.n <- names(dat)[i]
}
if (create == "word") {
row.ab <- c()
if ("n" %in% n.or.miss)
row.ab <- c(row.ab, " N")
if ("miss" %in% n.or.miss)
row.ab <- c(row.ab, " Missing")
if (!("n" %in% n.or.miss) & !("miss" %in% n.or.miss))
row.ab <- c(row.ab, " ")
row.ab <- c(row.ab, " Mean", " SD", " Median", " Q1 -- Q3", " Min. -- Max.")
} else if (create == "R"){
row.ab <- c()
if ("n" %in% n.or.miss)
row.ab <- c(row.ab, " - N")
if ("miss" %in% n.or.miss)
row.ab <- c(row.ab, " - Missing")
if (!("n" %in% n.or.miss) & !("miss" %in% n.or.miss))
row.ab <- c(row.ab, " ")
row.ab <- c(row.ab, " - Mean", " - SD", " - Median", " - Q1 -- Q3", " - Min. -- Max.")
} else {
row.ab <- c()
if ("n" %in% n.or.miss)
row.ab <- c(row.ab, " \\hspace{2ex} N ")
if ("miss" %in% n.or.miss)
row.ab <- c(row.ab, "\\hspace{2ex} Missing")
if (!("n" %in% n.or.miss) & ! ("miss" %in% n.or.miss))
row.ab <- c(row.ab, " ")
row.ab <- c(row.ab, "\\hspace{2ex} Mean", "\\hspace{2ex} SD", "\\hspace{2ex} Median",
"\\hspace{2ex} Q1 -- Q3", "\\hspace{2ex} Min. -- Max.")
}
ab <- as.data.frame(ab)
ab <- cbind(NA, ab)
ab[, 1] <- c(var.n, row.ab, "")
pvalues_var <- matrix(F, ncol = ncol(dat))
pvalues_var[i] <- p.values
if (pvalues_var[i]) {
for (l in 1:length(levels(group))) {
if (n.vector[ -length(n.vector)][l] < groupsize)
pvalues_var[i] <- F
}
if (length(table(dat[,i])) == 1) {
pvalues_var[i] = F
} else {
for (l in 1:length(table(dat[,i]))) {
if (table(dat[,i])[l] == 0)
pvalues_var[i] = F
}
}
if (pvalues_var[i]) {
if (index) {
m <- m.cont(group, paired = paired, is.ordered = is.ordered(dat[, i]),
nonparametric = nonparametric[i], t.log = t.log[i])
if (!(m %in% testings)) {
testings <- c(testings, m)
l.i <- l.i + 1
index.i <- letters[l.i]
}
} else {
index.i <- c()
}
pv <- p.cont(dat[ ,i], group, paired = paired, is.ordered = is.ordered(dat[ ,i]),
nonparametric = nonparametric[i], t.log = t.log[i], var.equal = var.equal,
index = index.i, create = create)
index_var[i] <- T
} else {
index_var[i] <- F
pv <- "--"
}
if (!("miss" %in% n.or.miss) & !("n" %in% n.or.miss)) {
ab <- cbind(ab, c("","", pv, rep("", 5 + ("n" %in% n.or.miss & "miss" %in% n.or.miss))))
ab <- ab[-2, ]
} else {
ab <- cbind(ab, c("", pv, rep("", 6 + ("n" %in% n.or.miss & "miss" %in% n.or.miss))))
}
}
pos.i.alt <- pos.i
pos.i <- pos.i + 8 + ("n" %in% n.or.miss & "miss" %in% n.or.miss)
}
names(ab) <- 1:length(ab)
if (i == 1) {
ab1 <- ab
} else {
ab1 <- rbind(ab1, ab)
}
if (!silent)
print(list("i" = i, "pos.i" = pos.i, "ab1" = ab1))
if (landscape == F) {
if (is.null(pos.pagebr))
pos.pagebr <- 45
} else {
if (is.null(pos.pagebr)) {
pos.pagebr <- 30
}
}
if (pos.i > pos.pagebr * pos.mult & i <= ncol(dat)) {
pos <- c(pos, pos.i.alt)
pos.mult <- pos.mult + 1
}
}
return(list("descr" = ab1, "pos" = pos, "pos.pagebr" = pos.pagebr, "testings" = testings, "pvalues_var" = pvalues_var))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.