#' Table 1
#'
#' @description Perform a table 1 like descriptive table
#'
#' @param DF dataframe : data
#' @param explicatives : (optionnal) columns of the dataframe to be analysed
#' all the columns if let empty
#' @param y character (optionnal) : column in DF to separate the table
#' @param round (optionnal) : number of digits to be display in the results. If a vector is provided, digits stands for p-value then variables.
#' @param overall booleen (optionnal) : TRUE if an "overall" column is wanted
#' @param mutation numeric : number of modalities to display for one variable. If there is more than "mutation" categories, the modalities after this threeshold are wrapped into an "others" categorie.
#' @param tests character / vector (optionnal) : classical (student, khi2...) or logistic regression. Specify if paired.
#' @param y_test character (optionnal) : name of the column that will be used for statistical test
#' @param exit character : 'html', 'console' or 'excel'
#' @param strata : columns with the number of the paires
#'
#' @return dataframe, excel file or html table depending on the exit parameter
#' @export
#' @import stringr
#' @import stats
#' @import flextable
#' @import MASS
#' @import pbapply
#' @import stats
table1 <- function(DF,
explicatives = columns(y),
y = NULL,
overall = TRUE,
tests = "classic",
strata = NULL,
adjusted = NULL,
mutation = 10,
round = c(3,2),
exit = 'console',
num_function = "auto",
available_data = TRUE,
by_line = FALSE,
complete = F)
{
##################################################
# Arguments verification / transformation #
##################################################
## EXPLICATIVES ## v1.0
#####
if (is.matrix(explicatives) | is.data.frame(explicatives)){
explicatives_matrix <- explicatives
} else {
explicatives_matrix <- get_explicatives_matrix(explicatives,DF)
}
explicatives <- rownames(explicatives_matrix)
if (! all(explicatives %in% colnames(DF)))
stop(paste0(explicatives[!(explicatives %in% colnames(DF))], " not in DF"))
#####
## DF
#####
if (is.data.frame(DF) || is.matrix(DF)) {
DF <- as.data.frame(DF, row.names = NULL)
format_data(DF) -> DF
if (any(make.names(colnames(DF)) != colnames(DF))) {
message('Names are not fitted, names are generated')
colnames(DF) <- colnames_prep(colnames(DF))
explicatives <- explicatives_matrix[, 1] <- colnames_prep(explicatives)
}
} else
stop("No dataframe DF has been provided")
#autostats::format_data(DF) -> DF #plain and no plural factors
#####
## y
#####
if (!is.null(y)) {
if (!is.matrix(y)) {
y <- colnames_prep(y)
y_vector <- get_y_vector(y, DF)
} else {
y_vector <- y
y <- y["y"] <- colnames_prep(y["y"])
}
y_test <- y_vector["y_test"]
DF[, y] <- as.factor(DF[, y])
levels_y <- length(levels(DF[, y]))
DF_without_y_and_all <- DF_without_y <- DF[, -match(y, colnames(DF))]
ynames <- y_vector["names"]
if (is.na(ynames))
ynames <- levels(as.factor(DF[,y]))
if (!is.null(y_test) & (y != y_test))
DF_without_y_and_all <- DF_without_y[, -match(y_test, colnames(DF_without_y))]
if (!is.null(strata)) {
paired = TRUE
for (colstrata in strata) {
DF_without_y_and_all <- DF_without_y_and_all[,-match(colstrata, colnames(DF_without_y_and_all))]
}
} else{
paired = FALSE
}
if (!is.null(adjusted))
DF_without_y_and_all <- DF_without_y_and_all[, -match(adjusted, colnames(DF_without_y_and_all))]
if (!is.null(y_test)) {
if(y == y_test){
DF <- DF[,c(y, explicatives, strata)]
} else {
DF <- DF[,c(y, explicatives, strata, y_test)]
}
} else { DF <- DF[, c(explicatives)]}
} else{ #y is null
tests = ynames = NULL
overall = TRUE
levels_y = 0
DF_without_y = DF_without_y_and_all = DF
}
# Options verification # 1.5
## Options
#####
if (!is.logical(overall))
stop("'overall' should be a booleen")
if (!is.numeric(mutation))
stop("'mutation' should be numeric")
if (any(c("IQR","quartiles","[Q1-Q3]","Q1Q3","IQQ","QQ") %in% num_function)){
num_function <- c("median","quartile")
}
if (!is.null(tests)){
method <- tests
paired = "paired" %in% tests
tests <- TRUE
} else {
tests <- paired <- method <- FALSE
}
#####
# Round transformation # 1.0
#####
if (is.vector(round)) {
round_p <- round[1]
round <- round[2]
} else if (!is.numeric(round)) {
stop("'round' should be numeric or a vector of numerics")
} else {
round_p <- round
}
#####
##################################################
##################################################
# HEADER #
##################################################
tabf <- c("Characteristics")
if (available_data)
tabf <- c(tabf, "Available data")
if (overall)
tabf <- c(tabf, "Overall")
tabf <- c(tabf, ynames)
if (tests & (method == 'logistic'))
tabf <- c(tabf, 'OR')
if (tests)
tabf <- c(tabf, 'p-value')
first_column <- match(ynames[1], tabf)
ligne2 <- rep("", length(tabf))
if (!is.null(y)) {
for (k in 1:levels_y) {
ligne2[k + first_column - 1] <- paste0("N = ", table(DF[, y])[k])
}
numbers_observation <- ligne2
if (overall) {
overall_observations <- sum(table(DF[, y]))
ligne2[first_column - 1] <- paste0("N = ", overall_observations)
}
} else {
overall_observations <- sum(nrow(DF))
ligne2[first_column - 1] <- paste0("N = ", overall_observations)
}
tabf <- rbind(tabf, ligne2)
##################################################
########################################################
### Loop for each characteristics (var) ###
########################################################
table1_one_var <- function(X) {
varname <- as.character(X[1])
var <- DF_without_y_and_all[,varname]
n_available <- sum(!is.na(var))
ligne1 <- as.character(X[2])
sign <- NULL # will store a note depending on test performed (fischer, Wilcoxon...)
if (by_line) {
expression = c(varname, "y_b", adjusted)
} else {
expression = c("y_b", varname, adjusted)
}
#####################################
#--------------------------------------------------------------------------------------------------------------------#
#-- if numeric ----------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------------#
if (! as.logical(X[5])) {
var <- as.numeric(var)
if (available_data)
ligne1 <- c(ligne1, n_available)
### column Legend
if (length(num_function) < 3) {
ligne2 <- ligne3 <- NULL
} else {
median_bis <- ifelse("quartile" %in% num_function,
"[Q1-Q3]","[min - max]")
ligne2 <- ifelse(exit == "html",
"\t \t Mean (SD)",
" Mean (SD)")
ligne3 <- ifelse(exit == "html",
paste0("\t \t Median ", median_bis),
paste0(" Median ", median_bis))
}
#### Overall calculation v1.5
shapiro_overall <- !((shapiro.test(var)$p.value < 0.05) & length(var) < 30)
ligne_mediane <- paste0(round(median(var, na.rm = TRUE), round),
" [",
ifelse(
!any("quartile" %in% num_function) & !any("auto" %in% num_function),
round(min(var, na.rm = TRUE), round),
round(quantile(var, na.rm = TRUE)[2], round)),
" - ",
ifelse(
!any("quartile" %in% num_function) & !any("auto" %in% num_function),
round(max(var, na.rm = TRUE), round),
round(quantile(var, na.rm = TRUE)[4], round)
),
"] ")
ligne_moyenne <- paste0(round(mean(var, na.rm = TRUE), round),
" (", round(sd(var, na.rm = TRUE), round),")")
if (overall) {
if (length(num_function) < 3 & !any("auto" %in% num_function)) {
ligne1 <- c(ligne1,
ifelse(any("mean" %in% num_function),ligne_moyenne,ligne_mediane))
} else if ("auto" %in% num_function) {
ligne1 <- c(ligne1,
ifelse(shapiro_overall,ligne_moyenne,ligne_mediane))
} else {
ligne1 <- c(ligne1, " ")
ligne2 <- c(ligne2, ligne_moyenne)
ligne3 <- c(ligne3, ligne_mediane)
}
}
####
#### Calculation for variable ~ y v1.5
if (!is.null(y)) {
mean_vars <- aggregate(var ~ DF[, y], FUN = "mean")
median_vars <- aggregate(var ~ DF[, y], FUN = "median")
sd_vars <- aggregate(var ~ DF[, y], FUN = "sd")
min_vars <- aggregate(var ~ DF[, y], FUN = "min")
max_vars <- aggregate(var ~ DF[, y], FUN = "max")
quartiles_vars <- aggregate(var ~ DF[, y], FUN = "quantile")
length_vars <- aggregate(var ~ DF[, y], FUN = "length")
if (any(length_vars$var < 6) | any(sd_vars$var == 0)) {
normal <- FALSE
} else {
shapiro_vars <- aggregate(var ~ DF[, y], FUN = "shapiro.test")
normal <- ! (any(shapiro_vars[, 2][, 2] < 0.05)) # si un test est positif => pas de normalite
}
# round and save all results
for (j in 1:levels_y) {
mean_vars_level <- round(mean_vars[j, 2], round)
sd_vars_level <- round(sd_vars[j, 2], round)
median_vars_level <- round(median_vars[j, 2], round)
min_vars_level <- round(min_vars[j, 2], round)
max_vars_level <- round(max_vars[j, 2], round)
quartiles_vars_level <- round(quartiles_vars[j, 2],round)
ligne_moyenne <- paste0(mean_vars_level, " ? ", sd_vars_level)
ligne_mediane <- paste0(
median_vars_level,
" [",
ifelse("quartile" %in% num_function,
quartiles_vars_level[2],
min_vars_level),
" - ",
ifelse(
"quartile" %in% num_function,
quartiles_vars_level[4],
max_vars_level),
"] ")
if (length(num_function) < 3 & !("auto" %in% num_function)) {
ligne1 <- c(ligne1,
ifelse("mean" %in% num_function,ligne_moyenne,ligne_mediane))
} else if ("auto" %in% num_function) {
ligne1 <- c(ligne1,ifelse(normal,ligne_moyenne,ligne_mediane))
} else{
ligne1 <- c(ligne1, " ")
ligne2 <- c(ligne2,ligne_moyenne)
ligne3 <- c(ligne3,ligne_mediane)
}
}
}
####
##### STATISTICAL TESTS ##### v1.5
### if not paired ###
if (tests & !paired) {
p <- "-" ## Control
if (method == 'logistic') {
regression <- glm(DF[, expression], family = "binomial")
p <- anova(regression, test = "LRT")$Pr[2]
OR <- round(exp(summary(regression)$coefficients[2, 1]),round)
IC <- round(exp(confint(regression)), round)
OR <- paste0(OR, ' (', IC[2, 1], '-', IC[2, 2], ')')
p <- roundp(p,n = round_p)
ligne1 <- c(ligne1, OR, p)
} else{
if (normal & levels_y < 3) {
if (nrow(mean_vars) == levels_y) {
variance_equal <- var.test(var ~ DF[, y])$p.value > 0.05
} else {
variance_equal <- FALSE
p <- paste0(p, " (a)")
}
p <- roundp(t.test(var ~ DF[, y], var.equal = variance_equal)$p.value, round_p)
} else {
if (levels_y == 2){
withCallingHandlers(
warning = function(cnd) {
p <- roundp(kruskal.test(var ~ DF[, y])$p.value, round_p)
p <- paste0(p, " (d)")
},{
p <- roundp(wilcox.test(var ~ DF[, y])$p.value, round_p)
p <- paste0(p, " (c)")
}
)
}else{
p <- roundp(kruskal.test(var ~ DF[, y])$p.value, round_p)
p <- paste0(p, " (d)")
}
}
}
ligne1 <- c(ligne1, p)
} else if (tests & paired) { ### if paired ###
if (method == 'logistic') {
# Composition of the dataframe with each observations
Strate <- var_strat <- y_strat <- vector()
for (strate_i in strata) {
Strate <- c(Strate, DF[, strate_i])
var_strat <- c(var_strat, var)
y_strat <- c(y_strat, DF[, y])
}
DF_var <-
as.data.frame(cbind(as.numeric(var_strat), y_strat - 1, Strate)) #y strat-1 cause the process changes the y_strat from 0/1 to 1/2
DF_var <- DF_var[complete.cases(DF_var), ]
colnames(DF_var) <- c("variable", "y", "Strate")
p <- anova(
survival::clogit(
expression + strata(Strate),
data = DF_var,
family = "binomial"),test = "LRT")$Pr[2]
OR <- exp(summary(
survival::clogit(
expression + strata(Strate),
data = DF_var,
family = "binomial"))$coefficients[2, 1])
OR <- round(OR, round)
p <- roundp(p, round_p)
ligne1 <- c(ligne1, OR, p)
} else {
nrow = 0
DF_stratified <- as.data.frame(matrix(ncol = 2, nrow = 1))
colnames(DF_stratified) <- levels(DF$y_b)
for (strate_i in strata) {
for (a in 1:(length(DF[!is.na(DF[, strate_i]), strate_i]) / 2)) {
nrow <- 1 + nrow
n <- DF[a, strate_i] #num de la paire
DF[DF[, strate_i] == n &
!is.na(DF[, strate_i]), c("y_b", varname)] -> tab_1 # resultat de la paire
c(tab_1[tab_1[, "y_b"] == levels(DF$y_b)[1], varname], tab_1[tab_1[, "y_b"] == levels(DF$y_b)[2], varname]) -> DF_stratified[nrow, ]
}
}
p <- t.test(DF_stratified[, 1], DF_stratified[, 2], paired = TRUE)$p.value
p <- roundp(p, round_p)
ligne1 <- c(ligne1, p)
}
}
ligne <- ligne1
if (!is.null(ligne2))
ligne <- rbind(ligne, c(ligne2, " "))
if (!is.null(ligne3))
ligne <- rbind(ligne, c(ligne3, " "))
} # if numeric
#--------------------------------------------------------------------------------------------------------------------#
#-- if non numeric ------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------------#
else {
clig <- NULL
var <- sapply(var, ## convert modality's lenght is > 40 characters to first 40 chr
function(x) {
if (stringr::str_length(x) > 40 & !is.na(x)) {
paste0(substr(x, 1, 40), "...")
} else{x}
})
#### Transformation into factor and mutation if needed
var[var == "NA"] <- NA
tryCatch(relevel(as.factor(as.character(var)),as.character(X[6])),
error = function(e) {
print(as.character(X[6]))
print(DF[,varname])
stop(paste0("You tried to assign < ", as.character(X[6]),
" > for the variable <",varname,
" > but it doesn't exists"))}) -> var
# if there is more than 'mutation' modalities, the last modalities are grouped in 'others' modality
if (length(levels(var)) >= mutation) {
nvar <- as.vector(var)
for (other_levels_i in mutation:length(levels(var))) {
other_levels <- levels(var)[other_levels_i]
nvar[nvar == other_levels] <- "others"
}
as.factor(nvar) -> var
}
#---#
if (length(levels(var)) >= 2) {
if (!is.null(y)) {
tb <- table(DF[, y],var, useNA = "no")
tbm <- margin.table(tb, 1)
# conditions verification
verif_level <- margin.table(table(var, DF[, y]), 2)
verif <- ! any(sapply(verif_level, function(lev) lev == 0)) # false if 0 elements in a group
# Chi2 conditions
if (verif) {
condition_chi2 <- 0
condition_chi2_B <- TRUE
for (m in chisq.test(var, DF[, y], simulate.p.value = TRUE)$expected) {
if (m < 5) { # counting EXPECTED modalities < 5. if one > 3 and only 2 column we can apply yate's correction otherwise we apply Fisher
condition_chi2_B <- FALSE
if (m < 3) {
condition_chi2_B <- FALSE
break
} else{
condition_chi2 <- condition_chi2 + 1
}
}
}
if (condition_chi2 > 1 | (condition_chi2 == 1 & length(levels(var)) > 2)) {
condition_chi2_B <- FALSE
}
## if paired ##
if (paired & tests) {
Strate <- var_strat <- y_strat <- vector()
for (strate_i in strata) {
Strate <- c(Strate, DF[, strate_i])
var_strat <- c(var_strat, var)
y_strat <- c(y_strat, DF[, y])
}
DF_var <-as.data.frame(cbind(
as.numeric(var_strat),
y_strat - 1,
Strate))
DF_var <- DF_var[complete.cases(DF_var), ]
colnames(DF_var) <- c("variable", "y", "Strate")
## if not paired ##
} else if (tests) {
if (method == 'logistic') {
p <- anova(glm(DF[, expression], family = "binomial"), test = "LRT")$Pr
OR <- exp(summary(glm(DF[, expression], family = "binomial"))$coefficients[, 1])
} else {
if (condition_chi2_B) { # if verif ok chi2, else Fisher
clig <- roundp(chisq.test(var, DF[, y])$p.value,
round_p)
} else {
clig <- roundp(fisher.test(var, DF[, y], simulate.p.value = T)$p.value,
round_p)
sign <- " (b)"
}
}
}
}
} # if y is not null
## Variable with 2 levels #############################
if ((length(levels(var)) == 2) & as.logical(X[4])) {
if (toupper(levels(var)[1]) == "NON") {
if (toupper(levels(var)[2]) == "OUI")
var <- relevel(var, levels(var)[2])
}
if (tests) {
if (paired) {
if (method == 'logistic') {
clig <- anova(survival::clogit(
expression + strata(Strate),
data = DF_var,
family = "binomial"),test = "LRT")$Pr[2]
OR <- exp(summary(
survival::clogit(
expression + strata(Strate),
data = DF_var,
family = "binomial"
))$coefficients[2, 1])
OR <- round(OR, round)
clig <- roundp(clig, round_p)
clig <- c(OR, clig)
} else{
nrow = 0
DF_stratified <- matrix(ncol = 2, nrow = 1)
DF_stratified <- as.data.frame(DF_stratified)
colnames(DF_stratified) <- levels(DF$y_b)
for (strate_i in strata) {
for (a in 1:(length(DF[!is.na(DF[, strate_i]), strate_i]) / 2)) {
nrow <- 1 + nrow
n <- DF[a, strate_i] #num de la paire
DF[DF[, strate_i] == n & !is.na(DF[, strate_i]),
c("y_b", varname)] -> tab_1
c(tab_1[tab_1[, "y_b"] == levels(DF$y_b)[1], varname], tab_1[tab_1[, "y_b"] == levels(DF$y_b)[2], varname]) -> DF_stratified[nrow, ] }
}
p <- mcnemar.test(DF_stratified[, 1], DF_stratified[, 2])$p.value
clig <- roundp(p[2], round_p)
}
} else if (method == "logistic") {
OR <- round(OR[2], round)
IC <- round(exp(confint(glm(DF[, expression], family = "binomial"))),
round)
OR <- paste0(OR, ' (', IC[2, 1], '-', IC[2, 2], ')')
clig <- roundp(p[2], round_p)
clig <- c(OR, clig)
}
}
ligne1 <- paste0(ligne1,
ifelse(as.logical(X[3]),
paste0(" (", levels(var)[1], ") - no. (%)"),
""))
if (available_data)
ligne1 <- c(ligne1, n_available)
if (overall) {
overall_count <- addmargins(table(DF[, varname], useNA = "no"))[levels(var)[1]]
percent_overall <- round(100 * overall_count / n_available, round)
ligne1 <- c(ligne1, paste0(overall_count, " (", percent_overall, "%)"))
}
if (!is.null(y)) {
ligne1 <- c(ligne1,
sapply(1:levels_y, function(j) {
paste0(tb[j, levels(var)[1]],
" (",
round(100 * tb[j, levels(var)[1]] / tbm[j], round),
"%)")
}))
}
if (tests)
ligne1 <- c(ligne1, paste0(clig, sign))
ligne1 -> ligne
} else { ## Variable with more than 2 levels #############################
ligne <- ifelse(as.logical(X[3]),
paste0(as.character(X[2]), " - no. (%)"),
as.character(X[2]))[[1]]
if (available_data)
ligne <- c(ligne, n_available)
ligne <- c(ligne,
rep("", ifelse(overall,levels_y+1,levels_y)))
if (tests){
ligne <- c(ligne,ifelse(method == 'logistic',
c(" ", " "),
paste0(clig, sign)))
}
# other lines
lignes_tot <- matrix(ligne, ncol = ncol(tabf))
for (n in 1:length(levels(var))) {
if (tests) {
if (paired) {
if (n == 1) {
clig <- '-'
OR <- 1
} else {
result <- survival::clogit(
expression + strata(Strate),
data = DF_var,family = "binomial")
clig <- summary(result)$coefficients[n, 4]
OR <- exp(summary(result)$coefficients[n, 1])
IC <- exp(confint(result))
OR <- round(OR, round)
OR <- paste0(OR, ' (', IC[2, 1], '-', IC[2, 2], ')')
clig <- roundp(clig, round_p)
}
clig <- c(OR, clig)
} else if (method == "logistic") {
if (n == 1) {
clig <- '-'
OR_clig <- 1
} else {
clig <- p[n]
OR_clig <- round(OR[n], round)
clig <- roundp(clig, round_p)
}
clig <- c(OR_clig, clig)
} else {
clig <- " "
}
} else {
clig <- " "
}
# For each modality
ligne <- ifelse(exit != "html",
paste0(" ", levels(var)[n]),
paste0(" \t \t ", levels(var)[n]))
if (available_data)
ligne <- c(ligne, " ")
if (overall) {
overall_count <- addmargins(table(DF[,varname], useNA = "no"))[levels(var)[n]]
percent_overall <- round(100 * overall_count / length(var), round)
ligne <- c(ligne, paste0(overall_count," (", percent_overall," %)"))
}
if (!is.null(y)) {
tb <- table(DF[, y], var, useNA = "no")
tbm <- margin.table(tb, 1)
for (j in 1:levels_y) {
no <- tb[j, n]
pn <- round(100 * no / tbm[j], round)
ligne <- c(ligne, paste0(no, " (", pn, "%)"))
}
}
if (tests){
lignes_tot <- rbind(lignes_tot,c(ligne,clig))
} else {
lignes_tot <- rbind(lignes_tot,ligne)
}
}
lignes_tot -> ligne
}
}
}
return(ligne)
}
#### LOOP ###
t_explicative_matix <- as.data.frame( t(explicatives_matrix))
lignes <- pblapply(t_explicative_matix, table1_one_var)
as.data.frame(do.call(rbind, lignes),stringsAsFactors = F) -> rslt
row.names(rslt) <- NULL
rslt <- rbind(tabf[2,],rslt)
colnames(rslt) <- tabf[1,]
if ("excel" %in% exit)
xlsx::write.xlsx(
rslt,
paste0("table1", ".xlsx"),
append = FALSE,
row.names = FALSE
)
text_footer <- paste0(
"Data shown are number (and percentage) for categorical data and ",
ifelse("mean" %in% num_function,
"mean and standard deviation ",
ifelse('auto' %in% num_function,
"mean and standard deviation for numerical data if they were
normaly distributed, median and interquartile range otherwise.",
ifelse('range' %in% num_function,
"median and range for numerical data.",
"median and interquartile range for numerical data."
)
)
), ifelse(!tests,
"",
ifelse(
any("paired" %in% method),
" p-values have been obtained from paired t test for continuous
variables and mac nemar test for categorical variables",
" p-values have been obtained from a two-sided student test
for continuous variables and from a khi-2 test for categorical variables,
unless specified otherwise (a: Student test with Welch approximation,
b: Fisher's exact test, c: Wilcoxon test, d: Kruskal-Wallis Rank sum test)"
)
)
)
if ("html" %in% exit) {
rslt %>%
flextable(col_keys = colnames_rslt) %>%
fontsize(i = 1, part = "header", size = 24) %>%
bold(i = 1, part = "header", bold = TRUE) %>%
theme_booktabs() %>%
autofit() %>%
align(j = 2:nb_colums, align = 'center') %>%
align(j = 2:nb_colums, align = 'center', part = "header") -> rslt
}
version <- paste0("Statistical analysis have been performed with R (",
version$version.string,
"), with the package AutostatR version ",
packageVersion('AutostatR'))
print(rslt)
return(invisible(list("version" = version,"methods" = text_footer,"table" = rslt)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.