#' Table 1
#'
#' @param DF dataframe : datas to be analysed
#' @param y character : column that can separate the dataframe
#' @param ynames vector of characters (optionnal) : names to be put in table one columns instead of y levels.
#' @param make.names (optionnal) : TRUE so that the colnames provided are formated
#' @param round (optionnal) : the number of digits to be display in the results
#' @param overall booleen (optionnal) : TRUE if an "overall" column is wanted
#' @param overall_name (optionnal) : name of the 'overall' column
#' @param mutation numeric : number of modalities to display for one variable. If there is more than "mutation" categories, the modalities after this threeshold are wrapped into an "others" categorie.
#' @param legend booleen (optionnal) : TRUE if a legend is wanted under the table
#' @param title booleen (optionnal) : TRUE if a title is wanted
#' @param tests booleen (optionnal) : TRUE if you want tests to be executed
#' @param norm_test booleen (optionnal) : TRUE if you want tests on normal distribution to be performed has condition to student test
#' @param Khi2 booleen (optionnal) : if TRUE, khi2 is prefered to fisher
#' @param exit character : 'html', 'console' or 'excel'
#' @param paired
#' @param strata
#' @param method
#' @param adjusted
#' @param significance
#' @param name_sheet
#'
#' @return dataframe, excel file or html table depending on the exit parameter
#' @export
#' @import stringr
#' @import stats
#' @import flextable
#' @import RVAideMemoire
table1 <- function(DF,
y,
paired=FALSE,
strata=NULL,
ynames = NULL,
make.names = TRUE,
overall = TRUE,
overall_name = "Overall",
tests = TRUE,
method = "classic",
adjusted = NULL,
norm_test = TRUE,
Khi2 = TRUE,
mutation = 40,
legend = TRUE,
title = TRUE,
round = 2,
significance = F,
exit = 'html',
name_sheet = 'results',
num_function = "auto",
available_data=TRUE
)
{
version_pkg = '0.0.7'
##################################################
# Arguments verification / transformation #
##################################################
## Dataframe
if (is.data.frame(DF) || is.matrix(DF)) {
DF <- as.data.frame(DF,row.names = NULL)
if ( (make.names(colnames(DF)) != colnames(DF))) {
colnames_prep(colnames(DF)) -> colnames(DF)
colnames_prep(y) -> y
}
} else stop("No dataframe has been provided. Make sure 'DF' is a dataframe, a tibble or a matrix")
## y
if (!is.character(y) || !(y %in% colnames(DF)))
stop("y must be a character variable, part of DF")
DF[, y] <- as.factor(DF[, y])
levels_y <- length(levels(DF[, y]))
DF_without_y <- DF[, -match(y, colnames(DF))]
if (is.null(ynames)) {
ynames <- levels(DF[, y])
} else if (!is.vector(ynames) | length(ynames) != levels_y) {
stop(paste0("ynames should be a vector of characters of length",levels_y))
}
if (!is.logical(overall))
stop("'overall' should be a booleen")
if (!is.logical(legend))
stop("'legend' should be a booleen")
if (!is.logical(title))
stop("'title' should be a booleen")
if (!is.numeric(mutation))
stop("'mutation' should be numeric")
if (!is.logical(paired))
stop("'paired' should be a logical")
if (is.logical(make.names)) {
if (make.names) {
colnames_definitive <- colnames_prep(colnames(DF)[colnames(DF) != y],type = "presentation")
} else {
colnames_definitive <- colnames(DF)[colnames(DF) != y]
}
} else {
warning('"make.name" should be logical ; "make.name" is considered as False')
colnames_definitive <- colnames(DF)
}
DF_without_y_and_all <- DF_without_y
if (paired & !is.null(strata)) {
for (colstrata in strata) {
DF_without_y_and_all <- DF_without_y_and_all[,-match(colstrata,colnames(DF_without_y_and_all))]
}
}
autostats::format_data(DF) -> DF #plain and no plural factors
##################################################
##################################################
# HEADER #
##################################################
tabf <- matrix(nrow = 1, ncol = levels_y + 2)
first_column <- 1 #first column in the table with an ynames
tabf <- c("Characteristics")
if (available_data)
tabf <- c(tabf,"Available data")
if (overall)
tabf <- c(tabf,"Overall")
tabf <- c(tabf,ynames)
if (tests)
tabf <- c(tabf,'p-value')
first_column <- match(ynames[1],tabf)
ligne2 <- rep("", length(tabf))
for (k in 1:levels_y) {
ligne2[k + first_column -1] <- paste0("N = ", table(DF[, y])[k]) #number of observations for each levels of y
}
numbers_observation <- ligne2
if (overall) {
overall_observations <- sum(table(DF[, y]))
ligne2[first_column -1] <- paste0("N = ", overall_observations)
}
tabf <- rbind(tabf, ligne2)
##################################################
# useful loop for flextable presentation
i <- 0
num_variables <- vector()
for (column in colnames(DF_without_y)) {
i + 1 -> i
if (is.numeric(DF_without_y[,column])) {
num_variables <- c(num_variables,i)
}
}
########################################################
### Loop for each characteristics (var) ###
########################################################
i <- 0
one_ligne <- FALSE
for (var in DF_without_y_and_all) {
i <- i + 1
varname <- colnames(DF_without_y_and_all)[i]
progressbar(total = length(DF_without_y_and_all),i,variable = varname)
ligne1 <- colnames_definitive[i]
sign <- NULL # store a note if a special test is done (fischer, Wilcoxon...)
if (available_data) {
n_available <- sum(!is.na(var))
}
#--------------------------------------------------------------------------------------------------------------------#
#-- if numeric ----------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------------#
if (is.numeric(var)) {
ligne1 <- c(ligne1,n_available)
# Legend
if (length(num_function) < 3) {
ligne2 <- NULL
ligne3 <- NULL
one_ligne <- TRUE
} else {
tabulation <- ifelse(exit == "html","\t \t "," ")
tabulation <- ifelse(length(num_function) > 2,"",tabulation)
median_bis <- ifelse("quartile" %in% num_function,"[Q1-Q3]","[min - max]")
ligne2 <- ifelse(exit == "html","\t \t Mean (SD)"," Mean (SD)")
ligne3 <- ifelse(exit == "html",paste0("\t \t Median ",median_bis),paste0(" Median ",median_bis))
}
##### DESCRIPTIVE CALCULS #####
mean_vars <- aggregate(var ~ DF[, y], FUN = "mean")
mean_overall <- round(mean(var,na.rm = TRUE), round)
median_vars <- aggregate(var ~ DF[, y], FUN = "median")
median_overall <- round(median(var,na.rm = TRUE),round)
sd_vars <- aggregate(var ~ DF[, y], FUN = "sd")
sd_overall <- round(sd(var,na.rm = TRUE),round)
min_vars <- aggregate(var ~ DF[, y], FUN = "min")
min_overall <- round(min(var, na.rm = TRUE),round)
max_vars <- aggregate(var ~ DF[, y], FUN = "max")
max_overall <- round(max(var, na.rm = TRUE),round)
quartiles_vars <- aggregate(var ~ DF[, y], FUN = "quantile")
quartiles_overall <- quantile(var, na.rm = TRUE)
length_vars <- aggregate(var ~ DF[, y], FUN = "length")
shapiro_vars <- aggregate(var ~ DF[, y], FUN = "shapiro.test")
shapiro_overall <- shapiro.test(var)$p.value
normal <- TRUE
if ((shapiro_overall < 0.05) & overall)
normal <- FALSE
if (TRUE %in% (shapiro_vars[,2][,2] < 0.05))
normal <- FALSE
# Add overall results if overall
if (overall) {
if (length(num_function) < 3 & (num_function != "auto")) {
ligne1 <- c(ligne1,ifelse("mean" %in% num_function,paste0(mean_overall, " ± ",sd_overall), paste0(median_overall," [",ifelse("quartile" %in% num_function,round(quartiles_overall[2],round),min_overall)," - ",ifelse("quartile" %in% num_function,round(quartiles_overall[4],round),max_overall),"] ")))
} else if (num_function == "auto") {
ligne1 <- c(ligne1,ifelse(normal,paste0(mean_overall, " ± ",sd_overall), paste0(median_overall," [",round(quartiles_overall[2],round)," - ",round(quartiles_overall[4],round),"] ")))
} else {
ligne1 <- c(ligne1," ")
ligne2 <- c(ligne2,paste0(mean_overall, " ± ",sd_overall))
ligne3 <- c(ligne3,paste0(median_overall," [",ifelse("quartile" %in% num_function,round(quartiles_overall[2],round),min_overall)," - ",ifelse("quartile" %in% num_function,round(quartiles_overall[4],round),max_overall),"] "))
}
}
# round and save all results
for (j in 1:levels_y) {
mean_vars_level <- round(mean_vars[j, 2], round)
sd_vars_level <- round(sd_vars[j, 2], round)
median_vars_level <- round(median_vars[j, 2], round)
min_vars_level <- round(min_vars[j, 2], round)
max_vars_level <- round(max_vars[j, 2], round)
quartiles_vars_level <- quartiles_vars[j, 2]
min_vars_level <- ifelse("quartile" %in% num_function,round(quartiles_vars_level[2],round),min_vars_level)
max_vars_level <- ifelse("quartile" %in% num_function,round(quartiles_vars_level[4],round),max_vars_level)
if (length(num_function) < 3 & (num_function != "auto")) {
ligne1 <- c(ligne1,ifelse("mean" %in% num_function,paste0(mean_vars_level, " (", sd_vars_level,") "), paste0(median_vars_level," [",min_vars_level," - ",max_vars_level,"] ")))
} else if (num_function == "auto") {
ligne1 <- c(ligne1,ifelse(normal,paste0(mean_vars_level, " ± ",sd_vars_level), paste0(median_vars_level," [",round(quartiles_vars_level[2],round)," - ",round(quartiles_vars_level[4],round),"] ")))
} else{
ligne1 <- c(ligne1," ")
ligne2 <- c(ligne2,paste0(mean_vars_level, " (", sd_vars_level,") "))
ligne3 <- c(ligne3,paste0(median_vars_level," [",min_vars_level," - ",max_vars_level,"] "))
}
}
#####
##### STATISTICAL TESTS #####
### if not paired ###
if (tests & !paired) {
verif <- TRUE
ncst <- TRUE
p <- "-"
if (nrow(mean_vars) == levels_y) {
for (var_mod in 1:levels_y) {
if (length_vars[var_mod,2] < 8) {
ncst <- FALSE
}
}
if (ncst & !(NA %in% sd_vars[,2])) {
for (var_mod in 1:levels_y) {
if (length_vars[var_mod,2] < 31 & length_vars[var_mod,2] > 7 & sd_vars[var_mod,2] != 0) {
p_shapiro <- shapiro.test(var[DF[,y] == levels(DF[, y])[var_mod]])$p.value
if (p_shapiro < 0.05)
verif <- FALSE
}
}
variance_equal <- ifelse(var.test(var ~ DF[, y])$p.value > 0.05,TRUE,FALSE)
}
if (ncst & sd(var, na.rm = TRUE) != 0) {
# if normale or n > 30 : verif == TRUE
if (verif) {
if (variance_equal) {
p <- signif(t.test(var ~ DF[, y],var.equal = TRUE)$p.value, 3)
} else{
p <- signif(t.test(var ~ DF[, y],var.equal = FALSE)$p.value, 3)
p <- paste0(p," (a)")
}
} else {
p <- signif(wilcox.test(var ~ DF[, y])$p.value,3)
p <- paste0(p," (c)")
}
} else {
p <- "-"
}
}
ligne1 <- c(ligne1, p)
}else
### if paired ###
if (tests & paired) {
Strate <- vector()
var_strat <- vector()
y_strat <- vector()
for (strate_i in strata) {
Strate <- c(Strate,DF[,strate_i])
var_strat <- c(var_strat,var)
y_strat <- c(y_strat,DF[,y])
}
p <- survival::clogit(as.factor(y_strat-1)~as.numeric(var_strat)+strata(Strate))$p.value
}
##
else{
ligne1 <- c(ligne1, " ")
}
ligne2 <- c(ligne2, " ")
ligne3 <- c(ligne3, " ")
tabf <- rbind(tabf,ligne1)
tabf <- rbind(tabf,ligne2)
tabf <- rbind(tabf,ligne3)
}
#--------------------------------------------------------------------------------------------------------------------#
#-- if non numeric ------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------------#
else{
#if a modality's lenght is > 40 characters, we shrink it to 40 characters to fit in the table
for (it in var) {
if (!is.na(it)) {
if (stringr::str_length(it) > 40)
var[var == it] <- paste0(substr(it, 1, 40), "...")
}
}
var <- as.factor(var)
#if there is more than 'mutation' modalities, the last modalities are grouped in 'others' modality
if (length(levels(var)) >= mutation) {
nvar <- as.vector(var)
for (other_levels_i in mutation:length(levels(var))) {
other_levels <- levels(var)[other_levels_i]
nvar[nvar == other_levels] <- "others"
}
nvar -> var
as.factor(var) -> var
}
if (length(levels(var)) >= 2) {
# Relevel to have the 'yes' level first
if (length(levels(var)) == 2) {
if (levels(var)[1] == "non" || levels(var)[1] == "NON" || levels(var)[1] == 0 || levels(var)[1] == "0") {
if (levels(var)[2] == "oui")
var <- relevel(var, "oui")
if (levels(var)[2] == "OUI")
var <- relevel(var, "OUI")
if (levels(var)[2] == "1") {
var <- as.character(var)
var[var == "0"] <- "non"
var[var == "1"] <- "oui"
var <- as.factor(var)
var <- relevel(var, "oui")
}
if (levels(var)[2] == 1) {
var <- as.character(var)
var[var == 0] <- "non"
var[var == 1] <- "oui"
var <- as.factor(var)
var <- relevel(var, "oui")
}
}
if (levels(var)[1] == "no" & levels(var)[2] == "yes") {
var <- relevel(var, "yes")
}
}
tb <- table(DF[, y], var,useNA = "no")
tbm <- margin.table(tb, 1)
# Application conditions verification
verif_level <- margin.table(table(var, DF[, y]), 2)
verif <- TRUE
for (lev in verif_level) {
if (lev == 0)
verif <- FALSE
}
if (verif) {
condition_chi2 <- 0
condition_chi2_B <- TRUE
for (m in chisq.test(var, DF[, y], simulate.p.value = TRUE)$expected) {
if (m < 5) {
#counting EXPECTED modalities under 5. if one > 3 and only 2 column we can apply yate's correction otherwise we apply Fisher
condition_chi2_B <- FALSE
if (m < 3) {
condition_chi2_B <- FALSE
break
} else{
condition_chi2 <- condition_chi2 + 1
}
}
}
if (condition_chi2 > 1 | (condition_chi2 == 1 & length(levels(var)) > 2) ) {
condition_chi2_B <- FALSE
}
## if paired ##
if (paired & tests) {
Strate = vector()
var_strat <- vector()
y_strat <- vector()
for (strate_i in strata) {
Strate <- c(Strate,DF[,strate_i])
var_strat <- c(var_strat,as.character(var))
y_strat <- c(y_strat,DF[,y])
}
clig <- survival::clogit(as.factor(y_strat-1)~as.factor(var_strat)+strata(Strate))$p.value
clig <- signif(clig, 3)
clig <- ifelse(significance, clig,ifelse(clig < 0.0001,"< 0.0001",clig))
## if not paired ##
}else {
# if verif ok chi2, else Fisher
if (condition_chi2_B) {
clig <- chisq.test(var, DF[, y])$p.value
clig <- signif(clig, 3)
} else{
clig <- fisher.test(var, DF[, y], simulate.p.value = TRUE)$p.value
clig <- signif(clig, 3)
sign <- " (b)"
}
}
}
} else{
clig <- NA
}
## Variable with 2 levels #############################
if (length(levels(var)) == 2) {
ligne <- c(paste0(ligne1, " (", levels(var)[1], ") - no. (%)"),n_available)
if (overall) {
overall_count <- addmargins(tb)[levels_y + 1,1]
percent_overall <- signif(100*overall_count / length(var),3)
ligne <- c(ligne, paste0(overall_count," (",percent_overall,"%)"))
}
for (j in 1:levels_y) {
no <- tb[j, 1]
pn <- signif(100 * no / tbm[j], 3)
ligne <- c(ligne, paste0(no, " (", pn, "%)"))
}
ligne <- c(ligne, paste0(clig, sign))
tabf <- rbind(tabf, ligne) #ajout de la ligne au tableau
} else {
## Variable with more than 2 levels #############################
# First line with only the test result
ligne <- paste0(colnames_definitive[i], " - no. (%)")
if (available_data)
ligne <- c(ligne,n_available)
ligne <- c(ligne, rep(" ", ifelse(overall,levels_y + 1,levels_y)), paste0(clig, sign))
tabf <- rbind(tabf, ligne)
# other lines
for (n in 1:length(levels(var))) {
# For each modality
if (exit != "html") {
ligne <- paste0(" ", levels(var)[n])
}else {
ligne <- paste0(" \t \t ", levels(var)[n])
}
if (available_data)
ligne <- c(ligne," ")
tb <- table(DF[, y], var,useNA = "no")
tbm <- margin.table(tb, 1)
if (overall) {
overall_count <- addmargins(tb)[levels_y+1,n]
percent_overall <- signif(100*overall_count / length(var),3)
ligne <- c(ligne,paste0(overall_count," (",percent_overall,"%)"))
}
for (j in 1:levels_y) {
no <- tb[j, n]
pn <- signif(100 * no / tbm[j], 3)
ligne <- c(ligne, paste0(no, " (", pn, "%)"))
}
ligne <- c(ligne, " ")
tabf <- rbind(tabf, ligne) #ajout de la ligne au tableau
}
}
}
}
row.names(tabf) <- NULL
########################################################
### dataframe transformation ###
as.data.frame(tabf) -> rslt
colnames(rslt) <- tabf[1, ]
rslt <- rslt[-1, ]
###
if ("excel" %in% exit)
xlsx::write.xlsx(
rslt,
paste0(name_sheet,".xlsx"),
sheetName = "name_sheet",
append = FALSE,
row.names = FALSE
)
if ("html" %in% exit) {
colnames_rslt <- colnames(rslt)
nb_colums <- length(rslt[1,])
rslt <- as.data.frame(rslt[-1,])
colsy <- vector()
for (n in seq(ynames)) {
start_nb_observation <- ifelse(overall,2,1)
colsy[n] <- paste(ynames[n],numbers_observation[n + start_nb_observation],sep = "\n")
}
rslt <- flextable(rslt, col_keys = colnames_rslt)
text_footer <- ""
text_footer <- "Data shown are number (%), and mean ± SD or median (25th – 75th percentiles) if not normally distributed \n"
text_footer <- paste0(text_footer,ifelse(test = paired,"p-values have been obtained from paired t test for continuous variables and mac nemar test for categorical variables","p-values have been obtained from a two-sided student test for continuous variables and from a khi-2 test for categorical variables, unless specified otherwise : \n
- a : Student test with Welch approximation
- b : Fisher's exact test
- c : Wilcoxon test"))
rslt <- add_footer(rslt, characteristics = text_footer)
nb_colums <- ifelse(overall,levels_y + 1,levels_y)
nb_colums <- ifelse(tests,nb_colums + 1,nb_colums)
#rslt <- merge_at(rslt, j = 1:(nb_colums), part = "footer")
rslt <- valign(rslt, valign = "bottom", part = "footer")
if (title) {
rslt <- add_header_lines(rslt,paste0("Table X. Patients baseline characteristics by study group ( ", colnames_prep(y,type = "presentation"),' )'))
}
rslt %>%
fontsize(i = 1, part = "header", size = 24) %>%
bold(i = 1, part = "header", bold = TRUE) -> rslt
rslt %>%
fontsize(i = 2, part = "header", size = 20) %>%
bold(i = 2, part = "header", bold = TRUE) -> rslt
rslt <- theme_booktabs(rslt)
rslt <- autofit(rslt)
rslt %>% align(j = 2:nb_colums,align = 'center') -> rslt
rslt %>% align(j = 2:nb_colums,align = 'center',part = "header") -> rslt
}
print(paste0('version : ',version_pkg))
return(rslt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.