#' Descriptive
#'
#' This function generates a descriptive statistics of the data - similar to what proc tabulate procedure does in SAS
#' @import openxlsx tables dplyr
#' @param ADS The input data frame.
#' @param vars a vector consisting of all the variable names for which the descriptive has to be generated
#' @param type a string indicating the type of descriptive to be generated. The possible values are "categorical","binary","continuous" with default being categorical. For binary, all the variables must have only 0 and 1 as possible values
#' @param strata a list of vectors consisting of the names of the stratification variables. The variables in a vector will be stratified in the order given.
#' @param strata_label The label for the strata variables. The size should be as same as that of the strata
#' @param numeric_summary Only applicable to type="continuous". The summary functions can be mentioned as a string seperated by '+'
#' @param percent Applicable to type "categorical" and "binary". It helps in generating percentage frequency of the variable. The possible values are "col","row" and "null". If unspecified there will be no percentage stats in the output
#' @param vars_label The label for the "vars" The size should be as same as that of the strata
#' @param test a string denoting the test which has to be conducted for calculating the p value
#' @return A list with descriptive table and other input parameters
#' @export
#' @examples
#' descriptive(ADS,vars=c("country","race"),percent="col")
#' @references The function is built on the top of a package named "tables" [tables::tabular()]
#' @author Nivesh Elangovanraaj, \email{kenivesh@gmail.com}
descriptive <-
function(ADS,
vars=c(),
type = "categorical",
strata = list(),
strata_label = list(),
numeric_summary = "min+mean+median+sd+max",
percent = "col",
vars_label = c(),
test = NULL) {
############################## Checking the inputs #############################
#checking if the input is of type dataframe
if (!is.data.frame(ADS)) {
stop("The ADS should be of type data frame")
}
#checking the length of the label
if (!is.null(vars_label)) {
if (length(vars_label) != length(vars)) {
stop(
"The number of elements in the 'vars_label' vector is not equal to
number of elements in the 'vars' vector"
)
}
}
#checking if the type has only allowed values
if (!(type == "categorical" | type == "continuous" |
type == "binary")) {
stop("Unexpected value for type. Try 'categorical' or 'binary' or 'continuous'")
}
#checking if the variable entered exists in the dataset
if (!is.null(unlist(strata))) {
for (i in c(vars, unique(unlist(strata)))) {
if (!i %in% colnames(ADS)) {
message <-
paste("The variable named '",
i,
"' doesn't exist in the input data frame")
stop(message)
}
}
#converting the variables to factors
ADS[c(unique(unlist(strata)))] <-
lapply(ADS[c(unique(unlist(strata)))], as.character)
#Replacing strata NA values with "NA"
for (i in c(unique(unlist(strata)))) {
ADS[is.na(ADS[, i]), i] <- "NA"
}
}
#checking if the strata labels size is same as that of strata
if (!is.null(unlist(strata))) {
if (!is.null(unlist(strata_label))) {
if (length(strata) == length(strata_label)) {
for (i in 1:length(strata)) {
if (length(strata[i]) != length(strata_label[i])) {
stop("The length of strata_label is not as same as strata")
}
}
} else{
stop("The length of strata_label is not as same as strata")
}
}
} else{
if (!is.null(unlist(strata_label))) {
warning(
"Strata labels present without any strata variables, the labels will be omitted. Please check your input parameters"
)
}
}
#checking if strata has atleast two levels
if (!is.null(unlist(strata))) {
for (i in c(unique(unlist(strata)))) {
if (length(unique(ADS[, i])) < 2) {
message <-
paste(
"The stratification variable named '",
i,
"' has only one level. The stratification variables must have atleast two levels",
sep = ""
)
stop(message)
}
}
}
#checking if the test has only values that are allowed
if (!(is.null(test))) {
if (!(test == "chi.sq" | test == "fisher" | test == "t.test")) {
stop("test has values that are not expected, try 'chi.sq','fisher' or 't.test")
}else{
if(sum(is.na(ADS[,vars]))>0){
stop("Chi square test won't work with NA values, remove them and try again")
}
}
}
#checking if chi.sq is applicable
if (!is.null(test)) {
if (test == "chi.sq") {
if (type == "continuous") {
stop("Chi square test is not applicable for continuous variables")
}
if (length(unlist(strata)) > 1) {
stop("Chi square works with only one level of stratification as of now")
}
}
}
if (!is.null(test)) {
if (test == "t.test") {
if (type != "continuous") {
stop("t.test is only applicable for continuous variables")
}
}
}
#Checking if the percent value is invalid
if (!is.null(percent)) {
if (!(percent == "col" | percent == "row" | percent == "all")) {
stop("Invalid value for 'percent'")
}
}
#####################################################################################
if (type == "categorical") {
#adding an empty column
ADS$empty <- "emptyspace"
#converting the variables to factors
ADS[vars] <- lapply(ADS[vars], as.character)
#Replace NAs with a "NA" string
for (i in vars) {
ADS[is.na(ADS[, i]), i] <- "NA"
}
#checking if the levels are alright
vars_with_1_level <- c()
labels_to_remove <- c()
for (i in vars) {
if (length(unique(ADS[, i])) == 1) {
vars_with_1_level <- c(vars_with_1_level, i)
labels_to_remove <-
c(labels_to_remove, vars_label[match(i, vars)])
message <-
paste(
"The variable",
i,
"has only 1 level and it has been removed. All input variables must have atleast two levels"
)
warning(message)
}
}
vars <- vars[!vars %in% vars_with_1_level]
vars_label <- vars_label[!vars_label %in% labels_to_remove]
ADS[vars] <- lapply(ADS[vars], as.factor)
ADS[unique(unlist(strata))] <-
lapply(ADS[unique(unlist(strata))], as.factor)
#creating the percentage function
perc <- function(x) {
x / 100
}
column <- ""
if (is.null(percent)) {
strata_count <- 0
if (!is.null(unlist(strata))) {
column <- "(1+"
for (list in strata) {
all = '('
if (column != "" & column != "(1+") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 1
strata_len <- length(strata)
level_len <- length(list)
for (level in list) {
if (level_count == 1 & level_count < level_len) {
column <- paste(column, level, " * ")
} else if (level_count == level_len & level_len > 1) {
column <- paste(column, ' (1+', level, " ) ")
}
else{
column <-
paste(column,
'(',
level,
' ) ')
}
level_count <- level_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
}
}
else{
column <- '1'
}
} else if (percent == "row") {
strata_count <- 0
if (!is.null(unlist(strata))) {
for (list in strata) {
all = '(1+Format(perc()) * Percent("row")+'
if (column != "") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 0
for (level in list) {
if (level_count == 0) {
column <- paste(column, level, " * ")
} else{
column <-
paste(column,
'(1+Format(perc()) * Percent("row")+',
level,
' * ')
}
level_count <- level_count + 1
}
if (level_count == 1) {
column <- paste(column, '(1+Format(perc()) * Percent("row"))')
} else{
column <- paste(column, '(1+Format(perc()) * Percent("row"))')
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
}
}
else{
column <- '(1+Format(perc()) * Percent("row"))'
}
} else if (percent == "all") {
strata_count <- 0
if (!is.null(unlist(strata))) {
for (list in strata) {
all = '(1+Format(perc()) * Percent("all")+'
if (column != "") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 0
for (level in list) {
if (level_count == 0) {
column <- paste(column, level, " * ")
} else{
column <-
paste(column,
'(1+Format(perc()) * Percent("all")+',
level,
' * ')
}
level_count <- level_count + 1
}
if (level_count == 1) {
column <- paste(column, '(1+Format(perc()) * Percent("all"))')
} else{
column <- paste(column, '(1+Format(perc()) * Percent("all"))')
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
}
}
else{
column <- '(1+Format(perc()) * Percent("all"))'
}
} else{
strata_count <- 0
if (!is.null(unlist(strata))) {
for (list in strata) {
all = '(1+Format(perc()) * Percent("col")+'
if (column != "") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 0
for (level in list) {
if (level_count == 0) {
column <- paste(column, level, " * ")
} else{
column <-
paste(column,
'(1+Format(perc()) * Percent("col")+',
level,
' * ')
}
level_count <- level_count + 1
}
if (level_count == 1) {
column <- paste(column, '(1+Format(perc()) * Percent("col"))')
} else{
column <- paste(column, '(1+Format(perc()) * Percent("col"))')
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
}
} else{
column <- '(1+Format(perc()) * Percent("col"))'
}
}
rows <- c(rbind(rep("empty", length(vars)), vars))
if(!is.null(vars)){
formula <- paste('1+',
paste(rows, collapse = "+"), '~',
column)
}else{
formula <- paste('1',
paste(rows, collapse = "+"), '~',
column)
}
#creating the tabular
desc <- tables::tabular(as.formula(formula), ADS)
# write.csv.tabular(desc,"desc.csv")
#converting the tabular to a data frame
desc_df <- as.data.frame.matrix(desc)
desc_df[] <- lapply(desc_df[], as.numeric)
#adding row labels
desc_df$variables <-
as.data.frame.matrix(tables::rowLabels(desc))[, 1]
if(!is.null(vars)){
desc_df$labels <-
as.data.frame.matrix(tables::rowLabels(desc))[, 2]
}else{
desc_df$labels <-
as.data.frame.matrix(tables::rowLabels(desc))[, 1]
}
#re-arranging the dataframe
desc_df <-
desc_df[, c(ncol(desc_df) - 1, ncol(desc_df), 1:(ncol(desc_df) - 2))]
#creating the empty row
desc_df[desc_df$labels == "empty", ] <- NA
#merging the variable and it's label in a single column
desc_df$variables <- dplyr::lead(desc_df$variables)
desc_df[, c(1, 2)] <-
lapply(desc_df[, c(1, 2)][], as.character)
desc_df$labels <-
ifelse(
desc_df$labels == "" | is.na(desc_df$labels),
as.character(desc_df$variables),
desc_df$labels
)
desc_df <- desc_df[, -1]
#adding the column labels
desc_heading <- as.data.frame.matrix(tables::colLabels(desc))
desc_heading[] <- lapply(desc_heading[], as.character)
if (!is.null(unlist(strata))) {
if (is.null(percent)) {
desc_heading[(max(lengths(strata)) * 2), c(1)] <- "Overall"
} else{
desc_heading[(max(lengths(strata)) * 2), c(1, 2)] <- "Overall"
}
}
if (is.null(percent)) {
desc_heading <-
as.data.frame(rbind(desc_heading, rep("All", length(desc_heading))))
}
desc_columns <-
sapply(desc_heading[], function(x)
paste(x, collapse = " | "))
V0 <- rep("Variables", nrow(desc_heading))
desc_heading <- cbind(V0 , desc_heading)
# if (!is.null(unlist(strata))) {
# desc_heading[c(1:(max(lengths(strata)) * 2)), c(2, 3)] <-
# "Overall"
# }
#filling the empty values
for (j in 1:ncol(desc_heading)) {
for (i in 1:nrow(desc_heading)) {
non_empty_row <- 0
if (desc_heading[i, j] == "") {
for (k in (i + 1):nrow(desc_heading)) {
if (desc_heading[k, j] != "") {
non_empty_row <- k
break()
}
}
if (desc_heading[k, j] == "All" |
desc_heading[k, j] == "Percent") {
if (!is.null(percent)) {
desc_heading[i, j] <- "Overall"
} else{
desc_heading[i, j] <- "All"
}
}
else{
desc_heading[i, j] <- desc_heading[k, j]
}
}
}
if (desc_heading[nrow(desc_heading), j] == "All") {
desc_heading[nrow(desc_heading), j] <- "N"
}
if (desc_heading[nrow(desc_heading), j] == "Percent") {
desc_heading[nrow(desc_heading), j] <- "%"
}
}
desc_columns[1] <- gsub(".*\\| O", "O", desc_columns[1])
desc_columns[2] <- gsub(".*\\| O", "O", desc_columns[2])
for(i in 1:length(desc_columns)){
desc_columns[i] <-
sub("\\| \\|", "\\|", desc_columns[i])
if(is.null(percent)){
desc_columns[i]<- sub("All$", "N", desc_columns[i])
}
}
if (!is.null(unlist(strata))) {
colnames(desc_df) <- c("Variables", desc_columns)
} else if (!is.null(percent)) {
colnames(desc_df) <- c("Variables", "All", "%")
} else{
colnames(desc_df) <- c("Variables", "All")
}
#dividing the percentage column by 100
desc_df[, which(grepl("Percent", colnames(desc_df)))] <-
desc_df[, which(grepl("Percent", colnames(desc_df)))] / 100
#dividing the percentage column by 100
if(is.null(unlist(strata))){
desc_df[, which(grepl("%", colnames(desc_df)))] <-
desc_df[, which(grepl("%", colnames(desc_df)))] / 100
}
if (!is.null(test)) {
#adding the chi.square value
if (test == "chi.sq") {
chisq_vec <- c()
for (i in vars) {
chisq_vec <- c(chisq_vec,
sprintf(chisq.test(c(ADS[, i]),
ADS[, unlist(strata[[1]])])$p.value,
fmt = '%#.2f'))
}
chisq_df <-
as.data.frame(cbind(Variables = vars, p_value = chisq_vec))
desc_df$Chi.sq <- NA
for (i in 1:nrow(desc_df)) {
for (j in 1:nrow(chisq_df)) {
if (desc_df[i, "Variables"] == chisq_df[j, "Variables"]) {
desc_df[i, "Chi.sq"] = as.numeric(as.character(chisq_df[j, "p_value"]))
}
}
}
#adding the chisquare column in the heading
desc_heading$Chi.sq <- "Chi.sq"
desc_heading[nrow(desc_heading), "Chi.sq"] <- "p-value"
}
}
#changing the strata name with their labels
if (!is.null(unlist(strata_label))) {
for (i in 1:nrow(desc_heading)) {
for (j in 1:length(unlist(strata))) {
if (!is.na(match(unlist(strata)[j], desc_heading[i,]))) {
desc_heading[i,] <-
replace(
desc_heading[i,],
which(desc_heading[i,] %in% unlist(strata)[j]),
unlist(strata_label)[j]
)
}
}
}
}
#changing the variable name with their labels
if (!is.null(vars_label)) {
desc_df[, 1] <-
replace(desc_df[, 1], match(vars, desc_df[, 1]), vars_label)
}
#returning the final data frame
return(list(
table = desc_df,
heading = desc_heading,
tabular = desc,
type = type
))
}
else if (type == "continuous") {
mean <- function(x)
base::mean(x, na.rm = TRUE)
median <- function(x)
stats::median(x, na.rm = TRUE)
sd <- function(x)
stats::sd(x, na.rm = TRUE)
sum <- function(x)
base::sum(x, na.rm = TRUE)
min <- function(x)
base::min(x, na.rm = TRUE)
max <- function(x)
base::max(x, na.rm = TRUE)
All <- function(x)
length(x[!is.na(x)])
#converting the variables to factors/numeric
for (i in vars) {
if (!is.numeric(ADS[, i])) {
ADS[, i] <- as.character(ADS[, i])
ADS[, i] <- as.numeric(ADS[, i])
}
}
ADS[unique(unlist(strata))] <-
lapply(ADS[unique(unlist(strata))], as.factor)
summary <- paste("(All+", numeric_summary, ")")
rows <- vars
row_formula <- "1+"
for (i in 1:length(rows)) {
row_formula <- paste(row_formula, rows[i], "*", summary, "+")
}
row_formula <- substr(row_formula, 1, nchar(row_formula) - 1)
column_formula <- "1"
for (i in strata) {
count <- 0
for (j in i) {
count <- count + 1
temp <- paste("(", j, ")")
if (count == 1) {
k <- temp
} else{
pos <- regexpr(pattern = ')', k)[1]
k <-
paste(
substr(k, 1, pos - 1),
"*",
substr(temp, 1, 1),
"1+",
substr(temp, 2, nchar(temp)),
substr(k, pos, nchar(k))
)
}
}
column_formula <- paste(column_formula, "+", k)
}
formula <- paste(row_formula, "~", column_formula)
options(scipen = 999)
#creating the tabular
desc <- tables::tabular(as.formula(formula), ADS)
#converting the tabular to a data frame
desc_df <- as.data.frame.matrix(desc)
desc_df[] <- lapply(desc_df[], as.numeric)
#adding row labels
desc_df$variables <-
as.data.frame.matrix(rowLabels(desc))[, 1]
desc_df$labels <- as.data.frame.matrix(rowLabels(desc))[, 2]
#re-arranging the dataframe
desc_df <-
desc_df[, c(ncol(desc_df) - 1, ncol(desc_df), 1:(ncol(desc_df) - 2))]
#converting the first two columns to characters
desc_df[, c(1, 2)] <-
lapply(desc_df[, c(1, 2)][], as.character)
#adding the column labels
desc_heading <- as.data.frame.matrix(colLabels(desc))
desc_heading[] <- lapply(desc_heading[], as.character)
desc_columns <-
sapply(desc_heading[], function(x)
paste(x, collapse = " | "))
V00 <- rep("Variables", nrow(desc_heading))
V01 <- rep("Labels", nrow(desc_heading))
desc_heading <- cbind(V00, V01, desc_heading)
if (!is.null(unlist(strata))) {
if(!is.null(percent)){
desc_heading[c(1:(max(lengths(strata)) * 2)), c(3)] <-
"Overall"
}
}
#filling the empty values
for (j in 1:ncol(desc_heading)) {
for (i in 1:nrow(desc_heading)) {
non_empty_row <- 0
if (desc_heading[i, j] == "") {
for (k in (i + 1):nrow(desc_heading)) {
if (desc_heading[k, j] != "") {
non_empty_row <- k
break()
}
}
if (desc_heading[k, j] == "All" |
desc_heading[k, j] == "Percent") {
desc_heading[i, j] <- "Overall"
}
else{
desc_heading[i, j] <- desc_heading[k, j]
}
}
}
if (desc_heading[nrow(desc_heading), j] == "All") {
desc_heading[nrow(desc_heading), j] <- "Overall"
}
}
desc_columns[1] <- gsub(".*\\| A", "A", desc_columns[1])
colnames(desc_df) <- c("Variables", "Labels", desc_columns)
desc_df[1, 1] <- "All"
if (!is.null(test)) {
#adding the t.test p-value
if (test == "t.test") {
t_test_vec <- c()
if(length(unique(unlist(ADS[,strata[[1]]])))>2){
stop("The stratification variable has more than 2 stratifications")
}
unique_strata_values<-unique(ADS[,strata[[1]]])
trt<-ADS[ADS[,unlist(strata[[1]])]==unique_strata_values[1],]
ctrl<-ADS[ADS[,unlist(strata[[1]])]==unique_strata_values[2],]
for (i in vars) {
t_test_vec <- c(t_test_vec,
sprintf(t.test(trt[, i],ctrl[, i])$p.value,
fmt = '%#.3f'))
}
ttest_df <-
as.data.frame(cbind(Variables = vars, p_value = t_test_vec))
desc_df$t.test <- NA
for (i in 1:nrow(desc_df)) {
for (j in 1:nrow(ttest_df)) {
if(!is.na(desc_df[i, "Variables"])){
if (desc_df[i, "Variables"] == ttest_df[j, "Variables"]) {
desc_df[i, "t.test"] = as.numeric(as.character(ttest_df[j, "p_value"]))
}
}
}
}
#adding the chisquare column in the heading
desc_heading$t.test <- "t.test"
desc_heading[nrow(desc_heading), "t.test"] <- "p-value"
}
}
#changing the strata name with their labels
if (!is.null(unlist(strata_label))) {
for (i in 1:nrow(desc_heading)) {
for (j in 1:length(unlist(strata))) {
if (!is.na(match(unlist(strata)[j], desc_heading[i,]))) {
desc_heading[i,] <-
replace(
desc_heading[i,],
which(desc_heading[i,] %in% unlist(strata)[j]),
unlist(strata_label)[j]
)
}
}
}
}
#changing the variable name with their labels
if (!is.null(vars_label)) {
desc_df[, 1] <-
replace(desc_df[, 1], match(vars, desc_df[, 1]), vars_label)
}
#returning the final data frame
return(list(
table = desc_df,
heading = desc_heading,
tabular = desc,
type = type
))
}
else{
#adding an empty column
ADS$empty <- "emptyspace"
#converting the variables to factors
ADS[vars] <- lapply(ADS[vars], as.character)
#Replace NAs with a "NA" string
for (i in vars) {
if (sum(is.na(ADS[, i])) > 0) {
message <- paste("The variable named '",
i,
"' has NAs and it will be replaced with 0")
warning(message)
}
ADS[is.na(ADS[, i]), i] <- 0
}
#checking if the levels are alright
vars_with_1_level <- c()
labels_to_remove <- c()
for (i in vars) {
if (length(unique(ADS[, i])) == 1) {
vars_with_1_level <- c(vars_with_1_level, i)
labels_to_remove <-
c(labels_to_remove, vars_label[match(i, vars)])
message <-
paste(
"The variable",
i,
"has only 1 level and it has been removed. All input variables must have atleast two levels"
)
warning(message)
}
}
vars <- vars[!vars %in% vars_with_1_level]
vars_label <- vars_label[!vars_label %in% labels_to_remove]
ADS[vars] <- lapply(ADS[vars], as.factor)
ADS[unique(unlist(strata))] <-
lapply(ADS[unique(unlist(strata))], as.factor)
#creating the percentage function
perc <- function(x) {
x / 100
}
column <- ""
if (is.null(percent)) {
strata_count <- 0
if (!is.null(unlist(strata))) {
column <- "(1+"
for (list in strata) {
all = '('
if (column != "" & column != "(1+") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 1
strata_len <- length(strata)
level_len <- length(list)
for (level in list) {
if (level_count == 1 & level_count < level_len) {
column <- paste(column, level, " * ")
} else if (level_count == level_len & level_len > 1) {
column <- paste(column, ' (1+', level, " ) ")
}
else{
column <-
paste(column,
'(',
level,
' ) ')
}
level_count <- level_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
}
}
else{
column <- '1'
}
} else if (percent == "row") {
if (!is.null(unlist(strata))) {
for (list in strata) {
all = '(1+Format(perc()) * Percent("row")+'
if (column != "") {
column <- paste(column, "+")
}
column <- paste(column, all)
level_count <- 0
for (level in list) {
if (level_count == 0) {
column <- paste(column, level, " * ")
} else{
column <-
paste(column,
'(1+Format(perc()) * Percent("row")+',
level,
' * ')
}
level_count <- level_count + 1
}
if (level_count == 1) {
column <- paste(column, '(1+Format(perc()) * Percent("row")))')
} else{
column <- paste(column, '(1+Format(perc()) * Percent("row"))')
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
column <-
paste(column, paste(rep(")", start_bracket - end_bracket), collapse = ""))
} else{
column <- '(1+Format(perc()) * Percent("row"))'
}
} else if (percent == "all") {
strata_count <- 0
if (!is.null(unlist(strata))) {
for (list in strata) {
all = '(1+Format(perc()) * Percent("all")+'
if (column != "") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 0
for (level in list) {
if (level_count == 0) {
column <- paste(column, level, " * ")
} else{
column <-
paste(column,
'(1+Format(perc()) * Percent("all")+',
level,
' * ')
}
level_count <- level_count + 1
}
if (level_count == 1) {
column <- paste(column, '(1+Format(perc()) * Percent("all"))')
} else{
column <- paste(column, '(1+Format(perc()) * Percent("all"))')
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
} else{
column <- '(1+Format(perc()) * Percent("all"))'
}
}
} else{
strata_count <- 0
if (!is.null(unlist(strata))) {
for (list in strata) {
all = '(1+Format(perc()) * Percent("col")+'
if (column != "") {
column <- paste(column, "+")
}
if (strata_count == 0) {
column <- paste(column, all)
}
level_count <- 0
for (level in list) {
if (level_count == 0) {
column <- paste(column, level, " * ")
} else{
column <-
paste(column,
'(1+Format(perc()) * Percent("col")+',
level,
' * ')
}
level_count <- level_count + 1
}
if (level_count == 1) {
column <- paste(column, '(1+Format(perc()) * Percent("col"))')
} else{
column <- paste(column, '(1+Format(perc()) * Percent("col"))')
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket - 1
), collapse = ""))
}
strata_count <- strata_count + 1
}
start_bracket <-
sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
end_bracket <-
sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
if (end_bracket - start_bracket != 0) {
column <-
paste(column, paste(rep(
")", start_bracket - end_bracket
), collapse = ""))
} else{
column <- '(1+Format(perc()) * Percent("col"))'
}
}
}
rows <- c(rbind(rep("empty", length(vars)), vars))
if(!is.null(vars)){
formula <- paste('1+',
paste(rows, collapse = "+"), '~',
column)
}else{
formula <- paste('1',
paste(rows, collapse = "+"), '~',
column)
}
#creating the tabular
desc <- tables::tabular(as.formula(formula), ADS)
#converting the tabular to a data frame
desc_df <- as.data.frame.matrix(desc)
desc_df[] <- lapply(desc_df[], as.numeric)
#adding row labels
desc_df$variables <-
as.data.frame.matrix(rowLabels(desc))[, 1]
if(!is.null(vars)){
desc_df$labels <-
as.data.frame.matrix(tables::rowLabels(desc))[, 2]
}else{
desc_df$labels <-
as.data.frame.matrix(tables::rowLabels(desc))[, 1]
}
#re-arranging the dataframe
desc_df <-
desc_df[, c(ncol(desc_df) - 1, ncol(desc_df), 1:(ncol(desc_df) - 2))]
#creating the empty row
desc_df[desc_df$labels == "empty", ] <- NA
#merging the variable and it's label in a single column
desc_df$variables <- dplyr::lag(desc_df$variables)
desc_df[, c(1, 2)] <-
lapply(desc_df[, c(1, 2)][], as.character)
desc_df <-
desc_df[(
desc_df$labels == 1 | desc_df$labels == "1:Yes" |
desc_df$labels == "Yes" |
desc_df$labels == "yes" |
desc_df$labels == "YES" |
desc_df$labels == "All"
) &
!is.na(desc_df$labels), ]
desc_df$labels <-
ifelse(
desc_df$labels == "" | desc_df$labels == 1 |
is.na(desc_df$labels) | desc_df$labels == "1:Yes" |
desc_df$labels == "YES" |
desc_df$labels == "Yes" | desc_df$labels == "yes" ,
as.character(desc_df$variables),
desc_df$labels
)
#removing the variable column
desc_df <- desc_df[, -1]
#adding the column labels
desc_heading <- as.data.frame.matrix(colLabels(desc))
desc_heading[] <- lapply(desc_heading[], as.character)
if (!is.null(unlist(strata))) {
if (is.null(percent)) {
desc_heading[(max(lengths(strata)) * 2), c(1)] <- "Overall"
} else{
desc_heading[(max(lengths(strata)) * 2), c(1, 2)] <- "Overall"
}
}
if (is.null(percent)) {
desc_heading <-
as.data.frame(rbind(desc_heading, rep("All", length(desc_heading))))
}
desc_columns <-
sapply(desc_heading[], function(x)
paste(x, collapse = " | "))
V0 <- rep("Variables", nrow(desc_heading))
desc_heading <- cbind(V0 , desc_heading)
if (!is.null(unlist(strata))) {
if(!is.null(percent)){
desc_heading[c(1:(max(lengths(strata)) * 2)), c(2, 3)] <-
"Overall"
}
}
#filling the empty values
for (j in 1:ncol(desc_heading)) {
for (i in 1:nrow(desc_heading)) {
non_empty_row <- 0
if (desc_heading[i, j] == "") {
for (k in (i + 1):nrow(desc_heading)) {
if (desc_heading[k, j] != "") {
non_empty_row <- k
break()
}
}
if (desc_heading[k, j] == "All" |
desc_heading[k, j] == "Percent") {
desc_heading[i, j] <- "Overall"
}
else{
desc_heading[i, j] <- desc_heading[k, j]
}
}
}
if (desc_heading[nrow(desc_heading), j] == "All") {
desc_heading[nrow(desc_heading), j] <- "N"
}
if (desc_heading[nrow(desc_heading), j] == "Percent") {
desc_heading[nrow(desc_heading), j] <- "%"
}
}
desc_columns[1] <- gsub(".*\\| O", "O", desc_columns[1])
desc_columns[2] <- gsub(".*\\| O", "O", desc_columns[2])
for(i in 1:length(desc_columns)){
desc_columns[i] <-
sub("\\| \\|", "\\|", desc_columns[i])
if(is.null(percent)){
desc_columns[i]<- sub("All$", "N", desc_columns[i])
}
}
if (!is.null(unlist(strata))) {
colnames(desc_df) <- c("Variables", desc_columns)
} else if (!is.null(percent)) {
colnames(desc_df) <- c("Variables", "All", "%")
} else{
colnames(desc_df) <- c("Variables", "All")
}
#dividing the percentage column by 100
desc_df[, which(grepl("Percent", colnames(desc_df)))] <-
desc_df[, which(grepl("Percent", colnames(desc_df)))] / 100
#adding the chi.square value
if (!is.null(test)) {
if (test == "chi.sq") {
chisq_vec <- c()
for (i in vars) {
chisq_vec <- c(chisq_vec,
sprintf(chisq.test(c(ADS[, i]),
ADS[, unlist(strata[[1]])])$p.value,
fmt = '%#.3f'))
}
chisq_df <-
as.data.frame(cbind(Variables = vars, p_value = chisq_vec))
desc_df$Chi.sq <- NA
for (i in 1:nrow(desc_df)) {
for (j in 1:nrow(chisq_df)) {
if (desc_df[i, "Variables"] == chisq_df[j, "Variables"]) {
desc_df[i, "Chi.sq"] = as.numeric(as.character(chisq_df[j, "p_value"]))
}
}
}
#adding the chisquare column in the heading
desc_heading$Chi.sq <- "Chi.sq"
desc_heading[nrow(desc_heading), "Chi.sq"] <- "p-value"
}
}
#changing the strata name with their labels
if (!is.null(unlist(strata_label))) {
for (i in 1:nrow(desc_heading)) {
for (j in 1:length(unlist(strata))) {
if (!is.na(match(unlist(strata)[j], desc_heading[i,]))) {
desc_heading[i,] <-
replace(
desc_heading[i,],
which(desc_heading[i,] %in% unlist(strata)[j]),
unlist(strata_label)[j]
)
}
}
}
}
#changing the variable name with their labels
if (!is.null(vars_label)) {
desc_df[, 1] <-
replace(desc_df[, 1], match(vars, desc_df[, 1]), vars_label)
}
#returning the final data frame
return(list(
table = desc_df,
heading = desc_heading,
tabular = desc,
type = type
))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.