R/descriptive.R

Defines functions descriptive

Documented in descriptive

#' Descriptive
#'
#' This function generates a descriptive statistics of the data - similar to what proc tabulate procedure does in SAS
#' @import openxlsx tables dplyr
#' @param ADS The input data frame.
#' @param vars a vector consisting of all the variable names for which the descriptive has to be generated
#' @param type a string indicating the type of descriptive to be generated. The possible values are "categorical","binary","continuous" with default being categorical. For binary, all the variables must have only 0 and 1 as possible values
#' @param strata a list of vectors consisting of the names of the stratification variables. The variables in a vector will be stratified in the order given.
#' @param strata_label The label for the strata variables. The size should be as same as that of the strata
#' @param numeric_summary Only applicable to type="continuous". The summary functions can be mentioned as a string seperated by '+'
#' @param percent Applicable to type "categorical" and "binary". It helps in generating percentage frequency of the variable. The possible values are "col","row" and "null". If unspecified there will be no percentage stats in the output
#' @param vars_label The label for the "vars" The size should be as same as that of the strata
#' @param test a string denoting the test which has to be conducted for calculating the p value
#' @return A list with descriptive table and other input parameters
#' @export
#' @examples
#' descriptive(ADS,vars=c("country","race"),percent="col")
#' @references The function is built on the top of a package named "tables" [tables::tabular()]
#' @author Nivesh Elangovanraaj, \email{kenivesh@gmail.com}



descriptive <-
  function(ADS,
           vars=c(),
           type = "categorical",
           strata = list(),
           strata_label = list(),
           numeric_summary = "min+mean+median+sd+max",
           percent = "col",
           vars_label = c(),
           test = NULL) {
    ##############################  Checking the inputs    #############################

    #checking if the input is of type dataframe
    if (!is.data.frame(ADS)) {
      stop("The ADS should be of type data frame")
    }


    #checking the length of the label
    if (!is.null(vars_label)) {
      if (length(vars_label) != length(vars)) {
        stop(
          "The number of elements in the 'vars_label' vector is not equal to
          number of elements in the 'vars' vector"
        )
      }
      }

    #checking if the type has only allowed values
    if (!(type == "categorical" | type == "continuous" |
          type == "binary")) {
      stop("Unexpected value for type. Try 'categorical' or 'binary' or 'continuous'")
    }


    #checking if the variable entered exists in the dataset
    if (!is.null(unlist(strata))) {
      for (i in c(vars, unique(unlist(strata)))) {
        if (!i %in% colnames(ADS)) {
          message <-
            paste("The variable named '",
                  i,
                  "' doesn't exist in the input data frame")
          stop(message)
        }
      }
      #converting the variables to factors
      ADS[c(unique(unlist(strata)))] <-
        lapply(ADS[c(unique(unlist(strata)))], as.character)

      #Replacing strata NA values with "NA"
      for (i in c(unique(unlist(strata)))) {
        ADS[is.na(ADS[, i]), i] <- "NA"
      }
    }



    #checking if the strata labels size is same as that of strata
    if (!is.null(unlist(strata))) {
      if (!is.null(unlist(strata_label))) {
        if (length(strata) == length(strata_label)) {
          for (i in 1:length(strata)) {
            if (length(strata[i]) != length(strata_label[i])) {
              stop("The length of strata_label is not as same as strata")
            }
          }
        } else{
          stop("The length of strata_label is not as same as strata")
        }
      }
    } else{
      if (!is.null(unlist(strata_label))) {
        warning(
          "Strata labels present without any strata variables, the labels will be omitted. Please check your input parameters"
        )
      }
    }
    #checking if strata has atleast two levels
    if (!is.null(unlist(strata))) {
      for (i in c(unique(unlist(strata)))) {
        if (length(unique(ADS[, i])) < 2) {
          message <-
            paste(
              "The stratification variable named '",
              i,
              "' has only one level. The stratification variables must have atleast two levels",
              sep = ""
            )
          stop(message)
        }
      }
    }

    #checking if the test has only values that are allowed
    if (!(is.null(test))) {
      if (!(test == "chi.sq" | test == "fisher" | test == "t.test")) {
        stop("test has values that are not expected, try 'chi.sq','fisher' or 't.test")
      }else{
        if(sum(is.na(ADS[,vars]))>0){
          stop("Chi square test won't work with NA values, remove them and try again")
        }
      }
    }

    #checking if chi.sq is applicable
    if (!is.null(test)) {
      if (test == "chi.sq") {
        if (type == "continuous") {
          stop("Chi square test is not applicable for continuous variables")
        }
        if (length(unlist(strata)) > 1) {
          stop("Chi square works with only one level of stratification as of now")
        }
      }
    }
    if (!is.null(test)) {
      if (test == "t.test") {
        if (type != "continuous") {
          stop("t.test is only applicable for continuous variables")
        }
      }


    }

    #Checking if the percent value is invalid
    if (!is.null(percent)) {
      if (!(percent == "col" | percent == "row" | percent == "all")) {
        stop("Invalid value for 'percent'")
      }
    }


    #####################################################################################

    if (type == "categorical") {
      #adding an empty column
      ADS$empty <- "emptyspace"

      #converting the variables to factors
      ADS[vars] <- lapply(ADS[vars], as.character)

      #Replace NAs with a "NA" string
      for (i in vars) {
        ADS[is.na(ADS[, i]), i] <- "NA"
      }


      #checking if the levels are alright
      vars_with_1_level <- c()
      labels_to_remove <- c()
      for (i in vars) {
        if (length(unique(ADS[, i])) == 1) {
          vars_with_1_level <- c(vars_with_1_level, i)
          labels_to_remove <-
            c(labels_to_remove, vars_label[match(i, vars)])
          message <-
            paste(
              "The variable",
              i,
              "has only 1 level and it has been removed. All input variables must have atleast two levels"
            )
          warning(message)
        }
      }
      vars <- vars[!vars %in% vars_with_1_level]
      vars_label <- vars_label[!vars_label %in% labels_to_remove]

      ADS[vars] <- lapply(ADS[vars], as.factor)
      ADS[unique(unlist(strata))] <-
        lapply(ADS[unique(unlist(strata))], as.factor)



      #creating the percentage function
      perc <- function(x) {
        x / 100
      }


      column <- ""
      if (is.null(percent)) {
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          column <- "(1+"
          for (list in strata) {
            all = '('
            if (column != "" & column != "(1+") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 1
            strata_len <- length(strata)
            level_len <- length(list)
            for (level in list) {
              if (level_count == 1 & level_count < level_len) {
                column <- paste(column, level, " * ")
              } else if (level_count == level_len & level_len > 1) {
                column <- paste(column, ' (1+', level, " ) ")
              }
              else{
                column <-
                  paste(column,
                        '(',
                        level,
                        ' ) ')
              }

              level_count <- level_count + 1
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          }
        }
        else{
          column <- '1'
        }
      } else if (percent == "row") {
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          for (list in strata) {
            all = '(1+Format(perc()) * Percent("row")+'
            if (column != "") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 0
            for (level in list) {
              if (level_count == 0) {
                column <- paste(column, level, " * ")
              } else{
                column <-
                  paste(column,
                        '(1+Format(perc()) * Percent("row")+',
                        level,
                        ' * ')
              }

              level_count <- level_count + 1
            }
            if (level_count == 1) {
              column <- paste(column, '(1+Format(perc()) * Percent("row"))')
            } else{
              column <- paste(column, '(1+Format(perc()) * Percent("row"))')
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          }
        }
        else{
          column <- '(1+Format(perc()) * Percent("row"))'

        }

      } else if (percent == "all") {
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          for (list in strata) {
            all = '(1+Format(perc()) * Percent("all")+'
            if (column != "") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 0
            for (level in list) {
              if (level_count == 0) {
                column <- paste(column, level, " * ")
              } else{
                column <-
                  paste(column,
                        '(1+Format(perc()) * Percent("all")+',
                        level,
                        ' * ')
              }

              level_count <- level_count + 1
            }
            if (level_count == 1) {
              column <- paste(column, '(1+Format(perc()) * Percent("all"))')
            } else{
              column <- paste(column, '(1+Format(perc()) * Percent("all"))')
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          }
        }
        else{
          column <- '(1+Format(perc()) * Percent("all"))'
        }
      } else{
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          for (list in strata) {
            all = '(1+Format(perc()) * Percent("col")+'
            if (column != "") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 0
            for (level in list) {
              if (level_count == 0) {
                column <- paste(column, level, " * ")
              } else{
                column <-
                  paste(column,
                        '(1+Format(perc()) * Percent("col")+',
                        level,
                        ' * ')
              }

              level_count <- level_count + 1
            }
            if (level_count == 1) {
              column <- paste(column, '(1+Format(perc()) * Percent("col"))')
            } else{
              column <- paste(column, '(1+Format(perc()) * Percent("col"))')
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }


          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          }
        } else{
          column <- '(1+Format(perc()) * Percent("col"))'
        }

      }
      rows <- c(rbind(rep("empty", length(vars)), vars))
      if(!is.null(vars)){
        formula <- paste('1+',
                         paste(rows, collapse = "+"), '~',
                         column)
      }else{
        formula <- paste('1',
                         paste(rows, collapse = "+"), '~',
                         column)
      }




      #creating the tabular
      desc <- tables::tabular(as.formula(formula), ADS)
      # write.csv.tabular(desc,"desc.csv")

      #converting the tabular to a data frame
      desc_df <- as.data.frame.matrix(desc)
      desc_df[] <- lapply(desc_df[], as.numeric)


      #adding row labels
      desc_df$variables <-
        as.data.frame.matrix(tables::rowLabels(desc))[, 1]
      if(!is.null(vars)){
        desc_df$labels <-
          as.data.frame.matrix(tables::rowLabels(desc))[, 2]
      }else{
        desc_df$labels <-
          as.data.frame.matrix(tables::rowLabels(desc))[, 1]
      }


      #re-arranging the dataframe
      desc_df <-
        desc_df[, c(ncol(desc_df) - 1, ncol(desc_df), 1:(ncol(desc_df) - 2))]

      #creating the empty row
      desc_df[desc_df$labels == "empty", ] <- NA

      #merging the variable and it's label in a single column
      desc_df$variables <- dplyr::lead(desc_df$variables)


      desc_df[, c(1, 2)] <-
        lapply(desc_df[, c(1, 2)][], as.character)
      desc_df$labels <-
        ifelse(
          desc_df$labels == "" | is.na(desc_df$labels),
          as.character(desc_df$variables),
          desc_df$labels
        )


      desc_df <- desc_df[, -1]

      #adding the column labels
      desc_heading <- as.data.frame.matrix(tables::colLabels(desc))
      desc_heading[] <- lapply(desc_heading[], as.character)
      if (!is.null(unlist(strata))) {
        if (is.null(percent)) {
          desc_heading[(max(lengths(strata)) * 2), c(1)] <- "Overall"

        } else{
          desc_heading[(max(lengths(strata)) * 2), c(1, 2)] <- "Overall"
        }
      }
      if (is.null(percent)) {
        desc_heading <-
          as.data.frame(rbind(desc_heading,  rep("All", length(desc_heading))))
      }
      desc_columns <-
        sapply(desc_heading[], function(x)
          paste(x, collapse = " | "))
      V0 <- rep("Variables", nrow(desc_heading))
      desc_heading <- cbind(V0 , desc_heading)
      # if (!is.null(unlist(strata))) {
      #   desc_heading[c(1:(max(lengths(strata)) * 2)), c(2, 3)] <-
      #     "Overall"
      # }

      #filling the empty values
      for (j in 1:ncol(desc_heading)) {
        for (i in 1:nrow(desc_heading)) {
          non_empty_row <- 0
          if (desc_heading[i, j] == "") {
            for (k in (i + 1):nrow(desc_heading)) {
              if (desc_heading[k, j] != "") {
                non_empty_row <- k
                break()
              }
            }
            if (desc_heading[k, j] == "All" |
                desc_heading[k, j] == "Percent") {
              if (!is.null(percent)) {
                desc_heading[i, j] <- "Overall"
              } else{
                desc_heading[i, j] <- "All"
              }

            }
            else{
              desc_heading[i, j] <- desc_heading[k, j]
            }

          }
        }
        if (desc_heading[nrow(desc_heading), j] == "All") {
          desc_heading[nrow(desc_heading), j] <- "N"


        }

        if (desc_heading[nrow(desc_heading), j] == "Percent") {
          desc_heading[nrow(desc_heading), j] <- "%"
        }
      }


      desc_columns[1] <- gsub(".*\\| O", "O", desc_columns[1])
      desc_columns[2] <- gsub(".*\\| O", "O", desc_columns[2])
      for(i in 1:length(desc_columns)){
        desc_columns[i] <-
          sub("\\|  \\|", "\\|", desc_columns[i])
        if(is.null(percent)){
          desc_columns[i]<- sub("All$", "N", desc_columns[i])
        }
      }

      if (!is.null(unlist(strata))) {
        colnames(desc_df) <- c("Variables", desc_columns)
      } else if (!is.null(percent)) {
        colnames(desc_df) <- c("Variables", "All", "%")
      } else{
        colnames(desc_df) <- c("Variables", "All")
      }


      #dividing the percentage column by 100
      desc_df[, which(grepl("Percent", colnames(desc_df)))] <-
        desc_df[, which(grepl("Percent", colnames(desc_df)))] / 100

      #dividing the percentage column by 100
      if(is.null(unlist(strata))){
        desc_df[, which(grepl("%", colnames(desc_df)))] <-
          desc_df[, which(grepl("%", colnames(desc_df)))] / 100
      }


      if (!is.null(test)) {
        #adding the chi.square value
        if (test == "chi.sq") {
          chisq_vec <- c()
          for (i in vars) {
            chisq_vec <- c(chisq_vec,
                           sprintf(chisq.test(c(ADS[, i]),
                                              ADS[, unlist(strata[[1]])])$p.value,
                                   fmt = '%#.2f'))
          }
          chisq_df <-
            as.data.frame(cbind(Variables = vars, p_value = chisq_vec))
          desc_df$Chi.sq <- NA
          for (i in 1:nrow(desc_df)) {
            for (j in 1:nrow(chisq_df)) {
              if (desc_df[i, "Variables"] == chisq_df[j, "Variables"]) {
                desc_df[i, "Chi.sq"] = as.numeric(as.character(chisq_df[j, "p_value"]))
              }
            }
          }
          #adding the chisquare column in the heading
          desc_heading$Chi.sq <- "Chi.sq"
          desc_heading[nrow(desc_heading), "Chi.sq"] <- "p-value"
        }
      }

      #changing the strata name with their labels
      if (!is.null(unlist(strata_label))) {
        for (i in 1:nrow(desc_heading)) {
          for (j in 1:length(unlist(strata))) {
            if (!is.na(match(unlist(strata)[j], desc_heading[i,]))) {
              desc_heading[i,] <-
                replace(
                  desc_heading[i,],
                  which(desc_heading[i,] %in% unlist(strata)[j]),
                  unlist(strata_label)[j]
                )
            }
          }
        }

      }

      #changing the variable name with their labels
      if (!is.null(vars_label)) {
        desc_df[, 1] <-
          replace(desc_df[, 1], match(vars, desc_df[, 1]), vars_label)
      }

      #returning the final data frame
      return(list(
        table = desc_df,
        heading = desc_heading,
        tabular = desc,
        type = type
      ))

    }
    else if (type == "continuous") {
      mean <- function(x)
        base::mean(x, na.rm = TRUE)
      median <- function(x)
        stats::median(x, na.rm = TRUE)
      sd <- function(x)
        stats::sd(x, na.rm = TRUE)
      sum <- function(x)
        base::sum(x, na.rm = TRUE)
      min <- function(x)
        base::min(x, na.rm = TRUE)
      max <- function(x)
        base::max(x, na.rm = TRUE)
      All <- function(x)
        length(x[!is.na(x)])
      #converting the variables to factors/numeric
      for (i in vars) {
        if (!is.numeric(ADS[, i])) {
          ADS[, i] <- as.character(ADS[, i])
          ADS[, i] <- as.numeric(ADS[, i])
        }
      }
      ADS[unique(unlist(strata))] <-
        lapply(ADS[unique(unlist(strata))], as.factor)

      summary <- paste("(All+", numeric_summary, ")")
      rows <-  vars
      row_formula <- "1+"
      for (i in 1:length(rows)) {
        row_formula <- paste(row_formula, rows[i], "*", summary, "+")
      }
      row_formula <- substr(row_formula, 1, nchar(row_formula) - 1)

      column_formula <- "1"
      for (i in strata) {
        count <- 0
        for (j in i) {
          count <- count + 1
          temp <- paste("(", j, ")")
          if (count == 1) {
            k <- temp
          } else{
            pos <- regexpr(pattern = ')', k)[1]
            k <-
              paste(
                substr(k, 1, pos - 1),
                "*",
                substr(temp, 1, 1),
                "1+",
                substr(temp, 2, nchar(temp)),
                substr(k, pos, nchar(k))
              )
          }
        }
        column_formula <- paste(column_formula, "+", k)
      }
      formula <- paste(row_formula, "~", column_formula)

      options(scipen = 999)
      #creating the tabular

      desc <- tables::tabular(as.formula(formula), ADS)

      #converting the tabular to a data frame
      desc_df <- as.data.frame.matrix(desc)
      desc_df[] <- lapply(desc_df[], as.numeric)

      #adding row labels
      desc_df$variables <-
        as.data.frame.matrix(rowLabels(desc))[, 1]
      desc_df$labels <- as.data.frame.matrix(rowLabels(desc))[, 2]

      #re-arranging the dataframe
      desc_df <-
        desc_df[, c(ncol(desc_df) - 1, ncol(desc_df), 1:(ncol(desc_df) - 2))]

      #converting the first two columns to characters
      desc_df[, c(1, 2)] <-
        lapply(desc_df[, c(1, 2)][], as.character)

      #adding the column labels
      desc_heading <- as.data.frame.matrix(colLabels(desc))
      desc_heading[] <- lapply(desc_heading[], as.character)
      desc_columns <-
        sapply(desc_heading[], function(x)
          paste(x, collapse = " | "))
      V00 <- rep("Variables", nrow(desc_heading))
      V01 <- rep("Labels", nrow(desc_heading))
      desc_heading <- cbind(V00, V01, desc_heading)
      if (!is.null(unlist(strata))) {
        if(!is.null(percent)){
          desc_heading[c(1:(max(lengths(strata)) * 2)), c(3)] <-
            "Overall"
        }

      }


      #filling the empty values
      for (j in 1:ncol(desc_heading)) {
        for (i in 1:nrow(desc_heading)) {
          non_empty_row <- 0
          if (desc_heading[i, j] == "") {
            for (k in (i + 1):nrow(desc_heading)) {
              if (desc_heading[k, j] != "") {
                non_empty_row <- k
                break()
              }
            }
            if (desc_heading[k, j] == "All" |
                desc_heading[k, j] == "Percent") {
              desc_heading[i, j] <- "Overall"
            }
            else{
              desc_heading[i, j] <- desc_heading[k, j]
            }

          }
        }
        if (desc_heading[nrow(desc_heading), j] == "All") {
          desc_heading[nrow(desc_heading), j] <- "Overall"
        }
      }


      desc_columns[1] <- gsub(".*\\| A", "A", desc_columns[1])
      colnames(desc_df) <- c("Variables", "Labels", desc_columns)
      desc_df[1, 1] <- "All"

      if (!is.null(test)) {
        #adding the t.test p-value
        if (test == "t.test") {
          t_test_vec <- c()
          if(length(unique(unlist(ADS[,strata[[1]]])))>2){
            stop("The stratification variable has more than 2 stratifications")
          }
          unique_strata_values<-unique(ADS[,strata[[1]]])
          trt<-ADS[ADS[,unlist(strata[[1]])]==unique_strata_values[1],]
          ctrl<-ADS[ADS[,unlist(strata[[1]])]==unique_strata_values[2],]
          for (i in vars) {
            t_test_vec <- c(t_test_vec,
                            sprintf(t.test(trt[, i],ctrl[, i])$p.value,
                                    fmt = '%#.3f'))
          }
          ttest_df <-
            as.data.frame(cbind(Variables = vars, p_value = t_test_vec))
          desc_df$t.test <- NA
          for (i in 1:nrow(desc_df)) {
            for (j in 1:nrow(ttest_df)) {
              if(!is.na(desc_df[i, "Variables"])){
                if (desc_df[i, "Variables"] == ttest_df[j, "Variables"]) {
                  desc_df[i, "t.test"] = as.numeric(as.character(ttest_df[j, "p_value"]))
                }
              }

            }
          }
          #adding the chisquare column in the heading
          desc_heading$t.test <- "t.test"
          desc_heading[nrow(desc_heading), "t.test"] <- "p-value"
        }
      }

      #changing the strata name with their labels
      if (!is.null(unlist(strata_label))) {
        for (i in 1:nrow(desc_heading)) {
          for (j in 1:length(unlist(strata))) {
            if (!is.na(match(unlist(strata)[j], desc_heading[i,]))) {
              desc_heading[i,] <-
                replace(
                  desc_heading[i,],
                  which(desc_heading[i,] %in% unlist(strata)[j]),
                  unlist(strata_label)[j]
                )
            }
          }
        }

      }
      #changing the variable name with their labels
      if (!is.null(vars_label)) {
        desc_df[, 1] <-
          replace(desc_df[, 1], match(vars, desc_df[, 1]), vars_label)
      }

      #returning the final data frame
      return(list(
        table = desc_df,
        heading = desc_heading,
        tabular = desc,
        type = type
      ))

    }
    else{
      #adding an empty column
      ADS$empty <- "emptyspace"

      #converting the variables to factors
      ADS[vars] <- lapply(ADS[vars], as.character)


      #Replace NAs with a "NA" string
      for (i in vars) {
        if (sum(is.na(ADS[, i])) > 0) {
          message <- paste("The variable named '",
                           i,
                           "' has NAs and it will be replaced with 0")
          warning(message)
        }
        ADS[is.na(ADS[, i]), i] <- 0
      }


      #checking if the levels are alright
      vars_with_1_level <- c()
      labels_to_remove <- c()
      for (i in vars) {
        if (length(unique(ADS[, i])) == 1) {
          vars_with_1_level <- c(vars_with_1_level, i)
          labels_to_remove <-
            c(labels_to_remove, vars_label[match(i, vars)])
          message <-
            paste(
              "The variable",
              i,
              "has only 1 level and it has been removed. All input variables must have atleast two levels"
            )
          warning(message)
        }
      }
      vars <- vars[!vars %in% vars_with_1_level]
      vars_label <- vars_label[!vars_label %in% labels_to_remove]

      ADS[vars] <- lapply(ADS[vars], as.factor)
      ADS[unique(unlist(strata))] <-
        lapply(ADS[unique(unlist(strata))], as.factor)

      #creating the percentage function
      perc <- function(x) {
        x / 100
      }
      column <- ""
      if (is.null(percent)) {
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          column <- "(1+"
          for (list in strata) {
            all = '('
            if (column != "" & column != "(1+") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 1
            strata_len <- length(strata)
            level_len <- length(list)
            for (level in list) {
              if (level_count == 1 & level_count < level_len) {
                column <- paste(column, level, " * ")
              } else if (level_count == level_len & level_len > 1) {
                column <- paste(column, ' (1+', level, " ) ")
              }
              else{
                column <-
                  paste(column,
                        '(',
                        level,
                        ' ) ')
              }

              level_count <- level_count + 1
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          }
        }
        else{
          column <- '1'
        }
      } else if (percent == "row") {
        if (!is.null(unlist(strata))) {
          for (list in strata) {
            all = '(1+Format(perc()) * Percent("row")+'
            if (column != "") {
              column <- paste(column, "+")
            }
            column <- paste(column, all)
            level_count <- 0
            for (level in list) {
              if (level_count == 0) {
                column <- paste(column, level, " * ")
              } else{
                column <-
                  paste(column,
                        '(1+Format(perc()) * Percent("row")+',
                        level,
                        ' * ')
              }

              level_count <- level_count + 1
            }
            if (level_count == 1) {
              column <- paste(column, '(1+Format(perc()) * Percent("row")))')
            } else{
              column <- paste(column, '(1+Format(perc()) * Percent("row"))')
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket - 1
              ), collapse = ""))
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          column <-
            paste(column, paste(rep(")", start_bracket - end_bracket), collapse = ""))
        } else{
          column <- '(1+Format(perc()) * Percent("row"))'
        }

      } else if (percent == "all") {
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          for (list in strata) {
            all = '(1+Format(perc()) * Percent("all")+'
            if (column != "") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 0
            for (level in list) {
              if (level_count == 0) {
                column <- paste(column, level, " * ")
              } else{
                column <-
                  paste(column,
                        '(1+Format(perc()) * Percent("all")+',
                        level,
                        ' * ')
              }

              level_count <- level_count + 1
            }
            if (level_count == 1) {
              column <- paste(column, '(1+Format(perc()) * Percent("all"))')
            } else{
              column <- paste(column, '(1+Format(perc()) * Percent("all"))')
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          } else{
            column <- '(1+Format(perc()) * Percent("all"))'
          }
        }
      } else{
        strata_count <- 0
        if (!is.null(unlist(strata))) {
          for (list in strata) {
            all = '(1+Format(perc()) * Percent("col")+'
            if (column != "") {
              column <- paste(column, "+")
            }
            if (strata_count == 0) {
              column <- paste(column, all)
            }

            level_count <- 0
            for (level in list) {
              if (level_count == 0) {
                column <- paste(column, level, " * ")
              } else{
                column <-
                  paste(column,
                        '(1+Format(perc()) * Percent("col")+',
                        level,
                        ' * ')
              }

              level_count <- level_count + 1
            }
            if (level_count == 1) {
              column <- paste(column, '(1+Format(perc()) * Percent("col"))')
            } else{
              column <- paste(column, '(1+Format(perc()) * Percent("col"))')
            }
            start_bracket <-
              sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
            end_bracket <-
              sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
            if (end_bracket - start_bracket != 0) {
              column <-
                paste(column, paste(rep(
                  ")", start_bracket - end_bracket - 1
                ), collapse = ""))
            }
            strata_count <- strata_count + 1
          }
          start_bracket <-
            sum(gregexpr("(", column, fixed = TRUE)[[1]] > 0)
          end_bracket <-
            sum(gregexpr(")", column, fixed = TRUE)[[1]] > 0)
          if (end_bracket - start_bracket != 0) {
            column <-
              paste(column, paste(rep(
                ")", start_bracket - end_bracket
              ), collapse = ""))
          } else{
            column <- '(1+Format(perc()) * Percent("col"))'
          }
        }


      }
      rows <- c(rbind(rep("empty", length(vars)), vars))
      if(!is.null(vars)){
        formula <- paste('1+',
                         paste(rows, collapse = "+"), '~',
                         column)
      }else{
        formula <- paste('1',
                         paste(rows, collapse = "+"), '~',
                         column)
      }



      #creating the tabular
      desc <- tables::tabular(as.formula(formula), ADS)

      #converting the tabular to a data frame
      desc_df <- as.data.frame.matrix(desc)
      desc_df[] <- lapply(desc_df[], as.numeric)


      #adding row labels
      desc_df$variables <-
        as.data.frame.matrix(rowLabels(desc))[, 1]
      if(!is.null(vars)){
        desc_df$labels <-
          as.data.frame.matrix(tables::rowLabels(desc))[, 2]
      }else{
        desc_df$labels <-
          as.data.frame.matrix(tables::rowLabels(desc))[, 1]
      }

      #re-arranging the dataframe
      desc_df <-
        desc_df[, c(ncol(desc_df) - 1, ncol(desc_df), 1:(ncol(desc_df) - 2))]

      #creating the empty row
      desc_df[desc_df$labels == "empty", ] <- NA

      #merging the variable and it's label in a single column
      desc_df$variables <- dplyr::lag(desc_df$variables)


      desc_df[, c(1, 2)] <-
        lapply(desc_df[, c(1, 2)][], as.character)
      desc_df <-
        desc_df[(
          desc_df$labels == 1 | desc_df$labels == "1:Yes" |
            desc_df$labels == "Yes" |
            desc_df$labels == "yes" |
            desc_df$labels == "YES" |
            desc_df$labels == "All"
        ) &
          !is.na(desc_df$labels), ]

      desc_df$labels <-
        ifelse(
          desc_df$labels == "" |  desc_df$labels == 1 |
            is.na(desc_df$labels) |  desc_df$labels == "1:Yes" |
            desc_df$labels == "YES" |
            desc_df$labels == "Yes" |  desc_df$labels == "yes" ,
          as.character(desc_df$variables),
          desc_df$labels
        )


      #removing the variable column
      desc_df <- desc_df[, -1]

      #adding the column labels
      desc_heading <- as.data.frame.matrix(colLabels(desc))
      desc_heading[] <- lapply(desc_heading[], as.character)
      if (!is.null(unlist(strata))) {
        if (is.null(percent)) {
          desc_heading[(max(lengths(strata)) * 2), c(1)] <- "Overall"
        } else{
          desc_heading[(max(lengths(strata)) * 2), c(1, 2)] <- "Overall"
        }
      }
      if (is.null(percent)) {
        desc_heading <-
          as.data.frame(rbind(desc_heading,  rep("All", length(desc_heading))))
      }
      desc_columns <-
        sapply(desc_heading[], function(x)
          paste(x, collapse = " | "))
      V0 <- rep("Variables", nrow(desc_heading))
      desc_heading <- cbind(V0 , desc_heading)
      if (!is.null(unlist(strata))) {
        if(!is.null(percent)){
          desc_heading[c(1:(max(lengths(strata)) * 2)), c(2, 3)] <-
            "Overall"
        }

      }
      #filling the empty values
      for (j in 1:ncol(desc_heading)) {
        for (i in 1:nrow(desc_heading)) {
          non_empty_row <- 0
          if (desc_heading[i, j] == "") {
            for (k in (i + 1):nrow(desc_heading)) {
              if (desc_heading[k, j] != "") {
                non_empty_row <- k
                break()
              }
            }
            if (desc_heading[k, j] == "All" |
                desc_heading[k, j] == "Percent") {
              desc_heading[i, j] <- "Overall"
            }
            else{
              desc_heading[i, j] <- desc_heading[k, j]
            }

          }
        }
        if (desc_heading[nrow(desc_heading), j] == "All") {
          desc_heading[nrow(desc_heading), j] <- "N"
        }
        if (desc_heading[nrow(desc_heading), j] == "Percent") {
          desc_heading[nrow(desc_heading), j] <- "%"
        }
      }
      desc_columns[1] <- gsub(".*\\| O", "O", desc_columns[1])
      desc_columns[2] <- gsub(".*\\| O", "O", desc_columns[2])
      for(i in 1:length(desc_columns)){
        desc_columns[i] <-
          sub("\\|  \\|", "\\|", desc_columns[i])
        if(is.null(percent)){
          desc_columns[i]<- sub("All$", "N", desc_columns[i])
        }
      }
      if (!is.null(unlist(strata))) {
        colnames(desc_df) <- c("Variables", desc_columns)
      } else if (!is.null(percent)) {
        colnames(desc_df) <- c("Variables", "All", "%")
      } else{
        colnames(desc_df) <- c("Variables", "All")
      }

      #dividing the percentage column by 100
      desc_df[, which(grepl("Percent", colnames(desc_df)))] <-
        desc_df[, which(grepl("Percent", colnames(desc_df)))] / 100

      #adding the chi.square value
      if (!is.null(test)) {
        if (test == "chi.sq") {
          chisq_vec <- c()
          for (i in vars) {
            chisq_vec <- c(chisq_vec,
                           sprintf(chisq.test(c(ADS[, i]),
                                              ADS[, unlist(strata[[1]])])$p.value,
                                   fmt = '%#.3f'))
          }
          chisq_df <-
            as.data.frame(cbind(Variables = vars, p_value = chisq_vec))
          desc_df$Chi.sq <- NA
          for (i in 1:nrow(desc_df)) {
            for (j in 1:nrow(chisq_df)) {
              if (desc_df[i, "Variables"] == chisq_df[j, "Variables"]) {
                desc_df[i, "Chi.sq"] = as.numeric(as.character(chisq_df[j, "p_value"]))
              }
            }
          }
          #adding the chisquare column in the heading
          desc_heading$Chi.sq <- "Chi.sq"
          desc_heading[nrow(desc_heading), "Chi.sq"] <- "p-value"
        }

      }
      #changing the strata name with their labels
      if (!is.null(unlist(strata_label))) {
        for (i in 1:nrow(desc_heading)) {
          for (j in 1:length(unlist(strata))) {
            if (!is.na(match(unlist(strata)[j], desc_heading[i,]))) {
              desc_heading[i,] <-
                replace(
                  desc_heading[i,],
                  which(desc_heading[i,] %in% unlist(strata)[j]),
                  unlist(strata_label)[j]
                )
            }
          }
        }

      }

      #changing the variable name with their labels
      if (!is.null(vars_label)) {
        desc_df[, 1] <-
          replace(desc_df[, 1], match(vars, desc_df[, 1]), vars_label)
      }

      #returning the final data frame
      return(list(
        table = desc_df,
        heading = desc_heading,
        tabular = desc,
        type = type
      ))
    }


    }
nivesh22/descriptive documentation built on Jan. 22, 2020, 8:03 p.m.