R/tab1.R

Defines functions tab1 render.varlabel render.missing.default render.categorical.default render.continuous.default parse.abbrev.render.code render.default stats.apply.rounding stats.default signif_pad

Documented in parse.abbrev.render.code render.categorical.default render.continuous.default render.default render.missing.default render.varlabel signif_pad stats.apply.rounding stats.default tab1

#' Round numbers with 0-padding.
#'
#' Utility functions to round numbers, similar the the base functions \code{signif}
#' and \code{round}, but resulting in character representations that keep zeros at
#' the right edge if they are significant.
#'
#' @param x A numeric vector.
#' @param digits An integer specifying the number of significant digits to keep
#' (for \code{signif_pad}) or the number of digits after the decimal point (for
#' \code{round_pad}).
#' @param round.integers Should rounding be limited to digits to the right of
#' the decimal point?
#' @param round5up Should numbers with 5 as the last digit always be rounded
#' up? The standard R approach is "go to the even digit" (IEC 60559 standard,
#' see \code{\link{round}}), while some other softwares (e.g. SAS, Excel)
#' always round up.
#' @param dec The character symbol to use as decimal mark (locale
#' specific). [Deprecated; use \code{decimal.mark} instead]
#' @param ... Further options, passed to \code{formatC} (which is used
#' internally). Not all options will work, but some might be useful (e.g.
#' \code{big.mark}, \code{decimal.mark}).
#'
#' @return A character vector containing the rounded numbers.
#'
#' @seealso
#' \code{\link{signif}}
#' \code{\link{round}}
#' \code{\link{formatC}}
#' \code{\link{prettyNum}}
#' \code{\link{format}}
#'
#' @examples
#' x <- c(0.9001, 12345, 1.2, 1., 0.1, 0.00001 , 1e5)
#' signif_pad(x, digits=3)
#' signif_pad(x, digits=3, round.integers=TRUE)
#'
#' # Compare:
#' as.character(signif(x, digits=3))
#' format(x, digits=3, nsmall=3)
#' prettyNum(x, digits=3, drop0trailing=TRUE)
#' prettyNum(x, digits=3, drop0trailing=FALSE)
#'
#' # This is very close.
#' formatC(x, format="fg", flag="#", digits=3)
#' formatC(signif(x, 3), format="fg", flag="#", digits=3)
#'
#' # Could always remove the trailing "."
#' sub("[.]$", "", formatC(x, format="fg", flag="#", digits=3))
#'
#' @keywords utilities
#' @export
signif_pad <-
    function(x,
             digits = 3,
             round.integers = TRUE,
             round5up = TRUE,
             dec,
             ...) {
        args <- list(...)
        if (!missing(dec)) {
            warning("argument dec is deprecated; please use decimal.mark instead.",
                    call. = FALSE)
            args$decimal.mark <- dec
        }
        eps <- if (round5up)
            x * (10 ^ (-(digits + 3)))
        else
            0
        
        rx <- ifelse(x >= 10 ^ digits & .isFALSE(round.integers),
                     round(x),
                     signif(x + eps, digits))
        
        cx <- do.call(formatC,
                      c(list(
                          x = rx,
                          digits = digits,
                          format = "fg",
                          flag = "#"
                      ),
                      args[names(args) %in% names(formals(formatC))]))
        
        cx[is.na(x)] <- "0"  # Put in a dummy value for missing x
        cx <- gsub("[^0-9]*$", "", cx) # Remove any trailing non-digit characters
        ifelse(is.na(x), NA, cx)
    }

#' @rdname signif_pad
#' @export
round_pad <- function (x,
                       digits = 2,
                       round5up = TRUE,
                       dec,
                       ...) {
    args <- list(...)
    if (!missing(dec)) {
        warning("argument dec is deprecated; please use decimal.mark instead.",
                call. = FALSE)
        args$decimal.mark <- dec
    }
    eps <- if (round5up)
        10 ^ (-(digits + 3))
    else
        0
    
    rx <- round(x + eps, digits)
    
    cx <- do.call(formatC,
                  c(list(
                      x = rx,
                      digits = digits,
                      format = "f",
                      flag = "0"
                  ),
                  args[names(args) %in% names(formals(formatC))]))
    ifelse(is.na(x), NA, cx)
}

#' Compute some basic descriptive statistics.
#'
#' Values of type \code{factor}, \code{character} and \code{logical} are
#' treated as categorical. For logicals, the two categories are given the
#' labels `Yes` for \code{TRUE}, and `No` for \code{FALSE}.  Factor levels with
#' zero counts are retained.
#'
#' @param x A vector or numeric, factor, character or logical values.
#' @param quantile.type An integer from 1 to 9, passed as the \code{type}
#' argument to function \code{\link[stats]{quantile}}.
#' @param ... Further arguments (ignored).
#'
#' @return A list. For numeric \code{x}, the list contains the numeric elements:
#' \itemize{
#'   \item \code{N}: the number of non-missing values
#'   \item \code{NMISS}: the number of missing values
#'   \item \code{SUM}: the sum of the non-missing values
#'   \item \code{MEAN}: the mean of the non-missing values
#'   \item \code{SD}: the standard deviation of the non-missing values
#'   \item \code{MIN}: the minimum of the non-missing values
#'   \item \code{MEDIAN}: the median of the non-missing values
#'   \item \code{CV}: the percent coefficient of variation of the non-missing values
#'   \item \code{GMEAN}: the geometric mean of the non-missing values if non-negative, or \code{NA}
#'   \item \code{GCV}: the percent geometric coefficient of variation of the
#'   non-missing values if non-negative, or \code{NA}
#'   \item \code{qXX}: various quantiles (percentiles) of the non-missing
#'   values (q01: 1\%, q02.5: 2.5\%, q05: 5\%, q10: 10\%, q25: 25\% (first
#'   quartile), q33.3: 33.33333\% (first tertile), q50: 50\% (median, or second
#'   quartile), q66.7: 66.66667\% (second tertile), q75: 75\% (third quartile),
#'   q90: 90\%, q95: 95\%, q97.5: 97.5\%, q99: 99\%)
#'   \item \code{Q1}: the first quartile of the non-missing values (alias \code{q25})
#'   \item \code{Q2}: the second quartile of the non-missing values (alias \code{q50} or \code{Median})
#'   \item \code{Q3}: the third quartile of the non-missing values (alias \code{q75})
#'   \item \code{IQR}: the inter-quartile range of the non-missing values (i.e., \code{Q3 - Q1})
#'   \item \code{T1}: the first tertile of the non-missing values (alias \code{q33.3})
#'   \item \code{T2}: the second tertile of the non-missing values (alias \code{q66.7})
#' }
#' If \code{x} is categorical (i.e. factor, character or logical), the list
#' contains a sublist for each category, where each sublist contains the
#' numeric elements:
#' \itemize{
#'   \item \code{FREQ}: the frequency count
#'   \item \code{PCT}: the percent relative frequency, including NA in the denominator
#'   \item \code{PCTnoNA}: the percent relative frequency, excluding NA from the denominator
#' }
#'
#' @examples
#' x <- exp(rnorm(100, 1, 1))
#' stats.default(x)
#'
#' y <- factor(sample(0:1, 99, replace=TRUE), labels=c("Female", "Male"))
#' y[1:10] <- NA
#' stats.default(y)
#' stats.default(is.na(y))
#'
#' @keywords utilities
#' @export
#' @importFrom stats sd median quantile IQR na.omit
stats.default <- function(x, quantile.type = 7, ...) {
    if (is.logical(x)) {
        x <- factor(1 - x,
                    levels = c(0, 1),
                    labels = c("Yes", "No"))
    }
    if (is.factor(x) || is.character(x)) {
        y <- table(x, useNA = "no")
        nn <- names(y)
        nn[is.na(nn)] <- "Missing"
        names(y) <- nn
        lapply(y, function(z)
            list(
                FREQ = z,
                PCT = 100 * z / length(x),
                PCTnoNA = 100 * z / sum(y),
                N = sum(y)
            ))
    } else if (is.numeric(x) && sum(!is.na(x)) == 0) {
        list(
            N = sum(!is.na(x)),
            NMISS = sum(is.na(x)),
            SUM = NA,
            MEAN = NA,
            SD = NA,
            CV = NA,
            GMEAN = NA,
            GCV = NA,
            MEDIAN = NA,
            MIN = NA,
            MAX = NA,
            q01 = NA,
            q025 = NA,
            q05 = NA,
            q10 = NA,
            q25 = NA,
            q50 = NA,
            q75 = NA,
            q90 = NA,
            q95 = NA,
            q975 = NA,
            q99 = NA,
            Q1 = NA,
            Q2 = NA,
            Q3 = NA,
            IQR = NA,
            T1 = NA,
            T2 = NA
        )
    } else if (is.numeric(x)) {
        q <-
            quantile(
                x,
                probs = c(
                    0.01,
                    0.025,
                    0.05,
                    0.1,
                    0.25,
                    1 / 3,
                    0.5,
                    2 / 3,
                    0.75,
                    0.9,
                    0.95,
                    0.975,
                    0.99
                ),
                na.rm = TRUE,
                type = quantile.type
            )
        list(
            N = sum(!is.na(x)),
            NMISS = sum(is.na(x)),
            SUM = sum(x, na.rm = TRUE),
            MEAN = mean(x, na.rm = TRUE),
            SD = sd(x, na.rm = TRUE),
            CV = 100 * sd(x, na.rm = TRUE) / abs(mean(x, na.rm = TRUE)),
            GMEAN = if (any(na.omit(x) <= 0))
                NA
            else
                exp(mean(log(x), na.rm = TRUE)),
            GCV = if (any(na.omit(x) <= 0))
                NA
            else
                100 * sqrt(exp(sd(
                    log(x), na.rm = TRUE
                ) ^ 2) - 1),
            MEDIAN = median(x, na.rm = TRUE),
            MIN = min(x, na.rm = TRUE),
            MAX = max(x, na.rm = TRUE),
            q01 = q["1%"],
            q02.5 = q["2.5%"],
            q05 = q["5%"],
            q10 = q["10%"],
            q25 = q["25%"],
            q50 = q["50%"],
            q75 = q["75%"],
            q90 = q["90%"],
            q95 = q["95%"],
            q97.5 = q["97.5%"],
            q99 = q["99%"],
            Q1 = q["25%"],
            Q2 = q["50%"],
            Q3 = q["75%"],
            IQR = q["75%"] - q["25%"],
            T1 = q["33.33333%"],
            T2 = q["66.66667%"]
        )
    } else {
        stop(paste("Unrecognized variable type:", class(x)))
    }
}

#' Apply rounding to basic descriptive statistics.
#'
#' Not all statistics should be rounded in the same way, or at all. This
#' function will apply rounding selectively to a list of statistics as returned
#' by \code{\link{stats.default}}. In particular we don't round counts (N and
#' FREQ), and for MIN, MAX and MEDIAN the \code{digits} is interpreted as the
#' \emph{minimum} number of significant digits, so that we don't loose any
#' precision. Percentages are rounded to a fixed number of decimal places
#' (default 1) rather than a specific number of significant digits.
#'
#' @param x A list, such as that returned by \code{\link{stats.default}}.
#' @param digits An integer specifying the number of significant digits to keep.
#' @param digits.pct An integer specifying the number of digits after the
#' decimal place for percentages.
#' @param round.median.min.max Should rounding applied to median, min and max?
#' @param round.integers Should rounding be limited to digits to the right of
#' the decimal point?
#' @param round5up Should numbers with 5 as the last digit always be rounded
#' up? The standard R approach is "go to the even digit" (IEC 60559 standard,
#' see \code{\link{round}}), while some other softwares (e.g. SAS, Excel)
#' always round up.
#' @param ... Further arguments.
#'
#' @return A list with the same number of elements as \code{x}. The rounded
#' values will be \code{character} (not \code{numeric}) and will have 0 padding
#' to ensure consistent number of significant digits.
#'
#' @seealso
#' \code{\link{signif_pad}}
#' \code{\link{stats.default}}
#' @examples
#' x <- round(exp(rnorm(100, 1, 1)), 6)
#' stats.default(x)
#' stats.apply.rounding(stats.default(x), digits=3)
#' stats.apply.rounding(stats.default(round(x, 1)), digits=3)
#'
#' @keywords utilities
#' @export
stats.apply.rounding <- function(x,
                                 digits = 3,
                                 digits.pct = 1,
                                 round.median.min.max = TRUE,
                                 round.integers = TRUE,
                                 round5up = TRUE,
                                 ...) {
    mindig <- function(x, digits) {
        cx <- format(x)
        ndig <- nchar(gsub("\\D", "", cx))
        ifelse(
            ndig > digits,
            cx,
            signif_pad(
                x,
                digits = digits,
                round.integers = round.integers,
                round5up = round5up,
                ...
            )
        )
    }
    format.percent <- function(x, digits) {
        if (x == 0)
            "0"
        else if (x == 100)
            "100"
        else
            formatC(x, digits = digits.pct, format = "f")
    }
    if (!is.list(x)) {
        stop("Expecting a list")
    }
    if (is.list(x[[1]])) {
        # Apply recursively
        lapply(
            x,
            stats.apply.rounding,
            digits = digits,
            digits.pct = digits.pct,
            round.integers = round.integers,
            round5up = round5up,
            ...
        )
    } else {
        cx <- lapply(x, format)
        r <- lapply(
            x,
            signif_pad,
            digits = digits,
            round.integers = round.integers,
            round5up = round5up,
            ...
        )
        nr <- c("N", "NMISS", "FREQ")       # No rounding
        nr <- nr[nr %in% names(x)]
        nr <- nr[!is.na(x[nr])]
        r[nr] <- cx[nr]
        if (!round.median.min.max) {
            sr <-
                c("MEDIAN", "MIN", "MAX")  # Only add significant digits, don't remove any
            sr <- sr[sr %in% names(x)]
            r[sr] <- lapply(x[sr], mindig, digits = digits)
        }
        pr <- c("PCT", "CV", "GCV")   # Percentages
        pr <- pr[pr %in% names(x)]
        pr <- pr[!is.na(x[pr])]
        r[pr] <-
            lapply(as.numeric(x[pr]), format.percent, digits = digits.pct)
        r
    }
}

#' Render values for table output.
#'
#' Called from \code{\link{tab1}} by default to render values for
#' displaying in the table. This function forwards the call to separate
#' functions for rendering continuous, categorical and missing values.
#' The idea is that each of these functions can be overridden to customize
#' the table output.
#'
#' @param x A vector or numeric, factor, character or logical values.
#' @param name Name of the variable to be rendered (ignored).
#' @param missing Should missing values be included?
#' @param render.empty A \code{character} to return when \code{x} is empty.
#' @param render.continuous A function to render continuous (i.e.
#' \code{numeric}) values. Can also be a \code{character} string, in which case
#' it is passed to \code{\link{parse.abbrev.render.code}}.
#' @param render.categorical A function to render categorical (i.e.
#' \code{factor}, \code{character} or \code{logical}) values. Can also be a
#' \code{character} string, in which case it is passed to
#' \code{\link{parse.abbrev.render.code}}.
#' @param render.missing A function to render missing (i.e. \code{NA}) values.
#' Can also be a \code{character} string, in which case it is passed to
#' \code{\link{parse.abbrev.render.code}}. Set to \code{NULL} to ignore missing
#' values.
#' @param ... Further arguments, passed to \code{\link{stats.apply.rounding}}.
#'
#' @return A \code{character} vector. Each element is to be displayed in a
#' separate cell in the table. The \code{\link{names}} of the vector are the
#' labels to use in the table. However, the first names should be empty as it
#' will be replaced by the name of the variable. Empty strings are allowed and
#' result in empty table cells.
#'
#' @examples
#' x <- exp(rnorm(100, 1, 1))
#' render.default(x)
#' render.default(x, TRUE)
#'
#' y <- factor(sample(0:1, 99, replace=TRUE), labels=c("Female", "Male"))
#' y[1:10] <- NA
#' render.default(y)
#'
#' @keywords utilities
#' @export
render.default <-
    function(x,
             name,
             missing = any(is.na(x)),
             render.empty = "NA",
             render.continuous = render.continuous.default,
             render.categorical = render.categorical.default,
             render.missing = render.missing.default,
             ...) {
        if (is.character(render.continuous)) {
            render.continuous <-
                parse.abbrev.render.code(code = render.continuous, ...)
        }
        if (is.character(render.categorical)) {
            render.categorical <-
                parse.abbrev.render.code(code = render.categorical, ...)
        }
        if (!is.null(render.missing) && is.character(render.missing)) {
            render.missing <- parse.abbrev.render.code(code = render.missing, ...)
        }
        if (length(x) == 0) {
            return(render.empty)
        }
        if (is.logical(x)) {
            x <- factor(x,
                        levels = c(T, F),
                        labels = c("Yes", "No"))
        }
        if (is.factor(x) || is.character(x)) {
            r <- do.call(render.categorical, c(list(x = x), list(...)))
        } else if (is.numeric(x)) {
            r <- do.call(render.continuous, c(list(x = x), list(...)))
        } else {
            stop(paste("Unrecognized variable type:", class(x)))
        }
        if (missing && !is.null(render.missing)) {
            r <- c(r, do.call(render.missing, c(list(x = x), list(...))))
        }
        
        # Remove empty levels from data
        r <- gsub("NA% \\(0/0\\)", "", r)
        r <- gsub("0% \\(0/\\d+\\)", "", r)
        gsub("0 \\(0%\\)", "", r)
        
    }

#' Parse abbreviated code for rendering table output.
#'
#' @param code A \code{character} vector specifying the statistics to display
#' in abbreviated code. See Details.
#' @param ... Further arguments, passed to \code{\link{stats.apply.rounding}}.
#'
#' @return A function that takes a single argument and returns a
#' \code{character} vector.
#'
#' @details In abbreviated code, the words N, NMISS, MEAN, SD, MIN, MEDIAN,
#' MAX, IQR, CV, GMEAN, GCV, FREQ and PCT are substituted for their respective
#' values (see \code{\link{stats.default}}). The substitution is case
#' insensitive, and the substituted values are rounded appropriately (see
#' \code{\link{stats.apply.rounding}}). Other text is left unchanged. The
#' \code{code} can be a vector, in which case each element is displayed in its
#' own row in the table. The \code{names} of \code{code} are used as row
#' labels; if no names are present, then the \code{code} itself is used unless
#' \code{code} is of length 1, in which case no label is used (for numeric
#' variables only, categorical variables are always labeled by the class
#' label). The special name '.' also indicates that \code{code} itself be is
#' used as the row label.
#'
#' @examples
#' \dontrun{
#' x <- round(exp(rnorm(100, log(20), 1)), 2)
#' stats.default(x)
#' f <- parse.abbrev.render.code(c("Mean (SD)", "Median [Min, Max]"), 3)
#' f(x)
#' f2 <- parse.abbrev.render.code(c("Geo. Mean (Geo. CV%)" = "GMean (GCV%)"), 3)
#' f2(x)
#' f3 <- parse.abbrev.render.code(c("Mean (SD)"), 3)
#' f3(x)
#'
#' x <- sample(c("Male", "Female"), 30, replace=T)
#' stats.default(x)
#' f <- parse.abbrev.render.code("Freq (Pct%)")
#' f(x)
#' }
#'
#' @keywords utilities
#' @export
parse.abbrev.render.code <-
    function(code, ...) {
        codestr <- code
        if (is.null(names(codestr)) && length(codestr) > 1) {
            names(codestr) <- codestr
        }
        names(codestr)[names(codestr) == "."] <-
            codestr[names(codestr) == "."]
        function(x, ...) {
            s <- stats.apply.rounding(stats.default(x, ...), ...)
            g <- function(ss) {
                res <- codestr
                for (nm in names(ss)) {
                    res <- gsub(paste0("\\b", nm, "\\b"),
                                ss[[nm]],
                                res,
                                ignore.case = T)
                }
                names(res) <- names(codestr)
                res
            }
            if (!is.list(s)) {
                stop("Expecting a list")
            }
            if (is.list(s[[1]])) {
                res <- lapply(s, g)
                nm <- ifelse(sapply(res, seq_along) == 1, "1", "")
                nm[nm == "1"] <- names(s)
                res <- unlist(res)
                names(res) <- nm
                c("", res)
            } else {
                if (length(codestr) == 1 && is.null(names(codestr))) {
                    g(s)
                } else {
                    c("", g(s))
                }
            }
        }
    }

#' Render continuous values for table output.
#'
#' Called from \code{\link{tab1}} by default to render continuous (i.e.
#' \code{numeric}) values for displaying in the table.
#'
#' @param x A numeric vector.
#' @param ... Further arguments, passed to \code{\link{stats.apply.rounding}}.
#'
#' @return A \code{character} vector. Each element is to be displayed in a
#' separate cell in the table. The \code{\link{names}} of the vector are the
#' labels to use in the table. However, the first names should be empty as it
#' will be replaced by the name of the variable. Empty strings are allowed and
#' result in empty table cells.
#'
#' @examples
#' x <- exp(rnorm(100, 1, 1))
#' render.continuous.default(x)
#'
#' @keywords utilities
#' @export
render.continuous.default <- function(x, ...) {
    with(
        stats.apply.rounding(stats.default(x, ...), ...),
        c(
            "",
            "Valid Obs."        = sprintf("%s", N),
            "Mean (SD)"         = sprintf("%s (%s)", MEAN, SD),
            "Median [Min, Max]" = sprintf("%s [%s, %s]", MEDIAN, MIN, MAX)
        )
    )
}

#' Render categorical values for table output.
#'
#' Called from \code{\link{tab1}} by default to render categorical (i.e.
#' \code{factor}, \code{character} or \code{logical}) values for displaying in the table.
#'
#' @param x A vector of type \code{factor}, \code{character} or \code{logical}.
#' @param ... Further arguments, passed to \code{\link{stats.apply.rounding}}.
#' @param na.is.category Include missing values in the denominator for
#' calculating percentages or omit them (the default). This option do not for
#' the moment.
#'
#' @return A \code{character} vector. Each element is to be displayed in a
#' separate cell in the table. The \code{\link{names}} of the vector are the
#' labels to use in the table. However, the first names should be empty as it
#' will be replaced by the name of the variable. Empty strings are allowed and
#' result in empty table cells.
#'
#' @examples
#' y <- factor(sample(0:1, 99, replace=TRUE), labels=c("Female", "Male"))
#' y[1:10] <- NA
#' render.categorical.default(y)
#' @keywords utilities
#' @export
render.categorical.default <- function(x, ..., na.is.category = FALSE) {
        c("", sapply(stats.apply.rounding(stats.default(x, ...), ...),
                     function(y)
                         with(y, ifelse(FREQ == 0, "",
                                        sprintf("%s/%s (%s%%)", FREQ, N, PCTnoNA)))))
    }

#' Render missing values for table output.
#'
#' Called from \code{\link{tab1}} by default to render missing (i.e.
#' \code{NA}) values for displaying in the table.
#'
#' @param x A vector.
#' @param ... Further arguments, passed to \code{\link{stats.apply.rounding}}.
#'
#' @return A \code{character} vector. Each element is to be displayed in a
#' separate cell in the table. The \code{\link{names}} of the vector are the
#' labels to use in the table. Empty strings are allowed and
#' result in empty table cells.
#'
#' @examples
#' y <- factor(sample(0:1, 99, replace=TRUE), labels=c("Female", "Male"))
#' y[1:10] <- NA
#' render.missing.default(y)
#' @keywords utilities
#' @export
render.missing.default <- function(x, ...) {
    with(stats.apply.rounding(stats.default(is.na(x), ...), ...)$Yes,
         c(Missing = sprintf("%s (%s%%)", FREQ, PCT)))
}


#' Render variable labels for table output.
#'
#' Called from \code{\link{tab1}} by default to render variable labels
#' for displaying in the table.
#'
#' @param x A vector, usually with the \code{\link{var_lab}} and (if appropriate)
#' \code{\link{unit}} attributes.
#' @param transpose Logical indicating whether on not the table is transposed.
#'
#' @return A \code{character}, which may contain HTML markup.
#'
#' @examples
#' x <- exp(rnorm(100, 1, 1))
#' var_lab(x) <- "Weight"
#' render.varlabel(x)
#'
#' y <- factor(sample(0:1, 99, replace=TRUE), labels=c("Female", "Male"))
#' y[1:10] <- NA
#' var_lab(y) <- "Sex"
#' render.varlabel(y)
#' @keywords utilities
#' @export
render.varlabel <- function(x, transpose = F) {
    sprintf("%s", var_lab(x))
}


#' Generate an flextable of descriptive statistics.
#'
#' There are two interfaces, the default, which typically takes a list of
#' \code{data.frame}s for \code{x}, and the formula interface. The formula
#' interface is less flexible, but simpler to use and designed to handle the
#' most common use cases. It is important to use factors appropriately for
#' categorical variables (i.e. have the levels labeled properly and in the
#' desired order). The contents of the table can be customized by providing
#' user-defined `renderer' functions. To facilitate this, some tags (such as
#'  row labels) are given specific classes for easy CSS selection.
#'
#' @details For the default version, is is expected that \code{x} is a named
#' list of \code{data.frame}s, one for each stratum, with names corresponding to
#' strata labels.
#'
#' @param vars Variables to be used for summary table.
#' @param data A \code{data.frame} from which the variables in \code{vars}
#' should be taken.
#' @param group Name of the grouping variable.
#' @param row_split Variable that used for splitting table rows, rows will be
#'  splited using this variable. Useful for repeated measures.
#' @param overall A label for the "Total" column. Specify \code{NULL} or
#' \code{FALSE} to omit the column altogether.
#' @param select a named vector with as many components as row-variables. Every 
#' element of `select` will be used to select the individuals to be analyzed
#'  for every row-variable. Name of the vector corresponds to the row variable,
#'  element is the selection.
#' @param render A function to render the table cells (see Details).
#' @param drop_lev Should empty factor levels be dropped?
#' @param indent Indent symbol of the table for statistic values.
#' @param ... Further arguments, passed to \code{render}.
#'
#' @return An object of class "tab1".
#'
#' @examples
#'
#' dat <- expand.grid(id=1:10, sex=c("Male", "Female"), treat=c("Treated", "Placebo"))
#' dat$age <- runif(nrow(dat), 10, 50)
#' dat$age[3] <- NA  # Add a missing value
#' dat$wt <- exp(rnorm(nrow(dat), log(70), 0.2))
#'
#' var_lab(dat$sex) <- "Sex"
#' var_lab(dat$age) <- "Age"
#' var_lab(dat$treat) <- "Treatment Group"
#' var_lab(dat$wt) <- "Weight"
#'
#'
#' # Something more complicated
#'
#' dat$dose <- ifelse(dat$treat=="Placebo", "Placebo",
#'                    sample(c("5 mg", "10 mg"), nrow(dat), replace=TRUE))
#' dat$dose <- factor(dat$dose, levels=c("Placebo", "5 mg", "10 mg"))
#'
#'
#'
#' my.render.cont <- function(x) {
#'     with(stats.default(x),
#'         sprintf("%0.2f (%0.1f)", MEAN, SD))
#' }
#'
#' tab1(c("age", "sex", "wt"),
#' data = dat,
#' group = "treat",
#'  render.continuous=my.render.cont)
#'
#'
#' @keywords utilities
#' @export
tab1 <- function(vars,
                   data,
                   group = NULL,
                   row_split = NULL,
                   overall = TRUE,
                   select = NULL,
                   render = render.default,
                   drop_lev = TRUE,
                   indent = "\t",
                   ...) {
    if (is.character(render)) {
        render <- parse.abbrev.render.code(code = render, ...)
    }
    
    vars_list <- c(vars, group, row_split)
    if (!all(vars_list %in% names(data))) {
        stop(
            "Variable ",
            paste(vars_list[!vars_list %in% names(data)], collapse = ", "),
            " not in the dataset, please check!"
        )
    }
    
    if(base::anyDuplicated(vars_list))
        stop("vars, group and row_splot duplicated.")

    # Select records with non-missing group and row split
    df <- data[stats::complete.cases(data[c(group, row_split)]), 
               vars_list, drop = FALSE]
    
    # Variable names to labels if no variable label
    with_varlab <- sapply(df, has.label)
    for (i in names(df)[!with_varlab]) {
        var_lab(df[[i]]) <- i
    }
    
    variables <- sapply(names(df), function(v)
        render.varlabel(df[[v]]), simplify = FALSE)
    
    # Group variable to factor
    if (!is.null(group)) {
        if (has.labels(df[[group]]) | !is.factor(df[[group]]))
            df[[group]] <- to_factor(df[[group]])
        group_label <- var_lab(df[[group]])
    }
    
    if (!is.null(row_split)) {
        if (has.labels(df[[row_split]]) | !is.factor(df[[row_split]]))
            df[[row_split]] <- to_factor(df[[row_split]])
    }

    # Check if missing
    any.missing <- sapply(vars, function(v)sum(is.na(df[[v]]))) > 0

    # Generate selection vector function
    gen_selec <- function(dat, var, select = NULL) {
        if (is.null(select) | !var %in% names(select)) {
            return(rep(TRUE, length(dat[[var]])))
        } else{
            r <- eval(str2expression(select[var]), envir = dat)
            r & !is.na(r)
        }
    }
    
    # Create value labels for characters variables to avoid missing levels between groups
    df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)],
                                           to_character)
    
    # Tabulation main function
    calc_tab <- function(dat) {
        # Transform data to list for loop
        if (overall & !is.null(group)) {
            x <- c(split(dat, dat[[group]]), list(Total = dat))
        } else if (!is.null(group)) {
            x <- split(dat, dat[[group]])
        } else{
            x <- list(Total = dat)
        }
        
        out <- lapply(vars, function(v) {
            # Apply stats
            y <- do.call(cbind, lapply(x, function(s) {
                z <- s[[v]][gen_selec(s, v, select)] # Apply subset
                # Convert character to factor
                if(has.labels(z) | is.character(z)) 
                    z <- to_factor(z)
                render(x = z, name = v, missing = any.missing[v], ...)
            }))
            rownames(y) <- paste(indent, rownames(y), sep = "")
            rownames(y)[1] <- variables[[v]]
            return(y)
        })
        
        out <- lapply(out, function(x) {
            cbind(x, c(1, rep(2, nrow(x) - 1)))
        })
        
        out <- do.call(rbind, out)
        rbind("Observation" = c(sapply(x, nrow), 1), out)
    }
    
    if (is.null(row_split)) {
        tbody <- calc_tab(df)
    } else{
        split_lab <- variables[[row_split]]
        dfm <- split(df, df[[row_split]])
        tbody <- lapply(names(dfm), function(x) {
            out <- calc_tab(dfm[[x]])
            out <- rbind(c(rep("", ncol(out) - 1), 0), out)
            row.names(out)[1] <- paste(split_lab, "=", x)
            return(out)
        })
        tbody <- do.call("rbind", tbody)
    }
    
    # Remove empty rows
    if (drop_lev)
        tbody <- tbody[!(rowSums(tbody != "") == 1 &
                             !tbody[, ncol(tbody)] %in% c(0, 1)), ]
    
    # Position of variable to bold and merge horizontal
    # pos <- tbody[, ncol(tbody)]
    res <- tbody[,-ncol(tbody), drop = FALSE]
    
    structure(res,
              position = unname(tbody[, ncol(tbody)]),
              class = c("tab1", class(res))
    )
    
}
adayim/cttab documentation built on Dec. 18, 2021, 10:27 p.m.