R/data_xtabulate.R

Defines functions .validate_table_weights .validate_by print_html.datawizard_crosstabs print.datawizard_crosstabs print_html.datawizard_crosstab print_md.datawizard_crosstab print.datawizard_crosstab format.datawizard_crosstab .crosstable

# helper to compute crosstables --------------

.crosstable <- function(x,
                        by,
                        weights = NULL,
                        remove_na = FALSE,
                        proportions = NULL,
                        obj_name = NULL,
                        group_variable = NULL) {
  if (!is.null(proportions)) {
    proportions <- match.arg(proportions, c("row", "column", "full"))
  }
  # frequency table
  if (is.null(weights)) {
    # we have a `.default` and a `.data.frame` method for `data_tabulate()`.
    # since this is the default, `x` can be an object which cannot be used
    # with `table()`, that's why we add `tryCatch()` here. Below we give an
    # informative error message for non-supported objects.
    if (remove_na) {
      x_table <- tryCatch(table(x, by), error = function(e) NULL)
    } else {
      x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
    }
  } else if (remove_na) {
    # weighted frequency table, excluding NA
    x_table <- tryCatch(
      stats::xtabs(
        weights ~ x + by,
        data = data.frame(weights = weights, x = x, by = by),
        na.action = stats::na.omit,
        addNA = FALSE
      ),
      error = function(e) NULL
    )
  } else {
    # weighted frequency table, including NA
    x_table <- tryCatch(
      stats::xtabs(
        weights ~ x + by,
        data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
        na.action = stats::na.pass,
        addNA = TRUE
      ),
      error = function(e) NULL
    )
  }

  if (is.null(x_table)) {
    insight::format_warning(paste0("Can't compute cross tables for objects of class `", class(x)[1], "`."))
    return(NULL)
  }

  out <- as.data.frame(stats::ftable(x_table))
  colnames(out) <- c("Value", "by", "N")
  total_n <- sum(out$N, na.rm = TRUE)

  # we want to round N for weighted frequencies
  if (!is.null(weights)) {
    out$N <- round(out$N)
    total_n <- round(total_n)
  }

  out <- data_to_wide(out, values_from = "N", names_from = "by")

  # use variable name as column name
  if (!is.null(obj_name)) {
    colnames(out)[1] <- obj_name
  }

  # for grouped data frames, add info about grouping variables
  if (!is.null(group_variable)) {
    var_info <- toString(lapply(colnames(group_variable), function(i) {
      sprintf("%s (%s)", i, group_variable[[i]])
    }))
    out <- cbind(out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1])
  }

  attr(out, "total_n") <- total_n
  attr(out, "weights") <- weights
  attr(out, "proportions") <- proportions
  attr(out, "varname") <- obj_name

  class(out) <- c("datawizard_crosstab", "data.frame")

  out
}


# methods ---------------------


#' @export
format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark = NULL, ...) {
  # convert to character manually, else, for large numbers,
  # format_table() returns scientific notation
  x <- as.data.frame(x)

  # remove group variable
  x$Group <- NULL

  # compute total N for rows and colummns
  total_n <- attributes(x)$total_n
  total_column <- rowSums(x[, -1], na.rm = TRUE)
  total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n)

  # proportions?
  props <- attributes(x)$proportions

  if (!is.null(props)) {
    # we copy x to tmp, because when we create strings with "sprintf()", the
    # variable is coerced to character, and in subsequent iterations of the loop,
    # mathemathical operations are not possible anymore
    tmp <- x
    if (identical(props, "row")) {
      for (i in seq_len(nrow(x))) {
        row_sum <- sum(x[i, -1], na.rm = TRUE)
        if (row_sum == 0) {
          row_sum_string <- "(0%)"
        } else {
          row_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[i, -1] / row_sum)
        }
        tmp[i, -1] <- paste(format(x[i, -1]), format(row_sum_string, justify = "right"))
      }
    } else if (identical(props, "column")) {
      for (i in seq_len(ncol(x))[-1]) {
        col_sum <- sum(x[, i], na.rm = TRUE)
        if (col_sum == 0) {
          col_sum_string <- "(0%)"
        } else {
          col_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[, i] / col_sum)
        }
        tmp[, i] <- paste(format(x[, i]), format(col_sum_string, justify = "right"))
      }
    } else if (identical(props, "full")) {
      for (i in seq_len(ncol(x))[-1]) {
        tmp[, i] <- paste(
          format(x[, i]),
          format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right")
        )
      }
    }
    # copy back final result
    x <- tmp
  }

  x[] <- lapply(x, as.character)

  # format data frame
  ftab <- insight::format_table(x, ...)
  # replace empty cells with NA
  ftab[] <- lapply(ftab, function(i) {
    i[i == ""] <- ifelse(identical(format, "text"), "<NA>", "(NA)") # nolint
    i
  })
  # Remove ".00" from numbers
  ftab$Total <- gsub("\\.00$", "", as.character(total_column))
  # for text format, insert "empty row" before last total row
  if (identical(format, "text") || identical(format, "markdown")) {
    empty_row <- as.data.frame(t(data.frame(
      rep("", ncol(ftab)),
      c("Total", as.character(total_row)),
      stringsAsFactors = FALSE
    )))
  } else {
    empty_row <- as.data.frame(t(data.frame(
      c("Total", as.character(total_row)),
      stringsAsFactors = FALSE
    )))
  }
  colnames(empty_row) <- colnames(ftab)
  ftab <- rbind(ftab, empty_row)
  ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ])

  # insert big marks?
  ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark)
  ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark)

  # also format NA column name
  colnames(ftab)[colnames(ftab) == "NA"] <- ifelse(identical(format, "text"), "<NA>", "(NA)")

  ftab
}


#' @export
print.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  # grouped data? if yes, add information on grouping factor
  if (is.null(x[["Group"]])) {
    caption <- NULL
  } else {
    caption <- paste0("Grouped by ", x[["Group"]][1])
    x$Group <- NULL
  }

  # print table
  cat(insight::export_table(
    format(x, big_mark = big_mark, ...),
    cross = "+",
    missing = "<NA>",
    caption = caption,
    empty_line = "-"
  ))
  invisible(x)
}


#' @export
print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  # grouped data? if yes, add information on grouping factor
  if (is.null(x[["Group"]])) {
    caption <- NULL
  } else {
    caption <- paste0("Grouped by ", x[["Group"]][1])
    x$Group <- NULL
  }

  # print table
  insight::export_table(
    format(x, format = "markdown", big_mark = big_mark, ...),
    cross = "+",
    missing = "<NA>",
    caption = caption,
    empty_line = "-",
    format = "markdown"
  )
}


#' @export
print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  # grouped data? if yes, add information on grouping factor
  if (!is.null(x[["Group"]])) {
    x$groups <- paste0("Grouped by ", x[["Group"]][1])
    x$Group <- NULL
  }

  # print table
  insight::export_table(
    format(x, big_mark = big_mark, format = "html", ...),
    missing = "(NA)",
    format = "html",
    by = "groups"
  )
}


#' @export
print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  for (i in seq_along(x)) {
    print(x[[i]], big_mark = big_mark, ...)
    cat("\n")
  }
  invisible(x)
}


#' @export
print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  if (length(x) == 1) {
    print_html(x[[1]], big_mark = big_mark, ...)
  } else {
    x <- lapply(x, function(i) {
      # grouped data? if yes, add information on grouping factor
      if (!is.null(i[["Group"]])) {
        i$groups <- paste0("Grouped by ", i[["Group"]][1])
        i$Group <- NULL
      }
      format(i, format = "html", big_mark = big_mark, ...)
    })

    out <- do.call(rbind, x)

    # print table
    insight::export_table(
      out,
      missing = "(NA)",
      format = "html",
      by = "groups"
    )
  }
}


# helper ---------------------

.validate_by <- function(by, x) {
  if (!is.null(by)) {
    if (is.character(by)) {
      # If "by" is a character string, must be of length 1
      if (length(by) > 1) {
        insight::format_error(
          "If `by` is a string indicating a variable name, `by` must be of length 1.",
          "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`."
        )
      }
      # if "by" is a character, "x" must be a data frame
      if (!is.data.frame(x)) {
        insight::format_error("If `by` is a string indicating a variable name, `x` must be a data frame.")
      }
      # is "by" a column in "x"?
      if (!by %in% colnames(x)) {
        insight::format_error(sprintf(
          "The variable specified in `by` was not found in `x`. %s",
          .misspelled_string(names(x), by, "Possibly misspelled?")
        ))
      }
      by <- x[[by]]
    }
    # is "by" of same length as "x"?
    if (is.data.frame(x) && length(by) != nrow(x)) {
      insight::format_error("Length of `by` must be equal to number of rows in `x`.") # nolint
    }
    if (!is.data.frame(x) && length(by) != length(x)) {
      insight::format_error("Length of `by` must be equal to length of `x`.") # nolint
    }
    if (!is.factor(by)) {
      # coerce "by" to factor, including labels
      by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE)
    }
  }

  by
}


.validate_table_weights <- function(weights, x, weights_expression = NULL) {
  # exception: for vectors, if weighting variable not found, "weights" is NULL.
  # to check this, we further need to check whether a weights expression was
  # provided, e.g. "weights = iris$not_found" - all this is only relevant when
  # weights is NULL
  if (is.null(weights)) {
    # possibly misspelled weights-variables for default-method ----------------
    # -------------------------------------------------------------------------

    # do we have any value for weights_expression?
    if (!is.null(weights_expression) &&
      # due to deparse() and substitute, NULL becomes "NULL" - we need to check for this
      !identical(weights_expression, "NULL") &&
      # we should only run into this problem, when a variable from a data frame
      # is used in the data_tabulate() method for vectors - thus, we need to check
      # whether the weights_expression contains a "$" - `iris$not_found` is "NULL"
      # we need this check, because the default-method of data_tabulate() is called
      # from the data.frame method, where `weights = weights`, and then,
      # deparse(substitute(weights)) is "weights" (not "NULL" or "iris$not_found"),
      # leading to an error when actually all is OK (if "weights" is NULL)
      # Example:
      #> efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
      # Here, efc$wweight is NULL
      #> data_tabulate(efc$c172code, weights = efc$wweight)
      # Here, wweight errors anyway, because object "wweight" is not found
      #> data_tabulate(efc$c172code, weights = wweight)
      grepl("$", weights_expression, fixed = TRUE)) {
      insight::format_error("The variable specified in `weights` was not found. Possibly misspelled?")
    }
  } else {
    # possibly misspecified weights-variables for data.frame-method -----------
    # -------------------------------------------------------------------------

    if (is.character(weights)) {
      # If "weights" is a character string, must be of length 1
      if (length(weights) > 1) {
        insight::format_error(
          "If `weights` is a string indicating a variable name, `weights` must be of length 1."
        )
      }
      # if "weights" is a character, "x" must be a data frame
      if (!is.data.frame(x)) {
        insight::format_error("If `weights` is a string indicating a variable name, `x` must be a data frame.") # nolint
      }
      # is "by" a column in "x"?
      if (!weights %in% colnames(x)) {
        insight::format_error(sprintf(
          "The variable specified in `weights` was not found in `x`. %s",
          .misspelled_string(names(x), weights, "Possibly misspelled?")
        ))
      }
      weights <- x[[weights]]
    }
    # is "by" of same length as "x"?
    if (is.data.frame(x) && length(weights) != nrow(x)) {
      insight::format_error("Length of `weights` must be equal to number of rows in `x`.") # nolint
    }
    if (!is.data.frame(x) && length(weights) != length(x)) {
      insight::format_error("Length of `weights` must be equal to length of `x`.") # nolint
    }
  }

  weights
}

Try the datawizard package in your browser

Any scripts or data that you put into this service are public.

datawizard documentation built on Oct. 6, 2024, 1:08 a.m.