sjmisc: Data and Variable Transformation Functions

Documented in frq

#' @title Frequency table of labelled variables
#' @name frq
#'
#' @description This function returns a frequency table of labelled vectors, as data frame.
#'
#' @param sort.frq Determines whether categories should be sorted
#'   according to their frequencies or not. Default is \code{"none"}, so
#'   categories are not sorted by frequency. Use \code{"asc"} or
#'   \code{"desc"} for sorting categories ascending or descending order.
#' @param weights Bare name, or name as string, of a variable in \code{x}
#'   that indicates the vector of weights, which will be applied to weight all
#'   observations. Default is \code{NULL}, so no weights are used.
#' @param auto.grp Numeric value, indicating the minimum amount of unique
#'   values in a variable, at which automatic grouping into smaller  units
#'   is done (see \code{\link{group_var}}). Default value for \code{auto.group}
#'   is \code{NULL}, i.e. auto-grouping is off.
#' @param show.strings Logical, if \code{TRUE}, frequency tables for character
#'   vectors will not be printed. This is useful when printing frequency tables
#'   of all variables from a data frame, and due to computational reasons
#'   character vectors should not be printed.
#' @param show.na Logical, or \code{"auto"}. If \code{TRUE}, the output always
#'   contains information on missing values, even if variables have no missing
#'   values. If \code{FALSE}, information on missing values are removed from
#'   the output. If \code{show.na = "auto"}, information on missing values
#'   is only shown when variables actually have missing values, else it's not
#'   shown.
#' @param grp.strings Numeric, if not \code{NULL}, groups string values in
#'   character vectors, based on their similarity. See \code{\link{group_str}}
#'   and \code{\link{str_find}} for details on grouping, and their
#'   \code{precision}-argument to get more details on the distance of strings
#'   to be treated as equal.
#' @param min.frq Numeric, indicating the minimum frequency for which a
#'   value will be shown in the output (except for the missing values, prevailing
#'   \code{show.na}). Default value for \code{min.frq} is \code{0}, so all value
#'   frequencies are shown. All values or categories that have less than
#'   \code{min.frq} occurences in the data will be summarized in a \code{"n < 100"}
#'   category.
#' @param title String, will be used as alternative title to the variable
#'   label. If \code{x} is a grouped data frame, \code{title} must be a
#'   vector of same length as groups.
#' @param file Destination file, if the output should be saved as file.
#'   Only used when \code{out} is not \code{"txt"}.
#' @param encoding Character vector, indicating the charset encoding used
#'   for variable and value labels. Default is \code{"UTF-8"}. Only used
#'   when \code{out} is not \code{"txt"}.
#'
#' @inheritParams descr
#' @inheritParams to_dummy
#'
#' @return A list of data frames with values, value labels, frequencies, raw, valid and
#'           cumulative percentages of \code{x}.
#'
#' @details The \dots-argument not only accepts variable names or expressions
#'   from select-helpers. You can also use logical
#'   conditions, math operations, or combining variables to produce "crosstables".
#'   See 'Examples' for more details.
#'
#' @note \code{x} may also be a grouped data frame (see \code{\link[dplyr]{group_by}})
#'       with up to two grouping variables. Frequency tables are created for each
#'       subgroup then.
#'       \cr \cr
#'       The \code{print()}-method adds a table header with information on the
#'       variable label, variable type, total and valid N, and mean and
#'       standard deviations. Mean and SD are \emph{always} printed, even for
#'       categorical variables (factors) or character vectors. In this case,
#'       values are coerced into numeric vector to calculate the summary
#'       statistics.
#'       \cr \cr
#'       To print tables in markdown or HTML format, use \code{print_md()} or
#'       \code{print_html()}.
#'
#' @seealso \code{\link{flat_table}} for labelled (proportional) tables.
#'
#' @examples
#' # simple vector
#' data(efc)
#' frq(efc$e42dep)
#'
#' # with grouped data frames, in a pipe
#' library(dplyr)
#' efc %>%
#'   group_by(e16sex, c172code) %>%
#'   frq(e42dep)
#'
#' # show only categories with a minimal amount of frequencies
#' frq(mtcars$gear)
#'
#' frq(mtcars$gear, min.frq = 10)
#'
#' frq(mtcars$gear, min.frq = 15)
#'
#' # with select-helpers: all variables from the COPE-Index
#' # (which all have a "cop" in their name)
#' frq(efc, contains("cop"))
#'
#' # all variables from column "c161sex" to column "c175empl"
#' frq(efc, c161sex:c175empl)
#'
#' # for non-labelled data, variable name is printed,
#' # and "label" column is removed from output
#' data(iris)
#' frq(iris, Species)
#'
#' # also works on grouped data frames
#' efc %>%
#'   group_by(c172code) %>%
#'   frq(is.na(nur_pst))
#'
#' # group variables with large range and with weights
#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5))
#' frq(efc, c160age, auto.grp = 5, weights = weights)
#'
#' # different weight options
#' frq(efc, c172code, weights = weights)
#' frq(efc, c172code, weights = "weights")
#' frq(efc, c172code, weights = efc$weights)
#' frq(efc$c172code, weights = efc$weights)
#'
#' # group string values
#' dummy <- efc[1:50, 3, drop = FALSE]
#' dummy$words <- sample(
#'   c("Hello", "Helo", "Hole", "Apple", "Ape",
#'     "New", "Old", "System", "Systemic"),
#'   size = nrow(dummy),
#'   replace = TRUE
#' )
#'
#' frq(dummy)
#' frq(dummy, grp.strings = 2)
#'
#' #### other expressions than variables
#'
#' # logical conditions
#' frq(mtcars, cyl ==6)
#'
#' frq(efc, is.na(nur_pst), contains("cop"))
#'
#' iris %>%
#'   frq(starts_with("Petal"), Sepal.Length > 5)
#'
#' # computation of variables "on the fly"
#' frq(mtcars, (gear + carb) / cyl)
#'
#' # crosstables
#' set.seed(123)
#' d <- data.frame(
#'   var_x = sample(letters[1:3], size = 30, replace = TRUE),
#'   var_y = sample(1:2, size = 30, replace = TRUE),
#'   var_z = sample(LETTERS[8:10], size = 30, replace = TRUE)
#' )
#' table(d$var_x, d$var_z)
#' frq(d, paste0(var_x, var_z))
#' frq(d, paste0(var_x, var_y, var_z))
#' @export
frq <- function(x,
                ...,
                sort.frq = c("none", "asc", "desc"),
                weights = NULL,
                auto.grp = NULL,
                show.strings = TRUE,
                show.na = TRUE,
                grp.strings = NULL,
                min.frq = 0,
                out = c("txt", "viewer", "browser"),
                title = NULL,
                encoding = "UTF-8",
                file = NULL) {

  out <- match.arg(out)

  if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) {
    message("Package `sjPlot` needs to be loaded to print HTML tables.")
    out <- "txt"
  }

  # check min.frq value
  if (!is.numeric(min.frq)) {
    message("min.frq value is not numeric. Returned output assumes default value 0.")
    min.frq <- 0
  }

  # get dot data
  xw <- get_dot_data(x, dplyr::enquos(...))

  if (missing(weights)) {
    w <- NULL
    x <- xw
  } else {
    w <- try(rlang::quo_name(rlang::enquo(weights)), silent = TRUE)
    if (inherits(w, "try-error")) w <- NULL

    w.string <- try(eval(weights), silent = TRUE)
    if (!inherits(w.string, "try-error") && is.character(w.string)) w <- w.string

    if (!sjmisc::is_empty(w) && w != "NULL" && !obj_has_name(xw, w) && obj_has_name(x, w)) {
      x <- dplyr::bind_cols(xw, data.frame(x[[w]]))
      colnames(x)[ncol(x)] <- w
    } else if (!sjmisc::is_empty(string_contains("$", w)) && length(w.string) > 1 && is.numeric(w.string)) {
      x <- cbind(xw, data.frame(w.string))
      w <- sub("(.*)\\$(.*)", "\\2", w)
      colnames(x)[ncol(x)] <- w
    } else {
      message(sprintf("Weights `%s` not found in data.", w))
      w <- NULL
      x <- xw
    }
  }


  if (!isTRUE(show.na)) {
    # remove empty columns

    rem.col <- empty_cols(x)

    if (!sjmisc::is_empty(rem.col)) {
      rem.vars <- colnames(x)[rem.col]
      x <- remove_empty_cols(x)

      message(sprintf("Following %i variables have only missing values and are not shown:", length(rem.vars)))
      cat(paste(sprintf("%s [%i]", rem.vars, rem.col), collapse = ", "))
      cat("\n")
    }
  }


  # match args
  sort.frq <- match.arg(sort.frq)

  # return values
  dataframes <- list()


  # remove strings from output, if requested
  # and check if there are any variables left to print

  if (!show.strings)
    x <- dplyr::select_if(x, no_character)

  if ((all(sjmisc::is_empty(stats::na.omit(x), first.only = FALSE)) && show.na == FALSE) || all(suppressMessages(replace_na(sjmisc::is_empty(x, first.only = FALSE, all.na.empty = FALSE), value = FALSE))))
    return(NULL)


  # group strings

  if (!is.null(grp.strings)) {
    a <- attributes(x)

    if (!is.data.frame(x)) {
      was.df <- FALSE
      x <- data.frame(x, stringsAsFactors = FALSE)
    } else
      was.df <- TRUE


    x <- x %>%
      purrr::map_if(is.character, ~ group_str(
        strings = .x, precision = grp.strings, remove.empty = FALSE)
      ) %>%
      as.data.frame(stringsAsFactors = FALSE)

    if (was.df)
      attributes(x) <- a
    else
      attributes(x[[1]]) <- a
  }


  # do we have a grouped data frame?
  if (inherits(x, "grouped_df")) {

    grkey <- colnames(dplyr::group_keys(x))
    for (i in grkey) {
      if (is.character(x[[i]])) x[[i]] <- as.factor(x[[i]])
    }

    # get grouped data
    grps <- get_grouped_data(x)

    # we may have more than two variables...
    for (j in seq_len(ncol(grps$data[[1]]))) {

      # now plot everything
      for (i in seq_len(nrow(grps))) {
        # copy back labels to grouped data frame
        tmp <- sjlabelled::copy_labels(grps$data[[i]][j], x)

        if (!is.null(w))
          wb <- grps$data[[i]][[w]]
        else
          wb <- NULL

        # user-defined title
        if (!is.null(title) && length(title) >= i)
          gr.title <- title[i]
        else
          gr.title <- NULL

        # iterate data frame, but don't select
        # weighting variable
        if (is.null(w) || colnames(tmp)[1] != w) {
          dummy <-
            frq_helper(
              x = tmp[[1]],
              sort.frq = sort.frq,
              weight.by = wb,
              cn = colnames(tmp)[1],
              auto.grp = auto.grp,
              title = gr.title,
              show.na = show.na,
              min.frq = min.frq
            )

          attr(dummy, "group") <- get_grouped_title(x, grps, i, sep = ", ", long = FALSE)

          # save data frame for return value
          dataframes[[length(dataframes) + 1]] <- dummy
        }
      }
    }

  } else {
    # if we don't have data frame, coerce
    if (!is.data.frame(x)) x <- data.frame(x, stringsAsFactors = FALSE)

    if (!is.null(w))
      wb <- x[[w]]
    else
      wb <- NULL

    for (i in seq_len(ncol(x))) {
      # iterate data frame, but don't select
      # weighting variable
      if (is.null(w) || colnames(x)[i] != w) {
        dummy <-
          frq_helper(
            x = x[[i]],
            sort.frq = sort.frq,
            weight.by = wb,
            cn = colnames(x)[i],
            auto.grp = auto.grp,
            title = title,
            show.na = show.na,
            min.frq = min.frq
          )

        # save data frame for return value
        dataframes[[length(dataframes) + 1]] <- dummy
      }
    }
  }

  # add class-attr for print-method()
  if (out == "txt")
    class(dataframes) <- c("sjmisc_frq", "list")
  else
    class(dataframes) <- c("sjt_frq", "sjmisc_frq", "list")

  # save how to print output
  attr(dataframes, "print") <- out
  attr(dataframes, "encoding") <- encoding
  attr(dataframes, "file") <- file

  dataframes
}


frq_helper <- function(x, sort.frq, weight.by, cn, auto.grp, title = NULL, show.na = TRUE, min.frq = 0) {
  # remember type
  vartype <- var_type(x)

  # convert NaN and Inf to missing
  x <- zap_inf(x)

  # variable with only missing?
  if (length(stats::na.omit(x)) == 0 && show.na == FALSE) {
    mydat <- data.frame(
      val = NA,
      label = NA,
      frq = NA,
      raw.prc = NA,
      valid.prc = NA,
      cum.perc = NA
    )
    return(structure(class = "sjmisc_frq", list(mydat = mydat)))
  }


  # save descriptive statistics

  xnum <- sjlabelled::as_numeric(x, keep.labels = FALSE)

  if (!is.null(weight.by)) {
    # make sure, vector and weights have same length, so remove missing from weights

    weight.by[is.na(xnum)] <- NA
    xnum[is.na(weight.by)] <- NA
    x[is.na(weight.by)] <- NA

    mean.value <- stats::weighted.mean(stats::na.omit(xnum), w = stats::na.omit(weight.by))

    if (requireNamespace("datawizard", quietly = TRUE))
      sd.value <- datawizard::weighted_sd(stats::na.omit(xnum), weights = stats::na.omit(weight.by))
    else
      sd.value <- NA

  } else {
    mean.value <- mean(xnum, na.rm = TRUE)
    sd.value <- stats::sd(xnum, na.rm = TRUE)
  }


  # get variable label (if any)
  varlab <- sjlabelled::get_label(x)


  # numeric variables with many distinct values may
  # be grouped for better overview

  if (!is.null(auto.grp) && dplyr::n_distinct(x, na.rm = TRUE) >= auto.grp) {
    gl <- group_labels(x, size = "auto", n = auto.grp)
    x <- group_var(x, size = "auto", n = auto.grp)
    gv <- sort(stats::na.omit(unique(x)))
    names(gv) <- gl
    attr(x, "labels") <- gv
  }


  # get value labels (if any)
  labels <-
    sjlabelled::get_labels(
      x,
      attr.only = TRUE,
      values = "n",
      non.labelled = TRUE
    )


  # if we don't have variable label, use column name
  if (sjmisc::is_empty(varlab) && !sjmisc::is_empty(cn))
    varlab <- cn
  else if (!sjmisc::is_empty(varlab) && !sjmisc::is_empty(cn))
    varlab <- sprintf("%s (%s)", varlab, cn)


  # do we have a labelled vector?
  if (!is.null(labels)) {
    # add rownames and values as columns
    dat <- data_frame(
      n = names(labels),
      v = as.character(labels)
    )

    colnames(dat) <- c("val", "label")

    # character vectors need to be converted with to_value
    # to avoid NAs, but only if character is non-numeric
    if (is.character(dat$val) && anyNA(suppressWarnings(as.numeric(dat$val))))
      dat$val <- sjlabelled::as_numeric(dat$val, keep.labels = FALSE)
    else
      dat$val <- as.numeric(dat$val)

    # weight data?
    if (!is.null(weight.by)) {
      dat2 <- data.frame(round(
        stats::xtabs(
          weights ~ x,
          data = data.frame(weights = stats::na.omit(weight.by), x = stats::na.omit(x)),
          na.action = stats::na.pass,
          exclude = NULL
        ),
        0
      ))
    } else {
      # create frequency table
      dat2 <- data.frame(table(x, useNA = "always"))
    }

    colnames(dat2) <- c("val", "frq")
    dat2$val <- sjlabelled::as_numeric(dat2$val, keep.labels = FALSE)

    # join frq table and label columns
    mydat <- suppressMessages(dplyr::full_join(dat, dat2))

    # replace NA with 0, for proper percentages, i.e.
    # missing values don't appear (zero counts)
    mydat$frq <- suppressMessages(sjmisc::replace_na(mydat$frq, value = 0))
  } else {
    # weight data?
    if (!is.null(weight.by)) {
      mydat <- data.frame(round(
        stats::xtabs(
          weights ~ x,
          data = data.frame(weights = stats::na.omit(weight.by), x = stats::na.omit(x)),
          na.action = stats::na.pass,
          exclude = NULL
        ),
        0
      ))
    } else {
      # if we have no labels, do simple frq table
      mydat <- data.frame(table(x, useNA = "always"))
    }

    colnames(mydat) <- c("val", "frq")

    if (!anyNA(suppressWarnings(as.numeric(attr(mydat$val, "levels"))))) {
      mydat$val <- sjlabelled::as_numeric(mydat$val, keep.labels = FALSE)
    }

    # add values as label
    mydat$label <- as.character("<none>")
    mydat <- mydat[c("val", "label", "frq")]
  }

  min.frq.string <- sprintf("n < %g", min.frq)

  if (any(mydat$frq[!is.na(mydat$val)] < min.frq)) {
	  mydatS1 <- mydat[which(mydat$frq >= min.frq | is.na(mydat$val)), ]
	  mydatS2 <- mydat[which(mydat$frq < min.frq & !is.na(mydat$val)), ]

	  mydatS3 <- data_frame(
      val = min.frq.string,
      label = "<none>",
      frq = sum(mydatS2$frq)
    )

	  if (mydatS3$frq == 0) {
	    mydat <- mydatS1
	  } else {
  	  mydat <- rbind(mydatS1, mydatS3)
  	  row.names(mydat) <- c(
  	    row.names(mydat)[-length(row.names(mydat))],
  	    as.character(as.integer(row.names(mydat)[length(row.names(mydat)) - 1]) + 1)
  	  )
	  }
  }

  # need numeric
  if (is.factor(x) || is.character(x)) {
    x <- sjlabelled::as_numeric(x, keep.labels = FALSE)
  }

  # check if we have any NA-values - if not, add row for NA's
  if (!anyNA(mydat$val)) {
    mydat <- dplyr::bind_rows(
      mydat,
      data.frame(
        val = NA,
        label = NA,
        frq = 0
      )
    )
  }

  # valid values are one row less, because last row is NA row
  valid.vals <- nrow(mydat) - 1

  if (!all(is.na(mydat$val))) {

    extra.vals <- 1
    # Momentarily, in order to sort categories, we consider lower frequencies subtotal as a non valid value
    if (is.na(mydat$val[valid.vals]) & mydat$val[valid.vals + 1] == min.frq.string) {
      valid.vals <- valid.vals - 1
      extra.vals <- 2
    }


    # sort categories ascending or descending
    if (!is.null(sort.frq) && (sort.frq == "asc" || sort.frq == "desc")) {
      ord <- order(mydat$frq[seq_len(valid.vals)], decreasing = (sort.frq == "desc"))
    } else {
      ord <- seq_len(valid.vals)
    }
    mydat <- mydat[c(ord, (valid.vals + extra.vals):(valid.vals + 1)), ]
  }

  valid.vals <- nrow(mydat) - 1

  # raw percentages
  mydat$raw.prc <- mydat$frq / sum(mydat$frq)

  # compute valid and cumulative percentages
  mydat$valid.prc <- c(mydat$frq[seq_len(valid.vals)] / sum(mydat$frq[seq_len(valid.vals)]), NA)
  mydat$cum.prc <- c(cumsum(mydat$valid.prc[seq_len(valid.vals)]), NA)

  # proper rounding
  mydat$raw.prc <- 100 * round(mydat$raw.prc, 4)
  mydat$cum.prc <- 100 * round(mydat$cum.prc, 4)
  mydat$valid.prc <- 100 * round(mydat$valid.prc, 4)

  # "rename" labels for NA values
  if (!is.null(mydat$label)) mydat$label[is.na(mydat$val)] <- NA_character_

  if (!all(is.na(mydat$val))) {
    if (extra.vals == 1) {
      # save original order
      reihe <- sjlabelled::as_numeric(mydat$val, start.at = 1, keep.labels = FALSE)
      # sort
      if (sort.frq == "none") mydat <- mydat[order(reihe), ]
    } else if (extra.vals == 2) {
      # save original order
      reihe <- suppressWarnings(sjlabelled::as_numeric(mydat$val[-c(valid.vals, valid.vals + 1)], start.at = 1, keep.labels = FALSE))
      # sort
      if (sort.frq == "none") mydat <- mydat[c(order(reihe), valid.vals, valid.vals + 1), ]
    }
  }

  # remove NA, if requested
  has.na <- mydat$frq[nrow(mydat)] > 0
  if ((!is.logical(show.na) && show.na == "auto" && !has.na) || identical(show.na, FALSE)) {
    mydat <- mydat[-nrow(mydat), ]
  }

  # compute relative confidence intervals
  total_n <- sum(mydat$frq)
  rel_frq <- as.numeric(mydat$frq / total_n)
  ci <- stats::qnorm(.975) * suppressWarnings(sqrt(rel_frq * (1 - rel_frq) / total_n))
  total_ci <- data.frame(lower = total_n * (rel_frq - ci), upper = total_n * (rel_frq + ci))
  relative_ci <- data.frame(lower = rel_frq - ci, upper = rel_frq + ci)


  # add variable label and type as attribute, for print-method

  if (!is.null(title)) {
    attr(mydat, "label") <- title
    attr(mydat, "vartype") <- ""

  } else {
    attr(mydat, "label") <- varlab
    attr(mydat, "vartype") <- vartype
  }

  attr(mydat, "mean") <- mean.value
  attr(mydat, "sd") <- sd.value

  attr(mydat, "ci") <- total_ci
  attr(mydat, "relative.ci") <- relative_ci

  row.names(mydat) <- NULL

  mydat
}


get_grouped_title <- function(x, grps, i, sep = ", ", long = FALSE) {
  # create title for first grouping level
  tp <- get_title_part(x, grps, 1, i)

  if (long)
    title <- sprintf("%s: %s", tp[1], tp[2])
  else
    title <- sprintf("%s", tp[2])

  # do we have another groupng variable?
  if (length(dplyr::group_vars(x)) > 1) {
    tp <- get_title_part(x, grps, 2, i)

    if (long)
      title <- sprintf("%s%s%s: %s", title, sep, tp[1], tp[2])
    else
      title <- sprintf("%s%s%s", title, sep, tp[2])
  }

  # return title
  title
}


get_title_part <- function(x, grps, level, i) {
  # prepare title for group
  var.name <- colnames(grps)[level]

  # get values from value labels
  vals <- sjlabelled::get_values(x[[var.name]])
  t2 <- NULL

  # if we have no value labels, get values directly
  if (is.null(vals)) {
    vals <- grps[[var.name]]
    if (is.factor(grps[[var.name]])) vals <- as.character(vals)
    lab.pos <- i
  } else {
    # find position of value labels for current group
    lab.pos <- which(vals == grps[[var.name]][i])
    t2 <- sjlabelled::get_labels(x[[var.name]])[lab.pos]
  }

  # get variable and value labels
  t1 <- sjlabelled::get_label(x[[var.name]], def.value = var.name)

  # if we have no value label, use value instead
  if (sjmisc::is_empty(t2)) t2 <- vals[lab.pos]

  # generate title
  c(t1, t2)
}


get_grouped_data <- function(x) {
  # nest data frame
  grps <- .nest(x)

  # remove NA category for grouped data
  cc <- grps %>%
    dplyr::select(-.data$data) %>%
    stats::complete.cases()

  # select only complete cases
  grps <- dplyr::filter(grps, !! cc)

  # arrange data

  if (length(dplyr::group_vars(x)) == 1)
    reihe <- order(grps[[1]])
  else
    reihe <- order(grps[[1]], grps[[2]])

  grps <- grps[reihe, , drop = FALSE]

  grps
}


no_character <- function(x) !is.character(x)


#' @export
as.data.frame.sjmisc_frq <- function(x, row.names = NULL, optional = FALSE, ...) {
  x <- lapply(x, function(i) {
    i$variable <- attr(i, "label")
    i$group <- attr(i, "group")
    cols <- c("variable", "group", "val", "label", "frq", "raw.prc", "vaid.prc", "cum.prc")
    i[, intersect(cols, colnames(i))]
  })
  do.call(rbind, x)
}