R/data_inventory_table.R

Defines functions pt_data_inventory_notes pt_data_inventory pt_data_study data_inventory_data data_inventory_data_split data_inventory_chunk find_bq_col

Documented in data_inventory_chunk data_inventory_data find_bq_col pt_data_inventory pt_data_inventory_notes pt_data_study

#' Scan data set columns for BQL / BLQ
#'
#' @param data a data frame
#'
#' @details
#' Will return the first match among `BQL` and `BLQ`.  If no match is found,
#' returns `NA_character_`.
#'
#' @export
find_bq_col <- function(data) {
  candidate <- intersect(c("BQL", "BLQ"), names(data))
  if(length(candidate)==0) return(NA_character_)
  candidate[1]
}

#' Create a data inventory summary for a data chunk
#'
#' @inheritParams pt_data_inventory
#' @param tot logical indicating if a summary row should be included
#' @param ... used to absorb other arguments; not used
data_inventory_chunk <- function(data, by, panel = by, stacked = FALSE,
                                 tot = FALSE, all_name = "all",
                                 dv_col = "DV",
                                 bq_col = "BQL",
                                 id_col = "ID",
                                 ...) {

  if(by==".total" | panel == ".total") {
    data <- data_total_col(data, all_name = all_name)
  }

  miss <- FALSE
  miss_required <- function(needed,pass) {
    x <- c(
      "couldn't find the {needed} column; ",
      "set the column name to '{needed}' or ",
      "pass the name as '{pass}'"
    )
    glue::glue(paste0(x,collapse=""))
  }

  if(!exists(dv_col,data)) {
    emessage(miss_required("DV", "dv_col"))
    miss <- TRUE
  }

  if(!exists(bq_col,data)) {
    emessage(miss_required("BQL", "bq_col"))
    miss <- TRUE
  }

  if(!exists(id_col,data)) {
    emessage(miss_required("ID", "id_col"))
    miss <- TRUE
  }

  if(miss) {
    stop(
      "there was a problem finding required columns ",
      "for the data inventory summary; ",
      "please see error messages as well as '?data_inventory_chunk' ",
      "help topic",
      call.=FALSE
    )
  }

  .groups <- unique(c(panel,by))

  data <- ungroup(data)

  # .N is the overall n
  # ..n is the panel n
  if(stacked) {
    data <- group_by(data, !!sym(panel))
    data <- mutate(data, .N = n_non_missing(!!sym(dv_col), !!sym(bq_col)))
  } else {
    data <- mutate(data, .N = n_non_missing(!!sym(dv_col), !!sym(bq_col)))
  }
  data <- ungroup(data)
  data <- group_by(data, !!sym(panel))
  data <- mutate(data, ..n = n_non_missing(!!sym(dv_col), !!sym(bq_col)))
  data <- ungroup(data)
  data <- group_by(data, !!!syms(.groups))
  body <- summarise(
    data,
    SUBJ = n_unique(!!sym(id_col)),
    NOBS = n_obs(!!sym(dv_col), !!sym(bq_col)),
    NMISS = n_missing(!!sym(dv_col), !!sym(bq_col)),
    POBS = digit1(100*.data[["NOBS"]]/first(.data[["..n"]])),
    OOBS = digit1(100*.data[["NOBS"]]/first(.data[[".N"]]))
  )
  bq <- summarise(
    data,
    NBQL = n_bql(!!sym(bq_col)),
    PBQL = digit1(100*.data[["NBQL"]]/first(.data[["..n"]])),
    OBQL = digit1(100*.data[["NBQL"]]/first(.data[[".N"]]))
  )
  summ <- left_join(body, bq, by = unique(c(by, panel)))
  summ <- select(
    summ,
    !!sym(by),
    !!sym(panel),
    "SUBJ",
    "NMISS",
    "NOBS",
    "NBQL",
    "POBS",
    "PBQL",
    "OOBS",
    "OBQL"
  )
  summ <- ungroup(summ)
  summ <- mutate(summ, !!sym(by) := as.character(!!sym(by)))
  summ <- mutate(summ, !!sym(panel) := as.character(!!sym(panel)))
  summ
}

data_inventory_data_split <- function(data,by,panel=by,stacked=FALSE,...) {
  data <- ungroup(data)
  data <- split(data,data[[panel]],drop=TRUE)
  data <- map_dfr(data,data_inventory_data,by=by,panel=panel,stacked=FALSE,...)
  data <- mutate(data, !!sym(by) := replace_na(!!sym(by),"all"))
  data <- fill(data,!!sym(panel),.direction = "down")
  data[[".total"]] <- NULL
  data

}

#' Create a summary of endpoint data
#'
#' @inheritParams pt_data_inventory
#'
#' @param ... passed to subsequent summary functions
#'
#' @export
data_inventory_data <- function(data, by, panel = by, all_name = "all",
                                summarize_all = TRUE, stacked = FALSE, ...) {
  by <- unname(by)
  panel <- unname(panel)

  if(stacked) {
    ans <- data_inventory_data_split(data, by, panel, stacked = FALSE, ...)
    return(ans)
  }

  data <- data_total_col(data, all_name = all_name)

  check_discrete(data,cols = unique(c(panel,by)))

  ans <- data_inventory_chunk(
    data = data,
    by = by,
    panel = panel,
    all_name = all_name,
    stacked = stacked,
    ...
  )

  if(by != ".total" && isTRUE(summarize_all)) {
    tot <- data_inventory_chunk(
      data,
      by = ".total",
      panel = ".total",
      stacked = FALSE,
      all_name = all_name, ...
    )
    tot <- mutate(tot, .total = all_name)
    if(!stacked) {
      tot <- mutate(
        tot,
        POBS = "---",
        PBQL = "---"
      )
    }
    ans <- bind_rows(ans,tot)
  }

  if(panel != by) {
    ans <- mutate(ans, !!sym(by) := replace_na(!!sym(by),".total"))
    ans <- fill(ans, !!sym(panel), .direction = "down")
  }

  if(panel == by) {
    ans <- mutate(ans, !!sym(by) := replace_na(!!sym(by),".total"))
  }

  ans
}

#' Data inventory by study
#'
#' This is a convenience wrapper around [pt_data_inventory]
#'
#' @param data the data frame to summarize; the user should filter or subset
#' so that `data` contains exactly the records to be summarized; pmtables will
#' not add or remove rows prior to summarizing `data`
#' @param study_col the name of the column containing the study identifier;
#' may be character or quosure (see [dplyr::vars])
#' @param panel another categorical data set column name to stratify the
#' data summary
#' @param ... other arguments passed to [pt_data_inventory] and
#' [data_inventory_chunk]
#'
#' @examples
#'
#' ans <- pt_data_study(pmt_pk, study_col = "STUDYf")
#'
#' \dontrun{
#' st2report(stable(ans))
#' }
#'
#' @export
pt_data_study <- function(data, study_col = "STUDY", panel = study_col, ...) {
  pt_data_inventory(data  = data, by = study_col, panel = panel, ...)
}

#' Create a data inventory table
#'
#' This function summarizes your data in a specific way and returns an object
#' that can be converted into a `latex` table.
#'
#' @inheritParams pt_cont_long
#'
#' @param by The outer grouping variable; may be character or quosure.
#' @param panel The panel grouping variable; may be character or quosure.
#' @param inner_summary If `TRUE`, then a summary of the inner variable will
#' be provided.
#' @param drop_miss If `TRUE`, then `MISS` will be dropped, but only when all
#' `MISS` values are equal to zero.
#' @param stacked If `TRUE`, then independent summaries are created by `outer`
#' and included in a single table (see examples).
#' @param summarize_all if `TRUE` then a complete data summary will be
#' appended to the bottom of the table when `stacked` is `FALSE`.
#' @param all_name_stacked a name to use for the complete data summary when
#' `stacked` is `TRUE`.
#' @param dv_col Character name of `DV` column.
#' @param bq_col Character name of `BQL` column; see [find_bq_col()].
#' @param id_col Character name of `ID` column.
#' @param ... Other arguments passed to [data_inventory_chunk()].
#'
#'
#' @details
#'
#' Output columns include counts for subjects (`SUBJ`), observations (`OBS)`,
#' BQL observations, missing observations (`MISS`) and percentage of
#' observations that are BQL. When panels are requested, then the percentages
#' for `OBS` and `BQL` are presented for the `Overall` data and for the panel
#' `Group`.
#'
#' Specifically, please note that:
#' - `MISS` is the number of data records where `DV` is missing (`NA`) and where
#'   the `BQL` (or `BLQ`) column is `0`
#' - `OBS` is the number of data records where `DV` is not missing (non-`NA`)
#'   and the `BQL` (or `BLQ`) column is `0`
#' - `BQL` are records where the `BQL` (or `BLQ`) column is not equal to `0`
#'
#' The sum of `MISS` + `OBS` + `BQL` should equal the number of rows in the
#' data frame passed to `pt_data_inventory()`.
#'
#' When calculating percent `OBS` and percent `BQL`, we use `OBS + BQL` as the
#' denominator such that the percent `BQL` and percent `OBS` sum to `100`
#' within a group or panel. When the `panel` argument is set, these percentages
#' are calculated for the group (or `panel`)  as well as overall. In other
#' words, records that are `MISS` are not factored into totals for `OBS` or
#' `BQL` and similarly are not factored into calculation of percent `OBS` or
#' percent `BQL`.
#
#' The summary function is expecting certain columns to be named in a certain
#' way. This can be modified to suit your need by passing the following
#' arguments: `dv_col` (for observations), `bq_col` (for BQL observations),
#' and `id_col` (for ID). For example, if BQL indicator is in a column called
#' `BELOW` you would pass `bq_col = BELOW`.
#'
#' See the [data_inventory_chunk()] help topic for a description of these
#' columns.
#'
#' The notes for this table are generated by [pt_data_inventory_notes()].
#'
#' @examples
#'
#' ans <- pt_data_inventory(pmt_pk, by = c("Renal function" = "RFf"))
#'
#' ans <- pt_data_inventory(
#'    pmt_pk,
#'    by = "STUDYf",
#'    panel = "RFf"
#' )
#'
#' ans <- pt_data_inventory(
#'    pmt_obs,
#'    by = "STUDYf",
#'    panel = "SEQf",
#'    stacked = TRUE
#' )
#'
#' \dontrun{
#' st2report(stable(ans))
#' }
#'
#' @return
#' An object with class `pmtable`; see [class-pmtable].
#'
#' @export
pt_data_inventory <- function(data, by = ".total", panel = by,
                              inner_summary = TRUE, drop_miss = FALSE,
                              stacked = FALSE, table = NULL,
                              summarize_all = TRUE,
                              all_name = "All data",
                              all_name_stacked  = "Group Total",
                              dv_col = "DV",
                              bq_col = find_bq_col(data),
                              id_col = "ID",
                              ...) {

  stacked <- isTRUE(stacked)
  if(stacked) all_name <- all_name_stacked
  summarize_all <- isTRUE(summarize_all)

  assert_that(is.data.frame(data))
  data <- as.data.frame(data)

  has_panel <- !missing(panel)
  panel_data <- as.panel(panel)
  panel <- panel_data$col
  has_by <- !missing(by)

  by <- new_names(by,table)

  panel <- new_names(panel,table)

  drop_bql <- FALSE

  if(is.na(bq_col)) {
    data[["BQL"]] <- 0
    bq_col <- "BQL"
    drop_bql <- TRUE
  }

  if(panel==by | stacked) {
    inner_summary <- FALSE
  }

  if(stacked) {
    total_name <- paste0("\\hline {\\it ", all_name, "}")
  } else {
    total_name <- paste0("\\hline \\hline {\\bf ", all_name, "}")
  }

  ans <- data_inventory_data(
    data,
    by = by,
    panel = panel,
    stacked = stacked,
    all_name = all_name,
    summarize_all = summarize_all,
    dv_col = dv_col,
    bq_col = bq_col,
    id_col = id_col,
    ...
  )

  if(exists(by, ans)) {
    ans <- mutate(
      ans,
      !!sym(by) := ifelse(!!sym(by)==".total", total_name, !!sym(by))
    )
  }

  if(inner_summary) {
    ans <- rename(
      ans,
      `Group percent.OBS` = "POBS",
      `Group percent.BQL` = "PBQL",
      `Overall percent.OBS` = "OOBS",
      `Overall percent.BQL` = "OBQL"
    )
  } else {
    ans <- rename(
      ans,
      `Percent.OBS` = "OOBS",
      `Percent.BQL` = "OBQL"
    )
    ans <- mutate(ans, POBS = NULL, PBQL = NULL)
  }

  ans <- rename(
    ans,
    Number.SUBJ = "SUBJ",
    Number.MISS = "NMISS",
    Number.OBS = "NOBS",
    Number.BQL = "NBQL"
  )

  if(bq_col == "BLQ") {
    names(ans) <- gsub(".BQL", ".BLQ", names(ans), fixed = TRUE)
  }

  if(isTRUE(drop_miss)) {
    ans <- mutate(ans, Number.MISS = NULL)
  }

  ans <- mutate(ans, .total = NULL)
  out <- ans

  notes <- pt_data_inventory_notes(bq = bq_col, drop_bql = drop_bql)

  if(isTRUE(drop_miss)) notes <- notes[!grepl("MISS", notes)]

  if(isTRUE(drop_bql)) {
    notes <- notes[!grepl("below", notes)]
    out <- select(out, !contains("BQL"))
  }

  .panel <- rowpanel(NULL)
  if(has_panel) {
    .panel <- panel_data
    .panel$prefix_skip <- "(Grand|Group) Total"
  }

  if(panel==by) panel <- NULL

  out <- list(
    data = out,
    panel = .panel,
    cols_rename = by,
    span_split = colsplit(sep = "."),
    align = cols_center(.outer = 'l'),
    notes = notes
  )

  out <- structure(out, class = c("pmtable", class(out)))

  return(out)
}

#' Return table notes for pt_data_inventory
#'
#' See [pt_data_inventory()]. The function generates standard table notes for
#' the table.
#'
#' @param bq Abbreviation for below limit of quantification.
#' @param drop_bql If `TRUE`, the `BQL`/`BLQ` summary is omitted.
#' @param note_add Additional notes to be include.

#'
#' @export
pt_data_inventory_notes <- function(bq = c("BQL", "BLQ"), drop_bql = FALSE, note_add = NULL) {
  l2 <- NULL
  l3 <- "MISS: missing observations"
  if(isFALSE(drop_bql)) {
    bq <- match.arg(bq)
    if(bq=="BQL") {
      l2 <- "BQL: below quantification limit"
    }
    if(bq=="BLQ") {
      l2 <- "BLQ: below limit of quantification"
    }
    l3 <- paste0(l3, " (non-", bq, ")")
  }
  ans <- note_add
  ans <- c(
    ans,
    "SUBJ: subjects",
    l2,
    l3,
    "OBS: observations"
  )
  ans
}
metrumresearchgroup/pmtables documentation built on Oct. 27, 2024, 5:16 p.m.