Nothing
#' Preprocess a raw data frame for use with ternG or ternD
#'
#' \code{ternP()} cleans a raw data frame loaded from a CSV or XLSX file,
#' applying a standardized set of transformations and performing validation
#' checks before the data is passed to \code{\link{ternG}} or
#' \code{\link{ternD}}.
#'
#' @section Cleaning pipeline (in order):
#' \enumerate{
#' \item String NA values (\code{"NA"}, \code{"na"}, \code{"Na"},
#' \code{"unk"}) are converted to \code{NA}.
#' \item Leading and trailing whitespace is trimmed from all character
#' columns.
#' \item Columns that are 100\% empty (all \code{NA}) are silently dropped.
#' \item Rows where every cell is \code{NA} are removed.
#' \item Character columns where values differ only by capitalization
#' (e.g. \code{"Male"} vs \code{"MAle"}) are standardized to title case.
#' }
#'
#' @section Validation hard stops:
#' \code{ternP()} stops with a descriptive error if:
#' \itemize{
#' \item Any column name matches a protected health information (PHI) pattern
#' (e.g. \code{MRN}, \code{DOB}, \code{FirstName}). De-identified research
#' identifiers such as \code{patient_id}, \code{subject_id}, and
#' \code{participant_id} are explicitly excluded, as are clinical-event
#' dates (admission date, discharge date, visit date, etc.). Only
#' personal-identity dates such as DOB and DOD are flagged.
#' \item Any column with a blank or whitespace-only header contains data.
#' Completely empty unnamed columns are silently dropped and do not trigger
#' this error.
#' }
#'
#' @param data A data frame or tibble as loaded from a CSV or XLSX file (e.g.
#' via \code{readr::read_csv()} or \code{readxl::read_excel()}). All
#' character columns are processed; numeric and logical columns are passed
#' through unchanged by the string-cleaning steps.
#'
#' @return A named list with three elements:
#' \describe{
#' \item{\code{clean_data}}{A tibble containing the fully cleaned dataset,
#' ready to pass to \code{ternG()} or \code{ternD()}.}
#' \item{\code{sparse_rows}}{A tibble of rows from \code{clean_data} where
#' more than 50\% of values are \code{NA}. These rows are \emph{retained}
#' in \code{clean_data} but extracted here for optional review or download.
#' An empty tibble if no sparse rows exist.}
#' \item{\code{feedback}}{A named list of feedback items. Each element is
#' \code{NULL} if the corresponding transformation was not triggered, or a
#' value describing what changed:
#' \describe{
#' \item{\code{string_na_converted}}{A named list with elements
#' \code{total} (integer count of values converted) and \code{cols}
#' (character vector of affected column names), or \code{NULL} if no
#' string NA values were found.}
#' \item{\code{blank_rows_removed}}{A named list with elements
#' \code{count} (integer) and \code{row_indices} (integer vector of
#' original row positions removed), or \code{NULL} if none.}
#' \item{\code{sparse_rows_flagged}}{A named list with elements
#' \code{count} (integer) and \code{row_indices} (integer vector of
#' row positions in \code{clean_data} with >50\% missingness),
#' or \code{NULL} if none.}
#' \item{\code{case_normalized_vars}}{A named list with elements
#' \code{cols} (character vector of affected column names) and
#' \code{detail} (a named list per column, each with
#' \code{changed_from} and \code{changed_to} character vectors
#' showing the exact value changes), or \code{NULL} if none.}
#' \item{\code{dropped_empty_cols}}{Character vector of column names
#' (or \code{""} for unnamed columns) that were dropped because they
#' were 100\% empty, or \code{NULL} if none.}
#' }}
#' }
#'
#' @seealso \code{\link{ternG}} for grouped comparisons, \code{\link{ternD}} for descriptive statistics.
#'
#' @examples
#' \donttest{
#' # Load a messy CSV and preprocess it
#' path <- system.file("extdata/csv", "tern_colon_messy.csv",
#' package = "TernTables")
#' raw <- read.csv(path, stringsAsFactors = FALSE)
#' result <- ternP(raw)
#'
#' # Access cleaned data
#' result$clean_data
#'
#' # Review preprocessing feedback
#' result$feedback
#'
#' # Sparse rows flagged (>50% missing), retained but not removed
#' result$sparse_rows
#' }
#'
#' @export
ternP <- function(data) {
# --- Input validation -------------------------------------------------------
if (!is.data.frame(data)) {
stop("`data` must be a data frame or tibble.", call. = FALSE)
}
if (nrow(data) == 0) {
stop("`data` has no rows.", call. = FALSE)
}
if (ncol(data) == 0) {
stop("`data` has no columns.", call. = FALSE)
}
# --- Hard stops (before any cleaning) ---------------------------------------
.check_phi(data)
.check_unnamed_cols(data)
# --- Initialise feedback trackers -------------------------------------------
feedback <- list(
string_na_converted = NULL,
blank_rows_removed = NULL,
sparse_rows_flagged = NULL,
case_normalized_vars = NULL,
dropped_empty_cols = NULL
)
# ---------------------------------------------------------------------------
# Step 1: Convert string NA values to NA
# "NA", "na", "Na", "unk" in any character column are treated as missing.
# ---------------------------------------------------------------------------
string_na_values <- c("NA", "na", "Na", "unk")
# Count total occurrences and record which columns are affected before cleaning.
sna_cols <- Filter(function(nm) {
col <- data[[nm]]
is.character(col) && any(col %in% string_na_values, na.rm = TRUE)
}, names(data))
sna_total <- sum(vapply(sna_cols, function(nm) {
sum(data[[nm]] %in% string_na_values, na.rm = TRUE)
}, integer(1)))
data <- dplyr::mutate(
data,
dplyr::across(
dplyr::where(is.character),
~ dplyr::if_else(. %in% string_na_values, NA_character_, .)
)
)
if (length(sna_cols) > 0) {
feedback$string_na_converted <- list(total = sna_total, cols = sna_cols)
}
# ---------------------------------------------------------------------------
# Step 2: Trim leading and trailing whitespace from all character columns
# ---------------------------------------------------------------------------
data <- dplyr::mutate(
data,
dplyr::across(dplyr::where(is.character), stringr::str_trim)
)
# ---------------------------------------------------------------------------
# Step 3: Drop 100% empty columns (including empty-named ones)
# Named empty columns are reported in feedback; they are soft-dropped.
# This is distinct from the hard stop in .check_unnamed_cols(), which
# fires only for unnamed columns that contain data.
# ---------------------------------------------------------------------------
empty_col_idx <- which(vapply(data, function(col) all(is.na(col)), logical(1)))
if (length(empty_col_idx) > 0) {
empty_col_names <- names(data)[empty_col_idx]
feedback$dropped_empty_cols <- empty_col_names
data <- data[, -empty_col_idx, drop = FALSE]
}
# ---------------------------------------------------------------------------
# Step 4: Remove completely blank rows (every cell is NA)
# ---------------------------------------------------------------------------
blank_row_idx <- which(apply(data, 1, function(r) all(is.na(r))))
data <- dplyr::filter(data, !dplyr::if_all(dplyr::everything(), is.na))
n_removed <- length(blank_row_idx)
if (n_removed > 0) {
feedback$blank_rows_removed <- list(count = n_removed, row_indices = blank_row_idx)
}
# ---------------------------------------------------------------------------
# Step 5: Case-inconsistency normalization
# For each character column, if any values are letter-for-letter identical
# when lowercased (i.e. only differ by capitalization), collapse them all
# to title case. Track which columns were affected.
# ---------------------------------------------------------------------------
collapsed_detail <- list()
# Replace dplyr::across lambda that used <<- with an explicit for loop
# (avoids modifying the global search path; CRAN policy)
for (col_nm in names(data)) {
if (!is.character(data[[col_nm]])) next
col_vals <- data[[col_nm]]
lowered <- stringr::str_to_lower(col_vals)
if (dplyr::n_distinct(lowered, na.rm = TRUE) <
dplyr::n_distinct(col_vals, na.rm = TRUE)) {
unique_vals <- unique(col_vals[!is.na(col_vals)])
title_vals <- stringr::str_to_title(unique_vals)
changed_idx <- which(unique_vals != title_vals)
collapsed_detail[[col_nm]] <- list(
changed_from = unique_vals[changed_idx],
changed_to = title_vals[changed_idx]
)
data[[col_nm]] <- stringr::str_to_title(col_vals)
}
}
if (length(collapsed_detail) > 0) {
feedback$case_normalized_vars <- list(
cols = names(collapsed_detail),
detail = collapsed_detail
)
}
# ---------------------------------------------------------------------------
# Step 6: Identify sparse rows (>50% missing values)
# Sparse rows are RETAINED in clean_data but extracted separately so the
# caller can surface them to the user (e.g. as a download button).
# ---------------------------------------------------------------------------
sparse_row_idx <- which(rowSums(is.na(data)) > (ncol(data) / 2))
sparse_rows <- data[sparse_row_idx, , drop = FALSE]
n_sparse <- length(sparse_row_idx)
if (n_sparse > 0) {
feedback$sparse_rows_flagged <- list(count = n_sparse, row_indices = sparse_row_idx)
}
# ---------------------------------------------------------------------------
# Return
# ---------------------------------------------------------------------------
result <- list(
clean_data = tibble::as_tibble(data),
sparse_rows = tibble::as_tibble(sparse_rows),
feedback = feedback
)
class(result) <- "ternP_result"
# Emit feedback immediately — fires whether or not the result is assigned.
.ternP_emit_feedback(result)
result
}
# ── Internal feedback emitter ────────────────────────────────────────────────
# Shared by ternP() (runs automatically on every call) and print.ternP_result
# (re-displays the summary on demand). Kept internal — not exported.
.ternP_emit_feedback <- function(x) {
fb <- x$feedback
n_clean <- nrow(x$clean_data)
n_cols <- ncol(x$clean_data)
cli::cli_rule("ternP Preprocessing Summary")
clean_flag <- all(vapply(fb, is.null, logical(1)))
if (clean_flag) {
cli::cli_alert_success(
"No transformations required. Data passed through unchanged."
)
} else {
if (!is.null(fb$string_na_converted)) {
sna <- fb$string_na_converted
cli::cli_alert_info(
"{sna$total} string NA value{?s} converted to {.code NA} across \
{length(sna$cols)} column{?s}: {.val {sna$cols}}."
)
}
if (!is.null(fb$dropped_empty_cols)) {
n_d <- length(fb$dropped_empty_cols)
cli::cli_alert_info(
"{n_d} empty column{?s} dropped: {.val {fb$dropped_empty_cols}}."
)
}
if (!is.null(fb$blank_rows_removed)) {
br <- fb$blank_rows_removed
cli::cli_alert_info(
"{br$count} completely blank row{?s} removed. Original row number{?s}: {br$row_indices}."
)
}
if (!is.null(fb$case_normalized_vars)) {
cn <- fb$case_normalized_vars
n_n <- length(cn$cols)
cli::cli_alert_info(
"Capitalization normalized in {n_n} column{?s}: {.val {cn$cols}}."
)
}
if (!is.null(fb$sparse_rows_flagged)) {
sp <- fb$sparse_rows_flagged
cli::cli_alert_warning(
"{sp$count} sparse row{?s} flagged (>50% missing, retained in clean_data). Row number{?s}: {sp$row_indices}."
)
}
}
cli::cli_rule()
cli::cli_alert_info(
"Cleaned data: {n_clean} row{?s} \u00d7 {n_cols} column{?s}."
)
invisible(x)
}
# ── S3 print method ─────────────────────────────────────────────────────────
#' Print method for ternP_result objects
#'
#' Re-displays the preprocessing summary for a \code{ternP_result} object.
#' Note that \code{\link{ternP}} already emits this summary automatically at
#' the time it is called, so this method is most useful for reviewing the
#' summary after the fact (e.g. typing \code{result} at the console later
#' in a session).
#'
#' @param x A \code{ternP_result} object returned by \code{\link{ternP}}.
#' @param ... Currently unused; included for S3-method compatibility.
#' @return Invisibly returns \code{x}.
#' @method print ternP_result
#' @export
print.ternP_result <- function(x, ...) {
.ternP_emit_feedback(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.