R/codebook.R

Defines functions codebook.data.frame codebook.default codebook

Documented in codebook

#' Generate a codebook
#'
#' @description
#'
#' `r lifecycle::badge('stable')`
#'
#' The `codebook` function generates a codebook for the given dataset. It provides a summary
#' of the dataset's structure and characteristics, including variable names, types, missing
#' values, completeness percentages, unique value counts, and variable labels (if available).
#'
#' @param data The dataset for which the codebook is to be generated.
#'
#' @return The input dataset is returned invisibly,
#' allowing `codebook()` to be used within a data pipe line.
#'
#' @family Data Management
#' @examples
#' codebook(mtcars)
#'
#' codebook(iris)
#'
#' labelled::var_label(iris) <- c(
#' 	"sepal length", "sepal width", "petal length",
#' 	"petal width", "species"
#' )
#' codebook(iris)
#'
#' @export
codebook <- function(data) {
	UseMethod("codebook")
}

#' @export
codebook.default <- function(data) {
	pillar::glimpse(data)
	invisible(data)
}

#' @export
codebook.data.frame <- function(data)
{
	.data_name <- rlang::expr_text(substitute(data))
	.data_name <- ifelse(.data_name == ".", "<Piped Data>", .data_name)
	cli::cat_line(crayon::magenta("$ Codebook"))
	cli::cat_line(crayon::magenta("  dataset:", crayon::bold(.data_name)))

	## data label
	data_label <- attr(data, "label")
	if (!is.null(data_label))
		cli::cat_line(crayon::magenta("  label:", crayon::bold(data_label)))

	row_n <- nrow(data)
	col_n <- ncol(data)
	cli::cat_line(crayon::magenta("  Row:", row_n))
	cli::cat_line(crayon::magenta("  Col:", col_n))

	var_names <- pillar::new_pillar_title(names(data)) |> format()
	var_types <- purrr::map(data, pillar::type_sum) |>
		purrr::map_chr(\(x) paste0("<", format(x), ">")) |>
		pillar::new_pillar_title() |>
		format()
	miss <- purrr::map_int(data, \(x) sum(is.na(x)))
	complete <- 1 - (miss / row_n)
	unique <- purrr::map_int(data, \(x) length(unique(x)))

	df <- data.frame(
		name = var_names,
		type = var_types,
		miss = miss,
		complete = scales::label_comma(accuracy = .01)(complete),
		unique = unique,
		row.names = NULL
	)

	## TODO: to replace with label() function
	var_labels <- labelled::var_label(data, unlist = TRUE)
	# var_labels <- ifelse(var_labels == "", "<NULL>", var_labels)
	# compare df width with console width. if < 30, truncate labels
	width_diff <- getOption("width") - sum(pillar::get_extent(df[1, ]))
	if (width_diff < 30)
		var_labels <- ifelse(nchar(var_labels) > width_diff,
												 paste0(substr(var_labels, 1, 10), "..."), var_labels)

	# print tables
	cbind(df, label = var_labels) |>
		data.frame(row.names = NULL) |>
		print.data.frame(right = FALSE, row.names = TRUE)

	# return data invisibly
	invisible(data)
}
myominnoo/mStats documentation built on Nov. 29, 2023, 2:36 a.m.