Nothing
# =============================================================================
# autocodebook — generate_report() : entrypoint do modulo de relatorio
# =============================================================================
# Gera um relatorio HTML autocontido com:
# 1. Resumo do dataset
# 2. Fluxograma de elegibilidade (lendo .cb_env$tracking)
# 3. Codebook (lendo .cb_env$codebook)
# 4. Inspecao de variaveis (transversal ou longitudinal)
# =============================================================================
#' Generate a standardized report from the current session
#'
#' Produces an HTML report combining the eligibility flowchart, the codebook,
#' and a per-variable inspection panel. Supports two inspection modes:
#'
#' - `cross_sectional`: one plot per variable (histogram / bar / time).
#' - `longitudinal`: three plots per variable (global distribution, intra-ID
#' variation, missingness by time) plus a meta plot of observations per ID.
#'
#' All aggregations happen in Spark/dplyr; only small summaries are collected.
#'
#' @param data A Spark DataFrame (tbl_spark) or local data frame.
#' @param type One of `"cross_sectional"` or `"longitudinal"`.
#' @param id_var Character. Name of the ID column. For `longitudinal`,
#' mandatory. For `cross_sectional`, used to skip the ID column in inspection.
#' @param time_var Character or NULL. Name of the time/wave column.
#' Used in `longitudinal` to compute missingness-over-time. Default: NULL.
#' @param variables Optional character vector. If provided, inspects only
#' these variables. Default: NULL (all except id_var/time_var).
#' @param labels Optional named list (variable -> label). If NULL, uses
#' labels from the codebook when available.
#' @param treat_as_categorical Character vector of variable names to treat
#' as categorical even when their R class is numeric or integer. Useful
#' for coded variables (e.g. `cod_sexo` stored as 1L/2L, `cod_raca`
#' stored as integer). For these variables, the report uses bar charts
#' and proportion-by-time stacked plots instead of histograms / median+IQR.
#' Default: NULL.
#' @param output_html File path for the HTML output. There is no default:
#' the destination must be supplied explicitly (e.g. a file under
#' [tempdir()] or a directory chosen by the user).
#' @param output_dir Optional directory for ancillary files (codebook.xlsx,
#' codebook.docx, etc.). If NULL, derived from output_html.
#' @param export_codebook_editable Logical. Also export codebook as
#' .docx and .xlsx in `output_dir`. Default: TRUE.
#' @param cache_data Logical. If TRUE and `data` is a tbl_spark, persists
#' the dataset once before the report aggregations, then releases it on
#' exit. No-op for local data frames. Default: TRUE.
#' @param title Optional title for the report.
#' @param n_bins Number of bins for numeric histograms. Default: 30.
#' @param top_n_cat Max categories shown in categorical plots. Default: 20.
#'
#' @return Invisible list with paths to all generated files.
#' @export
#'
#' @examples
#' \donttest{
#' # Rendering the HTML report needs rmarkdown + pandoc and a few plotting
#' # packages (all in Suggests); it also takes more than 5 seconds, so the
#' # example is wrapped in \donttest and writes only to tempdir().
#' if (requireNamespace("rmarkdown", quietly = TRUE) &&
#' requireNamespace("knitr", quietly = TRUE) &&
#' requireNamespace("ggplot2", quietly = TRUE) &&
#' requireNamespace("patchwork", quietly = TRUE) &&
#' requireNamespace("scales", quietly = TRUE) &&
#' rmarkdown::pandoc_available()) {
#'
#' cb_init(id_col = "id_indiv")
#' df_baseline <- data.frame(
#' id_indiv = sprintf("ID%03d", 1:50),
#' cod_sexo = sample(c(1L, 2L), 50, replace = TRUE),
#' idade = sample(18:80, 50, replace = TRUE)
#' )
#'
#' # Write to a dedicated subdir of tempdir() and clean everything up after:
#' out_dir <- file.path(tempdir(), "autocodebook_report_demo")
#' generate_report(df_baseline, type = "cross_sectional",
#' id_var = "id_indiv",
#' treat_as_categorical = "cod_sexo",
#' output_html = file.path(out_dir, "report_baseline.html"))
#' unlink(out_dir, recursive = TRUE)
#' }
#' }
generate_report <- function(data,
type = c("cross_sectional", "longitudinal"),
id_var = NULL,
time_var = NULL,
variables = NULL,
labels = NULL,
treat_as_categorical = NULL,
output_html,
output_dir = NULL,
export_codebook_editable = TRUE,
cache_data = TRUE,
title = NULL,
n_bins = 30,
top_n_cat = 20) {
type <- match.arg(type)
# Deps obrigatorias para o relatorio HTML
.require_pkg("rmarkdown", "gerar o relatorio HTML")
.require_pkg("knitr", "gerar o relatorio HTML")
if (!.has_ggplot2()) {
stop("[autocodebook] Pacote 'ggplot2' necessario. ",
"Instale com: install.packages('ggplot2')",
call. = FALSE)
}
if (!.has_patchwork()) {
stop("[autocodebook] Pacote 'patchwork' necessario para os paineis ",
"compostos do relatorio. ",
"Instale com: install.packages('patchwork')",
call. = FALSE)
}
if (!.has_scales()) {
stop("[autocodebook] Pacote 'scales' necessario para formatacao ",
"dos eixos. Instale com: install.packages('scales')",
call. = FALSE)
}
# Resolve labels do codebook se nao fornecidos
if (is.null(labels)) {
cb <- .cb_env$codebook
if (nrow(cb) > 0) {
labels <- as.list(stats::setNames(cb$label, cb$variable))
} else {
labels <- list()
}
}
# Resolve output_dir
if (is.null(output_dir)) {
output_dir <- dirname(normalizePath(output_html, mustWork = FALSE))
}
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
ttl <- if (!is.null(title)) title else "autocodebook report"
# =========================================================================
# 0. Cache do dataset (CRITICO em big data)
# =========================================================================
# O relatorio faz dezenas de agregacoes sobre `data`. Se for um tbl_spark
# vindo de uma cadeia lazy, cada agregacao re-executa o pipeline inteiro.
# Persistir UMA vez garante que as agregacoes leem de memoria/disco.
# Liberado (unpersist) no fim via on.exit.
cached_here <- FALSE
if (isTRUE(cache_data) && inherits(data, "tbl_spark") &&
requireNamespace("sparklyr", quietly = TRUE)) {
.cb_msg("[autocodebook] Persistindo dataset (sdf_persist MEMORY_AND_DISK)...")
data <- tryCatch({
d <- sparklyr::sdf_persist(data, storage.level = "MEMORY_AND_DISK")
# Forca materializacao com um count (dispara o cache de fato)
invisible(sparklyr::sdf_nrow(d))
cached_here <- TRUE
d
}, error = function(e) {
.cb_msg("[autocodebook] Aviso: nao foi possivel persistir (",
conditionMessage(e), "). Seguindo sem cache.")
data
})
if (cached_here) {
on.exit({
tryCatch({
if (exists("sdf_unpersist", where = asNamespace("sparklyr"),
inherits = FALSE)) {
unpersist_fn <- get("sdf_unpersist", envir = asNamespace("sparklyr"))
unpersist_fn(data)
}
}, error = function(e) NULL)
}, add = TRUE)
}
}
# =========================================================================
# 1. Coleta sumarios do dataset (n, ncol, etc.)
# =========================================================================
.cb_msg("[autocodebook] Coletando sumario do dataset...")
dataset_summary <- .summarize_dataset(data, id_var = id_var,
time_var = time_var)
# =========================================================================
# 2. Tabelas de fluxo: tracking linear + arvore (se houver split)
# =========================================================================
.cb_msg("[autocodebook] Building eligibility tables...")
flow_tree_table <- tryCatch(flow_table(), error = function(e) NULL)
has_tree <- !is.null(flow_tree_table) && nrow(flow_tree_table) > 0
# Diagrama CONSORT (ggplot) desenhado a partir do tracking + arvore de fluxo.
# So e gerado se ggplot2 estiver disponivel; dimensoes escalam com o numero
# de subgrupos (colunas) e de desfechos (linhas).
report_flow_diagram <- NULL
report_fd_w <- 9; report_fd_h <- 6
if (.has_ggplot2()) {
report_flow_diagram <- tryCatch(flow_diagram(), error = function(e) NULL)
if (!is.null(report_flow_diagram)) {
fl_now <- flow_get()
n_leaves <- if (length(fl_now$levels) > 0) {
nrow(fl_now$levels[[length(fl_now$levels)]]$counts)
} else 1L
n_oc <- length(fl_now$outcomes)
report_fd_w <- max(9, 4 + n_leaves * 3.6)
report_fd_h <- max(6, 5 + n_oc * 1.6)
}
}
# =========================================================================
# 3. Inspecao de variaveis
# =========================================================================
.cb_msg("[autocodebook] Inspecionando variaveis (", type, ")...")
# Se o usuario nao especificou `variables`, usa as variaveis registradas no
# codebook (= o dataset final do pipeline). So inclui as que existem no
# dataframe. Se o codebook estiver vazio, cai no comportamento antigo
# (todas as colunas, menos id/time).
if (is.null(variables) && nrow(.cb_env$codebook) > 0) {
cols_data <- if (inherits(data, "tbl_spark")) colnames(data) else names(data)
cb_vars <- intersect(.cb_env$codebook$variable, cols_data)
if (length(cb_vars) > 0) {
variables <- cb_vars
.cb_msg("[autocodebook] Inspecionando ", length(variables),
" variaveis registradas no codebook.")
}
}
inspection <- if (type == "cross_sectional") {
.inspect_variables_cross_sectional(
data, variables = variables, labels = labels,
id_var = id_var, n_bins = n_bins, top_n_cat = top_n_cat,
treat_as_categorical = treat_as_categorical
)
} else {
.inspect_variables_longitudinal(
data, variables = variables, labels = labels,
id_var = id_var, time_var = time_var,
n_bins = n_bins, top_n_cat = top_n_cat,
treat_as_categorical = treat_as_categorical
)
}
# =========================================================================
# 4. Exports auxiliares (codebook editavel + tracking)
# =========================================================================
ancillary <- list()
if (isTRUE(export_codebook_editable) && nrow(.cb_env$codebook) > 0) {
.cb_msg("[autocodebook] Exportando codebook editavel...")
try({
p_docx <- file.path(output_dir, "codebook.docx")
cb_export(p_docx)
ancillary$codebook_docx <- p_docx
}, silent = TRUE)
try({
p_xlsx <- file.path(output_dir, "codebook.xlsx")
cb_export(p_xlsx)
ancillary$codebook_xlsx <- p_xlsx
}, silent = TRUE)
}
# Fluxograma CONSORT editavel (DOCX) — embute o diagrama como vetor (EMF)
# quando possivel; copiavel/editavel no Word.
if (isTRUE(export_codebook_editable) && !is.null(report_flow_diagram) &&
requireNamespace("officer", quietly = TRUE)) {
.cb_msg("[autocodebook] Exportando fluxograma editavel (DOCX)...")
try({
p_flow_docx <- file.path(output_dir, "flowchart.docx")
flow_diagram_export(p_flow_docx, width = report_fd_w,
height = report_fd_h)
ancillary$flowchart_docx <- p_flow_docx
}, silent = TRUE)
}
# Tabelas de elegibilidade editaveis (XLSX) — pra copiar/colar resultados
if (requireNamespace("openxlsx", quietly = TRUE)) {
# Tracking linear (sempre que houver etapas)
if (nrow(.cb_env$tracking) > 0) {
try({
tr <- .cb_env$tracking
if ("elapsed_s" %in% names(tr)) tr$elapsed_s <- NULL
p_track_xlsx <- file.path(output_dir, "eligibility_table.xlsx")
openxlsx::write.xlsx(tr, p_track_xlsx, overwrite = TRUE)
ancillary$eligibility_table_xlsx <- p_track_xlsx
}, silent = TRUE)
}
# Tabela da arvore (so se houver split)
if (has_tree) {
try({
p_tree_xlsx <- file.path(output_dir, "groups_table.xlsx")
openxlsx::write.xlsx(flow_tree_table, p_tree_xlsx, overwrite = TRUE)
ancillary$groups_table_xlsx <- p_tree_xlsx
}, silent = TRUE)
}
}
# =========================================================================
# 5. Renderiza o RMarkdown
# =========================================================================
.cb_msg("[autocodebook] Renderizando HTML...")
rmd_path <- system.file("rmarkdown", "report-template.Rmd",
package = "autocodebook")
if (!nzchar(rmd_path) || !file.exists(rmd_path)) {
# Fallback: usa template embarcado em string
rmd_path <- .write_default_template(tempfile(fileext = ".Rmd"))
}
# Empacota tudo no env do render
render_env <- new.env(parent = globalenv())
render_env$report_title <- ttl
render_env$report_type <- type
render_env$report_id_var <- id_var
render_env$report_time_var <- time_var
render_env$report_dataset <- dataset_summary
render_env$report_codebook <- .cb_env$codebook
render_env$report_tracking <- .cb_env$tracking
render_env$report_flow_tree <- flow_tree_table
render_env$report_has_tree <- has_tree
render_env$report_flow_diagram <- report_flow_diagram
render_env$report_fd_w <- report_fd_w
render_env$report_fd_h <- report_fd_h
render_env$report_inspection <- inspection
render_env$report_ancillary <- ancillary
rmarkdown::render(
input = rmd_path,
output_file = basename(output_html),
output_dir = dirname(normalizePath(output_html, mustWork = FALSE)),
envir = render_env,
quiet = !isTRUE(.cb_env$verbose)
)
out <- c(list(report_html = output_html), ancillary)
message("[autocodebook] Report generated: ", output_html)
if (length(ancillary) > 0) {
for (nm in names(ancillary)) {
message("[autocodebook] + ", ancillary[[nm]])
}
}
invisible(out)
}
# =============================================================================
# Helpers internos
# =============================================================================
#' @keywords internal
#' @noRd
.summarize_dataset <- function(data, id_var = NULL, time_var = NULL) {
n_rows <- if (inherits(data, "tbl_spark")) {
as.integer(sparklyr::sdf_nrow(data))
} else {
nrow(data)
}
cols <- if (inherits(data, "tbl_spark")) colnames(data) else names(data)
n_cols <- length(cols)
n_ids <- NA_integer_
if (!is.null(id_var) && id_var %in% cols) {
n_ids <- tryCatch({
if (inherits(data, "tbl_spark")) {
data %>% dplyr::select(dplyr::all_of(id_var)) %>%
dplyr::distinct() %>% sparklyr::sdf_nrow() %>% as.integer()
} else {
length(unique(data[[id_var]]))
}
}, error = function(e) NA_integer_)
}
time_range <- NULL
if (!is.null(time_var) && time_var %in% cols) {
time_range <- tryCatch({
tv <- rlang::sym(time_var)
data %>%
dplyr::filter(!is.na(!!tv)) %>%
dplyr::summarise(min_t = min(!!tv, na.rm = TRUE),
max_t = max(!!tv, na.rm = TRUE)) %>%
dplyr::collect()
}, error = function(e) NULL)
}
list(
n_rows = n_rows,
n_cols = n_cols,
n_ids = n_ids,
cols = cols,
time_range = time_range
)
}
#' @keywords internal
#' @noRd
.write_default_template <- function(path) {
template <- '---
title: "`r report_title`"
output:
html_document:
toc: true
toc_float: true
theme: cosmo
self_contained: true
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, dpi = 100)
.fig_w_panel <- if (report_type == "longitudinal") 11 else 8
.fig_h_panel <- if (report_type == "longitudinal") 3.4 else 2.8
```
# Resumo do dataset
```{r summary, results="asis"}
ds <- report_dataset
cat("- **Linhas:** ", format(ds$n_rows, big.mark = ","), "\\n")
cat("- **Colunas:** ", ds$n_cols, "\\n")
if (!is.na(ds$n_ids)) cat("- **Unique subjects:** ", format(ds$n_ids, big.mark = ","), "\\n")
if (!is.null(ds$time_range)) cat("- **Periodo:** ", as.character(ds$time_range$min_t), " a ", as.character(ds$time_range$max_t), "\\n")
```
# Elegibilidade
```{r tracking-table}
if (nrow(report_tracking) > 0) {
tr <- report_tracking
if ("elapsed_s" %in% names(tr)) tr$elapsed_s <- NULL
knitr::kable(tr, format.args = list(big.mark = "."))
}
```
```{r tree-table, eval=isTRUE(report_has_tree)}
if (isTRUE(report_has_tree)) knitr::kable(report_flow_tree)
```
# Codebook
```{r codebook}
if (nrow(report_codebook) > 0) cb_render(group_by_block = TRUE, show_code = TRUE) else cat("_Codebook is empty._")
```
# Inspecao de variaveis
```{r inspection-loop, results="asis"}
plots <- report_inspection$plots
src <- c()
for (var in names(plots)) {
chunk_id <- paste0("var_", gsub("[^A-Za-z0-9_]", "_", var))
src <- c(src,
sprintf("## %s\\n", var),
sprintf("```{r %s, fig.width=%s, fig.height=%s, echo=FALSE}", chunk_id, .fig_w_panel, .fig_h_panel),
sprintf("print(report_inspection$plots[[%s]])", deparse(var)),
"```\\n")
}
out <- knitr::knit_child(text = paste(src, collapse = "\\n"), quiet = TRUE, envir = environment())
cat(out)
```
---
*Gerado por `autocodebook::generate_report()` (template basico)*
'
writeLines(template, path)
path
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.