R/10_flow.R
In autocodebook: Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines

Documented in flow_diagram flow_diagram_export flow_get flow_reset flow_table track_outcomes track_split

# =============================================================================
# autocodebook - Arvore de fluxo (CONSORT composavel)
# =============================================================================
# Modelo de dados: uma arvore de nos guardada em .cb_env$flow.
#
# Tipos de no:
#   - "root"    : coorte inicial (1, criado automaticamente)
#   - "step"    : reducao linear (filtro). 1 filho.
#   - "split"   : ramificacao por coluna. N filhos (um por categoria).
#   - "outcome" : folha empilhada (contagem de desfecho). Sem filhos.
#
# A arvore e construida incrementalmente:
#   track_split(by=)     -> ramifica TODAS as folhas-corrente pela coluna
#   track_outcomes(vars=)-> anexa contagens de desfecho nas folhas-corrente
#
# "Folhas-corrente" = os nos no nivel mais profundo que ainda podem ramificar
# (root no inicio; depois os filhos do ultimo split).
#
# REGRA BIG DATA: cada split/outcome roda group_by no Spark; so os N
# agregados (poucas linhas) voltam pro R.
# =============================================================================


# Inicializa a arvore no environment (chamado por cb_init via flow_reset)
.flow_init <- function() {
  .cb_env$flow <- list(
    id_col   = .cb_env$id_col,
    n_root   = NA_integer_,     # N da coorte inicial
    levels   = list(),          # lista de niveis de split
    outcomes = list(),          # desfechos empilhados (por folha)
    built    = FALSE
  )
  invisible(NULL)
}

#' Reset the flow tree
#'
#' Clears the CONSORT flow tree. Called automatically by `cb_init()`.
#' @return Invisible NULL.
#' @export
flow_reset <- function() {
  .flow_init()
  invisible(NULL)
}

#' Get the current flow tree (raw structure)
#'
#' Returns the internal flow representation. Mostly for debugging /
#' programmatic access. For a tidy table, use `flow_table()`.
#' @return A list describing the flow tree.
#' @export
flow_get <- function() {
  if (is.null(.cb_env$flow)) .flow_init()
  .cb_env$flow
}

# -----------------------------------------------------------------------------
# Helper: conta N unico de individuos (ou linhas se assume_unique)
# -----------------------------------------------------------------------------

.flow_count_ids <- function(sdf, assume_unique = FALSE) {
  id_col <- .cb_env$id_col
  if (inherits(sdf, "tbl_spark")) {
    if (isTRUE(assume_unique)) {
      as.integer(sparklyr::sdf_nrow(sdf))
    } else {
      sdf %>% dplyr::select(dplyr::all_of(id_col)) %>%
        dplyr::distinct() %>% sparklyr::sdf_nrow() %>% as.integer()
    }
  } else {
    if (isTRUE(assume_unique)) nrow(sdf)
    else length(unique(sdf[[id_col]]))
  }
}

# -----------------------------------------------------------------------------
# Helper: conta N por categoria de uma coluna (group_by no Spark)
# Retorna tibble (categoria, n_ids) ja coletada.
# -----------------------------------------------------------------------------

.flow_count_by <- function(sdf, by_col) {
  id_col <- .cb_env$id_col
  b <- rlang::sym(by_col)

  # Conta individuos unicos por categoria.
  # Usa coalesce pra agrupar NA numa categoria "(NA)".
  res <- sdf %>%
    dplyr::mutate(.cat_v = dplyr::coalesce(as.character(!!b), "(NA)")) %>%
    dplyr::group_by(.cat_v) %>%
    dplyr::summarise(
      n_ids = dplyr::n_distinct(!!rlang::sym(id_col)),
      .groups = "drop"
    ) %>%
    dplyr::collect() %>%
    dplyr::rename(categoria = .cat_v) %>%
    dplyr::arrange(categoria)

  res
}

# -----------------------------------------------------------------------------
# Helper: conta desfecho (quantos tem outcome == 1 / TRUE) por categoria
# Para outcome binario: conta os "positivos". Retorna n e pct.
# -----------------------------------------------------------------------------

.flow_count_outcome <- function(sdf, outcome_col, split_cols = character()) {
  id_col <- rlang::sym(.cb_env$id_col)
  o <- rlang::sym(outcome_col)

  # Agrupa pelos splits acumulados + conta positivos do desfecho
  grp_syms <- lapply(split_cols, rlang::sym)

  base <- sdf %>%
    dplyr::mutate(
      .is_pos = dplyr::if_else(
        as.numeric(!!o) == 1 | as.logical(!!o) == TRUE, 1L, 0L
      )
    )

  if (length(split_cols) > 0) {
    base <- base %>%
      dplyr::mutate(dplyr::across(dplyr::all_of(split_cols),
                                  ~ dplyr::coalesce(as.character(.x), "(NA)")))
    base <- base %>% dplyr::group_by(!!!grp_syms)
  }

  out <- base %>%
    dplyr::summarise(
      n_total = dplyr::n_distinct(!!id_col),
      n_pos   = sum(.is_pos, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::collect()

  out
}

# =============================================================================
# track_split() - adiciona um nivel de ramificacao
# =============================================================================

#' Split the cohort into branches by a column (CONSORT flowchart)
#'
#' Adds one branching level to the flow tree. The cohort is divided by the
#' distinct values of `by`. Chain multiple `track_split()` calls to create
#' nested branches (e.g. exposure then mediator). Passes the data through
#' unchanged, so it fits in a `%>%` pipeline.
#'
#' @param sdf A Spark DataFrame or local data frame.
#' @param by Character. Column name to split by. Its distinct values become
#'   the branches. NA values are grouped as "(NA)".
#' @param label Optional character. A human-readable name for this split
#'   level (e.g. "Exposure: drought"). Defaults to `by`.
#' @param value_labels Optional named character vector mapping raw values to
#'   readable labels, e.g. `c("0" = "Sem seca", "1" = "Com seca")`. If not
#'   given, the function tries factor levels / labelled attributes on the
#'   column; failing that, uses the raw value.
#' @param max_levels Integer. Safety cap on nesting depth. Default 3.
#'
#' @return `sdf` unchanged (for piping).
#' @export
#'
#' @examples
#' cb_init(id_col = "id_indiv")
#' df <- data.frame(
#'   id_indiv     = sprintf("ID%03d", 1:100),
#'   exposto_seca = sample(c(0L, 1L), 100, replace = TRUE),
#'   migrou       = sample(c(0L, 1L), 100, replace = TRUE),
#'   obito_dcv    = sample(c(0L, 1L), 100, replace = TRUE)
#' )
#' df <- track_split(df, by = "exposto_seca", label = "Exposure: drought",
#'                   value_labels = c("0" = "No drought", "1" = "Drought"))
#' df <- track_split(df, by = "migrou", label = "Mediator: migration",
#'                   value_labels = c("0" = "Did not migrate", "1" = "Migrated"))
#' track_outcomes(df, vars = "obito_dcv", labels = list(obito_dcv = "CVD death"))
#' flow_table()
track_split <- function(sdf, by, label = NULL, value_labels = NULL,
                        max_levels = 3L) {
  if (is.null(.cb_env$flow)) .flow_init()

  cols <- if (inherits(sdf, "tbl_spark")) colnames(sdf) else names(sdf)
  if (!by %in% cols) {
    stop("[autocodebook] Coluna '", by, "' nao encontrada nos dados.",
         call. = FALSE)
  }

  n_levels <- length(.cb_env$flow$levels)
  if (n_levels >= max_levels) {
    warning("[autocodebook] Limite de ", max_levels,
            " niveis de split atingido. Split por '", by, "' ignorado.",
            call. = FALSE)
    return(invisible(sdf))
  }

  # Auto-deteccao de value_labels se nao fornecido (fatores em df local)
  if (is.null(value_labels) && !inherits(sdf, "tbl_spark")) {
    col_data <- sdf[[by]]
    if (is.factor(col_data)) {
      lv <- levels(col_data)
      value_labels <- stats::setNames(lv, lv)
    }
  }

  # N da raiz (so na primeira vez)
  if (is.na(.cb_env$flow$n_root)) {
    .cb_env$flow$n_root <- .flow_count_ids(sdf)
  }

  # Colunas de split acumuladas (para group_by aninhado)
  prev_cols <- vapply(.cb_env$flow$levels, function(l) l$by, character(1))
  all_cols  <- c(prev_cols, by)

  # Conta N por combinacao de todos os niveis ate aqui
  id_col <- rlang::sym(.cb_env$id_col)
  grp_syms <- lapply(all_cols, rlang::sym)

  counts <- sdf %>%
    dplyr::mutate(dplyr::across(dplyr::all_of(all_cols),
                                ~ dplyr::coalesce(as.character(.x), "(NA)"))) %>%
    dplyr::group_by(!!!grp_syms) %>%
    dplyr::summarise(n_ids = dplyr::n_distinct(!!id_col), .groups = "drop") %>%
    dplyr::collect()

  .cb_env$flow$levels[[length(.cb_env$flow$levels) + 1L]] <- list(
    by           = by,
    label        = if (!is.null(label)) label else by,
    value_labels = value_labels,
    counts       = counts,
    cols         = all_cols
  )

  .cb_msg("[autocodebook] split por '", by, "' -> ",
          nrow(counts), " grupos (nivel ", n_levels + 1L, ")")

  invisible(sdf)
}

# Helper: aplica value_labels a um valor cru
.flow_value_label <- function(raw, value_labels) {
  if (is.null(value_labels)) return(as.character(raw))
  key <- as.character(raw)
  if (key %in% names(value_labels)) value_labels[[key]] else key
}

# =============================================================================
# track_outcomes() - empilha desfechos nas folhas correntes
# =============================================================================

#' Attach outcome counts to the current leaves (CONSORT flowchart)
#'
#' Adds one or more outcome variables, counted within each current leaf
#' (combination of all splits so far). Outcomes are stacked, not branched.
#' Each outcome is treated as binary: counts how many individuals have
#' value == 1 (or TRUE), plus the percentage within the leaf.
#'
#' @param sdf A Spark DataFrame or local data frame.
#' @param vars Character vector of outcome column names (binary 0/1 or
#'   logical).
#' @param labels Optional named list (var -> label).
#'
#' @return `sdf` unchanged (for piping).
#' @export
track_outcomes <- function(sdf, vars, labels = NULL) {
  if (is.null(.cb_env$flow)) .flow_init()

  cols <- if (inherits(sdf, "tbl_spark")) colnames(sdf) else names(sdf)
  miss <- setdiff(vars, cols)
  if (length(miss) > 0) {
    stop("[autocodebook] Colunas de desfecho nao encontradas: ",
         paste(miss, collapse = ", "), call. = FALSE)
  }

  # Colunas de split acumuladas (define as folhas)
  split_cols <- if (length(.cb_env$flow$levels) > 0) {
    .cb_env$flow$levels[[length(.cb_env$flow$levels)]]$cols
  } else character()

  # N da raiz se ainda nao setado (caso outcomes sem split)
  if (is.na(.cb_env$flow$n_root)) {
    .cb_env$flow$n_root <- .flow_count_ids(sdf)
  }

  for (v in vars) {
    cnt <- .flow_count_outcome(sdf, v, split_cols = split_cols)
    lbl <- if (!is.null(labels) && !is.null(labels[[v]])) labels[[v]] else v
    .cb_env$flow$outcomes[[length(.cb_env$flow$outcomes) + 1L]] <- list(
      var    = v,
      label  = lbl,
      counts = cnt,           # tibble: split_cols + n_total + n_pos
      cols   = split_cols
    )
    .cb_msg("[autocodebook] desfecho '", v, "' contado em ",
            max(1L, nrow(cnt)), " folha(s)")
  }

  invisible(sdf)
}

# =============================================================================
# flow_table() - converte a arvore numa tabela tidy (para export editavel)
# =============================================================================

#' Flow tree as a tidy table
#'
#' Flattens the CONSORT flow tree into a publication-friendly data frame.
#' One row per leaf x outcome. Split levels become named columns (using their
#' labels), values are mapped through value_labels, and percentages are
#' formatted as readable strings.
#'
#' @return A tibble.
#' @export
flow_table <- function() {
  fl <- flow_get()
  if (is.na(fl$n_root)) {
    return(tibble::tibble())
  }

  n_levels <- length(fl$levels)

  # Funcao: dado um data frame com as colunas de split cruas, devolve
  # um data frame com colunas nomeadas pelos labels dos niveis e valores
  # mapeados pelos value_labels.
  .label_split_cols <- function(df) {
    if (n_levels == 0) return(df[, character(0), drop = FALSE])
    out <- list()
    for (lv in seq_len(n_levels)) {
      lvl  <- fl$levels[[lv]]
      raw  <- as.character(df[[lvl$by]])
      mapped <- vapply(raw, .flow_value_label, character(1),
                       value_labels = lvl$value_labels)
      out[[lvl$label]] <- mapped
    }
    as.data.frame(out, check.names = FALSE, stringsAsFactors = FALSE)
  }

  # Caso sem splits e sem outcomes: so a raiz
  if (n_levels == 0 && length(fl$outcomes) == 0) {
    return(tibble::tibble(Group = "Cohort", N = fl$n_root))
  }

  # Caso com splits mas sem outcomes: tabela de N por folha
  if (length(fl$outcomes) == 0) {
    last <- fl$levels[[n_levels]]
    labeled <- .label_split_cols(last$counts)
    labeled$N <- last$counts$n_ids
    return(tibble::as_tibble(labeled))
  }

  # Caso com outcomes: uma linha por folha x desfecho
  rows <- list()
  for (oc in fl$outcomes) {
    cnt <- oc$counts
    labeled <- if (length(oc$cols) > 0) {
      .label_split_cols(cnt)
    } else {
      data.frame(Group = rep("Cohort", nrow(cnt)),
                 check.names = FALSE, stringsAsFactors = FALSE)
    }
    labeled[["Outcome"]] <- oc$label
    labeled[["N total"]]  <- cnt$n_total
    labeled[["N events"]] <- cnt$n_pos
    pct <- ifelse(cnt$n_total > 0, cnt$n_pos / cnt$n_total, NA_real_)
    labeled[["%"]] <- vapply(pct, .pct_str, character(1))
    rows[[length(rows) + 1L]] <- labeled
  }
  out <- dplyr::bind_rows(rows)
  tibble::as_tibble(out)
}

# =============================================================================
# flow_diagram() - desenha a arvore de fluxo como um diagrama CONSORT
# =============================================================================

# Helper local: formata inteiro com separador de milhar.
.flow_fmt_int <- function(x) {
  format(as.numeric(x), big.mark = ",", scientific = FALSE, trim = TRUE)
}

# Helper local: quebra um rotulo em varias linhas para caber na caixa.
.flow_wrap <- function(s, width = 28L) {
  vapply(s, function(z) paste(strwrap(z, width = width), collapse = "\n"),
         character(1), USE.NAMES = FALSE)
}

#' Draw the eligibility flow as a CONSORT-style flowchart
#'
#' Renders a publication-ready CONSORT-style diagram (a 'ggplot' object) from
#' the information captured during the pipeline. The layout is computed
#' automatically, so no manual positioning is needed:
#'
#' - A vertical trunk built from the linear eligibility steps (recorded by
#'   [auto_filter()] / [track_step()]): the cohort baseline at the top, a
#'   single side box listing every exclusion and its count, and the eligible
#'   cohort below.
#' - One column of boxes per subgroup leaf of the flow tree (built with
#'   [track_split()]), shown side by side under the eligible cohort.
#' - Outcome boxes (added with [track_outcomes()]) stacked beneath each
#'   subgroup column, one row per outcome, with the event count and percent.
#'
#' Either part is optional: with only linear steps you get the trunk; with
#' only a split tree you get the cohort box and its subgroup columns. The
#' function reads the current session state and writes nothing to disk.
#'
#' Requires the 'ggplot2' package (a soft dependency).
#'
#' @param title Plot title. Default: "Eligibility flowchart".
#' @param show_exclusions Logical. Show the side box listing the linear
#'   exclusion steps? Default: TRUE.
#' @param box_fill Fill colour of the boxes. Default: "white".
#' @param border_col Border colour of the boxes. Default: "grey25".
#' @param text_size Base text size for the box labels. Default: 3.5.
#'
#' @return A 'ggplot' object, or invisible NULL if there is nothing to draw.
#' @export
#'
#' @examples
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
#'   cb_init(id_col = "id_indiv")
#'   df <- data.frame(
#'     id_indiv  = sprintf("ID%04d", 1:400),
#'     sgm       = sample(c(0L, 1L), 400, replace = TRUE, prob = c(0.95, 0.05)),
#'     self_harm = sample(c(0L, 1L), 400, replace = TRUE, prob = c(0.97, 0.03))
#'   )
#'   df <- track_split(df, by = "sgm", label = "SGM status",
#'                     value_labels = c("0" = "Non-SGM", "1" = "SGM"))
#'   track_outcomes(df, vars = "self_harm",
#'                  labels = list(self_harm = "Self-harm"))
#'   flow_diagram()
#' }
flow_diagram <- function(title = "Eligibility flowchart",
                         show_exclusions = TRUE,
                         box_fill = "white",
                         border_col = "grey25",
                         text_size = 3.5) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("[autocodebook] Package 'ggplot2' is required for flow_diagram(). ",
         "Install it with: install.packages('ggplot2')", call. = FALSE)
  }

  fl <- flow_get()
  tr <- .cb_env$tracking
  has_tracking <- !is.null(tr) && nrow(tr) > 0
  n_levels     <- length(fl$levels)
  has_tree     <- !is.na(fl$n_root) && n_levels > 0

  if (!has_tracking && is.na(fl$n_root)) {
    message("[autocodebook] Flow is empty - nothing to draw.")
    return(invisible(NULL))
  }

  # Geometria (unidades arbitrarias; coord_equal preserva a proporcao)
  W <- 3.4; H <- 1.25; COLP <- 4.0; ROWP <- 2.0

  boxes <- list(); segs <- list()
  add_box <- function(xc, yc, w, h, text, face = "plain", hj = 0.5) {
    tx <- if (hj == 0) xc - w / 2 + 0.18 else xc
    boxes[[length(boxes) + 1L]] <<- data.frame(
      xc = xc, yc = yc, xmin = xc - w / 2, xmax = xc + w / 2,
      ymin = yc - h / 2, ymax = yc + h / 2,
      face = face, hj = hj, tx = tx, text = text, stringsAsFactors = FALSE)
  }
  add_seg <- function(x, y, xend, yend, arr = FALSE) {
    segs[[length(segs) + 1L]] <<- data.frame(x = x, y = y, xend = xend,
                                             yend = yend, arr = arr)
  }

  # --- Folhas / colunas de subgrupo -----------------------------------------
  if (has_tree) {
    last <- fl$levels[[n_levels]]; cnt <- last$counts; cc <- last$cols
    ord <- do.call(order, lapply(cc, function(k) as.character(cnt[[k]])))
    cnt <- cnt[ord, , drop = FALSE]
    keymat    <- as.matrix(cnt[, cc, drop = FALSE])
    leaf_keys <- apply(keymat, 1, function(r) paste(as.character(r),
                                                    collapse = ""))
    leaf_lab  <- apply(keymat, 1, function(r) {
      labs <- vapply(seq_len(n_levels), function(k)
        .flow_value_label(as.character(r[[k]]), fl$levels[[k]]$value_labels),
        character(1))
      paste(labs, collapse = " / ")
    })
    leaf_lab <- .flow_wrap(leaf_lab, 30L)
    leaf_n   <- cnt$n_ids
    C        <- nrow(cnt)
    col_x    <- (seq_len(C) - (C + 1) / 2) * COLP
  } else {
    C <- 0L; col_x <- 0
  }
  trunk_x <- 0

  y_base <- 0; y_elig <- -ROWP * 1.25; y_head <- y_elig - ROWP * 1.25

  # --- Tronco linear (baseline -> exclusoes -> elegiveis) --------------------
  if (has_tracking) {
    elig_n <- tr$n_ids[nrow(tr)]
    tot_rm <- sum(tr$n_removed, na.rm = TRUE)
    base_n <- elig_n + tot_rm
    add_box(trunk_x, y_base, W * 1.7, H,
            paste0("Individuals in the cohort baseline\nn = ",
                   .flow_fmt_int(base_n)), face = "bold")
    add_box(trunk_x, y_elig, W * 1.7, H,
            paste0("Eligible individuals included\nin the analyses\nn = ",
                   .flow_fmt_int(elig_n)), face = "bold")
    add_seg(trunk_x, y_base - H / 2, trunk_x, y_elig + H / 2, arr = TRUE)

    if (isTRUE(show_exclusions) && tot_rm > 0) {
      b <- tr[tr$n_removed > 0, , drop = FALSE]
      blines <- paste0("\u2022 ", b$description, " (n = ",
                       vapply(b$n_removed, .flow_fmt_int, character(1)), ")")
      excl_w <- W * 1.65
      # Mantem a caixa de exclusoes a direita do tronco e das colunas, sem
      # sobrepor (importante quando nao ha split: C = 0).
      min_excl_x <- trunk_x + (W * 1.7) / 2 + 0.6 + excl_w / 2
      right_cols <- if (C > 0) max(col_x) + COLP * 1.15 else -Inf
      excl_x <- max(min_excl_x, right_cols)
      excl_h <- max(H, 0.55 + 0.40 * (length(blines) + 1))
      excl_y <- (y_base + y_elig) / 2
      add_box(excl_x, excl_y, excl_w, excl_h,
              paste0("Exclusions (n = ", .flow_fmt_int(tot_rm), "):\n",
                     paste(blines, collapse = "\n")), hj = 0)
      add_seg(trunk_x, excl_y, excl_x - excl_w / 2, excl_y, arr = TRUE)
    }
    split_from <- y_elig - H / 2
  } else {
    add_box(trunk_x, y_elig, W * 1.7, H,
            paste0("Cohort\nn = ", .flow_fmt_int(fl$n_root)), face = "bold")
    split_from <- y_elig - H / 2
  }

  # --- Colunas de subgrupo + desfechos empilhados ---------------------------
  if (C > 0) {
    bus_y <- (split_from + (y_head + H / 2)) / 2
    add_seg(trunk_x, split_from, trunk_x, bus_y)
    if (C > 1) add_seg(min(col_x), bus_y, max(col_x), bus_y)
    for (j in seq_len(C)) {
      add_seg(col_x[j], bus_y, col_x[j], y_head + (H * 1.25) / 2)
      add_box(col_x[j], y_head, W, H * 1.25,
              paste0(leaf_lab[j], "\nn = ", .flow_fmt_int(leaf_n[j])),
              face = "bold")
    }
    if (length(fl$outcomes) > 0) {
      header_half <- (H * 1.25) / 2; gap_oc <- 0.28
      for (oi in seq_along(fl$outcomes)) {
        oc <- fl$outcomes[[oi]]; ocnt <- oc$counts
        okeys <- if (length(oc$cols) > 0) {
          apply(as.matrix(ocnt[, oc$cols, drop = FALSE]), 1,
                function(r) paste(as.character(r), collapse = ""))
        } else rep("", nrow(ocnt))
        y_oc <- y_head - header_half - gap_oc - (oi - 1) * (H + gap_oc) - H / 2
        olab <- .flow_wrap(oc$label, 26L)
        for (j in seq_len(C)) {
          ridx <- which(okeys == leaf_keys[j])
          if (length(ridx) == 1L) {
            nt <- ocnt$n_total[ridx]; np <- ocnt$n_pos[ridx]
            pct <- if (nt > 0) .pct_str(np / nt) else "-"
            txt <- paste0(olab, "\nn = ", .flow_fmt_int(np), " (", pct, ")")
          } else {
            txt <- paste0(olab, "\n-")
          }
          add_box(col_x[j], y_oc, W, H, txt)
        }
      }
    }
  }

  # --- Monta o ggplot --------------------------------------------------------
  bx <- do.call(rbind, boxes)
  sg <- if (length(segs)) {
    do.call(rbind, segs)
  } else {
    data.frame(x = numeric(0), y = numeric(0), xend = numeric(0),
               yend = numeric(0), arr = logical(0))
  }

  ggplot2::ggplot() +
    ggplot2::geom_segment(
      data = sg[!sg$arr, , drop = FALSE],
      ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
      colour = "grey45", linewidth = 0.4) +
    ggplot2::geom_segment(
      data = sg[sg$arr, , drop = FALSE],
      ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
      colour = "grey25", linewidth = 0.45,
      arrow = grid::arrow(length = grid::unit(0.18, "cm"), type = "closed")) +
    ggplot2::geom_rect(
      data = bx,
      ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
      fill = box_fill, colour = border_col, linewidth = 0.4) +
    ggplot2::geom_text(
      data = bx[bx$face == "bold", , drop = FALSE],
      ggplot2::aes(x = tx, y = yc, label = text, hjust = hj),
      fontface = "bold", size = text_size, lineheight = 0.95) +
    ggplot2::geom_text(
      data = bx[bx$face == "plain", , drop = FALSE],
      ggplot2::aes(x = tx, y = yc, label = text, hjust = hj),
      size = text_size, lineheight = 0.95) +
    ggplot2::coord_equal(clip = "off") +
    ggplot2::labs(title = title) +
    ggplot2::theme_void(base_size = 11) +
    ggplot2::theme(
      plot.title  = ggplot2::element_text(face = "bold", hjust = 0.5,
                                          size = 13),
      plot.margin = ggplot2::margin(12, 16, 12, 16))
}

# =============================================================================
# flow_diagram_export() — salva o diagrama (raster, vetorial ou editavel)
# =============================================================================

#' Save the eligibility flowchart to a file
#'
#' Renders [flow_diagram()] and writes it to disk. The output format is taken
#' from the file extension, so the same call can produce a raster image, a
#' vector image, or an editable office document:
#'
#' - `.png`, `.jpg`/`.jpeg`, `.tiff`, `.bmp` - raster image.
#' - `.pdf`, `.svg`, `.eps` - vector image (editable in Inkscape / Illustrator;
#'   `.svg` needs the 'svglite' package).
#' - `.emf` - Windows vector metafile, editable in Word once inserted
#'   (needs the 'devEMF' package).
#' - `.docx` - a Word document with the flowchart embedded as a vector
#'   image (editable after ungrouping; needs the 'officer' package, plus
#'   'devEMF' for the vector version, otherwise a raster image is used).
#' - `.pptx` - a PowerPoint slide where every box and label is a native,
#'   fully editable shape (needs the 'rvg' and 'officer' packages).
#'
#' There is no default path: the destination must be supplied explicitly
#' (e.g. a file under [tempdir()] or a directory chosen by the user).
#'
#' @param path File path. The extension determines the format.
#' @param width,height Plot size in inches. If NULL (default), sensible
#'   values are derived from the number of subgroups and outcomes.
#' @param dpi Resolution for raster formats. Default: 300.
#' @param ... Further arguments passed to [flow_diagram()] (e.g. `title`,
#'   `box_fill`, `text_size`).
#'
#' @return Invisible path, or invisible NULL if there is nothing to draw.
#' @export
#'
#' @examples
#' \donttest{
#' # Wrapped in \donttest because writing the image invokes a graphics
#' # device and may take more than 5 seconds; it writes only to tempdir().
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
#'   cb_init(id_col = "id_indiv")
#'   df <- data.frame(
#'     id_indiv = sprintf("ID%03d", 1:100),
#'     g        = sample(c(0L, 1L), 100, replace = TRUE),
#'     y        = sample(c(0L, 1L), 100, replace = TRUE)
#'   )
#'   df <- track_split(df, by = "g", value_labels = c("0" = "A", "1" = "B"))
#'   track_outcomes(df, vars = "y", labels = list(y = "Outcome"))
#'   # Raster (no extra packages needed); written to tempdir() and cleaned up:
#'   out <- file.path(tempdir(), "flow.png")
#'   flow_diagram_export(out)
#'   unlink(out)
#' }
#' }
flow_diagram_export <- function(path, width = NULL, height = NULL,
                                dpi = 300, ...) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("[autocodebook] Package 'ggplot2' is required for ",
         "flow_diagram_export(). Install it with: install.packages('ggplot2')",
         call. = FALSE)
  }

  g <- flow_diagram(...)
  if (is.null(g)) {
    message("[autocodebook] Flow is empty - nothing to export.")
    return(invisible(NULL))
  }

  # Dimensoes padrao escalam com o numero de subgrupos e desfechos.
  if (is.null(width) || is.null(height)) {
    fl    <- flow_get()
    nlv   <- length(fl$levels)
    nleaf <- if (nlv > 0) nrow(fl$levels[[nlv]]$counts) else 1L
    noc   <- length(fl$outcomes)
    if (is.null(width))  width  <- max(9, 4 + nleaf * 3.6)
    if (is.null(height)) height <- max(6, 5 + noc * 1.6)
  }

  ext <- tolower(tools::file_ext(path))

  if (ext == "pptx") {
    .require_pkg("rvg",     "export PPTX")
    .require_pkg("officer", "export PPTX")
    doc <- officer::read_pptx()
    doc <- officer::add_slide(doc, layout = "Blank", master = "Office Theme")
    doc <- officer::ph_with(doc, rvg::dml(ggobj = g),
                            location = officer::ph_location_fullsize())
    print(doc, target = path)
  } else if (ext == "docx") {
    .require_pkg("officer", "export DOCX")
    # Embute o diagrama como EMF (vetor editavel no Word) se devEMF estiver
    # disponivel; senao, como PNG. A figura e exibida ajustada a largura util
    # da pagina, preservando a proporcao.
    if (requireNamespace("devEMF", quietly = TRUE)) {
      img <- tempfile(fileext = ".emf")
      ggplot2::ggsave(img, g, device = devEMF::emf,
                      width = width, height = height)
    } else {
      img <- tempfile(fileext = ".png")
      ggplot2::ggsave(img, g, width = width, height = height, dpi = dpi)
    }
    disp_w <- min(width, 6.5)
    disp_h <- height * (disp_w / width)
    doc <- officer::read_docx()
    doc <- officer::body_add_img(doc, src = img,
                                 width = disp_w, height = disp_h)
    print(doc, target = path)
  } else if (ext == "emf") {
    .require_pkg("devEMF", "export EMF")
    ggplot2::ggsave(path, g, device = devEMF::emf,
                    width = width, height = height)
  } else {
    # png/jpg/jpeg/tiff/bmp/pdf/svg/eps: ggsave infere o device pela extensao.
    ggplot2::ggsave(path, g, width = width, height = height, dpi = dpi)
  }

  message("[autocodebook] Flowchart saved to: ", path)
  invisible(path)
}
Any scripts or data that you put into this service are public.
autocodebook documentation built on June 9, 2026, 1:09 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
autocodebook
Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines

R/10_flow.R
In autocodebook: Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines

Defines functions flow_diagram_export flow_diagram .flow_wrap .flow_fmt_int flow_table track_outcomes .flow_value_label track_split .flow_count_outcome .flow_count_by .flow_count_ids flow_get flow_reset .flow_init

Documented in flow_diagram flow_diagram_export flow_get flow_reset flow_table track_outcomes track_split

Try the autocodebook package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

autocodebook Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines

R/10_flow.R In autocodebook: Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines

Defines functions flow_diagram_export flow_diagram .flow_wrap .flow_fmt_int flow_table track_outcomes .flow_value_label track_split .flow_count_outcome .flow_count_by .flow_count_ids flow_get flow_reset .flow_init

Documented in flow_diagram flow_diagram_export flow_get flow_reset flow_table track_outcomes track_split

Try the autocodebook package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

autocodebook
Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines

R/10_flow.R
In autocodebook: Automatic Codebook and Tracking for 'Spark' and 'dplyr' Pipelines