R/10_flow.R

Defines functions flow_diagram_export flow_diagram .flow_wrap .flow_fmt_int flow_table track_outcomes .flow_value_label track_split .flow_count_outcome .flow_count_by .flow_count_ids flow_get flow_reset .flow_init

Documented in flow_diagram flow_diagram_export flow_get flow_reset flow_table track_outcomes track_split

# =============================================================================
# autocodebook - Arvore de fluxo (CONSORT composavel)
# =============================================================================
# Modelo de dados: uma arvore de nos guardada em .cb_env$flow.
#
# Tipos de no:
#   - "root"    : coorte inicial (1, criado automaticamente)
#   - "step"    : reducao linear (filtro). 1 filho.
#   - "split"   : ramificacao por coluna. N filhos (um por categoria).
#   - "outcome" : folha empilhada (contagem de desfecho). Sem filhos.
#
# A arvore e construida incrementalmente:
#   track_split(by=)     -> ramifica TODAS as folhas-corrente pela coluna
#   track_outcomes(vars=)-> anexa contagens de desfecho nas folhas-corrente
#
# "Folhas-corrente" = os nos no nivel mais profundo que ainda podem ramificar
# (root no inicio; depois os filhos do ultimo split).
#
# REGRA BIG DATA: cada split/outcome roda group_by no Spark; so os N
# agregados (poucas linhas) voltam pro R.
# =============================================================================


# Inicializa a arvore no environment (chamado por cb_init via flow_reset)
.flow_init <- function() {
  .cb_env$flow <- list(
    id_col   = .cb_env$id_col,
    n_root   = NA_integer_,     # N da coorte inicial
    levels   = list(),          # lista de niveis de split
    outcomes = list(),          # desfechos empilhados (por folha)
    built    = FALSE
  )
  invisible(NULL)
}

#' Reset the flow tree
#'
#' Clears the CONSORT flow tree. Called automatically by `cb_init()`.
#' @return Invisible NULL.
#' @export
flow_reset <- function() {
  .flow_init()
  invisible(NULL)
}

#' Get the current flow tree (raw structure)
#'
#' Returns the internal flow representation. Mostly for debugging /
#' programmatic access. For a tidy table, use `flow_table()`.
#' @return A list describing the flow tree.
#' @export
flow_get <- function() {
  if (is.null(.cb_env$flow)) .flow_init()
  .cb_env$flow
}

# -----------------------------------------------------------------------------
# Helper: conta N unico de individuos (ou linhas se assume_unique)
# -----------------------------------------------------------------------------

.flow_count_ids <- function(sdf, assume_unique = FALSE) {
  id_col <- .cb_env$id_col
  if (inherits(sdf, "tbl_spark")) {
    if (isTRUE(assume_unique)) {
      as.integer(sparklyr::sdf_nrow(sdf))
    } else {
      sdf %>% dplyr::select(dplyr::all_of(id_col)) %>%
        dplyr::distinct() %>% sparklyr::sdf_nrow() %>% as.integer()
    }
  } else {
    if (isTRUE(assume_unique)) nrow(sdf)
    else length(unique(sdf[[id_col]]))
  }
}

# -----------------------------------------------------------------------------
# Helper: conta N por categoria de uma coluna (group_by no Spark)
# Retorna tibble (categoria, n_ids) ja coletada.
# -----------------------------------------------------------------------------

.flow_count_by <- function(sdf, by_col) {
  id_col <- .cb_env$id_col
  b <- rlang::sym(by_col)

  # Conta individuos unicos por categoria.
  # Usa coalesce pra agrupar NA numa categoria "(NA)".
  res <- sdf %>%
    dplyr::mutate(.cat_v = dplyr::coalesce(as.character(!!b), "(NA)")) %>%
    dplyr::group_by(.cat_v) %>%
    dplyr::summarise(
      n_ids = dplyr::n_distinct(!!rlang::sym(id_col)),
      .groups = "drop"
    ) %>%
    dplyr::collect() %>%
    dplyr::rename(categoria = .cat_v) %>%
    dplyr::arrange(categoria)

  res
}

# -----------------------------------------------------------------------------
# Helper: conta desfecho (quantos tem outcome == 1 / TRUE) por categoria
# Para outcome binario: conta os "positivos". Retorna n e pct.
# -----------------------------------------------------------------------------

.flow_count_outcome <- function(sdf, outcome_col, split_cols = character()) {
  id_col <- rlang::sym(.cb_env$id_col)
  o <- rlang::sym(outcome_col)

  # Agrupa pelos splits acumulados + conta positivos do desfecho
  grp_syms <- lapply(split_cols, rlang::sym)

  base <- sdf %>%
    dplyr::mutate(
      .is_pos = dplyr::if_else(
        as.numeric(!!o) == 1 | as.logical(!!o) == TRUE, 1L, 0L
      )
    )

  if (length(split_cols) > 0) {
    base <- base %>%
      dplyr::mutate(dplyr::across(dplyr::all_of(split_cols),
                                  ~ dplyr::coalesce(as.character(.x), "(NA)")))
    base <- base %>% dplyr::group_by(!!!grp_syms)
  }

  out <- base %>%
    dplyr::summarise(
      n_total = dplyr::n_distinct(!!id_col),
      n_pos   = sum(.is_pos, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::collect()

  out
}

# =============================================================================
# track_split() - adiciona um nivel de ramificacao
# =============================================================================

#' Split the cohort into branches by a column (CONSORT flowchart)
#'
#' Adds one branching level to the flow tree. The cohort is divided by the
#' distinct values of `by`. Chain multiple `track_split()` calls to create
#' nested branches (e.g. exposure then mediator). Passes the data through
#' unchanged, so it fits in a `%>%` pipeline.
#'
#' @param sdf A Spark DataFrame or local data frame.
#' @param by Character. Column name to split by. Its distinct values become
#'   the branches. NA values are grouped as "(NA)".
#' @param label Optional character. A human-readable name for this split
#'   level (e.g. "Exposure: drought"). Defaults to `by`.
#' @param value_labels Optional named character vector mapping raw values to
#'   readable labels, e.g. `c("0" = "Sem seca", "1" = "Com seca")`. If not
#'   given, the function tries factor levels / labelled attributes on the
#'   column; failing that, uses the raw value.
#' @param max_levels Integer. Safety cap on nesting depth. Default 3.
#'
#' @return `sdf` unchanged (for piping).
#' @export
#'
#' @examples
#' cb_init(id_col = "id_indiv")
#' df <- data.frame(
#'   id_indiv     = sprintf("ID%03d", 1:100),
#'   exposto_seca = sample(c(0L, 1L), 100, replace = TRUE),
#'   migrou       = sample(c(0L, 1L), 100, replace = TRUE),
#'   obito_dcv    = sample(c(0L, 1L), 100, replace = TRUE)
#' )
#' df <- track_split(df, by = "exposto_seca", label = "Exposure: drought",
#'                   value_labels = c("0" = "No drought", "1" = "Drought"))
#' df <- track_split(df, by = "migrou", label = "Mediator: migration",
#'                   value_labels = c("0" = "Did not migrate", "1" = "Migrated"))
#' track_outcomes(df, vars = "obito_dcv", labels = list(obito_dcv = "CVD death"))
#' flow_table()
track_split <- function(sdf, by, label = NULL, value_labels = NULL,
                        max_levels = 3L) {
  if (is.null(.cb_env$flow)) .flow_init()

  cols <- if (inherits(sdf, "tbl_spark")) colnames(sdf) else names(sdf)
  if (!by %in% cols) {
    stop("[autocodebook] Coluna '", by, "' nao encontrada nos dados.",
         call. = FALSE)
  }

  n_levels <- length(.cb_env$flow$levels)
  if (n_levels >= max_levels) {
    warning("[autocodebook] Limite de ", max_levels,
            " niveis de split atingido. Split por '", by, "' ignorado.",
            call. = FALSE)
    return(invisible(sdf))
  }

  # Auto-deteccao de value_labels se nao fornecido (fatores em df local)
  if (is.null(value_labels) && !inherits(sdf, "tbl_spark")) {
    col_data <- sdf[[by]]
    if (is.factor(col_data)) {
      lv <- levels(col_data)
      value_labels <- stats::setNames(lv, lv)
    }
  }

  # N da raiz (so na primeira vez)
  if (is.na(.cb_env$flow$n_root)) {
    .cb_env$flow$n_root <- .flow_count_ids(sdf)
  }

  # Colunas de split acumuladas (para group_by aninhado)
  prev_cols <- vapply(.cb_env$flow$levels, function(l) l$by, character(1))
  all_cols  <- c(prev_cols, by)

  # Conta N por combinacao de todos os niveis ate aqui
  id_col <- rlang::sym(.cb_env$id_col)
  grp_syms <- lapply(all_cols, rlang::sym)

  counts <- sdf %>%
    dplyr::mutate(dplyr::across(dplyr::all_of(all_cols),
                                ~ dplyr::coalesce(as.character(.x), "(NA)"))) %>%
    dplyr::group_by(!!!grp_syms) %>%
    dplyr::summarise(n_ids = dplyr::n_distinct(!!id_col), .groups = "drop") %>%
    dplyr::collect()

  .cb_env$flow$levels[[length(.cb_env$flow$levels) + 1L]] <- list(
    by           = by,
    label        = if (!is.null(label)) label else by,
    value_labels = value_labels,
    counts       = counts,
    cols         = all_cols
  )

  .cb_msg("[autocodebook] split por '", by, "' -> ",
          nrow(counts), " grupos (nivel ", n_levels + 1L, ")")

  invisible(sdf)
}

# Helper: aplica value_labels a um valor cru
.flow_value_label <- function(raw, value_labels) {
  if (is.null(value_labels)) return(as.character(raw))
  key <- as.character(raw)
  if (key %in% names(value_labels)) value_labels[[key]] else key
}

# =============================================================================
# track_outcomes() - empilha desfechos nas folhas correntes
# =============================================================================

#' Attach outcome counts to the current leaves (CONSORT flowchart)
#'
#' Adds one or more outcome variables, counted within each current leaf
#' (combination of all splits so far). Outcomes are stacked, not branched.
#' Each outcome is treated as binary: counts how many individuals have
#' value == 1 (or TRUE), plus the percentage within the leaf.
#'
#' @param sdf A Spark DataFrame or local data frame.
#' @param vars Character vector of outcome column names (binary 0/1 or
#'   logical).
#' @param labels Optional named list (var -> label).
#'
#' @return `sdf` unchanged (for piping).
#' @export
track_outcomes <- function(sdf, vars, labels = NULL) {
  if (is.null(.cb_env$flow)) .flow_init()

  cols <- if (inherits(sdf, "tbl_spark")) colnames(sdf) else names(sdf)
  miss <- setdiff(vars, cols)
  if (length(miss) > 0) {
    stop("[autocodebook] Colunas de desfecho nao encontradas: ",
         paste(miss, collapse = ", "), call. = FALSE)
  }

  # Colunas de split acumuladas (define as folhas)
  split_cols <- if (length(.cb_env$flow$levels) > 0) {
    .cb_env$flow$levels[[length(.cb_env$flow$levels)]]$cols
  } else character()

  # N da raiz se ainda nao setado (caso outcomes sem split)
  if (is.na(.cb_env$flow$n_root)) {
    .cb_env$flow$n_root <- .flow_count_ids(sdf)
  }

  for (v in vars) {
    cnt <- .flow_count_outcome(sdf, v, split_cols = split_cols)
    lbl <- if (!is.null(labels) && !is.null(labels[[v]])) labels[[v]] else v
    .cb_env$flow$outcomes[[length(.cb_env$flow$outcomes) + 1L]] <- list(
      var    = v,
      label  = lbl,
      counts = cnt,           # tibble: split_cols + n_total + n_pos
      cols   = split_cols
    )
    .cb_msg("[autocodebook] desfecho '", v, "' contado em ",
            max(1L, nrow(cnt)), " folha(s)")
  }

  invisible(sdf)
}

# =============================================================================
# flow_table() - converte a arvore numa tabela tidy (para export editavel)
# =============================================================================

#' Flow tree as a tidy table
#'
#' Flattens the CONSORT flow tree into a publication-friendly data frame.
#' One row per leaf x outcome. Split levels become named columns (using their
#' labels), values are mapped through value_labels, and percentages are
#' formatted as readable strings.
#'
#' @return A tibble.
#' @export
flow_table <- function() {
  fl <- flow_get()
  if (is.na(fl$n_root)) {
    return(tibble::tibble())
  }

  n_levels <- length(fl$levels)

  # Funcao: dado um data frame com as colunas de split cruas, devolve
  # um data frame com colunas nomeadas pelos labels dos niveis e valores
  # mapeados pelos value_labels.
  .label_split_cols <- function(df) {
    if (n_levels == 0) return(df[, character(0), drop = FALSE])
    out <- list()
    for (lv in seq_len(n_levels)) {
      lvl  <- fl$levels[[lv]]
      raw  <- as.character(df[[lvl$by]])
      mapped <- vapply(raw, .flow_value_label, character(1),
                       value_labels = lvl$value_labels)
      out[[lvl$label]] <- mapped
    }
    as.data.frame(out, check.names = FALSE, stringsAsFactors = FALSE)
  }

  # Caso sem splits e sem outcomes: so a raiz
  if (n_levels == 0 && length(fl$outcomes) == 0) {
    return(tibble::tibble(Group = "Cohort", N = fl$n_root))
  }

  # Caso com splits mas sem outcomes: tabela de N por folha
  if (length(fl$outcomes) == 0) {
    last <- fl$levels[[n_levels]]
    labeled <- .label_split_cols(last$counts)
    labeled$N <- last$counts$n_ids
    return(tibble::as_tibble(labeled))
  }

  # Caso com outcomes: uma linha por folha x desfecho
  rows <- list()
  for (oc in fl$outcomes) {
    cnt <- oc$counts
    labeled <- if (length(oc$cols) > 0) {
      .label_split_cols(cnt)
    } else {
      data.frame(Group = rep("Cohort", nrow(cnt)),
                 check.names = FALSE, stringsAsFactors = FALSE)
    }
    labeled[["Outcome"]] <- oc$label
    labeled[["N total"]]  <- cnt$n_total
    labeled[["N events"]] <- cnt$n_pos
    pct <- ifelse(cnt$n_total > 0, cnt$n_pos / cnt$n_total, NA_real_)
    labeled[["%"]] <- vapply(pct, .pct_str, character(1))
    rows[[length(rows) + 1L]] <- labeled
  }
  out <- dplyr::bind_rows(rows)
  tibble::as_tibble(out)
}

# =============================================================================
# flow_diagram() - desenha a arvore de fluxo como um diagrama CONSORT
# =============================================================================

# Helper local: formata inteiro com separador de milhar.
.flow_fmt_int <- function(x) {
  format(as.numeric(x), big.mark = ",", scientific = FALSE, trim = TRUE)
}

# Helper local: quebra um rotulo em varias linhas para caber na caixa.
.flow_wrap <- function(s, width = 28L) {
  vapply(s, function(z) paste(strwrap(z, width = width), collapse = "\n"),
         character(1), USE.NAMES = FALSE)
}

#' Draw the eligibility flow as a CONSORT-style flowchart
#'
#' Renders a publication-ready CONSORT-style diagram (a 'ggplot' object) from
#' the information captured during the pipeline. The layout is computed
#' automatically, so no manual positioning is needed:
#'
#' - A vertical trunk built from the linear eligibility steps (recorded by
#'   [auto_filter()] / [track_step()]): the cohort baseline at the top, a
#'   single side box listing every exclusion and its count, and the eligible
#'   cohort below.
#' - One column of boxes per subgroup leaf of the flow tree (built with
#'   [track_split()]), shown side by side under the eligible cohort.
#' - Outcome boxes (added with [track_outcomes()]) stacked beneath each
#'   subgroup column, one row per outcome, with the event count and percent.
#'
#' Either part is optional: with only linear steps you get the trunk; with
#' only a split tree you get the cohort box and its subgroup columns. The
#' function reads the current session state and writes nothing to disk.
#'
#' Requires the 'ggplot2' package (a soft dependency).
#'
#' @param title Plot title. Default: "Eligibility flowchart".
#' @param show_exclusions Logical. Show the side box listing the linear
#'   exclusion steps? Default: TRUE.
#' @param box_fill Fill colour of the boxes. Default: "white".
#' @param border_col Border colour of the boxes. Default: "grey25".
#' @param text_size Base text size for the box labels. Default: 3.5.
#'
#' @return A 'ggplot' object, or invisible NULL if there is nothing to draw.
#' @export
#'
#' @examples
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
#'   cb_init(id_col = "id_indiv")
#'   df <- data.frame(
#'     id_indiv  = sprintf("ID%04d", 1:400),
#'     sgm       = sample(c(0L, 1L), 400, replace = TRUE, prob = c(0.95, 0.05)),
#'     self_harm = sample(c(0L, 1L), 400, replace = TRUE, prob = c(0.97, 0.03))
#'   )
#'   df <- track_split(df, by = "sgm", label = "SGM status",
#'                     value_labels = c("0" = "Non-SGM", "1" = "SGM"))
#'   track_outcomes(df, vars = "self_harm",
#'                  labels = list(self_harm = "Self-harm"))
#'   flow_diagram()
#' }
flow_diagram <- function(title = "Eligibility flowchart",
                         show_exclusions = TRUE,
                         box_fill = "white",
                         border_col = "grey25",
                         text_size = 3.5) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("[autocodebook] Package 'ggplot2' is required for flow_diagram(). ",
         "Install it with: install.packages('ggplot2')", call. = FALSE)
  }

  fl <- flow_get()
  tr <- .cb_env$tracking
  has_tracking <- !is.null(tr) && nrow(tr) > 0
  n_levels     <- length(fl$levels)
  has_tree     <- !is.na(fl$n_root) && n_levels > 0

  if (!has_tracking && is.na(fl$n_root)) {
    message("[autocodebook] Flow is empty - nothing to draw.")
    return(invisible(NULL))
  }

  # Geometria (unidades arbitrarias; coord_equal preserva a proporcao)
  W <- 3.4; H <- 1.25; COLP <- 4.0; ROWP <- 2.0

  boxes <- list(); segs <- list()
  add_box <- function(xc, yc, w, h, text, face = "plain", hj = 0.5) {
    tx <- if (hj == 0) xc - w / 2 + 0.18 else xc
    boxes[[length(boxes) + 1L]] <<- data.frame(
      xc = xc, yc = yc, xmin = xc - w / 2, xmax = xc + w / 2,
      ymin = yc - h / 2, ymax = yc + h / 2,
      face = face, hj = hj, tx = tx, text = text, stringsAsFactors = FALSE)
  }
  add_seg <- function(x, y, xend, yend, arr = FALSE) {
    segs[[length(segs) + 1L]] <<- data.frame(x = x, y = y, xend = xend,
                                             yend = yend, arr = arr)
  }

  # --- Folhas / colunas de subgrupo -----------------------------------------
  if (has_tree) {
    last <- fl$levels[[n_levels]]; cnt <- last$counts; cc <- last$cols
    ord <- do.call(order, lapply(cc, function(k) as.character(cnt[[k]])))
    cnt <- cnt[ord, , drop = FALSE]
    keymat    <- as.matrix(cnt[, cc, drop = FALSE])
    leaf_keys <- apply(keymat, 1, function(r) paste(as.character(r),
                                                    collapse = ""))
    leaf_lab  <- apply(keymat, 1, function(r) {
      labs <- vapply(seq_len(n_levels), function(k)
        .flow_value_label(as.character(r[[k]]), fl$levels[[k]]$value_labels),
        character(1))
      paste(labs, collapse = " / ")
    })
    leaf_lab <- .flow_wrap(leaf_lab, 30L)
    leaf_n   <- cnt$n_ids
    C        <- nrow(cnt)
    col_x    <- (seq_len(C) - (C + 1) / 2) * COLP
  } else {
    C <- 0L; col_x <- 0
  }
  trunk_x <- 0

  y_base <- 0; y_elig <- -ROWP * 1.25; y_head <- y_elig - ROWP * 1.25

  # --- Tronco linear (baseline -> exclusoes -> elegiveis) --------------------
  if (has_tracking) {
    elig_n <- tr$n_ids[nrow(tr)]
    tot_rm <- sum(tr$n_removed, na.rm = TRUE)
    base_n <- elig_n + tot_rm
    add_box(trunk_x, y_base, W * 1.7, H,
            paste0("Individuals in the cohort baseline\nn = ",
                   .flow_fmt_int(base_n)), face = "bold")
    add_box(trunk_x, y_elig, W * 1.7, H,
            paste0("Eligible individuals included\nin the analyses\nn = ",
                   .flow_fmt_int(elig_n)), face = "bold")
    add_seg(trunk_x, y_base - H / 2, trunk_x, y_elig + H / 2, arr = TRUE)

    if (isTRUE(show_exclusions) && tot_rm > 0) {
      b <- tr[tr$n_removed > 0, , drop = FALSE]
      blines <- paste0("\u2022 ", b$description, " (n = ",
                       vapply(b$n_removed, .flow_fmt_int, character(1)), ")")
      excl_w <- W * 1.65
      # Mantem a caixa de exclusoes a direita do tronco e das colunas, sem
      # sobrepor (importante quando nao ha split: C = 0).
      min_excl_x <- trunk_x + (W * 1.7) / 2 + 0.6 + excl_w / 2
      right_cols <- if (C > 0) max(col_x) + COLP * 1.15 else -Inf
      excl_x <- max(min_excl_x, right_cols)
      excl_h <- max(H, 0.55 + 0.40 * (length(blines) + 1))
      excl_y <- (y_base + y_elig) / 2
      add_box(excl_x, excl_y, excl_w, excl_h,
              paste0("Exclusions (n = ", .flow_fmt_int(tot_rm), "):\n",
                     paste(blines, collapse = "\n")), hj = 0)
      add_seg(trunk_x, excl_y, excl_x - excl_w / 2, excl_y, arr = TRUE)
    }
    split_from <- y_elig - H / 2
  } else {
    add_box(trunk_x, y_elig, W * 1.7, H,
            paste0("Cohort\nn = ", .flow_fmt_int(fl$n_root)), face = "bold")
    split_from <- y_elig - H / 2
  }

  # --- Colunas de subgrupo + desfechos empilhados ---------------------------
  if (C > 0) {
    bus_y <- (split_from + (y_head + H / 2)) / 2
    add_seg(trunk_x, split_from, trunk_x, bus_y)
    if (C > 1) add_seg(min(col_x), bus_y, max(col_x), bus_y)
    for (j in seq_len(C)) {
      add_seg(col_x[j], bus_y, col_x[j], y_head + (H * 1.25) / 2)
      add_box(col_x[j], y_head, W, H * 1.25,
              paste0(leaf_lab[j], "\nn = ", .flow_fmt_int(leaf_n[j])),
              face = "bold")
    }
    if (length(fl$outcomes) > 0) {
      header_half <- (H * 1.25) / 2; gap_oc <- 0.28
      for (oi in seq_along(fl$outcomes)) {
        oc <- fl$outcomes[[oi]]; ocnt <- oc$counts
        okeys <- if (length(oc$cols) > 0) {
          apply(as.matrix(ocnt[, oc$cols, drop = FALSE]), 1,
                function(r) paste(as.character(r), collapse = ""))
        } else rep("", nrow(ocnt))
        y_oc <- y_head - header_half - gap_oc - (oi - 1) * (H + gap_oc) - H / 2
        olab <- .flow_wrap(oc$label, 26L)
        for (j in seq_len(C)) {
          ridx <- which(okeys == leaf_keys[j])
          if (length(ridx) == 1L) {
            nt <- ocnt$n_total[ridx]; np <- ocnt$n_pos[ridx]
            pct <- if (nt > 0) .pct_str(np / nt) else "-"
            txt <- paste0(olab, "\nn = ", .flow_fmt_int(np), " (", pct, ")")
          } else {
            txt <- paste0(olab, "\n-")
          }
          add_box(col_x[j], y_oc, W, H, txt)
        }
      }
    }
  }

  # --- Monta o ggplot --------------------------------------------------------
  bx <- do.call(rbind, boxes)
  sg <- if (length(segs)) {
    do.call(rbind, segs)
  } else {
    data.frame(x = numeric(0), y = numeric(0), xend = numeric(0),
               yend = numeric(0), arr = logical(0))
  }

  ggplot2::ggplot() +
    ggplot2::geom_segment(
      data = sg[!sg$arr, , drop = FALSE],
      ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
      colour = "grey45", linewidth = 0.4) +
    ggplot2::geom_segment(
      data = sg[sg$arr, , drop = FALSE],
      ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
      colour = "grey25", linewidth = 0.45,
      arrow = grid::arrow(length = grid::unit(0.18, "cm"), type = "closed")) +
    ggplot2::geom_rect(
      data = bx,
      ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
      fill = box_fill, colour = border_col, linewidth = 0.4) +
    ggplot2::geom_text(
      data = bx[bx$face == "bold", , drop = FALSE],
      ggplot2::aes(x = tx, y = yc, label = text, hjust = hj),
      fontface = "bold", size = text_size, lineheight = 0.95) +
    ggplot2::geom_text(
      data = bx[bx$face == "plain", , drop = FALSE],
      ggplot2::aes(x = tx, y = yc, label = text, hjust = hj),
      size = text_size, lineheight = 0.95) +
    ggplot2::coord_equal(clip = "off") +
    ggplot2::labs(title = title) +
    ggplot2::theme_void(base_size = 11) +
    ggplot2::theme(
      plot.title  = ggplot2::element_text(face = "bold", hjust = 0.5,
                                          size = 13),
      plot.margin = ggplot2::margin(12, 16, 12, 16))
}

# =============================================================================
# flow_diagram_export() — salva o diagrama (raster, vetorial ou editavel)
# =============================================================================

#' Save the eligibility flowchart to a file
#'
#' Renders [flow_diagram()] and writes it to disk. The output format is taken
#' from the file extension, so the same call can produce a raster image, a
#' vector image, or an editable office document:
#'
#' - `.png`, `.jpg`/`.jpeg`, `.tiff`, `.bmp` - raster image.
#' - `.pdf`, `.svg`, `.eps` - vector image (editable in Inkscape / Illustrator;
#'   `.svg` needs the 'svglite' package).
#' - `.emf` - Windows vector metafile, editable in Word once inserted
#'   (needs the 'devEMF' package).
#' - `.docx` - a Word document with the flowchart embedded as a vector
#'   image (editable after ungrouping; needs the 'officer' package, plus
#'   'devEMF' for the vector version, otherwise a raster image is used).
#' - `.pptx` - a PowerPoint slide where every box and label is a native,
#'   fully editable shape (needs the 'rvg' and 'officer' packages).
#'
#' There is no default path: the destination must be supplied explicitly
#' (e.g. a file under [tempdir()] or a directory chosen by the user).
#'
#' @param path File path. The extension determines the format.
#' @param width,height Plot size in inches. If NULL (default), sensible
#'   values are derived from the number of subgroups and outcomes.
#' @param dpi Resolution for raster formats. Default: 300.
#' @param ... Further arguments passed to [flow_diagram()] (e.g. `title`,
#'   `box_fill`, `text_size`).
#'
#' @return Invisible path, or invisible NULL if there is nothing to draw.
#' @export
#'
#' @examples
#' \donttest{
#' # Wrapped in \donttest because writing the image invokes a graphics
#' # device and may take more than 5 seconds; it writes only to tempdir().
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
#'   cb_init(id_col = "id_indiv")
#'   df <- data.frame(
#'     id_indiv = sprintf("ID%03d", 1:100),
#'     g        = sample(c(0L, 1L), 100, replace = TRUE),
#'     y        = sample(c(0L, 1L), 100, replace = TRUE)
#'   )
#'   df <- track_split(df, by = "g", value_labels = c("0" = "A", "1" = "B"))
#'   track_outcomes(df, vars = "y", labels = list(y = "Outcome"))
#'   # Raster (no extra packages needed); written to tempdir() and cleaned up:
#'   out <- file.path(tempdir(), "flow.png")
#'   flow_diagram_export(out)
#'   unlink(out)
#' }
#' }
flow_diagram_export <- function(path, width = NULL, height = NULL,
                                dpi = 300, ...) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("[autocodebook] Package 'ggplot2' is required for ",
         "flow_diagram_export(). Install it with: install.packages('ggplot2')",
         call. = FALSE)
  }

  g <- flow_diagram(...)
  if (is.null(g)) {
    message("[autocodebook] Flow is empty - nothing to export.")
    return(invisible(NULL))
  }

  # Dimensoes padrao escalam com o numero de subgrupos e desfechos.
  if (is.null(width) || is.null(height)) {
    fl    <- flow_get()
    nlv   <- length(fl$levels)
    nleaf <- if (nlv > 0) nrow(fl$levels[[nlv]]$counts) else 1L
    noc   <- length(fl$outcomes)
    if (is.null(width))  width  <- max(9, 4 + nleaf * 3.6)
    if (is.null(height)) height <- max(6, 5 + noc * 1.6)
  }

  ext <- tolower(tools::file_ext(path))

  if (ext == "pptx") {
    .require_pkg("rvg",     "export PPTX")
    .require_pkg("officer", "export PPTX")
    doc <- officer::read_pptx()
    doc <- officer::add_slide(doc, layout = "Blank", master = "Office Theme")
    doc <- officer::ph_with(doc, rvg::dml(ggobj = g),
                            location = officer::ph_location_fullsize())
    print(doc, target = path)
  } else if (ext == "docx") {
    .require_pkg("officer", "export DOCX")
    # Embute o diagrama como EMF (vetor editavel no Word) se devEMF estiver
    # disponivel; senao, como PNG. A figura e exibida ajustada a largura util
    # da pagina, preservando a proporcao.
    if (requireNamespace("devEMF", quietly = TRUE)) {
      img <- tempfile(fileext = ".emf")
      ggplot2::ggsave(img, g, device = devEMF::emf,
                      width = width, height = height)
    } else {
      img <- tempfile(fileext = ".png")
      ggplot2::ggsave(img, g, width = width, height = height, dpi = dpi)
    }
    disp_w <- min(width, 6.5)
    disp_h <- height * (disp_w / width)
    doc <- officer::read_docx()
    doc <- officer::body_add_img(doc, src = img,
                                 width = disp_w, height = disp_h)
    print(doc, target = path)
  } else if (ext == "emf") {
    .require_pkg("devEMF", "export EMF")
    ggplot2::ggsave(path, g, device = devEMF::emf,
                    width = width, height = height)
  } else {
    # png/jpg/jpeg/tiff/bmp/pdf/svg/eps: ggsave infere o device pela extensao.
    ggplot2::ggsave(path, g, width = width, height = height, dpi = dpi)
  }

  message("[autocodebook] Flowchart saved to: ", path)
  invisible(path)
}

Try the autocodebook package in your browser

Any scripts or data that you put into this service are public.

autocodebook documentation built on June 9, 2026, 1:09 a.m.