R/cto_form_dofile.R

Defines functions cto_form_dofile

Documented in cto_form_dofile

#' Generate a Stata Do-File with Variable and Value Labels from a SurveyCTO Form
#'
#' @description
#' Creates a Stata `.do` file that applies variable labels, value labels, and
#' notes to a dataset based on the XLSForm definition of a SurveyCTO form.
#' The function supports multi-language forms, repeat groups, and
#' `select_multiple` questions, and generates Stata-compatible regular
#' expressions so labels are applied to all indexed variables.
#'
#' @param form_id A character string specifying the SurveyCTO form ID.
#' @param path Optional character string giving the output file path for the
#'   generated `.do` file. Must end in `.do`. If `NULL`, the file is not written
#'   to disk and the generated commands are returned invisibly.
#'
#' @details
#' The function performs several processing steps:
#'
#' \itemize{
#'   \item \strong{Language selection:} Automatically chooses the default
#'   language defined in the XLSForm, or falls back to English when multiple
#'   label columns are present.
#'   \item \strong{Value labels:} Generates Stata `label define` commands for
#'   all `select_one` choice lists and a binary label set for
#'   `select_multiple` variables.
#'   \item \strong{Repeat handling:} For variables inside repeat groups,
#'   Stata loops and regex matching are created so labels apply to all indexed
#'   copies (for example, `child_age_1`, `child_age_2`).
#'   \item \strong{Select-multiple expansion:} Produces conditional labeling
#'   logic for binary indicator variables derived from
#'   `select_multiple` questions.
#'   \item \strong{Label cleaning:} Removes HTML markup, escapes Stata-special
#'   characters, normalizes whitespace, and preserves SurveyCTO interpolation
#'   strings such as `${var}`.
#' }
#'
#' @return
#' A character vector containing the lines of the generated Stata `.do` file.
#' The value is returned invisibly.
#'
#' @family Form Management Functions
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Generate a Stata do-file and write it to disk
#' cto_form_dofile("household_survey", path = "labels.do")
#'
#' # Generate without writing to a file
#' cmds <- cto_form_dofile("household_survey")
#' }

cto_form_dofile <- function(form_id, path = NULL) {
  verbose <- get_verbose()

  if (!is.null(path)) {
    checkmate::assert_path_for_output(path, TRUE, "do")
  }

  fp <- cto_form_definition(form_id, dir = tempdir(), overwrite = TRUE)
  form <- list(
    survey = readxl::read_excel(fp, sheet = "survey"),
    choices = readxl::read_excel(fp, sheet = "choices"),
    settings = readxl::read_excel(fp, sheet = "settings")
  )

  if (!is.null(path)) {
    cli_progress_step(
      "Writing {.val {form_id}} Stata do-file to {.file {path}}"
    )
  } else {
    cli_progress_step("Writing {.val {form_id}} Stata do-file")
  }

  # --- 1. Header Generation ---
  ts <- format(Sys.time(), format = '%b %d, %Y at %H:%M %Z')
  t1 <- center_text(str_glue("{toupper(form_id)} VARIABLE AND VALUE LABELS"))
  t2 <- center_text(str_glue("Generated on {ts} by 'scto' Package in R"))

  header_content <- str_glue(
    strrep("*", 80),
    paste0("*", t1, "*"),
    paste0("*", t2, "*"),
    strrep("*", 80),
    "",
    "",
    .sep = "\n"
  )

  # --- 2. Label Column Detection ---
  find_label_col <- function(cols, choices_cols = FALSE) {
    matches <- cols[grepl("^label", cols, TRUE)]
    if (length(matches) == 0) {
      cli_abort("No column with label name found.")
    }

    if (length(matches) > 1) {
      default_lang <- form$settings$default_language
      dl <- if (
        is.null(default_lang) || is.na(default_lang) || default_lang == ""
      ) {
        "english"
      } else {
        default_lang
      }
      matches_lang <- matches[grepl(dl, matches, TRUE)]
      if (length(matches_lang) > 0) {
        return(matches_lang[1])
      }
      return(matches[1])
    }
    return(matches)
  }

  var_label_col <- find_label_col(names(form$survey))

  val_label_col <- if (var_label_col %in% names(form$choices)) {
    var_label_col
  } else {
    find_label_col(names(form$choices))
  }

  # --- 3. Process Survey & Extract List Names ---
  survey <- form$survey
  if (any(grepl("^disabled$", names(survey)))) {
    survey <- dplyr::filter(survey, !grepl("yes", .data$disabled, TRUE))
  }

  # Extract list names and types efficiently
  select_types <- survey |>
    dplyr::filter(grepl("select_", .data$type, TRUE)) |>
    mutate(
      type_clean = str_squish(.data$type),
      list_name = str_extract(.data$type_clean, "(?<= )\\S+"),
      is_multi = grepl("select_multiple", .data$type_clean, TRUE),
      .keep = "none"
    )

  valid_choices_s1 <- unique(select_types$list_name[!select_types$is_multi])
  valid_choices_sm <- unique(select_types$list_name[select_types$is_multi])

  # --- 4. Process Choices ---
  choices_all <- form$choices |>
    mutate(
      value = suppressWarnings(as.numeric(.data$value)),
      list_name = str_squish(.data$list_name),
      label_clean = .data[[val_label_col]] |>
        str_remove_all("<[^<>]*>") |>
        str_replace_all('"', "'") |>
        str_squish()
    ) |>
    dplyr::filter(
      !is.na(.data$value),
      !grepl("^\\$\\{.*\\}$", .data$label_clean)
      )

  # Generate 'label define' commands for select_one
  choice_sets_s1 <- choices_all |>
    dplyr::filter(.data$list_name %in% valid_choices_s1) |>
    dplyr::summarise(
      stata_cmd = paste0(
        'label define ',
        dplyr::first(.data$list_name),
        ' ',
        paste0(.data$value, ' "', .data$label_clean, '"', collapse = " "),
        ', modify'
      ),
      .by = "list_name"
    )

  # Prepare lookup for select_multiple (using a list for fast access)
  multi_lookup <- choices_all |>
    dplyr::filter(.data$list_name %in% valid_choices_sm) |>
    select("list_name", "value", "label_clean") |>
    mutate(
      value = ifelse(
        .data$value < 0,
        paste0("_", abs(.data$value)),
        as.character(.data$value)
      )
    ) |>
    dplyr::group_split(.data$list_name)

  names(multi_lookup) <- sort(unique(choices_all$list_name[
    choices_all$list_name %in% valid_choices_sm
  ]))

  # --- 5. Process Variables ---

  # Pre-clean survey data
  var_labels <- survey |>
    mutate(
      orig_order = dplyr::row_number(),
      type = str_squish(str_replace_all(.data$type, "\\\n", " ")),
      name = str_squish(str_replace_all(.data$name, "\\\n", " ")),

      # Calculate repeat levels
      repeat_level = purrr::accumulate(
        .data$type,
        .init = 0,
        .f = function(i, x) {
          if (grepl("begin repeat", x, TRUE)) {
            i + 1
          } else if (grepl("end repeat", x, TRUE)) {
            i - 1
          } else {
            i
          }
        }
      )[-1],

      is_repeat = .data$repeat_level > 0,
      is_slt_multi = grepl("^select_multiple", .data$type, TRUE),
      list_name_raw = str_extract(.data$type, "(?<= )\\S+"),
      list_name = ifelse(
        .data$is_slt_multi,
        "slt_multi_binary",
        .data$list_name_raw
      ),
      list_multi = ifelse(
        .data$is_slt_multi,
        .data$list_name_raw,
        NA_character_
      ),
      is_null_fields = grepl(
        "^note|^begin group|^end group|^begin repeat|^end repeat",
        .data$type,
        TRUE
      )
    ) |>
    dplyr::filter(!.data$is_null_fields, !is.na(.data[[var_label_col]])) |>
    # Generate regex only for relevant rows later if possible, but structure implies we need it here
    mutate(
      regex_varname = purrr::pmap_chr(
        list(.data$name, .data$repeat_level, .data$is_slt_multi),
        gen_regex_varname
      ),

      # Efficient cleaning of the label column
      cleaned_label = .data[[var_label_col]] |>
        str_remove_all("<[^<>]*>") |>
        str_replace_all(stringr::fixed("${"), "\\${") |>
        str_replace_all('"', "'") |>
        str_replace_all("\\\n", " ") |>
        str_squish(),

      var_label = stringr::str_trunc(
        #stringr::str_remove(.data$cleaned_label, paste0(.data$name, "(\\W+)?")),
        .data$cleaned_label,
        80
      ),
      var_note = .data$cleaned_label,
      has_list = !is.na(.data$list_name) & !.data$is_slt_multi
    ) |>
    dplyr::filter(.data$var_label != "" & !is.na(.data$var_label))

  # --- SPLIT-APPLY-COMBINE STRATEGY ---

  # 1. Complex Select Multiple Logic
  vars_multi <- var_labels |>
    dplyr::filter(.data$is_slt_multi) |>
    mutate(
      stata_cmd = purrr::pmap_chr(
        list(
          .data$regex_varname,
          .data$list_multi,
          .data$var_label,
          .data$var_note,
          .data$name
        ),
        function(n, l, v, vn, ln) {
          base_cmd <- str_c(
            "cap {\n",
            "\tunab vars : ",
            ln,
            "*\n",
            "\tforeach var of local vars {\n",
            "\t\tif regexm(\"`var'\", \"",
            n,
            "\") {\n",
            "\t\t\tcap label variable `var' \"",
            v,
            "\"\n",
            "\t\t\tcap note `var': \"",
            vn,
            "\"\n",
            "\t\t\tcap label values `var' slt_multi_binary\n"
          )

          choice_cmds <- ""
          if (!is.na(l) && l %in% names(multi_lookup)) {
            choices <- multi_lookup[[l]]
            patterns <- stringr::str_replace(
              n,
              stringr::fixed("*[0-9]+"),
              choices$value
            )
            full_labels <- ifelse(
              !is.na(choices$label_clean) & choices$label_clean != "",
              paste0(choices$label_clean, " - ", v),
              v
            )
            #full_labels <- paste0(choices$label_clean, " - ", v)
            full_labels <- ifelse(
              nchar(full_labels) > 80,
              stringr::str_trunc(full_labels, 80),
              full_labels
            )

            choice_cmds <- paste0(
              "\t\tif regexm(\"`var'\", \"",
              patterns,
              "\") {\n",
              "\t\t\tcap label variable `var' \"",
              full_labels,
              "\"\n",
              "\t\t\t}\n",
              collapse = ""
            )
          } else {
            choice_cmds <- paste0("\t\tcap label variable `var' \"", v, "\"\n")
          }

          str_c(base_cmd, choice_cmds, "\t\t}\n", "\t}\n", "}")
        }
      )
    )

  # 2. Logic for Repeats (Non-Multi)
  vars_repeat <- var_labels |>
    dplyr::filter(!.data$is_slt_multi & .data$is_repeat) |>
    mutate(
      stata_cmd = str_c(
        "cap {\n",
        "\tunab vars : ",
        .data$name,
        "*\n",
        "\tforeach var of local vars {\n",
        "\t\tif regexm(\"`var'\", \"",
        .data$regex_varname,
        "\") {\n",
        "\t\t\tcap label variable `var' \"",
        .data$var_label,
        "\"\n",
        "\t\t\tcap note `var': \"",
        .data$var_note,
        "\"\n",
        ifelse(
          .data$has_list,
          paste0("\t\t\tcap label values `var' ", .data$list_name, "\n"),
          ""
        ),
        "\t\t}\n",
        "\t}\n",
        "}"
      )
    )

  # 3. Simple Variables (Non-Multi, Non-Repeat)
  vars_simple <- var_labels |>
    dplyr::filter(!.data$is_slt_multi & !.data$is_repeat) |>
    mutate(
      stata_cmd = str_c(
        "cap label variable ",
        .data$name,
        " \"",
        .data$var_label,
        "\"\n",
        "cap note variable ",
        .data$name,
        " \"",
        .data$var_note,
        "\"",
        ifelse(
          .data$has_list,
          paste0("\ncap label values ", .data$name, " ", .data$list_name),
          ""
        )
      )
    )

  # Combine and Restore Order
  labels_set <- dplyr::bind_rows(vars_multi, vars_repeat, vars_simple) |>
    dplyr::arrange(.data$orig_order)

  # --- 6. Final File Assembly ---

  do_file_content <- c(
    header_content,
    paste0("*", center_text(" VALUE LABELS ", "-"), "*"),
    "",
    "label define slt_multi_binary 1 \"Yes\" 0 \"No\", modify",
    choice_sets_s1[["stata_cmd"]],
    "",
    "",
    paste0("*", center_text(" VARIABLE LABELS ", "-"), "*"),
    "",
    labels_set[["stata_cmd"]],
    "",
    "",
    paste0("*", center_text(" DEFAULT FIELDS ", "-"), "*"),
    "",
    c(
      'cap replace KEY = instanceID if KEY==""',
      'cap drop instanceID',
      'cap label variable KEY "Unique submission ID"',
      'cap label variable SubmissionDate "Date/time submitted"',
      'cap label variable CompletionDate "Date/time review completed"',
      'cap label variable formdef_version "Form version used on device"',
      'cap label variable review_status "Review status"',
      'cap label variable review_comments "Comments made during review"',
      'cap label variable review_corrections "Corrections made during review"'
    ),
    "",
    "",
    paste0("*", center_text(" THE END! ", "-"), "*")
  )

  # Incase of missing value labels for binaries
  #do_file_content <- sub('(") - ', '\\1', do_file_content)

  if (!is.null(path)) {
    writeLines(do_file_content, path)
  }
  return(invisible(do_file_content))
}

Try the ctoclient package in your browser

Any scripts or data that you put into this service are public.

ctoclient documentation built on March 28, 2026, 5:07 p.m.