R/subset_surveys.R

Defines functions subset_survey_memory subset_survey_file subset_save_surveys subset_waves subset_surveys

Documented in subset_save_surveys subset_survey_file subset_survey_memory subset_surveys subset_waves

#' Subset and optionally harmonize surveys
#'
#' @description
#' Subset one or more surveys by retaining a specified set of variables.
#' Subsetting can be performed either on surveys already loaded in memory
#' or directly from survey files on disk.
#'
#' If a crosswalk table is supplied, variables are selected based on the
#' variables listed for each survey in the crosswalk, and variable names
#' can optionally be harmonized using `var_name_target`.
#'
#' This function replaces the deprecated helpers
#' [subset_waves()] and [subset_save_surveys()].
#'
#' @details
#' The function supports multiple workflows:
#'
#' * **In-memory subsetting** using `survey_list`
#' * **File-based subsetting** using `survey_paths` or `import_path`
#' * **Crosswalk-driven subsetting**, where variables are selected
#'   per survey using a crosswalk table created by
#'   [crosswalk_table_create()]
#'
#' If `export_path` is provided, subsetted surveys are written to disk
#' as `.rds` files. Otherwise, subsetted surveys are returned in memory.
#'
#' @param survey_list A list of survey objects created by
#'   [read_surveys()]. If `NULL`, surveys are read from disk.
#'
#' @param survey_paths A character vector of full file paths to survey files.
#'   Used when `survey_list` is `NULL`.
#'
#' @param rowid Name of the unique observation identifier column.
#'   Defaults to `"rowid"`.
#'
#' @param subset_name Character string appended to filenames of
#'   subsetted surveys. Defaults to `"subset"`.
#'
#' @param subset_vars Character vector of variable names to retain.
#'   If `NULL`, all variables are retained.
#'
#' @param crosswalk_table Optional crosswalk table created with
#'   [crosswalk_table_create()]. If supplied, variables are selected
#'   per survey based on `var_name_orig`, and variable names may be
#'   harmonized using `var_name_target`.
#'
#' @param import_path Optional directory containing survey files.
#'   Used to resolve filenames when subsetting from disk.
#'
#' @param export_path Optional directory where subsetted surveys
#'   are saved as `.rds` files. If `NULL`, surveys are returned in memory.
#'
#' @return
#' Either:
#' * a list of subsetted survey objects (if `export_path = NULL`), or
#' * a character vector of filenames written to `export_path`.
#'
#' @family subsetting functions
#' 
#' @importFrom dplyr select any_of
#'
#' @seealso
#' [crosswalk_table_create()],
#' [harmonize_survey_variables()],
#' [read_surveys()]
#'
#' @examples
#' examples_dir <- system.file("examples", package = "retroharmonize")
#' survey_files <- dir(examples_dir, pattern = "\\.rds$")
#'
#' surveys <- read_surveys(
#'   file.path(examples_dir, survey_files),
#'   export_path = NULL
#' )
#'
#' subset_surveys(
#'   survey_list = surveys,
#'   subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"),
#'   subset_name = "example_subset"
#' )
#'
#' @export

subset_surveys <- function(survey_list,
                           survey_paths = NULL,
                           rowid = "rowid",
                           subset_name = "subset",
                           subset_vars = NULL,
                           crosswalk_table = NULL,
                           import_path = NULL,
                           export_path = NULL) {
  if (!is.null(survey_list)) {
    validate_survey_list(survey_list)
    subset_from_files <- FALSE
    files_to_subset <- NULL

    if (!"list" %in% class(survey_list) & is.survey(survey_list)) {
      survey_list <- list(i = survey_list)
      names(survey_list)[1] <- attr(survey_list[[1]], "id")
    }
  } else {
    subset_from_files <- TRUE
  }
  if (!is.null(survey_paths)) {
    validate_survey_files(survey_paths)
  }

  if (!is.null(crosswalk_table)) {
    is.crosswalk_table(crosswalk_table)
    if (!is.null(survey_paths)) {
      files_to_subset <- survey_paths[fs::path_file(survey_paths) %in% fs::path_file(crosswalk_table$filename)]
    } else if (!is.null(import_path)) {
      files_to_subset <- dir(import_path)[dir(import_path) %in% fs::path_file(crosswalk_table$filename)]
    } else {
      files_to_subset <- unique(crosswalk_table$filename)
    }
  }
  if (!is.null(import_path)) {
    assert_that(fs::dir_exists(import_path) == TRUE,
      msg = " in subset_surveys: the 'import_path' is not a valid path to a directory."
    )

    files_to_subset <- dir(import_path)[dir(import_path) %in% fs::path_file(crosswalk_table$filename)]
    files_to_subset <- file.path(import_path, files_to_subset)
  }

  if (!is.null(export_path)) {
    assert_that(fs::dir_exists(export_path) == TRUE,
      msg = " in subset_surveys: the 'export_path' is not a valid path to a directory."
    )
  }

  if (subset_from_files) {
    ## Subsetting from files ------------------------------------------------------------------
    if (length(files_to_subset) == 0) {
      message("subset_surveys(): No files to subset.")
      return(NULL)
    }

    ### Subsetting with crosswalk table ----------------------
    if (!is.null(crosswalk_table)) {
      get_survey <- function(x) {
        this_path <- files_to_subset[x]

        this_id <- crosswalk_table %>%
          filter(
            filename == fs::path_file(this_path)
          ) %>%
          dplyr::distinct(id) %>%
          unlist() %>%
          as.character()

        subset_vars <- crosswalk_table %>%
          filter(id == this_id) %>%
          select(var_name_orig) %>%
          unlist() %>%
          as.character() %>%
          unique()

        subset_survey_file(
          file_path = this_path,
          subset_vars = subset_vars,
          id = this_id,
          export_path = export_path
        )
      }

      if (!is.null(export_path)) {
        saved_file_names <- lapply(seq_along(files_to_subset), 
                                   function(x) get_survey(x))
        return(unlist(saved_file_names))
      }
      return_list <- lapply(seq_along(files_to_subset), 
                            function(x) get_survey(x))
    } else {
      ## Subsetting without crosswalk table -----------------

      get_survey_no_ctable <- function(x) {
        this_path <- files_to_subset[x]

        ### issue
        this_id <- NULL

        subset_survey_file(
          file_path = this_path,
          subset_vars = subset_vars,
          id = this_id,
          export_path = export_path
        )
      }

      if (!is.null(export_path)) {
        saved_file_names <- lapply(seq_along(files_to_subset), function(x) get_survey_no_ctable(x))
        return(unlist(saved_file_names))
      }
      return_list <- lapply(seq_along(files_to_subset), function(x) get_survey_no_ctable(x))
    }
  } else {
    ### Subsetting from memory, with cross_table ---------------------------------------------

    if (!is.null(crosswalk_table)) {
      available_surveys <- vapply(survey_list, 
                                  function(x) attr(x, "id"), 
                                  character(1)
                                  )
      
      surveys_in_ctable <- unique(
        crosswalk_table$id)[unique(crosswalk_table$id) %in%
                              available_surveys]

      s <- which(surveys_in_ctable == available_surveys)

      get_survey_memory <- function(x) {
        this_survey <- survey_list[[x]]

        subset_vars <- crosswalk_table %>%
          filter(id == attr(this_survey, "id")) %>%
          select(var_name_orig) %>%
          unlist() %>%
          as.character() %>%
          unique()

        subset_survey_memory(
          this_survey = this_survey,
          subset_vars = subset_vars,
          subset_name = subset_name,
          export_path = export_path
        )
      }

      if (!is.null(export_path)) {
        saved_file_names <- lapply(seq_along(s), 
                                   function(x) get_survey_memory(x)
                                   
                                   )
        return(unlist(saved_file_names))
      } else {
        return_list <- lapply(seq_along(s),
                              function(x) 
                                get_survey_memory(x)
                              )
      }
    } else {
      ### Subsetting from memory, without cross_table ------------------------------
      get_survey_no_ctable_memory <- function(x) {
        subset_survey_memory(
          this_survey = survey_list[[x]],
          subset_vars = subset_vars,
          subset_name = subset_name,
          export_path = export_path
        )
      }

      if (!is.null(export_path)) {
        saved_file_names <- lapply(
          seq_along(survey_list), 
          function(x) get_survey_no_ctable_memory(x))
        return(unlist(saved_file_names))
      } else {
        return_list <- lapply(seq_along(survey_list), 
                              function(x) get_survey_no_ctable_memory(x)
                              )
      }
    }
  }
  return_list
}

#' @rdname subset_surveys
#' @param waves A list of surveys imported with [read_surveys()].
#' @export
subset_waves <- function(waves, subset_vars = NULL) {
  .Deprecated(new = "subset_surveys", msg = "subset_waves is deprecated, use subset_surveys instead.")
  subset_surveys(survey_list = waves, subset_vars = subset_vars)
}


#' @rdname subset_surveys
#' @export
subset_save_surveys <- function(crosswalk_table,
                                subset_name = "subset",
                                survey_list = NULL,
                                subset_vars = NULL,
                                survey_paths = NULL,
                                import_path = NULL,
                                export_path = NULL) {
  .Deprecated(
    new = "subset_surveys",
    msg = "subset_save_surveys is deprecated, use subset_surveys instead."
  )
  subset_surveys(
    crosswalk_table = crosswalk_table,
    subset_name = subset_name,
    survey_list = survey_list,
    subset_vars = subset_vars,
    import_path = import_path,
    export_path = export_path
  )
}

#' @title Subset surveys from files
#' @inheritParams subset_surveys
#' @param file_path A single survey files.
#' @keywords internal
subset_survey_file <- function(file_path,
                               subset_vars,
                               subset_name = "subset",
                               id = NULL,
                               export_path = NULL) {
  subset_vars <- unique(c("rowid", subset_vars))

  survey_file_ext <- fs::path_ext(file_path)
  if (is.null(id)) id <- fs::path_ext_remove(fs::path_file(file_path))

  if (survey_file_ext %in% c("sav", "por")) {
    this_survey <- read_spss(file_path, id = id)
  } else if (survey_file_ext == "rds") {
    this_survey <- read_rds(file = file_path, id = id)
  } else if (survey_file_ext == "dta") {
    this_survey <- read_dta(file = file_path, id = id)
  } else if (survey_file_ext == "csv") {
    this_survey <- read_csv(file = file_path, id = id)
  } else {
    return(NULL)
  }
  subset_survey_memory(this_survey,
    subset_vars = subset_vars,
    subset_name = subset_name,
    export_path = export_path
  )
}

#' @title Subset surveys in memory
#' @inheritParams subset_surveys
#' @importFrom tibble as_tibble
#' @importFrom fs path_ext path_ext_remove
#' @keywords internal
subset_survey_memory <- function(this_survey,
                                 subset_vars,
                                 subset_name = "subset",
                                 export_path = NULL) {
  subset_vars <- unique(c("rowid", subset_vars))

  subset_survey <- this_survey %>%
    select(any_of(subset_vars))

  attr(subset_survey, "subset_size") <- as.numeric(object.size(as_tibble(subset_survey)))

  this_file_name <- paste0(
    fs::path_ext_remove(attr(subset_survey, "filename")),
    "_", subset_name, ".",
    fs::path_ext(attr(subset_survey, "filename"))
  )

  attr(subset_survey, "filename") <- this_file_name

  if (!is.null(export_path)) {
    save_file_name <- paste0(fs::path_ext_remove(fs::path_file(this_file_name)), ".rds")
    message("Saving ", paste0(fs::path_ext_remove(fs::path_file(this_file_name)), ".rds"))
    saveRDS(
      object = subset_survey_memory(this_survey, subset_vars),
      file = file.path(export_path, save_file_name),
      version = 2
    )
    save_file_name
  } else {
    subset_survey
  }
}


#' @rdname subset_surveys
#' @importfrom dplyr add_count
#' @examples
#' test_survey <- read_rds(
#'   file = system.file("examples", "ZA7576.rds",
#'     package = "retroharmonize"
#'   )
#' )
#'
#' test_metadata <- metadata_create(test_survey)
#' test_metadata <- test_metadata[c(1, 7, 18), ]
#' ctable_2 <- crosswalk_table_create(test_metadata)
#' ctable_2$var_name_target <- ifelse(ctable$var_name_orig == "qa14_3",
#'   "trust_ecb",
#'   ctable$var_name_orig
#' )
#'
#' subset_save_surveys(
#'   crosswalk_table = ctable_2,
#'   subset_name = "tested",
#'   survey_list = test_survey,
#'   import_path = NULL
#' )
#' @export

Try the retroharmonize package in your browser

Any scripts or data that you put into this service are public.

retroharmonize documentation built on Jan. 14, 2026, 9:08 a.m.