Nothing
#' Subset and optionally harmonize surveys
#'
#' @description
#' Subset one or more surveys by retaining a specified set of variables.
#' Subsetting can be performed either on surveys already loaded in memory
#' or directly from survey files on disk.
#'
#' If a crosswalk table is supplied, variables are selected based on the
#' variables listed for each survey in the crosswalk, and variable names
#' can optionally be harmonized using `var_name_target`.
#'
#' This function replaces the deprecated helpers
#' [subset_waves()] and [subset_save_surveys()].
#'
#' @details
#' The function supports multiple workflows:
#'
#' * **In-memory subsetting** using `survey_list`
#' * **File-based subsetting** using `survey_paths` or `import_path`
#' * **Crosswalk-driven subsetting**, where variables are selected
#' per survey using a crosswalk table created by
#' [crosswalk_table_create()]
#'
#' If `export_path` is provided, subsetted surveys are written to disk
#' as `.rds` files. Otherwise, subsetted surveys are returned in memory.
#'
#' @param survey_list A list of survey objects created by
#' [read_surveys()]. If `NULL`, surveys are read from disk.
#'
#' @param survey_paths A character vector of full file paths to survey files.
#' Used when `survey_list` is `NULL`.
#'
#' @param rowid Name of the unique observation identifier column.
#' Defaults to `"rowid"`.
#'
#' @param subset_name Character string appended to filenames of
#' subsetted surveys. Defaults to `"subset"`.
#'
#' @param subset_vars Character vector of variable names to retain.
#' If `NULL`, all variables are retained.
#'
#' @param crosswalk_table Optional crosswalk table created with
#' [crosswalk_table_create()]. If supplied, variables are selected
#' per survey based on `var_name_orig`, and variable names may be
#' harmonized using `var_name_target`.
#'
#' @param import_path Optional directory containing survey files.
#' Used to resolve filenames when subsetting from disk.
#'
#' @param export_path Optional directory where subsetted surveys
#' are saved as `.rds` files. If `NULL`, surveys are returned in memory.
#'
#' @return
#' Either:
#' * a list of subsetted survey objects (if `export_path = NULL`), or
#' * a character vector of filenames written to `export_path`.
#'
#' @family subsetting functions
#'
#' @importFrom dplyr select any_of
#'
#' @seealso
#' [crosswalk_table_create()],
#' [harmonize_survey_variables()],
#' [read_surveys()]
#'
#' @examples
#' examples_dir <- system.file("examples", package = "retroharmonize")
#' survey_files <- dir(examples_dir, pattern = "\\.rds$")
#'
#' surveys <- read_surveys(
#' file.path(examples_dir, survey_files),
#' export_path = NULL
#' )
#'
#' subset_surveys(
#' survey_list = surveys,
#' subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"),
#' subset_name = "example_subset"
#' )
#'
#' @export
subset_surveys <- function(survey_list,
survey_paths = NULL,
rowid = "rowid",
subset_name = "subset",
subset_vars = NULL,
crosswalk_table = NULL,
import_path = NULL,
export_path = NULL) {
if (!is.null(survey_list)) {
validate_survey_list(survey_list)
subset_from_files <- FALSE
files_to_subset <- NULL
if (!"list" %in% class(survey_list) & is.survey(survey_list)) {
survey_list <- list(i = survey_list)
names(survey_list)[1] <- attr(survey_list[[1]], "id")
}
} else {
subset_from_files <- TRUE
}
if (!is.null(survey_paths)) {
validate_survey_files(survey_paths)
}
if (!is.null(crosswalk_table)) {
is.crosswalk_table(crosswalk_table)
if (!is.null(survey_paths)) {
files_to_subset <- survey_paths[fs::path_file(survey_paths) %in% fs::path_file(crosswalk_table$filename)]
} else if (!is.null(import_path)) {
files_to_subset <- dir(import_path)[dir(import_path) %in% fs::path_file(crosswalk_table$filename)]
} else {
files_to_subset <- unique(crosswalk_table$filename)
}
}
if (!is.null(import_path)) {
assert_that(fs::dir_exists(import_path) == TRUE,
msg = " in subset_surveys: the 'import_path' is not a valid path to a directory."
)
files_to_subset <- dir(import_path)[dir(import_path) %in% fs::path_file(crosswalk_table$filename)]
files_to_subset <- file.path(import_path, files_to_subset)
}
if (!is.null(export_path)) {
assert_that(fs::dir_exists(export_path) == TRUE,
msg = " in subset_surveys: the 'export_path' is not a valid path to a directory."
)
}
if (subset_from_files) {
## Subsetting from files ------------------------------------------------------------------
if (length(files_to_subset) == 0) {
message("subset_surveys(): No files to subset.")
return(NULL)
}
### Subsetting with crosswalk table ----------------------
if (!is.null(crosswalk_table)) {
get_survey <- function(x) {
this_path <- files_to_subset[x]
this_id <- crosswalk_table %>%
filter(
filename == fs::path_file(this_path)
) %>%
dplyr::distinct(id) %>%
unlist() %>%
as.character()
subset_vars <- crosswalk_table %>%
filter(id == this_id) %>%
select(var_name_orig) %>%
unlist() %>%
as.character() %>%
unique()
subset_survey_file(
file_path = this_path,
subset_vars = subset_vars,
id = this_id,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply(seq_along(files_to_subset),
function(x) get_survey(x))
return(unlist(saved_file_names))
}
return_list <- lapply(seq_along(files_to_subset),
function(x) get_survey(x))
} else {
## Subsetting without crosswalk table -----------------
get_survey_no_ctable <- function(x) {
this_path <- files_to_subset[x]
### issue
this_id <- NULL
subset_survey_file(
file_path = this_path,
subset_vars = subset_vars,
id = this_id,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply(seq_along(files_to_subset), function(x) get_survey_no_ctable(x))
return(unlist(saved_file_names))
}
return_list <- lapply(seq_along(files_to_subset), function(x) get_survey_no_ctable(x))
}
} else {
### Subsetting from memory, with cross_table ---------------------------------------------
if (!is.null(crosswalk_table)) {
available_surveys <- vapply(survey_list,
function(x) attr(x, "id"),
character(1)
)
surveys_in_ctable <- unique(
crosswalk_table$id)[unique(crosswalk_table$id) %in%
available_surveys]
s <- which(surveys_in_ctable == available_surveys)
get_survey_memory <- function(x) {
this_survey <- survey_list[[x]]
subset_vars <- crosswalk_table %>%
filter(id == attr(this_survey, "id")) %>%
select(var_name_orig) %>%
unlist() %>%
as.character() %>%
unique()
subset_survey_memory(
this_survey = this_survey,
subset_vars = subset_vars,
subset_name = subset_name,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply(seq_along(s),
function(x) get_survey_memory(x)
)
return(unlist(saved_file_names))
} else {
return_list <- lapply(seq_along(s),
function(x)
get_survey_memory(x)
)
}
} else {
### Subsetting from memory, without cross_table ------------------------------
get_survey_no_ctable_memory <- function(x) {
subset_survey_memory(
this_survey = survey_list[[x]],
subset_vars = subset_vars,
subset_name = subset_name,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply(
seq_along(survey_list),
function(x) get_survey_no_ctable_memory(x))
return(unlist(saved_file_names))
} else {
return_list <- lapply(seq_along(survey_list),
function(x) get_survey_no_ctable_memory(x)
)
}
}
}
return_list
}
#' @rdname subset_surveys
#' @param waves A list of surveys imported with [read_surveys()].
#' @export
subset_waves <- function(waves, subset_vars = NULL) {
.Deprecated(new = "subset_surveys", msg = "subset_waves is deprecated, use subset_surveys instead.")
subset_surveys(survey_list = waves, subset_vars = subset_vars)
}
#' @rdname subset_surveys
#' @export
subset_save_surveys <- function(crosswalk_table,
subset_name = "subset",
survey_list = NULL,
subset_vars = NULL,
survey_paths = NULL,
import_path = NULL,
export_path = NULL) {
.Deprecated(
new = "subset_surveys",
msg = "subset_save_surveys is deprecated, use subset_surveys instead."
)
subset_surveys(
crosswalk_table = crosswalk_table,
subset_name = subset_name,
survey_list = survey_list,
subset_vars = subset_vars,
import_path = import_path,
export_path = export_path
)
}
#' @title Subset surveys from files
#' @inheritParams subset_surveys
#' @param file_path A single survey files.
#' @keywords internal
subset_survey_file <- function(file_path,
subset_vars,
subset_name = "subset",
id = NULL,
export_path = NULL) {
subset_vars <- unique(c("rowid", subset_vars))
survey_file_ext <- fs::path_ext(file_path)
if (is.null(id)) id <- fs::path_ext_remove(fs::path_file(file_path))
if (survey_file_ext %in% c("sav", "por")) {
this_survey <- read_spss(file_path, id = id)
} else if (survey_file_ext == "rds") {
this_survey <- read_rds(file = file_path, id = id)
} else if (survey_file_ext == "dta") {
this_survey <- read_dta(file = file_path, id = id)
} else if (survey_file_ext == "csv") {
this_survey <- read_csv(file = file_path, id = id)
} else {
return(NULL)
}
subset_survey_memory(this_survey,
subset_vars = subset_vars,
subset_name = subset_name,
export_path = export_path
)
}
#' @title Subset surveys in memory
#' @inheritParams subset_surveys
#' @importFrom tibble as_tibble
#' @importFrom fs path_ext path_ext_remove
#' @keywords internal
subset_survey_memory <- function(this_survey,
subset_vars,
subset_name = "subset",
export_path = NULL) {
subset_vars <- unique(c("rowid", subset_vars))
subset_survey <- this_survey %>%
select(any_of(subset_vars))
attr(subset_survey, "subset_size") <- as.numeric(object.size(as_tibble(subset_survey)))
this_file_name <- paste0(
fs::path_ext_remove(attr(subset_survey, "filename")),
"_", subset_name, ".",
fs::path_ext(attr(subset_survey, "filename"))
)
attr(subset_survey, "filename") <- this_file_name
if (!is.null(export_path)) {
save_file_name <- paste0(fs::path_ext_remove(fs::path_file(this_file_name)), ".rds")
message("Saving ", paste0(fs::path_ext_remove(fs::path_file(this_file_name)), ".rds"))
saveRDS(
object = subset_survey_memory(this_survey, subset_vars),
file = file.path(export_path, save_file_name),
version = 2
)
save_file_name
} else {
subset_survey
}
}
#' @rdname subset_surveys
#' @importfrom dplyr add_count
#' @examples
#' test_survey <- read_rds(
#' file = system.file("examples", "ZA7576.rds",
#' package = "retroharmonize"
#' )
#' )
#'
#' test_metadata <- metadata_create(test_survey)
#' test_metadata <- test_metadata[c(1, 7, 18), ]
#' ctable_2 <- crosswalk_table_create(test_metadata)
#' ctable_2$var_name_target <- ifelse(ctable$var_name_orig == "qa14_3",
#' "trust_ecb",
#' ctable$var_name_orig
#' )
#'
#' subset_save_surveys(
#' crosswalk_table = ctable_2,
#' subset_name = "tested",
#' survey_list = test_survey,
#' import_path = NULL
#' )
#' @export
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.