#' @title Subset surveys
#'
#' @description This is a wrapper function for various procedures to reduce the size of surveys
#' by removing variables that are not harmonized.
#'
#' @details This function allows several workflows.
#' Subsetting can be based on a vector of variable names
#' given by \code{survey_path}, or on the basis of a \code{crosstable}.
#' The \code{\link{subset_save_surveys}} can be called directly.
#'
#' \code{subset_surveys} will also harmonize the variable names if the \code{var_name_target} is
#' optionally defined in the \code{crosswalk_table} input.
#' \code{harmonize_survey_variables} is a wrapper and will require that the new (target) variable names are
#' present in a valid \code{crosstable}.
#'
#' @param crosswalk_table A crosswalk table created by \code{\link{crosswalk_table_create}} or a manually created
#' crosstable including at least
#' \code{filename}, \code{var_name_orig}, \code{var_name_target} and optionally
#' \code{var_label_orig} and \code{var_label_target}. This parameter is optional and
#' defaults to \code{NULL}.
#' @param survey_list A list of surveys imported with \code{\link{read_surveys}}. If set to
#' \code{NULL}, the \code{survey_path} should give full path to the surveys.
#' @param survey_paths A vector of full file paths to the surveys to subset.
#' @param rowid The unique row (observation) identifier in the files. Defaults to
#' \code{"rowid"}, which is the default of the importing functions in this package.
#' @param subset_name An identifier for the survey subset.
#' @param subset_vars The names of the variables that should be kept from all surveys in the list that contains the
#' wave of surveys. Defaults to \code{NULL} in which case it returns all variables without subsetting.
#' @importFrom dplyr select any_of
#' @importFrom utils object.size
#' @family subsetting function
#' @return A list of surveys or save individual rds files on the \code{export_path}.
#' @examples
#' examples_dir <- system.file("examples", package = "retroharmonize")
#' survey_list <- dir(examples_dir)[grepl("\\.rds", dir(examples_dir))]
#'
#' example_surveys <- read_surveys(
#' file.path( examples_dir, survey_list)
#' )
#'
#' subset_surveys(survey_list = example_surveys,
#' subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"),
#' subset_name = "subset_example")
#' @export
subset_surveys <- function ( survey_list,
survey_paths = NULL,
rowid = "rowid",
subset_name = "subset",
subset_vars = NULL,
crosswalk_table = NULL,
import_path = NULL,
export_path = NULL) {
if ( !is.null(survey_list)) {
validate_survey_list(survey_list)
subset_from_files <- FALSE
files_to_subset <- NULL
if ( !"list" %in% class(survey_list) & is.survey(survey_list) ) {
survey_list <- list ( i = survey_list )
names(survey_list)[1] <- attr(survey_list[[1]], "id")
}
} else {
subset_from_files <- TRUE
}
if ( !is.null(survey_paths) ) {
validate_survey_files(survey_paths)
}
if ( !is.null(crosswalk_table)) {
is.crosswalk_table(crosswalk_table)
if (!is.null(survey_paths)) {
files_to_subset <- survey_paths [fs::path_file(survey_paths) %in% fs::path_file(crosswalk_table$filename)]
} else if ( !is.null(import_path)) {
files_to_subset <- dir(import_path) [dir(import_path) %in% fs::path_file(crosswalk_table$filename)]
} else {
files_to_subset <- unique(crosswalk_table$filename)
}
}
if ( !is.null(import_path)) {
assert_that(fs::dir_exists(import_path) == TRUE,
msg = " in subset_surveys: the 'import_path' is not a valid path to a directory.")
files_to_subset <- dir(import_path)[dir(import_path) %in% fs::path_file(crosswalk_table$filename)]
files_to_subset <- file.path(import_path, files_to_subset)
}
if ( !is.null(export_path)) {
assert_that(fs::dir_exists(export_path) == TRUE,
msg = " in subset_surveys: the 'export_path' is not a valid path to a directory.")
}
if ( subset_from_files ) {
## Subsetting from files ------------------------------------------------------------------
if (length(files_to_subset) ==0) {
message ("subset_surveys(): No files to subset.")
return(NULL)}
### Subsetting with crosswalk table ----------------------
if ( !is.null(crosswalk_table)) {
get_survey <- function(x) {
this_path <- files_to_subset[x]
this_id <- crosswalk_table %>% filter (
filename == fs::path_file(this_path)
) %>% distinct(id) %>%
unlist() %>%
as.character()
subset_vars <- crosswalk_table %>%
filter ( id == this_id ) %>%
select ( var_name_orig ) %>%
unlist() %>%
as.character() %>% unique()
subset_survey_file(
file_path = this_path,
subset_vars = subset_vars,
id = this_id,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply (seq_along(files_to_subset), function(x) get_survey(x))
return(unlist(saved_file_names))
}
return_list <- lapply (seq_along(files_to_subset), function(x) get_survey(x))
} else {
## Subsetting without crosswalk table -----------------
get_survey_no_ctable <- function(x) {
this_path <- files_to_subset[x]
### issue
this_id <- NULL
subset_survey_file(
file_path = this_path,
subset_vars = subset_vars,
id = this_id,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply (seq_along(files_to_subset), function(x) get_survey_no_ctable(x))
return(unlist(saved_file_names))
}
return_list <- lapply (seq_along(files_to_subset), function(x) get_survey_no_ctable(x))
}
} else {
### Subsetting from memory, with cross_table ---------------------------------------------
if (!is.null(crosswalk_table)) {
available_surveys <- vapply(survey_list, function(x) attr(x, "id"), character(1))
surveys_in_ctable <- unique(crosswalk_table$id)[unique(crosswalk_table$id) %in% available_surveys]
s <- which(surveys_in_ctable == available_surveys)
get_survey_memory <- function(x) {
this_survey <- survey_list[[x]]
subset_vars <- crosswalk_table %>%
filter ( id == attr(this_survey, "id")) %>%
select ( var_name_orig ) %>%
unlist() %>%
as.character() %>% unique()
subset_survey_memory(
this_survey = this_survey,
subset_vars = subset_vars,
subset_name = subset_name,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply (seq_along(s), function(x) get_survey_memory(x))
return(unlist(saved_file_names))
} else {
return_list <- lapply (seq_along(s), function(x) get_survey_memory(x))
}
} else {
### Subsetting from memory, without cross_table ------------------------------
get_survey_no_ctable_memory <- function(x) {
subset_survey_memory(
this_survey = survey_list[[x]],
subset_vars = subset_vars,
subset_name = subset_name,
export_path = export_path
)
}
if (!is.null(export_path)) {
saved_file_names <- lapply (seq_along(survey_list), function(x) get_survey_no_ctable_memory(x))
return(unlist(saved_file_names))
} else {
return_list <- lapply (seq_along(survey_list), function(x) get_survey_no_ctable_memory(x))
}
}
}
return_list
}
#' @rdname subset_surveys
#' @param waves A list of surveys imported with \code{\link{read_surveys}}.
#' @export
subset_waves <- function( waves, subset_vars = NULL) {
.Deprecated(new = "subset_surveys", msg = "subset_waves is deprecated, use subset_surveys instead.")
subset_surveys ( survey_list = waves, subset_vars = subset_vars )
}
#' @rdname subset_surveys
#' @export
subset_save_surveys <- function ( crosswalk_table,
subset_name = "subset",
survey_list = NULL,
subset_vars = NULL,
survey_paths = NULL,
import_path = NULL,
export_path = NULL) {
.Deprecated(new = "subset_surveys",
msg = "subset_save_surveys is deprecated, use subset_surveys instead.")
subset_surveys (
crosswalk_table = crosswalk_table,
subset_name = subset_name,
survey_list = survey_list,
subset_vars = subset_vars,
import_path = import_path,
export_path = export_path)
}
#' @title Subset surveys from files
#' @inheritParams subset_surveys
#' @param file_path A single survey files.
#' @keywords internal
subset_survey_file <- function( file_path,
subset_vars,
subset_name = 'subset',
id = NULL,
export_path = NULL ) {
subset_vars <- unique(c("rowid",subset_vars))
survey_file_ext <- fs::path_ext(file_path)
if ( is.null(id)) id <- fs::path_ext_remove(fs::path_file(file_path))
if ( survey_file_ext %in% c("sav", "por")) {
this_survey <- read_spss(file_path, id = id)
} else if (survey_file_ext == "rds") {
this_survey <- read_rds(file = file_path, id = id)
} else if ( survey_file_ext == "dta") {
this_survey <- read_dta(file = file_path, id = id )
} else if ( survey_file_ext == "csv") {
this_survey <- read_csv(file = file_path, id = id )
} else {
return(NULL)
}
subset_survey_memory (this_survey,
subset_vars = subset_vars,
subset_name = subset_name,
export_path = export_path)
}
#' @title Subset surveys in memory
#' @inheritParams subset_surveys
#' @importFrom tibble as_tibble
#' @keywords internal
subset_survey_memory <- function(this_survey,
subset_vars,
subset_name = 'subset',
export_path = NULL ) {
subset_vars <- unique(c("rowid", subset_vars))
subset_survey <- this_survey %>%
select (any_of(subset_vars))
attr(subset_survey, "subset_size") <- as.numeric(object.size(as_tibble(subset_survey)))
this_file_name <- paste0(
fs::path_ext_remove(attr(subset_survey, "filename") ),
"_", subset_name, ".",
fs::path_ext(attr(subset_survey, "filename")))
attr(subset_survey, "filename") <- this_file_name
if(!is.null(export_path)) {
save_file_name <- paste0(fs::path_ext_remove(fs::path_file(this_file_name)), ".rds")
message("Saving ", paste0(fs::path_ext_remove(fs::path_file(this_file_name)), ".rds"))
saveRDS(object = subset_survey_memory (this_survey, subset_vars),
file = file.path(export_path, save_file_name),
version = 2)
save_file_name
} else {
subset_survey
}
}
#' @rdname subset_surveys
#' @importfrom dplyr add_count
#' @examples
#' test_survey <- read_rds (
#' file = system.file("examples", "ZA7576.rds",
#' package = "retroharmonize")
#' )
#'
#' test_metadata <- metadata_create ( test_survey )
#' test_metadata <- test_metadata[c(1,7,18),]
#' ctable_2 <- crosswalk_table_create(test_metadata)
#' ctable_2$var_name_target <- ifelse(ctable$var_name_orig == "qa14_3",
#' "trust_ecb",
#' ctable$var_name_orig)
#'
#' subset_save_surveys ( crosswalk_table = ctable_2,
#' subset_name = "tested",
#' survey_list = test_survey,
#' import_path = NULL
#' )
#' @export
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.