Nothing
#' Merge Sampled Data based on IDs
#'
#' In an object of class \code{"Mutiwave"}, \code{merge_samples} creates
#' a dataframe in the \code{"data"} slot of the specified wave by merging
#' the dataframe in the \code{"sampled data"} slot with the dataframe in
#' the \code{"data"} slot of the previous wave.
#'
#'
#' Columns in \code{"sampled_data"} that do not match names of the
#' \code{"data"} from the previous wave will be added as new columns in
#' the output dataframe. All ids that do not appear in
#' \code{"sampled_data"} will receive NA values for these new variables.
#'
#' @param x an object of class \code{"Multiwave"}.
#' @param phase A numeric value specifying the phase of the
#' Multiwave object that
#' the specified wave is in. Cannot be phase 1.
#' @param wave A numeric value specifying the wave of the Multiwave
#' object that the merge should be
#' performed in. This wave must have a valid dataframe in the
#' \code{"sampled data"} slot. The previous wave, taken as the final
#' wave of the previous phase if \code{wave} = 1, must have a valid
#' dataframe in the \code{"data"} slot.
#' @param id A character value specifying the name of the column holding unit
#' ids. Taken from wave, phase, or overall metadata (searched for in that
#' order) if \code{NULL}. Defaults to \code{NULL}.
#' @param phase_sample_ind a character value specifying the name of the column
#' that should hold the indicator of whether each unit has already been sampled
#' in the current \emph{phase}. The specified phase number will be appended
#' to the end of the given character name. Defaults to "sampled_phase".
#' @param wave_sample_ind a character value specifying the name of the column
#' that should hold the indicator of whether each unit has already been sampled
#' in the current \emph{wave}. The specified phase and wave numbers separated
#' by "." will be appended o the end of the given character name.
#' If FALSE, no such column is created. Defaults to "sampled_wave".
#' @param include_probs A logical value. If TRUE, looks for "probs" in
#' the \code{design_data} slot and includes the corresponding sampling
#' probability for each element sampled in the current wave in the merged data
#' in a column named "sampling_prob". If this column already exists, it keeps
#' the existing column and adds (or replaces) the values for units sampled in
#' the current wave. Returns an error if specified but
#' \code{wave_sample_wave} is FALSE.
#' Defaults to NULL, which looks for "probs" argument in
#' metadata and does not create (or add to existing) "sampling_prob" column if
#' none is found.
#' @details
#' If a column name in the \code{"sampled_data"} matches a column name in
#' the \code{"data"} slot of the previous wave, these columns will be
#' merged into one column with the same name in the output dataframe.
#' For ids that have non-missing values in both columns of the merge,
#' the value from \code{"sampled_data"} will overwrite the previous value
#' and a warning will be printed. All ids present in the \code{"data"} from the
#' previous wave but missing from \code{"sampled_data"} will be given NA values
#' for the newly merged variables.
#'
#' If columns with the name produced by \code{phase_sample_ind} or
#' \code{wave_sample_ind} already exist, they will be overwritten.
#'
#'
#' @return A Multiwave object with the merged dataframe in the
#' \code{"data"} slot of the specified wave.
#' @export
#' @include multiwave.R phase.R wave.R
#' @aliases merge_samples,Multiwave-method
#'
#' @examples
#' library(datasets)
#' iris <- data.frame(iris, id = 1:150)
#'
#' MySurvey <- multiwave(phases = 2, waves = c(1, 3))
#' set_mw(MySurvey, phase = 1, slot = "data") <-
#' data.frame(dplyr::select(iris, -Sepal.Width))
#' set_mw(MySurvey, phase = 2, wave = 1, slot = "sampled_data") <-
#' dplyr::select(iris, id, Sepal.Width)[1:40, ]
#' set_mw(MySurvey, phase = 2, wave = 1, slot = "samples") <-
#' list(ids = 1:40)
#' MySurvey <- merge_samples(MySurvey, phase = 2, wave = 1, id = "id")
#' @importFrom rlang :=
setGeneric("merge_samples", function(x, phase, wave,
id = NULL,
phase_sample_ind = "sampled_phase",
wave_sample_ind = "sampled_wave",
include_probs = NULL) {
standardGeneric("merge_samples")
})
setMethod(
"merge_samples", c(x = "Multiwave"),
function(x, phase, wave, id = NULL,
phase_sample_ind = NULL,
wave_sample_ind = NULL,
include_probs = NULL) {
if (!is.numeric(phase) |
!(phase %in% c(seq_len(length(x@phases))) & phase > 1)) {
stop("'phase' must be a numeric value specifying a valid phase
in 'x'")
}
if (!is.numeric(wave) |
!(wave %in% c(seq_len(length(x@phases[[phase]]@waves))))) {
stop("'wave' must be a numeric value specifying a valid wave in
'phase' in 'x'")
}
# Get previous wave data.
if ((phase == 2 | phase == "phase2") & (wave == 1 | wave == "wave1")) {
previous_wave_data <- x@phases$phase1$data
} else if (wave == 1 | wave == "wave1") {
previous_wave_data <- x@phases[[phase - 1]]@waves[[
length(x@phases[[phase - 1]]@waves)]]@data
} else if (wave != 1) {
previous_wave_data <- x@phases[[phase]]@waves[[
wave - 1]]@data
}
# Get id if given id is NULL
if (is.null(id)) {
if ("id" %in% names(x@phases[[phase]]@waves[[wave]]@metadata) # &
# inherits(x@phases[[phase]]@waves[[wave]]@metadata$id,
# "character")
) {
id <- x@phases[[phase]]@waves[[wave]]@metadata$id
} else if ("id" %in% names(x@phases[[phase]]@metadata) # &
# inherits(x@phases[[phase]]@waves[[wave]]@metadata$id,
# "character")
) {
id <- x@phases[[phase]]@metadata$id
} else if ("id" %in% names(x@metadata) # &
# inherits(x@phases[[phase]]@waves[[wave]]@metadata$id,
# "character")
) {
id <- x@metadata$id
} else {
stop("Could not find character element with name
'id' in metadata. Add
or specify in function call.")
}
}
if (!(id %in% names(x@phases[[phase]]@waves[[wave]]@sampled_data)) |
!(id %in% names(previous_wave_data))) {
stop("'id' must be a character value specifying the name of the
column that holds unit ids in both 'sampled_data' of this wave
and 'data' of the previous wave")
}
# Print warning about ids if necessary
if (!(all(
x@phases[[phase]]@waves[[wave]]@sampled_data[[id]] %in%
previous_wave_data[[id]]
))) {
warning("sampled_data is introducing new ids to the data")
}
# Get include_probs if given include_probs is NULL
if(is.null(include_probs)){
if ("include_probs" %in% names(x@phases[[phase]]@waves[[wave]]@metadata)){
include_probs <- x@phases[[phase]]@waves[[wave]]@metadata$include_probs
} else if ("include_probs" %in% names(x@phases[[phase]]@metadata)) {
include_probs <- x@phases[[phase]]@metadata$include_probs
} else if ("include_probs" %in% names(x@metadata)) {
include_probs <- x@metadata$include_probs
} else{
include_probs <- FALSE
}
}
# Error if include_probs is not logical
if(!is.logical(include_probs)){
stop("'include_probs' must be TRUE, FALSE, or NULL")
}
# Get phase_sample_ind if given phase_sample_ind is NULL
if (is.null(phase_sample_ind)) {
if ("phase_sample_ind" %in% names(x@phases[[phase]]@waves[[wave]]@metadata)
) {
phase_sample_ind <-
x@phases[[phase]]@waves[[wave]]@metadata$phase_sample_ind
} else if ("phase_sample_ind" %in% names(x@phases[[phase]]@metadata)) {
phase_sample_ind <- x@phases[[phase]]@metadata$phase_sample_ind
} else if ("phase_sample_ind" %in% names(x@metadata)) {
phase_sample_ind <- x@metadata$phase_sample_id
} else {
phase_sample_ind <- "sampled_phase"
}
}
# Get wave_sample_ind if given wave_sample_ind is NULL
if (is.null(wave_sample_ind)) {
if ("wave_sample_ind" %in% names(x@phases[[phase]]@waves[[wave]]@metadata)
) {
wave_sample_ind <-
x@phases[[phase]]@waves[[wave]]@metadata$wave_sample_ind
} else if ("wave_sample_ind" %in% names(x@phases[[phase]]@metadata)) {
wave_sample_ind <- x@phases[[phase]]@metadata$wave_sample_ind
} else if ("wave_sample_ind" %in% names(x@metadata)) {
wave_sample_ind <- x@metadata$wave_sample_id
} else {
wave_sample_ind <- "sampled_wave"
}
}
# Require wave_sample_ind to exist if include_probs is true, so wave is
# always attached to probability
if(include_probs == TRUE & wave_sample_ind == FALSE){
stop("'wave_sample_ind must be specified if 'include_probs' is TRUE.")
}
# Errors if given wave ind or phase ind are not characaters
if (!is.character(phase_sample_ind) | length(phase_sample_ind) != 1) {
stop("'phase_sampled_ind' must be a character value specifying the desired
name of the column in the newly merged data that should hold a
binary indicator for whether each unit has been sampled in the current
phase.")
}
if (!is.character(wave_sample_ind) & wave_sample_ind != FALSE |
length(wave_sample_ind) != 1) {
stop("'wave_sampled_ind' must be FALSE or a character value specifying the
desired name of the column in the newly merged data that should hold a
binary indicator for whether each unit has been sampled in the current
wave.")
}
# Perform merge
output_data <-
dplyr::full_join(previous_wave_data,
x@phases[[phase]]@waves[[wave]]@sampled_data,
by = id
)
# Fix duplicate columns
dup_cols <- names(previous_wave_data)[
names(previous_wave_data) %in% names(
x@phases[[phase]]@waves[[wave]]@sampled_data
)
]
dup_cols <- dup_cols[dup_cols != id]
if (length(dup_cols > 0)) {
for (i in seq_along(dup_cols)) {
# Warning if non-NA for both values for any row
if (any(!is.na(sort(previous_wave_data[
previous_wave_data[, id] %in%
x@phases[[phase]]@waves[[wave]]@sampled_data[, id],
dup_cols[i]
])) &
!is.na(sort(x@phases[[phase]]@waves[[wave]]@sampled_data[
, dup_cols[i]
])))) {
warning("Some units in 'sampled_data' have non-NA values already in
columns that are being merged. The old values will be
overwritten by the values in 'sampled_data' for these
units.")
}
new_col_name <- as.character(dup_cols[i])
new_col_namex <- paste0(new_col_name, ".x")
new_col_namey <- paste0(new_col_name, ".y")
new_col_name_sym <- enquo(new_col_name)
new_col_namex_sym <- enquo(new_col_namex)
new_col_namey_sym <- enquo(new_col_namey)
output_data <- output_data %>%
dplyr::mutate(!!new_col_name := dplyr::coalesce(
!!sym(new_col_namey),
!!sym(new_col_namex)
)) %>%
dplyr::select(
-paste0(new_col_name, ".x"),
-paste0(new_col_name, ".y")
)
}
}
# Add indicator for already sampled in current phase
already_sampled_phase_ids <- list()
warn_empty <- FALSE
for (i in seq_len(wave)) {
already_sampled_phase_ids[[i]] <- get_mw(x,
phase = phase,
wave = i,
slot = "samples"
)$ids
if (length(
get_mw(x,
phase = phase,
wave = i,
slot = "samples"
)$ids) == 0) {
warn_empty <- TRUE
}
}
if(warn_empty == TRUE){
warning("some 'samples' slots of waves in this phase are
empty. The `sampled_ind` column of the newly merged data may
be inaccurate.")
}
already_sampled_phase_ids <- unlist(already_sampled_phase_ids)
phase_col_name <- paste0(phase_sample_ind, phase)
output_data[, phase_col_name] <-
ifelse(output_data[,id] %in% already_sampled_phase_ids, 1, 0)
# Add indicator for sampled in current wave
if(wave_sample_ind != FALSE){
wave_col_name <- paste0(wave_sample_ind, phase, ".", wave)
output_data[, wave_col_name] <-
ifelse(output_data[,id] %in% get_mw(x,
phase = phase,
wave = wave,
slot = "samples")$ids, 1, 0)
if(all(sort(unique(get_mw(x,
phase = phase,
wave = wave,
slot = "samples")$ids)) !=
sort(unique(x@phases[[phase]]@waves[[wave]]@sampled_data[,id])))){
warning("ids in 'samples' slot were used to create sample indicator,
but they do not match ids in 'sampled_data'")
}
}
# Get probs from design_data if include_probs is TRUE.
if(include_probs == TRUE){
if(length(x@phases[[phase]]@waves[[wave]]@samples$probs) > 0){
if(length(x@phases[[phase]]@waves[[wave]]@samples$probs) !=
length(x@phases[[phase]]@waves[[wave]]@samples$ids)){
stop(" 'probs' and 'ids' in 'samples'
slot should be the same length.")
} else{
probs_df <-
data.frame(x@phases[[phase]]@waves[[wave]]@samples$ids,
x@phases[[phase]]@waves[[wave]]@samples$probs)
names(probs_df) <- c(id, "sampling_prob")
# First if 'sampling_probs' is already a column in data, write probs
# from 'design_data' only for new samples, leaving any old values
if("sampling_prob" %in% names(output_data)){
temp1 <- output_data[,c(id, "sampling_prob")]
temp2 <- dplyr::left_join(temp1, probs_df, by = id)
updated_probs <- dplyr::coalesce(temp2$sampling_prob.y,
temp2$sampling_prob.x)
output_data$sampling_prob <- updated_probs
} else{
output_data <- output_data %>%
dplyr::left_join(probs_df, by = id)
}
}
} else{
warning("include_probs is TRUE, but no 'probs' found in samples slot.")
}
}
# Add output_data to data slot of current wave
set_mw(x, phase = phase, wave = wave, slot ="data") <- output_data
return(x)
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.