R/mergedatasetsbyvariable.R

Defines functions recodeOutOfBoundsIntegersIfNecessary print.MergeDataSetByVariablePage print.MergeDataSetByVariable exampleIDValues doMergeByVariable parseInputVariableTextForDataSet orderVariablesUsingInputDataSet mergedDataSetVariableNames matchCasesWithoutIDVariables convertIDVariableType mergedIDVariableType parseIDVariables matchCasesWithIDVariables matchCases MergeDataSetsByVariable

Documented in MergeDataSetsByVariable

#' @title Merge Data Sets by Variable
#' @description Merges multiple data sets by combining variables, matching
#'  cases either using ID variables or by simply joining data sets side-by-side.
#' @param data.set.names A character vector of names of data sets from the
#'  Displayr cloud drive to merge (if run from Displayr) or file paths of local
#'  data sets.
#' @param merged.data.set.name A character scalar of the name of the merged
#'  data set in the Displayr cloud drive (if run from Displayr) or the local
#'  file path of the merged data set.
#' @param id.variables A character vector of ID variable names corresponding
#'  to each data set. ID variables should generally contain unique IDs, but
#'  otherwise an ID can only be duplicated in at most one data set. The ID
#'  variable in the merged data set will use the name and label from the ID
#'  variable from the first input data set.
#'  NULL if ID variables are not used, in which case the input data sets are
#'  simply combined side-by-side, and the input data sets are required to have
#'  the same number of cases.
#' @param include.or.omit.variables A character vector where each element
#'  corresponds to an input data set, and indicates whether variables from the
#'  input data set are to be specified in the merged data set by specifying
#'  the variables to include ("Only include manually specified variables") or the
#'  variables to omit ("Include all variables except those manually omitted").
#' @param variables.to.include.or.omit A list of character vectors corresponding
#'  to each data set. Each element in a character vector contains comma-separated
#'  names of variables to include or omit (depending on the option for the data
#'  set in \code{include.or.omit.variables}). Ranges of variables can be specified
#'  by separating variable names by '-'. Wildcard matching of names is supported
#'  using the asterisk character '*'.
#' @param only.keep.cases.matched.to.all.data.sets A logical scalar which
#'  controls whether to only keep cases if they are present in all data sets,
#'  and discard the rest.
#' @param include.merged.data.set.in.output A logical scalar which controls
#'  whether to include the merged data set in the output object, which can be
#'  used for diagnostic purposes in R.
#' @return A list of class MergeDataSetByVariable with the following elements:
#' \itemize{
#'   \item \code{merged.data.set} If \code{include.merged.data.set.in.output},
#'   is TRUE, this is a data frame of the merged data set.
#'   \item \code{input.data.sets.metadata} A list containing metadata on the
#'     the input data sets such as variable names, labels etc. See the function
#'     \code{metadataFromDataSets} for more information.
#'   \item \code{merged.data.set.metadata} A list containing metadata on the
#'     the merged data set such as variable names, labels etc. See the function
#'     \code{metadataFromDataSet} for more information.
#'   \item \code{source.data.set.indices} An integer vector corresponding to the
#'   variables in the merged data set. Each element contains the index of the
#'   input data set from which the variable originated. The data set index
#'   for the ID variable will be 1 even though ID variables are present in all
#'   data sets when ID variables are specified.
#'   \item \code{omitted.variable.names.list} A list whose elements correspond
#'   to the input data sets. Each element contains the names of variables from a
#'   data set that were omitted from the merged data set.
#'   \item \code{merged.id.variable.name} A character scalar of the name of the
#'   ID variable in the merged data set. It is NULL if there is no ID variable.
#'   \item \code{id.variable.names} A character vector corresponding to the
#'   input data sets. Each element is an ID variable name from an input data set.
#'   \item \code{example.id.values} A character vector corresponding to the
#'   input data sets. Each element is an example ID value from an ID variable
#'   from an input data set.
#'   \item \code{is.saved.to.cloud} A logical scalar indicating whether the
#'   merged data set was saved to the Displayr cloud drive.
#' }
#' @examples
#' path <- c(system.file("examples", "cola15.sav", package = "flipData"),
#'           system.file("examples", "cola16.sav", package = "flipData"))
#' print(MergeDataSetsByVariable(path, id.variables = c("Attr1","PartyID")))
#' @export
MergeDataSetsByVariable <- function(data.set.names,
                                    merged.data.set.name = NULL,
                                    id.variables = NULL,
                                    include.or.omit.variables = rep("Include all variables except those manually omitted", length(data.set.names)),
                                    variables.to.include.or.omit = NULL,
                                    only.keep.cases.matched.to.all.data.sets = FALSE,
                                    include.merged.data.set.in.output = FALSE)
{
    # === Data dictionary ===
    # data.sets: A list of data frames, with each representing an input data set.
    # input.data.sets.metadata: A list of containing metadata on the input data
    #                           sets. See the function metadataFromDataSets.
    # matched.cases.matrix: An integer matrix whose rows correspond to the cases
    #                       in the merged data set and whose columns correspond
    #                       to the input data sets. Each row contains the input
    #                       data set case indices that map to a case in the
    #                       merged data set. Has the attributes id.variable.names
    #                       and merged.id.variable (see below).
    # id.variable.names: Character vector of the names of the ID variables from
    #                    each input data set.
    # merged.id.variable: A vector containing the values of the ID variable in the merged
    #                     data set, which was created by merging the ID
    #                     variables from the input data sets.
    # merged.data.set.var.names: A character vector containing the names of the
    #                            variables in the merged data set. Has the
    #                            attributes included.variable.names.list,
    #                            omitted.variable.names, merged.id.variable.name,
    #                            source.data.set.indices.
    # included.variable.names.list: A list of character vectors corresponding to
    #                               the input data sets. Each element contains the
    #                               names of the variables from an input data set
    #                               that will be present in the merged data set.
    # omitted.variable.names.list: A list whose elements correspond to the input
    #                              data sets. Each element contains the names of
    #                              variables from a data set that were omitted
    #                              from the merged data set.
    # merged.id.variable.name: A character scalar of the name of the ID
    #                          variable in the merged data set. It is NULL if
    #                          there is no ID variable.
    # source.data.set.indices: An integer vector corresponding to the variables
    #                          in the merged data set. Each element contains
    #                          the index of the input data set from which the
    #                          variable originated. The data set index for the
    #                          ID variable will be 1 even though ID variables
    #                          are present in all data sets when ID variables
    #                          are specified.
    # merged.data.set: A data frame representing the merged data set.

    tryCatch({
        data.sets <- readDataSets(data.set.names, 2)
        input.data.sets.metadata <- metadataFromDataSets(data.sets)

        matched.cases.matrix <- matchCases(input.data.sets.metadata, id.variables,
                                           data.sets,
                                           only.keep.cases.matched.to.all.data.sets)
        merged.data.set.var.names <- mergedDataSetVariableNames(input.data.sets.metadata,
                                                                include.or.omit.variables,
                                                                variables.to.include.or.omit,
                                                                matched.cases.matrix)
    }, error = function(e) {
        if (grepl("cannot allocate vector of size ", e$message)) {
            throwInputDataSetsTooLargeError()
        } else
            stop(e)
    })


    tryCatch({
        merged.data.set <- doMergeByVariable(data.sets, matched.cases.matrix,
                                             merged.data.set.var.names,
                                             input.data.sets.metadata)
        merged.data.set.name <- correctDataSetName(merged.data.set.name,
                                                   "Combined data set.sav")
        is.saved.to.cloud <- IsDisplayrCloudDriveAvailable()
        writeDataSet(merged.data.set, merged.data.set.name,
                     is.saved.to.cloud = is.saved.to.cloud)

        result <- list()
        if (include.merged.data.set.in.output)
            result$merged.data.set <- merged.data.set

        result$input.data.sets.metadata <- input.data.sets.metadata
        result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
                                                               merged.data.set.name)
        result$source.data.set.indices <- attr(merged.data.set.var.names,
                                               "source.data.set.indices",
                                               exact = TRUE)
        result$omitted.variable.names.list <- attr(merged.data.set.var.names,
                                                   "omitted.variable.names.list",
                                                   exact = TRUE)
        result$merged.id.variable.name <- attr(merged.data.set.var.names,
                                               "merged.id.variable.name",
                                               exact = TRUE)
        result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
                                         exact = TRUE)
        result$example.id.values <- exampleIDValues(result$id.variable.names,
                                                    data.sets)
        result$is.saved.to.cloud <- is.saved.to.cloud
        class(result) <- "MergeDataSetByVariable"
        result
    }, error = function(e) {
        if (grepl("cannot allocate vector of size ", e$message)) {
            throwCombinedDataSetTooLargeError()
        } else
            stop(e)
    })
}

#' @param input.data.sets.metadata See data dictionary.
#' @param id.variables See documentation for id.variables in MergeDataSetsByVariable.
#' @param data.sets See data dictionary.
#' @param only.keep.cases.matched.to.all.data.sets See documentation for
#'  only.keep.cases.matched.to.all.data.sets in MergeDataSetsByVariable.
#' @return Returns matched.cases.matrix, see data dictionary.
#' @noRd
matchCases <- function(input.data.sets.metadata, id.variables,
                       data.sets, only.keep.cases.matched.to.all.data.sets)
{
    if (!is.null(id.variables))
        matchCasesWithIDVariables(input.data.sets.metadata, id.variables,
                                  data.sets,
                                  only.keep.cases.matched.to.all.data.sets)
    else
        matchCasesWithoutIDVariables(input.data.sets.metadata)
}

# See params for the function matchCases.
#' @return Returns the matched.cases matrix, see data dictionary.
#' @noRd
matchCasesWithIDVariables <- function(input.data.sets.metadata, id.variables,
                                      data.sets,
                                      only.keep.cases.matched.to.all.data.sets)
{
    id.var.names <- parseIDVariables(id.variables, input.data.sets.metadata)
    n.data.sets <- input.data.sets.metadata$n.data.sets
    n.cases <- input.data.sets.metadata$n.cases

    id.var.types <- vapply(seq_len(n.data.sets), function(i) {
        variableType(data.sets[[i]][[id.var.names[i]]])
    }, character(1))

    merged.id.var.type <- mergedIDVariableType(id.var.types)

    # List whose elements correspond to the input data sets.
    # Each element contains the ID variable from a data set, after converting
    # its type to merged.id.var.type
    ids.list <- lapply(seq_len(n.data.sets), function(i) {
        ids <- data.sets[[i]][[id.var.names[i]]]
        ids <- convertIDVariableType(ids, id.var.types[i],
                                     merged.id.var.type)

        if (all(is.na(ids)))
            stop("The id variable '", id.var.names[i], "' from data set ", i,
                 " does not contain any non-missing IDs.")
        ids
    })

    non.missing.ids.list <- lapply(ids.list, removeNA)

    # Warn if no overlap exists
    if (n.data.sets == 2)
    {
        if (all(!(non.missing.ids.list[[1]] %in% non.missing.ids.list[[2]])))
            warning("There are no common IDs between the two input data sets. ",
                    "Ensure that the ID variable names have been correctly specified.")
    }
    else # n.data.sets > 2
    {
        for (i in seq_len(n.data.sets))
        {
            if (all(!(non.missing.ids.list[[i]] %in% unlist(non.missing.ids.list[-i]))))
                warning("The IDs in data set ", i, " from variable '",
                        id.var.names[i],
                        "' are not present in the other data sets. ",
                        "Ensure that the ID variable names have been correctly specified.")
        }
    }

    # Merge ID values
    unique.ids <- unique(unlist(non.missing.ids.list))
    merged.id.variable <- NULL
    for (unique.id in unique.ids)
    {
        id.frequency <- vapply(non.missing.ids.list, function(ids) {
            sum(ids == unique.id)
        }, integer(1))

        # Check that there are no IDs that are duplicated in more than one data set
        if (sum(id.frequency > 1) > 1)
            stop("The data sets cannot be combined by the specified ID variables as the ID '",
                 unique.id, "' is duplicated in multiple data sets.")

        merged.id.variable <- c(merged.id.variable,
                                rep(unique.id, max(id.frequency)))
    }

    result <- matrix(NA_integer_, nrow = length(merged.id.variable),
                     ncol = n.data.sets)
    for (id in unique.ids)
    {
        merged.id.ind <- which(id == merged.id.variable)
        for (i in seq_len(n.data.sets))
        {
            id.ind <- which(id == ids.list[[i]])
            if (length(id.ind) == 0)
                next

            result[merged.id.ind, i] <- id.ind
        }
    }

    if (only.keep.cases.matched.to.all.data.sets)
    {
        is.incomplete.case <- rowSums(is.na(result)) > 0
        if (all(is.incomplete.case))
            stop("The combined data set has no cases as there are no IDs that appear in all data sets. ",
                 "Ensure that the ID variable names have been correctly specified.")
        result <- result[!is.incomplete.case, , drop = FALSE]
        merged.id.variable <- merged.id.variable[!is.incomplete.case]
    }

    attr(result, "id.variable.names") <- id.var.names
    attr(result, "merged.id.variable") <- merged.id.variable
    result
}

#' @param id.variables See documentation for id.variables in
#'  MergeDataSetsByVariable.
#' @param input.data.sets.metadata See data dictionary.
#' @return A character vector containing ID variable names corresponding to
#'  each data set.
#' @noRd
parseIDVariables <- function(id.variables, input.data.sets.metadata)
{
    n.data.sets <- input.data.sets.metadata$n.data.sets
    v.names.list <- input.data.sets.metadata$variable.names.list

    for (i in seq_len(n.data.sets))
    {
        t <- id.variables[i]
        if (!(t %in% v.names.list[[i]]))
            throwVariableNotFoundError(t, i)
    }
    id.variables
}

#' @param id.variable.types A character vector of variable types
#'  (see function variableType) of the ID variables to be merged.
#' @return A character scalar of the variable type of the merged ID variable.
#' @noRd
mergedIDVariableType <- function(id.variable.types)
{
    if (allIdentical(id.variable.types))
        id.variable.types[1]
    else if (all(isDateType(id.variable.types)))
        DATE.TIME.VARIABLE.TYPE
    else # Combination of 2 or more variable types (except date and date/time types)
        TEXT.VARIABLE.TYPE
}

#' @description Convert ID variable to have type merged.id.variable.type
#' @param ids A vector representing the ID variable to be converted.
#' @param id.variable.type The variable type (see function variableType) of the
#'  ID variable to be converted.
#' @param merged.id.variable.type The variable type that the ID variable will
#'  be converted to.
#' @return A vector representing the converted ID variable.
#' @noRd
convertIDVariableType <- function(ids, id.variable.type,
                                  merged.id.variable.type)
{
    if (id.variable.type == merged.id.variable.type) # no conversion necessary
    {
        ids
    }
    else if (merged.id.variable.type == DATE.TIME.VARIABLE.TYPE &&
             id.variable.type == DATE.VARIABLE.TYPE)
    {
             AsDateTime(as.character(ids))
    }
    else # merged.id.variable.type == TEXT.VARIABLE.TYPE (final possibility, see mergedIDVariableType)
    {
        if (id.variable.type == CATEGORICAL.VARIABLE.TYPE)
        {
            converted.ids <- rep(NA_character_, length(ids))
            val.attrs <- attr(ids, "labels", exact = TRUE)
            val.labels <- names(val.attrs)
            for (i in seq_along(val.attrs))
                converted.ids[ids == val.attrs[i]] <- val.labels[i]
            converted.ids
        }
        else
            as.character(ids)
    }
}

#' @description Match cases side-by-side (no matching of IDs).
#' @param input.data.sets.metadata See data dictionary.
#' @return Returns the matched.cases matrix, see data dictionary.
#' @noRd
matchCasesWithoutIDVariables <- function(input.data.sets.metadata)
{
    n.data.sets <- input.data.sets.metadata$n.data.sets
    n.cases <- input.data.sets.metadata$n.cases

    if (!allIdentical(n.cases))
        stop("The data sets could not be combined without ID variables (side-by-side, no matching) as they have differing numbers of cases. ",
             "To combine them, ID variables need to be specified.")

    matrix(rep(seq_len(n.cases[1]), n.data.sets), ncol = n.data.sets)
}

#' @param input.data.sets.metadata See data dictionary.
#' @param include.or.omit.variables See documentation for
#'  include.or.omit.variables in MergeDataSetsByVariable.
#' @param variables.to.include.or.omit See documentation for
#'  variables.to.include.or.omit in MergeDataSetsByVariable.
#' @param matched.cases See data dictionary.
#' @return Character vector of the names of the variables in the merged data set.
#' @noRd
mergedDataSetVariableNames <- function(input.data.sets.metadata,
                                       include.or.omit.variables,
                                       variables.to.include.or.omit,
                                       matched.cases)
{
    n.data.sets <- input.data.sets.metadata$n.data.sets
    v.names.list <- input.data.sets.metadata$variable.names.list
    id.var.names <- attr(matched.cases, "id.variable.names", exact = TRUE)

    v.names.to.include.or.omit.list <- lapply(seq_len(n.data.sets), function(i) {
        parseInputVariableTextForDataSet(variables.to.include.or.omit[[i]],
                                         v.names.list[[i]], i)
    })

    included.variable.names.list <- rep(list(character(0)), n.data.sets)
    omitted.var.names.list <- rep(list(character(0)), n.data.sets)
    merged.id.var.name <- NA_character_

    for (data.set.ind in seq_len(n.data.sets))
    {
        if (include.or.omit.variables[data.set.ind] == "Only include manually specified variables")
        {
            included.variable.names.list[[data.set.ind]] <- v.names.to.include.or.omit.list[[data.set.ind]]
            omitted.var.names.list[[data.set.ind]] <- setdiff(v.names.list[[data.set.ind]],
                                                              v.names.to.include.or.omit.list[[data.set.ind]])
        }
        else # include.or.omit.variables[data.set.ind] == "Include all variables except those manually omitted"
        {
            included.variable.names.list[[data.set.ind]] <- setdiff(v.names.list[[data.set.ind]],
                                                            v.names.to.include.or.omit.list[[data.set.ind]])
            omitted.var.names.list[[data.set.ind]] <- v.names.to.include.or.omit.list[[data.set.ind]]
        }

        # Keep ID variable from first data set,  because they must be included regardless of v.names.to.include.or.omit.list
        if (!is.null(id.var.names))
        {
            if (data.set.ind == 1)
            {
                included.variable.names.list[[data.set.ind]] <- union(included.variable.names.list[[data.set.ind]],
                                                                      id.var.names[data.set.ind])
                merged.id.var.name <- id.var.names[data.set.ind]
            }
            else if (data.set.ind > 1)
                included.variable.names.list[[data.set.ind]] <- setdiff(included.variable.names.list[[data.set.ind]],
                                                                        id.var.names[data.set.ind])
        }

        if (length(included.variable.names.list[[data.set.ind]]) == 0)
            stop("All variables in data set ", data.set.ind, "were specified to be omitted. ",
                 "Ensure that the variables to be omitted have been correctly specified.")

        included.variable.names.list[[data.set.ind]] <- orderVariablesUsingInputDataSet(included.variable.names.list[[data.set.ind]],
                                                                                v.names.list[[data.set.ind]])
        omitted.var.names.list[[data.set.ind]] <- orderVariablesUsingInputDataSet(omitted.var.names.list[[data.set.ind]],
                                                                                  v.names.list[[data.set.ind]])
    }

    merged.data.set.var.names <- character(0)
    for (i in seq_len(n.data.sets))
        for (nm in included.variable.names.list[[i]])
            merged.data.set.var.names <- c(merged.data.set.var.names,
                                           uniqueName(nm, merged.data.set.var.names, delimiter = "_"))
    merged.data.set.var.names <- sanitizeSPSSVariableNames(merged.data.set.var.names)
    attr(merged.data.set.var.names, "included.variable.names.list") <- included.variable.names.list
    attr(merged.data.set.var.names, "omitted.variable.names.list") <- omitted.var.names.list

    if (!is.null(id.var.names))
    {
        merged.id.var.name <- merged.data.set.var.names[match(merged.id.var.name,
                                                              unlist(included.variable.names.list))]
        attr(merged.data.set.var.names, "merged.id.variable.name") <- merged.id.var.name
    }

    source.data.set.indices <- unlist(lapply(seq_along(included.variable.names.list), function(i) {
        rep(i, length(included.variable.names.list[[i]]))
    }))
    attr(merged.data.set.var.names, "source.data.set.indices") <- source.data.set.indices

    merged.data.set.var.names
}

#' @param var.names.to.order Character vector of variable names to be reordered.
#' @param data.set.var.names Character vector of variable names from a data set
#'  that will be used to order the names in var.names.to.order.
#' @return Character vector of the names in var.names.to.order reordered
#'  according to their order in data.set.var.names.
#' @noRd
orderVariablesUsingInputDataSet <- function(var.names.to.order,
                                            data.set.var.names)
{
    var.names.to.order[order(match(var.names.to.order, data.set.var.names))]
}

#' @param input.text Character vector containing comma-separated variable names
#'  or variable ranges (see documentation for variables.to.include.or.omit in
#'  MergeDataSetsByVariable).
#' @param data.set.variable.names Character vector of variable names from a
#'  data set. The variables in input.text are expected to be from this data set.
#' @param data.set.index Integer scalar of the index of the data set among the
#'  input data sets.
#' @return A character vector of the names of variables parsed from input text.
#' @noRd
parseInputVariableTextForDataSet <- function(input.text,
                                             data.set.variable.names,
                                             data.set.index)
{
    split.text <- unlist(lapply(input.text, splitByComma),
                         use.names = FALSE)

    parsed.names <- character(0)
    for (j in seq_along(split.text))
    {
        t <- split.text[j]
        dash.ind <- match("-", strsplit(t, "")[[1]])
        if (is.na(dash.ind)) # not range
        {
            if (!grepl("\\*", t)) # single variable, not wildcard
            {
                if (!(t %in% data.set.variable.names))
                    throwVariableNotFoundError(t, data.set.index)
                parsed.names <- union(parsed.names, t)
            }
            else # wildcard
            {
                nms <-  parseVariableWildcardForMerging(t,
                                                        data.set.variable.names,
                                                        data.set.index,
                                                        error.if.not.found = TRUE)
                parsed.names <- union(parsed.names, nms)
            }
        }
        else # range of variables
        {
            if (grepl("\\*", t))
                stop("The input '", t,
                     "' is invalid as wildcard characters are not supported for variable ranges.")

            range.start <- trimws(substr(t, 1, dash.ind - 1))
            range.end <- trimws(substr(t, dash.ind + 1, nchar(t)))
            range.var.names <- variablesFromRange(data.set.variable.names,
                                                  range.start,
                                                  range.end,
                                                  data.set.index, t)
            parsed.names <- union(parsed.names, range.var.names)
        }
    }
    parsed.names
}

#' @param data.sets See data dictionary.
#' @param matched.cases.matrix See data dictionary.
#' @param merged.data.set.variable.names See data dictionary.
#' @param input.data.sets.metadata See data dictionary.
#' @return A data frame representing the merged data set.
#' @noRd
doMergeByVariable <- function(data.sets, matched.cases.matrix,
                              merged.data.set.variable.names,
                              input.data.sets.metadata)
{
    n.data.sets <- input.data.sets.metadata$n.data.sets
    n.merged.cases <- nrow(matched.cases.matrix)
    included.variable.names.list <- attr(merged.data.set.variable.names,
                                         "included.variable.names.list",
                                         exact = TRUE)
    id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
                              exact = TRUE)
    merged.id.var.name <- attr(merged.data.set.variable.names,
                               "merged.id.variable.name", exact = TRUE)
    merged.id.variable <- attr(matched.cases.matrix, "merged.id.variable",
                               exact = TRUE)

    merged.data.set.variables <- vector(mode = "list",
                                        length = length(merged.data.set.variable.names))

    j <- 1
    for (data.set.ind in seq_len(n.data.sets))
    {
        for (nm in included.variable.names.list[[data.set.ind]])
        {
            if (!is.null(merged.id.var.name) && data.set.ind == 1 &&
                nm == merged.id.var.name) # ID variable
            {
                merged.var <- merged.id.variable
                attr(merged.var, "label") <- attr(data.sets[[data.set.ind]][[nm]],
                                                  "label", exact = TRUE)
            }
            else # Non-ID variable
            {
                input.var <- data.sets[[data.set.ind]][[nm]]
                non.missing.ind <- which(!is.na(matched.cases.matrix[, data.set.ind]))
                # Initialize with missing values of the same type as input.var
                merged.var <- rep(c(input.var[1], NA)[2], n.merged.cases)
                merged.var[non.missing.ind] <- input.var[matched.cases.matrix[non.missing.ind,
                                                                              data.set.ind]]

                if (isIntegerValued(merged.var)) {
                    recode.object <- recodeOutOfBoundsIntegersIfNecessary(merged.var, input.var = input.var)
                    merged.var <- recode.object[["merged.var"]]
                    input.var <- recode.object[["input.var"]]
                    merged.var <- as.integer(merged.var)
                }
                v.type <- variableType(input.var)
                if (v.type == CATEGORICAL.VARIABLE.TYPE)
                {
                    val.attr <- attr(input.var, "labels", exact = TRUE)
                    if (is.integer(merged.var))
                    {
                        val.lbls <- names(val.attr)
                        val.attr <- as.integer(val.attr)
                        names(val.attr) <- val.lbls
                    }
                    attr(merged.var, "labels") <- val.attr
                    class(merged.var) <- c(class(merged.var), "haven_labelled")
                }
                attr(merged.var, "label") <- attr(input.var, "label",
                                                  exact = TRUE)
            }
            merged.data.set.variables[[j]] <- merged.var
            j <- j + 1
        }
    }
    names(merged.data.set.variables) <- merged.data.set.variable.names
    data.frame(merged.data.set.variables, check.names = FALSE)
}

#' @param id.variable.names See data dictionary.
#' @param data.sets See data dictionary.
#' @return Character vector of the first ID values from the ID variables from
#'  each input data set.
#' @noRd
exampleIDValues <- function(id.variable.names, data.sets)
{
    if (is.null(id.variable.names))
        return(NULL)

    vapply(seq_along(data.sets), function(i) {
        v <- data.sets[[i]][[id.variable.names[i]]]
        val.attr <- attr(v, "labels", exact = TRUE)
        # ID variables will have non-missing values as we checked for this
        if (!is.null(val.attr))
            names(val.attr)[val.attr == removeNA(v)[1]]
        else
            as.character(removeNA(v)[1])
    }, character(1))
}

#' @description Produces a widget output when printing a MergeDataSetByVariable
#'  object.
#' @param x A list of class MergeDataSetByVariable
#' @noRd
#' @importFrom flipFormat DataSetMergingByVariableWidget
#' @export
print.MergeDataSetByVariable <- function(x, ...)
{
    DataSetMergingByVariableWidget(x$input.data.sets.metadata,
                                   x$merged.data.set.metadata,
                                   x$source.data.set.indices,
                                   x$omitted.variable.names.list,
                                   x$merged.id.variable.name,
                                   x$id.variable.names,
                                   x$example.id.values,
                                   x$is.saved.to.cloud)
}

#' @description Produces a widget output that supports pagination when printing
#'  a MergeDataSetByVariablePage object.
#' @param x A list of class MergeDataSetByVariablePage
#' @noRd
#' @export
print.MergeDataSetByVariablePage <- function(x, ...)
{
    args <- list(x$input.data.sets.metadata,
                 x$merged.data.set.metadata,
                 x$source.data.set.indices,
                 x$omitted.variable.names.list,
                 x$merged.id.variable.name,
                 x$id.variable.names,
                 x$example.id.values,
                 x$is.saved.to.cloud,
                 x$page)

    if (!is.null(x$variables.per.page))
        args <- c(args, x$variables.per.page)

    do.call(DataSetMergingByVariableWidget, args)
}

recodeOutOfBoundsIntegersIfNecessary <- function(merged.var, input.var) {
    merged.unique.vals <- unique(merged.var)
    merged.unique.vals <- removeNA(merged.unique.vals)
    merged.val.attr <- attr(merged.var, "labels", exact = TRUE)
    input.val.attr <- attr(input.var, "labels", exact = TRUE)
    all.unique.vals <- unique(c(unclass(merged.unique.vals), unclass(input.val.attr)))

    bad.vals <- abs(all.unique.vals) > 1e9
    n.bad.vals <- length(which(bad.vals))
    if (n.bad.vals == 0)
        return(list(merged.var = merged.var,
                    input.var = input.var))
    lab <- attr(input.var, "label", exact = TRUE)
    if (n.bad.vals > 1 ) {
        stop("Variable: '",
             lab,
             "' contains multiple values outside the allowable range. ",
             "Values larger than 1,000,000,000 or smaller than -1,000,000,000 ",
             "should be recoded before attempting to merge these files.")
    }

    offending.value <- all.unique.vals[bad.vals]
    remaining.values <- setdiff(all.unique.vals, offending.value)
    # -99 is an industry convention
    new.value <- if (-99 %in% remaining.values) min(remaining.values) - 1 else -99
    warning("Variable: '",
             lab,
             "' contains a value outside of the allowable range (",
             offending.value,
             "). This value has been recoded as ",
             new.value)
    merged.var[merged.var == offending.value] <- new.value
    input.val.attr[input.val.attr == offending.value] <- new.value
    merged.val.attr[merged.val.attr == offending.value] <- new.value
    attr(input.var, "labels") <- input.val.attr
    attr(merged.var, "labels") <- merged.val.attr
    list(merged.var = merged.var,
         input.var = input.var)
}
NumbersInternational/flipData documentation built on March 2, 2024, 10:52 a.m.