R/stacking.R

Defines functions print.StackedData metadataFromStackedDataSet cleanStackedDataSetName parseVariableName parseVariableWildcard parseVariableRange getCommonSuffix getCommonSuffixTwoNames getCommonPrefixTwoNames getCommonPrefix deduplicateValAttrNames stackedValueAttributes stackedVariableLabel stackedVariableName stackedDataSet parseVariablesToInclude mergeCommonLabelAndManualStackingGroups isValueAttributesMergable stackingSpecifiedByObservation stackingSpecifiedByVariable permittedNA stackManually matchIndicesBasedOnName stackingGroupFromCommonLabels stackWithCommonLabels tidyManualCommonLabels commonLabelsFromASetOfReferenceVars commonLabelsFromReferenceVars commonLabelsByRemovingPrefix automaticCommonLabels commonLabels StackData

Documented in StackData

#' @title Stack data set
#' @description Stacks variables in a SPSS .sav data set that may be located
#'   locally or on the Displayr cloud drive (if run in Displayr). Stacking may
#'   be specified manually and/or by identifying common labels that appear in
#'   variable labels.
#'
#'   Manual stacking can be specified by variable or by observation. With the
#'   former, each group of variables to be stacked together is specified. With
#'   the latter, the variables in each stacked observation are specified (in
#'   order of the stacked variables). Any stacking can be performed with either
#'   option but often one is more convenient than the other depending on the
#'   structure and variable names of the required stacking.
#'
#'   Common label stacking occurs by stacking together groups of variables
#'   whose labels match the common labels after removing common prefixes and
#'   suffixes. Common labels can be specified to be generated automatically,
#'   inferred from a set of input reference variables or specified manually.
#'
#'   The stacked data set is saved as an SPSS .sav data set either locally or
#'   to the Displayr cloud drive (if run in Displayr).
#' @param input.data.set.name Name of data file to stack, either as a path to a
#'   local file (when running locally in R) or file in the Displayr Cloud Drive
#'   (when running in Displayr).
#' @param stacked.data.set.name Name of the stacked data file to be saved in
#'   the Displayr Cloud Drive (if run from Displayr) or saved locally.
#' @param stack.with.common.labels \code{"Automatically"},
#'   \code{"Using a set of variables to stack as reference"},
#'   \code{"Using manually input common labels"},
#'   \code{"Disabled"}.
#' @param reference.variables.to.stack A character vector of sets of variables
#'   to stack to be used as a reference to generate the common labels used for
#'   stacking. This can be a combination of comma-separated names, wildcards
#'   and ranges. Variable ranges can be specified by supplying the start and
#'   end variables separated by a dash '-'. If the start or end variables are
#'   left out, then the range is assumed to start from the first variable or
#'   end at the last variable respectively. Wildcards in variable names can be
#'   specified with an asterisk '*'.
#' @param manual.common.labels A list of sets of common labels to be used to
#'   identify variables to stack. Only used when \code{stack.with.common.labels}
#'   is \code{"Using manually input common labels"}. To be identified, a set of
#'   variables to be stacked must contain these labels, and have the same prefix
#'   and suffix before and after these labels.
#' @param specify.by "Variable" or "Observation". See \code{manual.stacking}.
#' @param manual.stacking If \code{specify.by} is "Variable", this is a
#'   character vector where each string corresponds to the names of the
#'   variables to be stacked into a new variable. If \code{specify.by} is
#'   "Observation", this is a character vector where each string corresponds
#'   the names of the variables in an observation in the stacked variables.
#'   The strings in both cases can be a combination of comma-separated names,
#'   wildcards and ranges.
#'   See \code{reference.variables.to.stack} for more details on the format.
#' @param variables.to.include Character vector of comma-separated names of
#'   non-stacked variables to include. Each string can be a combination of
#'   comma-separated names, wildcards and ranges.
#'   See \code{reference.variables.to.stack} for more details on the format.
#' @param include.stacked.data.set.in.output Whether to include the stacked
#'   data set in the output object.
#' @param include.original.case.variable Whether to include the \code{original_case}
#'   variable in the stacked data set.
#' @param include.observation.variable Whether to include the \code{observation}
#'   variable in the stacked data set.
#' @return A list with the following elements:
#' \itemize{
#'   \item \code{stacked.data.set.metadata} A list containing metadata on the
#'     the stacked data set such as variable names, labels etc.
#'   \item \code{unstackable.names} A list of character vectors containing
#'     names of the variables that could not be stacked using common labels due
#'     to mismatching types or value attributes.
#'   \item \code{common.labels.list} A list of character vectors containing
#'     the common labels used in the stacking. The source of these common
#'     labels depends on the parameter \code{stack.with.common.labels} and
#'     are either automatically generated, extracted from reference variables
#'     or manually supplied.
#'   \item \code{is.saved.to.cloud} Whether the stacked data set was saved to
#'     the Displayr cloud drive.
#'  }
#' @examples
#' path <- system.file("examples", "Cola.sav", package = "flipData")
#'
#' # Automatic common label stacking and manual stacking by variable
#' print(StackData(path,
#'                 specify.by = "Variable",
#'                 manual.stacking = c("Q6_*, NA", "Q9_A, Q9_B, Q9_C-Q9_F")))
#'
#' # Common labels from reference variables and included non-stacked variables
#' print(StackData(path,
#'                 stack.with.common.labels = "Using a set of variables to stack as reference",
#'                 reference.variables.to.stack = c("Q5_5_*", "Q6_A-Q6_F"),
#'                 variables.to.include = c("Q2")))
#'
#' # Manually specified common labels and manual stacking by observation
#' common.labels <- list(c("Coke", "Diet Coke", "Coke Zero", "Pepsi",
#'                         "Diet Pepsi", "Pepsi Max", "None of these"))
#' print(StackData(path,
#'                 stack.with.common.labels = "Using manually input common labels",
#'                 manual.common.labels = common.labels,
#'                 specify.by = "Observation",
#'                 manual.stacking = c("Q6_A,Q9_A", "Q6_B,Q9_B", "Q6_C,Q9_C",
#'                                     "Q6_D,Q9_D", "Q6_E,Q9_E", "Q6_F,Q9_F")))
#' @export
StackData <- function(input.data.set.name,
                      stacked.data.set.name = NULL,
                      stack.with.common.labels = "Automatically",
                      reference.variables.to.stack = NULL,
                      manual.common.labels = NULL,
                      specify.by = "Variable",
                      manual.stacking = NULL,
                      variables.to.include = NULL,
                      include.stacked.data.set.in.output = FALSE,
                      include.original.case.variable = TRUE,
                      include.observation.variable = TRUE)
{
    tryCatch({
        # Load input data set as data frame
        input.data.set <- readDataSets(input.data.set.name, 1)[[1]]

        # Create an object containing metadata on the input data set such as
        # variable names and labels which can be easily passed into function calls
        input.data.set.metadata <- metadataFromDataSet(input.data.set,
                                                       input.data.set.name)

        # Get the list of common labels to be used in stacking, either
        # automatically, via reference variables, or manually
        common.labels.list <- commonLabels(manual.common.labels,
                                           stack.with.common.labels,
                                           input.data.set.metadata,
                                           reference.variables.to.stack)

        # Compute stacking groups from common labels and manual stacking groups,
        # and then merge them together. A stacking group is a matrix that specifies
        # variables to be stacked via their indices in the input data set. Each row
        # represents a group of variables to be stacked.
        common.label.stacking.groups <- stackWithCommonLabels(common.labels.list,
                                                              input.data.set.metadata)
        manual.stacking.groups <- stackManually(manual.stacking, specify.by,
                                                input.data.set.metadata)
        stacking.groups <- mergeCommonLabelAndManualStackingGroups(common.label.stacking.groups,
                                                                   manual.stacking.groups)

        included.variable.names <- parseVariablesToInclude(variables.to.include,
                                                           input.data.set.metadata,
                                                           stacking.groups)
    }, error = function(e) {
        if (grepl("cannot allocate vector of size ", e$message)) {
            throwInputDataSetsTooLargeError()
        } else
            stop(e)
    })

    tryCatch({
        # Create the stacked data set as a data frame from the stacking groups
        stacked.data.set <- stackedDataSet(input.data.set, input.data.set.metadata,
                                           stacking.groups,
                                           include.original.case.variable,
                                           include.observation.variable,
                                           common.labels.list,
                                           included.variable.names)

        stacked.data.set.name <- cleanStackedDataSetName(stacked.data.set.name,
                                                         input.data.set.name)

        write.stacked.data.set <- nrow(stacked.data.set) > 0
        is.saved.to.cloud <- write.stacked.data.set && IsDisplayrCloudDriveAvailable()
        if (write.stacked.data.set)
            writeDataSet(stacked.data.set, stacked.data.set.name, is.saved.to.cloud)

        # Create an object containing metadata on the stacked data set such as
        # variable names and labels
        stacked.data.set.metadata <- metadataFromStackedDataSet(stacked.data.set,
                                                                stacked.data.set.name)

        result <- list()
        result$input.data.set.metadata <- input.data.set.metadata
        result$stacked.data.set.metadata <- stacked.data.set.metadata
        result$unstackable.names <- attr(stacking.groups, "unstackable.names")
        result$common.labels.list <- common.labels.list
        result$is.saved.to.cloud <- is.saved.to.cloud

        if (include.stacked.data.set.in.output)
            result$stacked.data.set <- stacked.data.set

        class(result) <- "StackedData"
        result
    }, error = function(e) {
        if (grepl("cannot allocate vector of size ", e$message)) {
            msg <- paste0("The stacked data set is too large to create. ",
                          "Consider reducing the number of variables in the stacked data set.")
            if (!is.null(common.labels.list))
                msg <- paste0(msg, " Also ensure that the common labels are ",
                              "appropriate: ",
                              paste0(unlist(common.labels.list), collapse = ", "), ".")
            stop(msg)
        } else
            stop(e)
    })
}

# Get common labels ready to be used for stacking. The actions of this function
# depend on the value of stack.with.common.labels.
# Returns a list where each element is a set of common labels
commonLabels <- function(manual.common.labels, stack.with.common.labels,
                         input.data.set.metadata, reference.variables.to.stack)
{
    if (stack.with.common.labels == "Automatically")
    {
        if (!is.null(manual.common.labels) && length(manual.common.labels) > 0)
            warning("Input common labels have been ignored because common labels are to be generated automatically.")
        return(automaticCommonLabels(input.data.set.metadata))
    }
    else if (stack.with.common.labels == "Using a set of variables to stack as reference")
    {
        if (!is.null(manual.common.labels) && length(manual.common.labels) > 0)
            warning("Input common labels have been ignored because common labels are to be obtained from a set of variables.")
        return(commonLabelsFromReferenceVars(reference.variables.to.stack,
                                             input.data.set.metadata))
    }
    else if (stack.with.common.labels == "Using manually input common labels")
        tidyManualCommonLabels(manual.common.labels)
    else if (stack.with.common.labels == "Disabled")
    {
        if (!is.null(manual.common.labels) && length(manual.common.labels) > 0)
            warning("Input common labels have been ignored as stacking with common labels has been disabled.")
        return(NULL)
    }
    else
        stop("Input for stack.with.common.labels not recognised: ",
             stack.with.common.labels)
}

# Finds a set of common labels from the variable labels. This works by
# 1. Finding all the complete-word prefixes between every pair of labels that
#    appear at least 3 times.
# 2. For each prefix, extract candidate common labels after removing the prefix
#    from labels.
# 3. For each set of candidate common labels, form stacking groups
# 4. Score stacking groups based on size, removing ones with many missing values
# 5. Choose common labels with highest score
# See unit test for automaticCommonLabels in test-stacking.R
automaticCommonLabels <- function(input.data.set.metadata)
{
    v.names <- input.data.set.metadata$variable.names
    v.labels <- input.data.set.metadata$variable.labels

    words <- strsplit(v.labels," ")
    is.multiple.words <- vapply(words, length, integer(1)) > 1
    words <- words[is.multiple.words]
    first.words <- vapply(words, `[`, character(1), 1)

    prefixes <- character(0)
    prefix.count <- integer(0)

    # Step 1
    for (i in 1:(length(first.words) - 1))
    {
        words.i <- words[[i]]
        n.words.i <- length(words.i)
        ind <- (i + 1):length(first.words)
        match.ind <- ind[first.words[i] == first.words[ind]]
        for (j in match.ind)
        {
            words.j <- words[[j]]

            min.ind <- 1:min(n.words.i, length(words.j))
            last.match <- match(TRUE, words.i[min.ind] != words.j[min.ind]) - 1
            prefix <- if (!is.na(last.match))
                paste(words.i[seq_len(last.match)], collapse = " ")
            else
                next # not really a prefix since one is a subset of another

            prefix.ind <- match(prefix, prefixes)
            if (is.na(prefix.ind))
            {
                prefixes <- c(prefixes, prefix)
                prefix.count <- c(prefix.count, 1)
            }
            else
                prefix.count[prefix.ind] <- prefix.count[prefix.ind] + 1
        }
    }
    prefixes <- prefixes[prefix.count > 2]

    # Step 2
    candidate.common.labels <- lapply(prefixes, commonLabelsByRemovingPrefix,
                                      v.labels)
    candidate.common.labels <- candidate.common.labels[!duplicated(candidate.common.labels)]

    # Step 3 and 4
    score <- vapply(candidate.common.labels, function(common.labels)
    {
        if (length(common.labels) == 1)
            return(-Inf)

        stacking.groups <- stackingGroupFromCommonLabels(common.labels,
                                                         v.names, v.labels)
        if (nrow(stacking.groups) == 0)
            return(-Inf)

        whole.column.missing <- apply(stacking.groups, 2,
                                      function(column) all(is.na(column)))
        stacking.groups <- stacking.groups[, !whole.column.missing, drop = FALSE]

        if (sum(!is.na(stacking.groups)) / length(stacking.groups) < 0.75)
            return(-Inf)

        sum(!is.na(stacking.groups))
    }, numeric(1))

    if (length(score) == 0 || max(score) == -Inf)
    {
        warning("No stacking was performed with common labels as they could not be found automatically. ",
                "To stack with common labels, specify them with reference variables or manually. ",
                "Otherwise specify stacking manually.")
        return(NULL)
    }

    # Step 5
    common.labels <- candidate.common.labels[[which.max(score)]]
    stacking.groups <- stackingGroupFromCommonLabels(common.labels,
                                                     v.names, v.labels)
    whole.column.missing <- apply(stacking.groups, 2,
                                  function(column) all(is.na(column)))
    list(common.labels[!whole.column.missing])
}

commonLabelsByRemovingPrefix <- function(prefix, v.labels)
{
    prefix <- paste0(prefix, " ")
    nchar.prefix <- nchar(prefix)
    lbls.with.prefix <- v.labels[substr(v.labels, 1, nchar.prefix) == prefix]
    common.labels <- substr(lbls.with.prefix, nchar.prefix + 1,
                            max(nchar(lbls.with.prefix)))
    common.labels <- unique(common.labels)
}

# Given a user-input string of variable names, parse it and extract common
# labels from the variable labels by removing common prefixes and suffixes
# See unit tests for commonLabelsFromReferenceVars in test-stacking.R
commonLabelsFromReferenceVars <- function(reference.variables.to.stack,
                                          input.data.set.metadata)
{
    if (is.null(reference.variables.to.stack) ||
        length(reference.variables.to.stack) == 0)
    {
        warning("No reference variables to stack were supplied for common labels. ",
                "No stacking was conducted using common labels.")
        return(NULL)
    }
    common.labels.list <- list()
    for (i in seq_along(reference.variables.to.stack))
    {
        common.labels <- commonLabelsFromASetOfReferenceVars(reference.variables.to.stack[i],
                                                             input.data.set.metadata)
        if (!is.null(common.labels))
            common.labels.list <- c(common.labels.list, list(common.labels))
    }

    if (length(common.labels.list) > 0)
        common.labels.list
    else
    {
        warning("No common labels could be extracted from the input reference variables.")
        NULL
    }
}

# Extract common labels from a single set of reference variables.
# See unit tests for commonLabelsFromASetOfReferenceVars in test-stacking.R
commonLabelsFromASetOfReferenceVars <- function(ref.vars.to.stack.text,
                                                input.data.set.metadata)
{
    split.text <- splitByComma(ref.vars.to.stack.text)

    if (length(split.text) == 0)
        return(NULL)

    on.fail.msg <- paste0("Common labels could not be obtained from the input '",
                          ref.vars.to.stack.text, "'.")

    v.names <- input.data.set.metadata$variable.names
    v.labels <- input.data.set.metadata$variable.labels

    parsed.var.names <- character(0)
    for (t in split.text)
    {
        parsed <- if (grepl("-", t, fixed = TRUE)) # contains range
            parseVariableRange(t, v.names, "common labels", on.fail.msg)
        else if (grepl("*", t, fixed = TRUE)) # contains wildcard
            parseVariableWildcard(t, v.names, "common labels", on.fail.msg)
        else
            parseVariableName(t, v.names, "common labels",
                              on.fail.msg)

        if (length(parsed) == 0)
            return(NULL)

        parsed.var.names <- c(parsed.var.names, parsed)
    }

    if (length(parsed.var.names) == 1)
    {
        warning("Only one variable is present in the input '",
                ref.vars.to.stack.text,
                "' for extracting common labels. ",
                "It has been ignored as more than one variable is required.")
        return(NULL)
    }

    lbls.containing.common.lbls <- vapply(parsed.var.names, function(nm) {
        v.labels[match(nm, v.names)]
    }, character(1))

    nchar.prefix <- nchar(getCommonPrefix(lbls.containing.common.lbls, whole.words = TRUE))
    nchar.suffix <- nchar(getCommonSuffix(lbls.containing.common.lbls, whole.words = TRUE))

    vapply(lbls.containing.common.lbls, function(lbl) {
        substr(lbl, nchar.prefix + 1, nchar(lbl) - nchar.suffix)
    }, character(1), USE.NAMES = FALSE)
}

# Tidy up user-input common labels and check for issues
tidyManualCommonLabels <- function(manual.common.labels)
{
    if (is.null(manual.common.labels))
    {
        warning("No common labels were manually supplied. ",
                "No stacking was conducted using common labels.")
        return(NULL)
    }

    common.labels.list <- lapply(seq_along(manual.common.labels), function(i) {
        lbls <- manual.common.labels[[i]]
        lbls <- trimws(lbls)
        lbls <- lbls[lbls != ""]
        if (length(lbls) == 0)
        {
            warning("Set ", i,
                    " of the manually-entered common labels does not contain any labels. ",
                    "Ensure that the common labels are correctly entered.")
            return(NULL)
        }
        else if (length(lbls) == 1)
        {
            warning("Set ", i,
                    " of the manually-entered common labels contains only one label when more than one is required.")
            return(NULL)
        }
        lbls
    })

    common.labels.list <- common.labels.list[!vapply(common.labels.list,
                                                     is.null, logical(1))]
    if (length(common.labels.list) > 0)
        common.labels.list
    else
        NULL
}

# Perform stacking given a character vector of common labels.
# A matrix of variable indices (stacking.group) is returned where each row
# represents a single stacking of variables into one variable.
# See unit test for stackWithCommonLabels in test-stacking.R
stackWithCommonLabels <- function(common.labels.list, input.data.set.metadata)
{
    if (is.null(common.labels.list) || length(common.labels.list) == 0)
        return(NULL)

    v.names <- input.data.set.metadata$variable.names
    v.labels <- input.data.set.metadata$variable.labels
    v.types <- input.data.set.metadata$variable.types
    v.val.attr <- input.data.set.metadata$variable.value.attributes

    stacking.groups.list <- lapply(common.labels.list,
                                   stackingGroupFromCommonLabels,
                                   v.names, v.labels)

    # Combine stacking groups in stacking.groups.list
    n.stacking <- max(vapply(stacking.groups.list, ncol, integer(1)))
    stacking.groups <- matrix(nrow = 0, ncol = n.stacking)
    for (element in stacking.groups.list)
    {
        for (i in seq_len(nrow(element)))
        {
            rw <- element[i, ]
            rw.without.missing <- removeNA(rw)
            is.overlapping <- apply(stacking.groups, 1, function(group.ind) {
                any(rw.without.missing %in% removeNA(group.ind))
            })
            n.overlapping <- sum(is.overlapping)
            if (n.overlapping == 0)
            {
                new.row <- rep(NA_integer_, n.stacking)
                new.row[seq_along(rw)] <- rw
                stacking.groups <- rbind(stacking.groups, new.row,
                                         deparse.level = 0)
            }
            else if (n.overlapping == 1)
            {
                # Use the new row if it has less missing indices than the row
                # it overlaps with
                if (length(rw.without.missing) >
                    sum(!is.na(stacking.groups[is.overlapping, ])))
                {
                    stacking.groups <- stacking.groups[!is.overlapping, ,
                                                       drop = FALSE]
                    new.row <- rep(NA_integer_, n.stacking)
                    new.row[seq_along(rw)] <- rw
                    stacking.groups <- rbind(stacking.groups, new.row,
                                             deparse.level = 0)
                }
            }
            # else (n.overlapping > 1): don't add row to stacking.groups
        }
    }

    # Remove groups with mismatching variable types and value attributes
    unstackable.ind <- which(apply(stacking.groups, 1, function(ind) {
        ind <- removeNA(ind)
        !allIdentical(v.types[ind]) || !allValueAttributesIdentical(v.val.attr[ind])
    }))
    unstackable.names <- lapply(unstackable.ind, function(ind) {
        v.names[removeNA(stacking.groups[ind, ])]
    })
    if (length(unstackable.names) > 0)
        warning("Some variables could not be stacked due to mismatching variable types or value attributes. ",
                "See Notes section in output for more details.")

    if (length(unstackable.ind) > 0)
        stacking.groups <- stacking.groups[-unstackable.ind, , drop = FALSE]
    attr(stacking.groups, "unstackable.names") <- unstackable.names
    stacking.groups
}

# Creates a stacking group matrix from common labels. This does the
# heavy-lifting for stackWithCommonLabels, i.e., determining which variables
# to stack together.
# This works by finding all the prefixes and suffixes from labels containing
# common labels and grouping together variables with common labels and the same
# prefix and suffix, also considering common prefixes and suffixes in variable
# names if necessary.
# See unit tests for stackingGroupFromCommonLabels in test-stacking.R
stackingGroupFromCommonLabels <- function(common.labels, v.names, v.labels)
{
    v.labels.lowercase <- tolower(v.labels)
    n.common.labels <- length(common.labels)
    common.label.prefixes.suffixes <- list()
    match.ind <- list()

    for (i in seq_len(n.common.labels))
    {
        common.lbl <- common.labels[i]
        matches <- gregexpr(tolower(common.lbl), v.labels.lowercase, # case insensitive match
                            fixed = TRUE)
        ind <- which(vapply(matches, function(m) m[[1]][1], integer(1)) != -1)
        common.label.prefixes.suffixes[[i]] <- t(vapply(ind, function(j) {
            lbl <- v.labels[j]
            start.ind <- matches[[j]][1]
            c(trimws(substr(lbl, 1, start.ind - 1)),
              trimws(substr(lbl, start.ind + nchar(common.lbl), nchar(lbl))))
        }, character(2)))
        match.ind[[i]] <- ind
    }

    unique.prefixes.suffixes <- do.call("rbind", common.label.prefixes.suffixes)
    unique.prefixes.suffixes <- unique.prefixes.suffixes[!duplicated(unique.prefixes.suffixes,
                                                                     MARGIN = 1), , drop = FALSE]
    ord <- do.call("order", data.frame(nchar(unique.prefixes.suffixes)))
    unique.prefixes.suffixes <- unique.prefixes.suffixes[ord, , drop = FALSE]

    stacking.groups <- matrix(nrow = 0, ncol = n.common.labels)
    for (i in seq_len(nrow(unique.prefixes.suffixes)))
    {
        prefix.suffix <- unique.prefixes.suffixes[i, ]
        common.labels.ind <- lapply(seq_len(n.common.labels), function(j) {
            common.label.prefix.suffix <- common.label.prefixes.suffixes[[j]]
            ind <- which(colSums(t(common.label.prefix.suffix) == prefix.suffix) == 2)
            if (length(ind) > 0)
                match.ind[[j]][ind]
            else
                NULL
        })

        new.rows <- matchIndicesBasedOnName(common.labels.ind, v.names)
        for (j in seq_len(nrow(new.rows)))
        {
            if (!(any(removeNA(new.rows[j, ]) %in% removeNA(stacking.groups))))
                stacking.groups <- rbind(stacking.groups, new.rows[j, ])
        }
    }

    # Remove groups with only one element
    stacking.groups <- stacking.groups[rowSums(!is.na(stacking.groups)) > 1, ,
                                       drop = FALSE]

    stacking.groups
}

# This function is needed by stacking with common labels because sometimes
# multiple variables have the same common label, label prefix and label suffix.
# In such a situation, it is not clear how to group variables just from the
# labels. We therefore fall back on variable names to finalize the grouping.

# The elements of ind.list correspond to a set of common labels and each
# element contains indices of names that have the same common label, label
# prefix and label suffix. The indices refer to the variables whose names are
# in nms.

# This function returns a matrix where each row contains the indices of
# a group of matched variables to be stacked together. A group is created by
# selecting one or zero variable indices from each element of ind.list such
# that the resulting variable names have common prefixes, suffixes and the
# remaining text is either numbers or letters but not both (since enumerations
# occur with letters or numbers but usually not both).

# See unit tests for matchIndicesBasedOnName in test-stacking.R
matchIndicesBasedOnName <- function(ind.list, nms)
{
    # Trivial case: each element in the list is at most length 1
    if (all(vapply(ind.list, length, integer(1)) < 2))
        return(t(matrix(vapply(ind.list, function(ind) {
            if (length(ind) == 0)
                NA_integer_
            else
                ind[[1]]
        }, integer(1)))))

    n.list <- length(ind.list)
    result <- matrix(nrow = 0, ncol = n.list)
    repeat
    {
        first.name <- NA_character_
        new.row <- rep(NA_integer_, n.list)
        for (i in seq_len(n.list))
        {
            ind <- ind.list[[i]]
            if (length(ind) == 0)
                next

            if (is.na(first.name))
            {
                first.name <- nms[ind[1]]
                new.row[i] <- ind[1]
                ind.list[[i]] <- ind.list[[i]][-1]
                next
            }

            for (j in seq_along(ind))
            {
                nm <- nms[ind[j]]

                prefix <- getCommonPrefixTwoNames(c(first.name, nm))
                suffix <- getCommonSuffixTwoNames(c(first.name, nm))

                nchar.prefix <- nchar(prefix)
                nchar.suffix <- nchar(suffix)

                if (nchar.prefix == 0)
                    next

                # prefix and suffix removed
                middle <- substr(nm, nchar.prefix + 1,
                                 nchar(nm) - nchar.suffix)

                # The middle part of variable names to stack needs to be either
                # numbers or letters (or empty) and not both
                if (grepl("^[[:digit:]]*$", middle) ||
                    grepl("^[[:alpha:]]*$", middle))
                {
                    new.row[i] <- ind[j]
                    ind.list[[i]] <- ind.list[[i]][-j]
                    break
                }
            }
        }
        result <- rbind(result, new.row, deparse.level = 0)

        if (all(vapply(ind.list, length, integer(1)) == 0))
            break
    }
    result
}

# Parse user-input manual stacking strings and append the variables to manually
# stack to stacking.groups. Most of the work is done by either
# stackingSpecifiedByVariable or stackingSpecifiedByObservation.
# See unit tests for stackManually in test-stacking.R
stackManually <- function(manual.stacking, specify.by, input.data.set.metadata)
{
    if (is.null(manual.stacking) || length(manual.stacking) == 0 ||
        setequal(manual.stacking, ""))
        return(NULL)

    manual.stacking.groups <- if (specify.by == "Variable")
        stackingSpecifiedByVariable(manual.stacking, input.data.set.metadata)
    else
        stackingSpecifiedByObservation(manual.stacking,
                                       input.data.set.metadata)
}

# See unit tests for permittedNA in test-stacking.R
permittedNA <- function(variable.names)
{
    na.ind <- match("NA", variable.names)
    has.na.variable <- !is.na(na.ind)
    if (has.na.variable)
    {
        warning("There is an input variable named 'NA'. ",
                "To avoid confusion, missing stacking variables need to be specified with an extra slash for this data set, i.e., N/A")
        "N/A"
    }
    else
        c("NA", "N/A")
}

# Parse user-input manual stacking strings for stacking specified by variable,
# i.e., each string in manual.stacking describes the variables to be stacked
# into one variable.
# See unit tests for stackingSpecifiedByVariable in test-stacking.R
#' @return matrix with one row per group of variables to be stacked. Number of columns
#' is equal to the size of the largest group of variables, smaller groups will
#' have NA values in their last columns
stackingSpecifiedByVariable <- function(manual.stacking,
                                        input.data.set.metadata)
{
    v.names <- input.data.set.metadata$variable.names
    v.types <- input.data.set.metadata$variable.types
    v.val.attr <- input.data.set.metadata$variable.value.attributes

    permitted.na <- permittedNA(v.names)

    manual.stacking.groups.list <- list()
    manual.stacking.groups.text <- character(0)

    for (input.text in manual.stacking)
    {
        on.fail.msg <- paste0("The manual stacking input '", input.text,
                              "' has been ignored.")
        split.text <- splitByComma(input.text)

        group.names <- character(0)

        for (t in split.text)
        {
            parsed <- if (t %in% permitted.na)
                NA_character_
            else if (grepl("-", t, fixed = TRUE)) # contains range
                parseVariableRange(t, v.names, "manual stacking", on.fail.msg)
            else if (grepl("*", t, fixed = TRUE)) # contains wildcard
                parseVariableWildcard(t, v.names, "manual stacking", on.fail.msg)
            else
                parseVariableName(t, v.names, "manual stacking",
                                  on.fail.msg)
            if (length(parsed) == 0)
            {
                group.names <- NULL
                break
            }
            else
                group.names <- c(group.names, parsed)
        }
        if (is.null(group.names) || length(group.names) == 0)
            next

        if (all(is.na(group.names)))
        {
            warning("The manual stacking input '", input.text,
                    "' has been ignored as it does not contain any variables.")
            next
        }

        # Check for duplicate variables
        dup <- duplicated(removeNA(group.names))
        if (any(dup))
        {
            warning("The manual stacking input '", input.text,
                    "' has been ignored as it contains duplicate entries for ",
                    paste0("'", removeNA(group.names)[dup], "'",
                           collapse = ", "), ".")
            next
        }

        # Remove trailing NA
        ind <- which(!is.na(group.names))
        group.names <- group.names[seq_len(ind[length(ind)])]

        group.ind <- vapply(group.names, match, integer(1), v.names)

        # Check for mismatching variable types and value attributes
        if (!allIdentical(v.types[removeNA(group.ind)]) ||
            !isValueAttributesMergable(v.val.attr[removeNA(group.ind)], input.text))
        {
            warning("The manual stacking input '", input.text,
                    "' has been ignored as it contains variables with mismatching types or value attributes. ",
                   "Every value must have identical labels in each variable it appears in.")
            next
        }

        # Check for overlap with previous manual stacking inputs
        if (length(manual.stacking.groups.list) > 0)
        {
            group.ind.without.na <- removeNA(group.ind)
            overlap.ind <- which(vapply(manual.stacking.groups.list, function(manual.group) {
                any(group.ind.without.na %in% manual.group)
            }, logical(1)))
            if (length(overlap.ind) > 0)
            {
                warning("The manual stacking input '", input.text,
                        "' has been ignored as it contains variable(s) that overlap with another manual stacking input '",
                        manual.stacking.groups.text[overlap.ind[1]], "'.")
                next
            }
        }

        manual.stacking.groups.list <- c(manual.stacking.groups.list, list(group.ind))
        manual.stacking.groups.text <- c(manual.stacking.groups.text,
                                         input.text)
    }

    if (length(manual.stacking.groups.list) == 0)
        return(NULL)

    n.stacking <- max(vapply(manual.stacking.groups.list, length, integer(1)))
    do.call("rbind", lapply(manual.stacking.groups.list, function(group) {
        rw <- rep(NA_integer_, n.stacking)
        rw[seq_along(group)] <- group
        rw
    }))
}

# Parse user-input manual stacking strings for stacking specified by
# observation, i.e., each string in manual.stacking describes the variables in
# an observation to be used in stackings of variables.
# See unit tests for stackingSpecifiedByObservation in test-stacking.R
stackingSpecifiedByObservation <- function(manual.stacking,
                                           input.data.set.metadata)
{
    if (length(manual.stacking) < 2)
    {
        warning("No manual stacking was conducted as 2 or more manual stacking inputs (corresponding to obvservations) are required.")
        return(NULL)
    }

    v.names <- input.data.set.metadata$variable.names
    v.types <- input.data.set.metadata$variable.types
    v.val.attr <- input.data.set.metadata$variable.value.attributes

    permitted.na <- permittedNA(v.names)

    # obs.group = observation group, which is a group of variables be stacked
    # belonging to an observation, instead of a variable (in which case I just
    # use 'group').
    manual.stacking.obs.groups.list <- list()
    manual.stacking.obs.groups.text <- list()

    for (input.text in manual.stacking)
    {
        on.fail.msg <- paste0("No manual stacking was conducted.")
        split.text <- splitByComma(input.text)
        obs.group.names <- character(0)

        for (t in split.text)
        {
            parsed <- if (t %in% permitted.na)
                NA_character_
            else if (grepl("-", t, fixed = TRUE)) # contains range
                parseVariableRange(t, v.names, "manual stacking", on.fail.msg)
            else if (grepl("*", t, fixed = TRUE)) # contains wildcard
                parseVariableWildcard(t, v.names, "manual stacking", on.fail.msg)
            else
                parseVariableName(t, v.names, "manual stacking",
                                  on.fail.msg)
            if (length(parsed) == 0)
                return(NULL)
            else
                obs.group.names <- c(obs.group.names, parsed)
        }

        if (all(is.na(obs.group.names)))
        {
            warning("No manual stacking was conducted as the manual stacking input '",
                    input.text, "' does not contain any variables.")
            return(NULL)
        }

        # Check for duplicate variables
        dup <- duplicated(removeNA(obs.group.names))
        if (any(dup))
        {
            warning("No manual stacking was conducted as the manual stacking input '",
                    input.text, "' contains duplicate entries for ",
                    paste0("'", removeNA(obs.group.names)[dup], "'",
                           collapse = ", "), ".")
            return(NULL)
        }

        # Remove trailing NA
        ind <- which(!is.na(obs.group.names))
        obs.group.names <- obs.group.names[seq_len(ind[length(ind)])]

        obs.group.ind <- vapply(obs.group.names, match, integer(1),
                                v.names)

        # Check for overlap with previous manual stacking inputs
        if (length(manual.stacking.obs.groups.list) > 0)
        {
            obs.group.ind.without.na <- removeNA(obs.group.ind)
            overlap.ind <- which(vapply(manual.stacking.obs.groups.list, function(manual.obs.group) {
                any(obs.group.ind.without.na %in% manual.obs.group)
            }, logical(1)))
            if (length(overlap.ind) > 0)
            {
                warning("No manual stacking was conducted as the manual stacking input '",
                        input.text, "' contains variable(s) that overlap with another manual stacking input '",
                        manual.stacking.obs.groups.text[overlap.ind[1]], "'.")
                return(NULL)
            }
        }

        manual.stacking.obs.groups.list <- c(manual.stacking.obs.groups.list, list(obs.group.ind))
        manual.stacking.obs.groups.text <- c(manual.stacking.obs.groups.text,
                                             input.text)
    }

    if (length(manual.stacking.obs.groups.list) == 0)
        return(NULL)

    n.stacked.var <- max(vapply(manual.stacking.obs.groups.list, length, integer(1)))
    manual.stacking.groups <- do.call("cbind", lapply(manual.stacking.obs.groups.list, function(obs.group) {
        column <- rep(NA_integer_, n.stacked.var)
        column[seq_along(obs.group)] <- obs.group
        column
    }))

    # Check for mismatching variable types and value attributes
    for (i in seq_len(nrow(manual.stacking.groups)))
    {
        group.ind <- removeNA(manual.stacking.groups[i, ])
        variable.names <- paste0(v.names[group.ind], collapse = ", ")
        if (!allIdentical(v.types[group.ind]) ||
            !isValueAttributesMergable(v.val.attr[group.ind], variable.names))
        {
            warning("No manual stacking was conducted as the following variables to be stacked ",
                    "have mismatching types or value attributes: ",
                    variable.names, ".")
            return(NULL)
        }
    }
    manual.stacking.groups
}

# Value attributes are mergable if they are consistent with each other
# (i.e. the value labels of their common values match).
isValueAttributesMergable <- function(v.val.attrs, variable.names)
{
    combined.val.attr <- do.call('c', v.val.attrs)
    names(combined.val.attr) <- unlist(lapply(v.val.attrs,
                                               function(v.val.attr) names(v.val.attr)),
                                        use.names = FALSE)

    # Check that there aren't multiple names per value
    unique.vals <- unique(combined.val.attr)
    for (v in unique.vals) {
        nms <- unique(names(combined.val.attr)[v == combined.val.attr])
        if (length(nms) > 1) {
            return(FALSE)
        }
    }

    # Check that there aren't multiple values per name
    unique.names <- unique(names(combined.val.attr))
    for (nm in unique.names) {
        vals <- unique(combined.val.attr[nm == names(combined.val.attr)])
        if (length(vals) > 1) {
            warning("Identical labels are used for distinct values in the variables ",
                    variable.names, ". Please check that the values and labels are correct ",
                    " for each variable. The labels will be made unique in the ",
                    "output data file by appending the corresponding value to ",
                    "each label. They can be edited via the object inspector ",
                    "after importing the stacked data file.")

            break
        }
    }

    return(TRUE)
}

# See unit tests for mergeCommonLabelAndManualStackingGroups in test-stacking.R
mergeCommonLabelAndManualStackingGroups <- function(common.label.stacking.groups,
                                                    manual.stacking.groups)
{
    if (is.null(common.label.stacking.groups))
    {
        if (is.null(manual.stacking.groups))
            return(NULL)

        stacking.groups <- manual.stacking.groups
        attr(stacking.groups, "is.manually.stacked") <- rep(TRUE, nrow(stacking.groups))
        return(stacking.groups)
    }

    if (is.null(manual.stacking.groups))
    {
        stacking.groups <- common.label.stacking.groups
        attr(stacking.groups, "is.manually.stacked") <- rep(FALSE, nrow(stacking.groups))
        return(stacking.groups)
    }

    # Extract attribute now as it will be lost if we modify
    # common.label.stacking.groups
    unstackable.names <- attr(common.label.stacking.groups,
                              "unstackable.names")

    # Remove rows in common.label.stacking.groups that contain overlap with
    # manual.stacking.groups
    for (manual.group in manual.stacking.groups)
    {
        manual.group.without.na <- removeNA(manual.group)
        is.overlapping <- apply(common.label.stacking.groups, 1, function(group) {
            any(manual.group.without.na %in% group)
        })
        common.label.stacking.groups <- common.label.stacking.groups[!is.overlapping, , drop = FALSE]
    }

    n.row.c <- nrow(common.label.stacking.groups)
    n.col.c <- ncol(common.label.stacking.groups)
    n.row.m <- nrow(manual.stacking.groups)
    n.col.m <- ncol(manual.stacking.groups)

    # We still merge the two groups even if the number of variables to stack
    # does not match. If this is not what the user intended, the output should
    # make this clear enough, so a warning isn't shown.
    n.stacking <- max(n.col.c, n.col.m)
    n.stacked.variables <- n.row.c + n.row.m
    stacking.groups <- matrix(NA_integer_,  nrow = n.stacked.variables,
                              ncol = n.stacking)
    stacking.groups[seq_len(n.row.c), seq_len(n.col.c)] <- common.label.stacking.groups
    stacking.groups[n.row.c + seq_len(n.row.m), seq_len(n.col.m)] <- manual.stacking.groups

    is.manually.stacked <- c(rep(FALSE, n.row.c), rep(TRUE, n.row.m))
    attr(stacking.groups, "is.manually.stacked") <- is.manually.stacked
    attr(stacking.groups, "unstackable.names") <- unstackable.names
    stacking.groups
}

parseVariablesToInclude <- function(variables.to.include,
                                    input.data.set.metadata,
                                    stacking.groups)
{
    if (length(variables.to.include) == 0 || setequal(variables.to.include, ""))
        return(character(0))

    v.names <- input.data.set.metadata$variable.names
    purpose <- "included variable"

    result <- character(0)
    for (i in seq_along(variables.to.include))
    {
        split.text <- trimws(strsplit(variables.to.include[i], ",")[[1]])
        split.text <- split.text[split.text != ""]

        for (t in split.text)
        {
            parsed <- if (grepl("-", t, fixed = TRUE)) # contains range
                parseVariableRange(t, v.names, purpose,
                                   "The input range has been ignored.")
            else if (grepl("*", t, fixed = TRUE)) # contains wildcard
                parseVariableWildcard(t, v.names, purpose,
                                      "This input has been ignored.")
            else
                parseVariableName(t, v.names, purpose,
                                  "This input has been ignored.")

            result <- c(result, parsed)
        }
    }

    stacked.variable.names <- v.names[removeNA(c(stacking.groups))]
    result[!(result %in% stacked.variable.names)]
}

# Constructs the stacked data set as a data frame from the input data set and
# the stacking.groups matrix
stackedDataSet <- function(input.data.set, input.data.set.metadata,
                           stacking.groups, include.original.case.variable,
                           include.observation.variable,
                           common.labels.list, included.variable.names)
{
    input.v.names <- input.data.set.metadata$variable.names
    input.v.labels <- input.data.set.metadata$variable.labels

    has.stacking <- !is.null(stacking.groups) && nrow(stacking.groups) > 0

    if (!has.stacking)
        return(data.frame())

    n.stacked <- ncol(stacking.groups)
    first.ind <- apply(stacking.groups, 1, function(rw) removeNA(rw)[1])
    is.manually.stacked <- attr(stacking.groups, "is.manually.stacked")

    stacked.data.set <- list()
    for (i in seq_along(input.v.names))
    {
        ind <- match(i, first.ind)
        if (!is.na(ind))
        {
            group.ind <- stacking.groups[ind, ]
            v <- unlist(lapply(group.ind, function(j) {
                vals <- if (!is.na(j))
                    input.data.set[[j]]
                else
                    rep(NA, nrow(input.data.set))
                if (isIntegerValued(vals))
                    as.integer(vals)
                else
                    vals
            }))
            nm <- stackedVariableName(group.ind, input.v.names, names(stacked.data.set))
            v <- c(matrix(v, ncol = nrow(input.data.set), byrow = TRUE))
            attr(v, "is.stacked") <- TRUE
            attr(v, "is.manually.stacked") <- is.manually.stacked[ind]
            attr(v, "stacking.input.variable.names") <- input.v.names[group.ind]
            attr(v, "stacking.input.variable.labels") <- input.v.labels[group.ind]
            attr(v, "label") <- stackedVariableLabel(group.ind, input.v.labels, nm)
            val.attr <- stackedValueAttributes(group.ind,
                                               input.data.set.metadata$variable.value.attributes)
            if (!is.null(val.attr))
            {
                if (is.integer(v))
                {
                    nms <- names(val.attr)
                    val.attr <- as.integer(val.attr)
                    names(val.attr) <- nms
                }
                attr(v, "labels") <- val.attr
                class(v) <- c(class(v), "haven_labelled")
            }

            stacked.data.set[[nm]] <- v
        }
        else if (input.v.names[i] %in% included.variable.names)
        {
            input.var <- input.data.set[[i]]
            val.attr <- attr(input.var, "labels", exact = TRUE)
            if (isIntegerValued(input.var))
            {
                input.var <- as.integer(input.var)
            }
            
            v <- rep(input.var, each = n.stacked)
            attr(v, "is.stacked") <- FALSE
            attr(v, "is.manually.stacked") <- NA
            attr(v, "label") <- input.v.labels[i]
            if (!is.null(val.attr))
            {
                if (is.integer(v))
                {
                    nms <- names(val.attr)
                    val.attr <- as.integer(val.attr)
                    names(val.attr) <- nms
                }
                attr(v, "labels") <- val.attr
                class(v) <- c(class(v), "haven_labelled")
            }
            nm <- uniqueName(input.v.names[i], names(stacked.data.set))
            stacked.data.set[[nm]] <- v
        }
    }

    if (include.original.case.variable && has.stacking)
    {
        original.case <- rep(seq_len(nrow(input.data.set)), each = n.stacked)
        attr(original.case, "label") <- "Original case number (pre-stacking)"
        attr(original.case, "is.stacked") <- FALSE
        attr(original.case, "is.manually.stacked") <- NA
        attr(original.case, "is.original.case") <- TRUE
        stacked.data.set[[uniqueName("original_case",
                                     names(stacked.data.set))]] <- original.case
    }

    if (include.observation.variable && has.stacking)
    {
        observation <- rep(seq_len(n.stacked), nrow(input.data.set))
        attr(observation, "label") <- "Observation # (from stacking)"

        if (!is.null(common.labels.list))
        {
            common.labels <- common.labels.list[[1]]
            val.attr <- seq_len(n.stacked)
            names(val.attr) <- paste0("Observation ", seq_len(n.stacked))
            names(val.attr)[seq_along(common.labels)] <- common.labels
            attr(observation, "labels") <- val.attr
        }
        attr(observation, "is.stacked") <- FALSE
        attr(observation, "is.manually.stacked") <- NA
        attr(observation, "is.observation") <- TRUE

        stacked.data.set[[uniqueName("observation",
                                     names(stacked.data.set))]] <- observation
    }

    stacked.data.set <- data.frame(stacked.data.set, check.names = FALSE)
    colnames(stacked.data.set) <- sanitizeSPSSVariableNames(colnames(stacked.data.set))
    stacked.data.set
}

stackedVariableName <- function(group.ind, input.variable.names, taken.names)
{
    ind <- removeNA(group.ind)
    nm <- input.variable.names[ind]
    common.prefix <- trimws(getCommonPrefix(nm))
    common.suffix <- trimws(getCommonSuffix(nm))
    candidate <- trimws(paste0(common.prefix, common.suffix))
    if (candidate == "")
        candidate <- "stacked_var"

    uniqueName(candidate, taken.names, "_")
}

stackedVariableLabel <- function(group.ind, input.variable.labels, stacked.variable.name)
{
    ind <- removeNA(group.ind)
    lbl <- input.variable.labels[ind]
    common.prefix <- trimws(getCommonPrefix(lbl, whole.words = TRUE))
    common.suffix <- trimws(getCommonSuffix(lbl, whole.words = TRUE))
    if (common.prefix == "" && common.suffix == "")
        lbl[1]
    else if (common.prefix == common.suffix)
        common.prefix
    else if (nchar(common.prefix) <= 1 && nchar(common.suffix) <= 1)
        lbl[1]
    else
        trimws(paste(common.prefix, common.suffix))
}

# Assume that value attributes are mergeable (isValueAttributesMergable or
# allValueAttributesIdentical has been run and returned TRUE)
stackedValueAttributes <- function(group.ind, input.value.attributes)
{
    v.val.attrs <- input.value.attributes[removeNA(group.ind)]
    combined.val.attrs <- do.call('c', v.val.attrs)

    names(combined.val.attrs) <- unlist(lapply(v.val.attrs,
                                               function(v.val.attr)
                                                   names(deduplicateValAttrNames(v.val.attr))),
                                        use.names = FALSE)

    is.dup <- duplicated(combined.val.attrs)
    sort(combined.val.attrs[!is.dup])
}

#' Deduplicates the names of a list of value attributes
#' by appending the value in square brackets to the labels
#' @examples
#' stopifnot(deduplicateValAttrNames(list(Dislike = 0, Dislike = 1)) ==
#'   list("Dislike [Value: 0]" = 0, "Dislike [Value: 1]" = 1))
#' @noRd
deduplicateValAttrNames <- function(var.value.attrs)
{
    combined.names <- names(var.value.attrs)
    if (anyDuplicated(combined.names))
    {
        duplicated.idx <- duplicated(combined.names)
        duplicated.idx <- names(var.value.attrs) %in%
            names(var.value.attrs[duplicated.idx])
        duplicated.attr <- var.value.attrs[duplicated.idx]
        duplicated.attr.names <- paste0(names(duplicated.attr),
                                         " [Value: ",
                                         duplicated.attr,
                                         "]")
        names(var.value.attrs)[duplicated.idx] <- duplicated.attr.names
    }
    return(var.value.attrs)
}

# Common prefix from a character vector of names.
# If whole.words is TRUE, the prefix is truncated so that it does not end
# halfway into a word or number.
getCommonPrefix <- function(nms, whole.words = FALSE)
{
    if (allIdentical(nms))
        return(nms[1])

    common_prefix <- ""
    for (i in 1:min(nchar(nms)))
    {
        if (allIdentical(vapply(tolower(nms), substr, character(1), 1, i)))
            common_prefix <- substr(nms[1], 1, i)
        else
            break
    }

    if (whole.words)
    {
        ind <- gregexpr("[^[:alnum:]]", common_prefix)[[1]]
        if (setequal(ind, -1))
            ""
        else
            substr(common_prefix, 1, ind[length(ind)])
    }
    else
        common_prefix
}

# Faster than getCommonPrefix but only works on character vectors of length 2
getCommonPrefixTwoNames <- function(nms)
{
    n.chars <- nchar(nms)
    min.n.chars <- min(n.chars)
    split.nms <-  strsplit(nms, "")
    char.ind <- seq_len(min.n.chars)
    last.ind <- match(TRUE, split.nms[[1]][char.ind] != split.nms[[2]][char.ind]) - 1
    if (is.na(last.ind)) # one is a subset of the other
        nms[which.min(n.chars)]
    else
        paste0(split.nms[[1]][seq_len(last.ind)], collapse = "")
}

# Faster than getCommonSuffix but only works on character vectors of length 2
getCommonSuffixTwoNames <- function(nms)
{
    n.chars <- nchar(nms)
    min.n.chars <- min(n.chars)
    split.nms <- lapply(strsplit(nms, ""), rev)
    char.ind <- seq_len(min.n.chars)
    first.ind <- match(TRUE, split.nms[[1]][char.ind] != split.nms[[2]][char.ind]) - 1
    if (is.na(first.ind)) # one is a subset of the other
        nms[which.min(n.chars)]
    else
        paste0(rev(split.nms[[1]][seq_len(first.ind)]), collapse = "")
}

# Common suffix from a character vector of names.
# If whole.words is TRUE, the prefix is truncated so that it does not end
# halfway into a word or number.
getCommonSuffix <- function(nms, whole.words = FALSE)
{
    if (allIdentical(nms))
        return(nms[1])

    common_suffix <- ""
    for (i in 1:min(nchar(nms)))
    {
        suffixes <- vapply(tolower(nms), function(nm) {
            substr(nm, nchar(nm) - i + 1, nchar(nm))
        }, character(1))
        if (allIdentical(suffixes))
            common_suffix <- substr(nms[1], nchar(nms[1]) - i + 1, nchar(nms[1]))
        else
            break
    }

    if (whole.words)
    {
        ind <- gregexpr("[^[:alnum:]]", common_suffix)[[1]]
        if (setequal(ind, -1))
            ""
        else
            substr(common_suffix, ind[1], nchar(common_suffix))
    }
    else
        common_suffix
}

# Parses a user-input variable range
# See unit tests for parseVariableRange in test-stacking.R
parseVariableRange <- function(range.text, variable.names, purpose,
                               on.fail.msg)
{
    dash.ind <- match("-", strsplit(range.text, "")[[1]])
    start.var.text <- trimws(substr(range.text, 1, dash.ind - 1))
    end.var.text <- trimws(substr(range.text, dash.ind + 1, nchar(range.text)))
    if (grepl("*", start.var.text, fixed = TRUE))
    {
        warning("The start variable from the ", purpose, " input range '",
                range.text,
                "' contains the wildcard character '*' which is not permitted in a range. ",
                on.fail.msg)
        return(character(0))
    }
    if (grepl("*", end.var.text, fixed = TRUE))
    {
        warning("The end variable from the ", purpose, " input range '",
                range.text,
                "' contains the wildcard character '*' which is not permitted in a range. ",
                on.fail.msg)
        return(character(0))
    }

    start.ind <- if (start.var.text != "")
        match(start.var.text, variable.names)
    else
        1

    end.ind <- if (end.var.text != "")
        match(end.var.text, variable.names)
    else
        length(variable.names)

    if (is.na(start.ind))
    {
        warning("The start variable from the ", purpose, " input range '",
                range.text, "' ", "could not be identified. ", on.fail.msg,
                " Ensure that the variable name is correctly specified.")
        result <- character(0)
        attr(result, "is.not.found") <- TRUE
        return(result)
    }
    if (is.na(end.ind))
    {
        warning("The end variable from the ", purpose, " input range '",
                range.text, "' ", "could not be identified. ", on.fail.msg,
                " Ensure that the variable name is correctly specified.")
        result <- character(0)
        attr(result, "is.not.found") <- TRUE
        return(result)
    }
    if (start.ind > end.ind)
    {
        warning("The start variable from the ", purpose, "input range '",
                range.text,
                "' appears after the end variable in the data set. ",
                "Ensure that the range has been correctly specified. ",
                on.fail.msg)
        return(character(0))
    }
    variable.names[start.ind:end.ind]
}

# Parses a user-input variable wildcard
# See unit tests for parseVariableWildcard in test-stacking.R
#' @importFrom flipU EscapeRegexSymbols
parseVariableWildcard <- function(wildcard.text, variable.names, purpose,
                                  on.fail.msg)
{
    ind.asterisk <- match("*", strsplit(wildcard.text, "")[[1]])
    start.var.text <- trimws(substr(wildcard.text, 1, ind.asterisk - 1))
    end.var.text <- trimws(substr(wildcard.text, ind.asterisk + 1,
                                  nchar(wildcard.text)))
    pattern <- paste0("^", EscapeRegexSymbols(start.var.text), ".*",
                      EscapeRegexSymbols(end.var.text), "$")
    is.match <- grepl(pattern, variable.names)
    if (!any(is.match))
    {
        warning("No matches were found for the ", purpose,
                " input wildcard name '", wildcard.text,
                "'. Ensure that the wildcard variable name has been correctly specified. ",
                on.fail.msg)
        result <- character(0)
        attr(result, "is.not.found") <- TRUE
        return(result)
    }
    variable.names[is.match]
}

# Parses a user-input variable name
# See unit tests for parseVariableName in test-stacking.R
parseVariableName <- function(variable.name.text, variable.names, purpose,
                              on.fail.msg)
{
    if (variable.name.text %in% variable.names)
        variable.name.text
    else
    {
        warning("The ", purpose, " input variable name '", variable.name.text,
                "' could not be identified. ", on.fail.msg)
        result <- character(0)
        attr(result, "is.not.found") <- TRUE
        return(result)
    }
}

# Cleans a user-input data set name or creates one from the input data set name
cleanStackedDataSetName <- function(stacked.data.set.name, input.data.set.name)
{
    if (is.null(stacked.data.set.name) || stacked.data.set.name == "")
    {
        if (grepl("\\.sav$", input.data.set.name))
        {
            n <- nchar(input.data.set.name)
            name.without.sav <- substr(input.data.set.name, 1, n - 4)
            return(paste0(name.without.sav, " stacked.sav"))
        }
        else
            return(paste0(input.data.set.name, " stacked.sav"))
    }
    else if (!grepl("\\.sav$", stacked.data.set.name))
        return(paste0(stacked.data.set.name, ".sav"))
    else
        return(stacked.data.set.name)
}

# Creates a list of metadata from a stacked data set
metadataFromStackedDataSet <- function(stacked.data.set, stacked.data.set.name)
{
    result <- metadataFromDataSet(stacked.data.set, stacked.data.set.name)
    result$is.stacked.variable <- vapply(stacked.data.set, attr, logical(1),
                                         "is.stacked")
    result$is.manually.stacked.variable <- vapply(stacked.data.set, attr,
                                                  logical(1),
                                                  "is.manually.stacked")
    result$stacking.input.variable.names <- lapply(stacked.data.set, attr,
                                                   "stacking.input.variable.names")
    result$stacking.input.variable.labels <- lapply(stacked.data.set, attr,
                                                    "stacking.input.variable.labels")
    result
}

#' @importFrom flipFormat StackingWidget
#' @export
print.StackedData <- function(x, ...)
{
    StackingWidget(x$input.data.set.metadata,
                   x$stacked.data.set.metadata,
                   x$unstackable.names,
                   x$common.labels,
                   x$is.saved.to.cloud)
}
NumbersInternational/flipData documentation built on March 2, 2024, 10:52 a.m.