R/util.R

Defines functions load_spss parse_factors atomic_to_fac start_index parse_label parse_labels

Documented in load_spss

#' @title Read SPSS file and parse factors from labels.
#' @name load_spss
#'
#' @description Reads an SPSS file with \code{sjmisc} and then converts
#'              columns to factors.
#'
#' @param path (character), path of .sav file that should be loaded.
#' @param cleanLabel (function), pass through a function to clean the character
#'        string of the column label.
#' @param rm_notAnswered (logical), if \code{TRUE} the 'Not Anwered' label will
#'        not be included in factor levels if there are no instances.
#' @param parse_frame_labels (logical), if \code{TRUE}, will attempt to parse
#'        "label" attribute into separate "label", "topic", and "category"
#'        attributes based on string patterns in SPSS Labels exported by
#'        Verint survey data.
#'
#' @return \code{data.frame} with factors and column attributes \code{label},
#'         \code{topic}, and \code{category}.
#'
#' @note Labels and topics may be truncated if they are too long. This happens
#'       when the .sav file is generated by Verint. SPSS has a maximum length
#'       for variable labels, so those labels will be truncated in the SPSS file
#'       itself.
#'
#' @examples
#' file_path <- "dataset.sav"
#' df <- load_spss(file_path)
#'
#'
#' @export
load_spss <- function(path, cleanLabel = NULL, rm_notAnswered = TRUE,
                      parse_frame_labels = TRUE) {
    require(sjmisc)
    # Use sjmisc to load the data frame, then parse the factors
    # and parse the labels
    df <- sjmisc::read_spss(path, attach.var.labels = TRUE)
    df <- parse_factors(df, cleanLabel, rm_notAnswered)
    if(parse_frame_labels == TRUE) {
        df <- parse_labels(df)
    }
    df

}

parse_factors <- function(df, cleanLabel, rm_notAnswered) {
    # Loop through every column and convert it to a factor, removing the
    # 'Not Answered' level if specified
    df[] <- lapply(df, atomic_to_fac, cleanLabel, rm_notAnswered)
    df
}

atomic_to_fac <- function(vec, cleanLabel = NULL, rm_notAnswered = TRUE) {
    # Converts from a named atomic vector to a factor while preserving
    # variable attributes

    # Only do the conversion if it has a "labels" attribute, which indicates a
    # categorical variable in SPSS
    retVal <- vec

    if(length(attr(vec, "labels")) > 0) {
        # Pull value labels and question label
        labs <- attr(vec, "labels")
        label <- attr(vec, "label")

        # Clean label if function is passed through
        if(!is.null(cleanLabel)) {
            labs <- as.character(sapply(names(labs), cleanLabel))
            label <- cleanLabel(label)
        }
        # Skip "Not Answered" Label under these conditions
        start_index <- start_index(vec, labs, rm_notAnswered)
        # Convert to factor
        labelNames <- names(labs)
        levs <- labs[start_index:length(labelNames)]
        labs <- labelNames[start_index:length(labelNames)]
        retVal <- factor(vec, levels = levs, labels = str_trim(labs))

        # Add question text as attribute to vector
        attr(retVal, "label") <- label
    }

    retVal
}


start_index <- function(vec, labels, rm_notAnswered) {
    # This function will detect if the first or second element should be the
    # start index of the labels vector in converting to a factor
    ifelse(rm_notAnswered == TRUE &&
               labels[1] == "Not Answered" &&
               sum(vec == 0, na.rm=TRUE) == 0, 2, 1)
}

parse_label <- function(vec) {
    require(stringr)
    # Verint combines topics and question text in the SPSS variable labels.
    # This function attempts to separate the question text from the topics and
    # categories and give each its own attribute
    lab <- attr(vec, "label")
    questionAttr <- lab
    topicAttr <- NULL
    catAttr <- NULL

    split1 <- str_split(lab, " \\[Question: ")[[1]]
    split2 <- str_split(lab, " Question: ")[[1]]

    if(length(split1) > 1) {
        questionAttr <- split1[2]
        topicAttr <- split1[1]
    }
    else if(length(split2) > 1) {
        questionAttr <- split2[2]
        topicAttr <- split2[1]
    }

    if(!is.null(topicAttr)) {
        catAttr <- str_trim(str_split(topicAttr, "\\[Topic: ")[[1]][1])
        topicAttr <- str_split(topicAttr, "\\[Topic: ")[[1]][2]
    }

    attr(vec, "label") <- questionAttr
    attr(vec, "topic") <- topicAttr
    attr(vec, "category") <- catAttr
    vec
}

parse_labels <- function(df) {
    # Loop through each column and parse the label attributes
    df[] <- lapply(df, parse_label)
}
m-dinardo/fsquant documentation built on May 21, 2019, 9:17 a.m.