R/apply_dic.R

Defines functions apply_dic

Documented in apply_dic

#' Apply dictionary
#'
#' Joins a data frame with a dictionary file.
#'
#' @param data Data frame
#' @param dic A data frame comprising a dictionary or a character string with a
#'   filename (for now an Microsoft Excel file) containing a dictionary.
#' @param factors If set TRUE, factor variables will be turned into factors.
#' @param set_label_attr If TRUE, label attributes from the haven package are
#'   set. These labels are shown in the Rstudio View panel.
#' @param coerce_class If set TRUE mismatches between class and dic type are
#'   corrected.
#' @param replace_missing If TRUE, missing values from the dic are replaced with
#'   NA
#' @param check_values If TRUE, performs the check_values function on the
#'   variables of the data frame included in the dic file.
#' @param impute_values If TRUE and score_scales is TRUE, missing values are
#'   automatically imputed based on scale information provided in the dic file.
#' @param score_scales If TRUE and the dic files contains score scale
#'   definitions these are applied
#' @param rename_var When a character is provided, corresponding column from the
#'   dic file is used to rename variables from rename_var to item_name.
#' @return A data frame with dictionary information.
#' @examples
#' dat <- apply_dic(dat_itrf, dic_itrf)
#' descriptives(dat)
#' @export

apply_dic <- function(data,
                      dic,
                      factors = TRUE,
                      set_label_attr = TRUE,
                      coerce_class = TRUE,
                      replace_missing = TRUE,
                      score_scales = TRUE,
                      check_values = FALSE,
                      impute_values = FALSE,
                      rename_var = NULL) {

  if (inherits(dic, "character")) dic <- .read_by_suffix(dic)
  if (inherits(data, "character")) data <- .read_by_suffix(data)

  dic <- .clean_dic_file(dic)

# rename variables by rename_var ---------------------------------------------

  if (!is.null(rename_var)) {
    to_from <- setNames(dic[[rename_var]], dic[[.dic_file$item_name]])
    to_from <- to_from[!is.na(to_from)]
    .duplicates <- names(to_from) %in% names(data)
    if(any(.duplicates)){
      message("Skipped renaming column to an already existing name: ")
      message(paste0(names(to_from)[.duplicates], collapse = ", "))
      to_from <- to_from[!.duplicates]
    }
    for(i in 1:length(to_from)) {
      names(data)[which(names(data) == to_from[i])] <- names(to_from[i])
    }
  }

  .filter <- dic[[.dic_file$class]] == "scale"
  dic_scores <- dic[.filter, ]
  dic <- dic[!.filter, ]

  var_not_df <- NULL


# apply dic information to variables --------------------------------------

  for (i in 1:nrow(dic)) {

    # look up column in data frame corresponding to item_name
    id <- which(names(data) == dic[[.dic_file$item_name]][i])
    if (length(id) > 1) {
      message("Multiple ids found")
      message(paste0(names(data)[id], collapse = ", "))
    }

    if (length(id) == 0) {
      var_not_df <- c(var_not_df, dic[[.dic_file$item_name]][i])
      next
    }


    # extract values
    values <- paste0("c(", as.character(dic[i, .dic_file$values]), ")") %>%
      parse(text = .) %>%
      eval()

    # extract value labels
    value_labels <- dic[i, .dic_file$value_labels] %>%
      as.character() %>%
      strsplit(";") %>%
      unlist() %>%
      strsplit("=")

    .n_labels <- length(value_labels)
    .df <- data.frame(
      value = character(.n_labels),
      label = character(.n_labels)
    )
    for(j in 1:.n_labels) {
      .df[j, 1] <- trimws(value_labels[[j]][1])
      .df[j, 2] <- trimws(value_labels[[j]][2])
    }
    if (.dic_file$type %in% c("integer", "numeric", "float", "double"))
      .df[["value"]] <- as.numeric(.df[["value"]])

    dic_attr(data[[id]], .opt$value_labels) <- .df

    for (x in value_labels) {
      value <- as.numeric(x[1])
      names(values)[which(values == value)] <- trimws(x[2])
    }

    dic_attr(data[[id]], .opt$values) <- values

    # extract missing
    dic_attr(data[[id]], .opt$missing) <-
      paste0("c(", as.character(dic[[.dic_file$missing]][i]), ")") %>%
      parse(text = .) %>%
      eval()

    # check variable type (class)
    # numeric:
    if (dic[i, .dic_file$type] %in% c("integer","numeric","float","double")) {
      if (!inherits(data[[id]], c("integer","numeric","double"))) {

        if (coerce_class) {
          message(
            "Class should be numeric but is ", class(data[[id]]),
            ". Coreced to numeric: ", names(data)[id]
          )
          class(data[[id]]) <- "numeric"
        } else {
          message(
           "Class should be numeric but is ", class(data[[id]]),
           ": ",names(data)[id]
          )
        }
      }
    }

    ### assign attributes
    filter <- !(names(.dic_file) %in% c(
      "values", "value_labels", "missing", "variables")
    )
    set <- names(.dic_file)[filter]

    for (j in set) {
      target <- .opt[[j]]
      source <- .dic_file[[j]]
      #value_dic <- as.character(dic[[source]][i])
      value_dic <- dic[[source]][i]
      dic_attr(data[[id]], target) <- value_dic
    }

    # set dic attributes for further variables
    new_dic_attributes <- names(dic)[!names(dic) %in% unlist(.dic_file)]

    for(j in new_dic_attributes) {
      dic_attr(data[[id]], j) <- dic[[j]][i]
    }


    ### set factors
    if (factors && dic_attr(data[[id]], .opt$type) == "factor") {
      values <- dic_attr(data[[id]], .opt$values)
      .factor <- factor(
        data[[id]],
        levels = values,
        labels = names(values)
      )
      .factor <- factor(
        data[[id]],
        levels = dic_attr(data[[id]], .opt$value_labels)$value,
        labels = dic_attr(data[[id]], .opt$value_labels)$label
      )
      attr(.factor, .opt$dic) <- attr(data[[id]], .opt$dic)
      data[[id]] <- .factor
    }

    #dic_attr(data[[id]], .opt$class) <- "item"
    class(data[[id]]) <- c("dic", class(data[[id]]))
  }

  if (!is.null(var_not_df)) {
    message(
      length(var_not_df), " of ", nrow(dic),
      " variables from dic not found in data file:"
    )
    message(paste0(var_not_df, collapse = ", "))
  }

  if (replace_missing) {
    data <- replace_missing(data)
    message("Replaced missing values.")
  }

  if (check_values) {
    message("Values checked.")
    vars <- names(data) %in% dic[[.opt$item_name]]
    data[, vars] <- check_values(data[, vars], replace = NA)
  }

  if (score_scales && nrow(dic_scores > 0)) {
    message("Values checked.")
    vars <- names(data) %in% dic[[.opt$item_name]]
    data[, vars] <- check_values(data[, vars], replace = NA)
    if (impute_values) message("Scales imputed.")
    message("Scales scored.")
    data <- score_from_dic(data, dic_scores, impute_values = impute_values)
  }

  if (set_label_attr) {
    message("`label` attribute set.")
    data <- dic_haven(data)
  }
  data
}

.clean_dic_file <- function(dic) {

  #rename dic names
  names(dic) <- tolower(names(dic))
  names(dic)[which(names(dic) %in% c("label", "name"))] <- "item_name"
  names(dic)[which(names(dic) %in% "item")] <- "item_label"
  names(dic)[which(names(dic) %in% "sub_scale_2")] <- "subscale_2"
  names(dic)[which(names(dic) %in% "sub_scale")] <- "subscale"
  names(dic)[which(names(dic) %in% "sub_scale_label")] <- "subscale_label"
  names(dic)[which(names(dic) %in% "sub_scale_2_label")] <- "subscale_2_label"

  # delete empty rows
  dic <- dic[apply(dic, 1, function(x) !all(is.na(x))),]

  # delete rows with comments (first sign of the first column is a #)
  .filter <- which(apply(dic, 1, function(x) substr(x[1], 1, 1) == "#"))
  if (length(.filter) > 0) dic <- dic[-.filter, ]

  # filter if variable "active" is available --------------------------------

  if ("active" %in% names(dic)) {
    dic <- dic[which(dic$active == 1), ]
  }

  #remove white spaces
  .column <- .dic_file$value_labels
  dic[[.column]] <- trimws(dic[[.column]])
  dic[[.column]][which(dic[[.column]] == "")] <- NA

  dic[[.dic_file$item_name]] <- trimws(dic[[.dic_file$item_name]])

  # check for missing class variable
  if (is.null(dic[[.dic_file$class]])) dic[[.dic_file$class]] <- "item"

  if (.dic_file$score_filter %in% names(dic)) {
    .filter <- which(!is.na(dic[[.dic_file$score_filter]]))
    dic[.filter, .dic_file$class] <- "scale"
  }

  filter_items <- dic[[.dic_file$class]] == "item"

  # check for missing weight variable
  if (is.null(dic[[.dic_file$weight]])) {
    message("'Weight' variable missing in the dictionary file. ",
            "Variable inserted with default of 1.")
    dic[[.dic_file$weight]] <- NA
    dic[[.dic_file$weight]][filter_items] <- 1
  }

  if(!inherits(class(dic[[.dic_file$weight]]), "numeric")) {
    message("'weight' variable is not numeric: transformed variable to numeric.")
    dic[[.dic_file$weight]] <- as.numeric(dic[[.dic_file$weight]])
  }

  # check for missing type variable
  if (is.null(dic[[.dic_file$type]])) {
    message("'type' variable missing in the dictionary file. ",
            "Variable inserted with default of 'integer'.")
    dic[[.dic_file$type]] <- "integer"
  }

  # type NA to integer
  miss_type <- which(is.na(dic[[.dic_file$type]]))
  if (length(miss_type) > 0) {
    message(
      length(miss_type), " missing types found and replaced with 'integer'."
    )
    dic[miss_type, .dic_file$type] <- "integer"
  }

  # checking other missing variables in dictionary file
  miss <- unlist(.dic_file)[which(!(unlist(.dic_file) %in% names(dic)))]

  .filter <- !miss %in% c(.dic_file$score_function, .dic_file$score_filter)
  miss <- miss[.filter]
  if (length(miss) > 0) {
    miss %>%
      paste(collapse = ", ")  %>%
      message(
        "The following variables were missing in the dictionary file: ", .
      )
    dic[, miss] <- NA
  }

  # weight NA to integer
  miss_weight <- which(is.na(dic[[.dic_file$weight]]) & filter_items)
  if (length(miss_weight) > 0) {
    message(length(miss_weight), " missing weights found and replaced with 1.")
    dic[miss_weight, .dic_file$weight] <- 1
  }

  # set report missing label
  missing_label <- NULL
  for (i in 1:nrow(dic)){
    if (is.na(dic[[.dic_file$item_label]][i])) missing_label <- c(missing_label, i)
  }
  if (!is.null(missing_label)) {
    message("Missing item_label found at variabel no. ", paste0(missing_label, collapse = ", "), " in dic file.")
  }

  # check for duplicated item_names in dic
  .duplicates <- duplicated(dic[[.dic_file$item_name]])
  if (any(.duplicates)) {
    id <- dic[[.dic_file$item_name]][which(.duplicates)]
    stop(paste0("Item names duplicated in dic-file: ", paste0(id, collapse = ", ")))
  }

  # Check for \r\n in value labels
  if (any(grepl("\r\n", "; ", x = dic[[.dic_file$value_labels]]))) {
    message("Found linebreaks in value_labels and replaced them with ';'")
    dic[[.dic_file$value_labels]] <- gsub("\r\n", "; ", x =  dic[[.dic_file$value_labels]])
  }


  dic
}
jazznbass/scaledic documentation built on Feb. 27, 2023, 7:51 p.m.