R/preprocess.R

Defines functions .preprocess_hand .preprocess_asha .preprocess_navs .preprocess_nnberr .preprocess_navipre .preprocess_navi .preprocess_wab .preprocess

Documented in .preprocess

#' Preprocess data
#'
#' Our field names in Redcap have a lot of information in them, but some of them
#' don't need to be used by the researcher at all. This removes some redundant
#' information that's common to all instruments and also calls additional
#' instrument-specific functions as needed. For example, checkbox matrices
#' in redcap append ___X to the field name, but what X means will vary from
#' instrument to instrument.
#'
#' @param inst_data A single instrument's data
#' @param inst_prefix The prefix for this instrument
#' @param inst_name The name for this instrument
#'
#' @return A smaller dataframe with some columns renamed depending on the
#' instrument provided
#'
.preprocess <- function(inst_data, inst_prefix, inst_name) {
  # Compose a function call of the form .preprocess_[prefix]()
  fx_name <- paste0(".preprocess_", inst_prefix)

  if (!"_summary" %in% inst_name) {
    inline_sums <- "_subtotals|_totals|_score|_scores|_analysis"
    inst_data <- inst_data[grep(inline_sums, colnames(inst_data), invert = T)]
  }
  unneeded_cols <- "(^consent)|(_complete$)|_info_|_override"
  inst_data <- inst_data[grep(unneeded_cols, colnames(inst_data), invert = T)]

  if (methods::existsFunction(fx_name)) {
    do.call(fx_name, args = list(inst_data))
  } else {
    return(inst_data)
  }
}

# .preprocess raw WAB data, rename checkbox columns and remove scores
.preprocess_wab <- function(inst_data) {
  # wab_obn_x_cues___1/2/3 should be Tactile, phonemic, Semantic
  # wab_ynq_x_acc___1/2/3/4/5 should be acc, verbal, gestural, eye blink, NR
  new_cols <- colnames(inst_data)

  rename_checkboxes <- function(column) {
    is_ynq_item <- grepl("wab_ynq_\\d+", column)
    is_obncue <- grepl("wab_obn_\\d+_cues", column)
    col_item <- stringr::str_extract(column, "wab_..._\\d+")
    last_num <- stringr::str_extract(column, ".$")
    if (is_ynq_item) {
      dplyr::case_when(
        last_num == "1" ~ paste0(col_item, "_acc"),
        last_num == "2" ~ paste0(col_item, "_verbal"),
        last_num == "3" ~ paste0(col_item, "_gestural"),
        last_num == "4" ~ paste0(col_item, "_eyeblink"),
        last_num == "5" ~ paste0(col_item, "_noresponse"),
      )
    } else if (is_obncue) {
      dplyr::case_when(
        last_num == "1" ~ paste0(col_item, "_tactile"),
        last_num == "2" ~ paste0(col_item, "_phonemic"),
        last_num == "3" ~ paste0(col_item, "_semantic")
      )
    } else {
      column
    }
  }
  # Rename checkbox columns
  colnames(inst_data) <- vapply(new_cols, rename_checkboxes, "char")

  # Omit wf_resp and any scores (they're entered by the experimenter but
  # really only used in the wab summary scores instrument)
  item_cols <- grep("wf_resp", colnames(inst_data), invert = T)
  inst_data[item_cols]
}

# .preprocess raw NAVI data, rename checkbox columns
.preprocess_navi <- function(inst_data) {
  # navi_main_x_err___1/2/3 should be substitution, ommission, other
  new_cols <- colnames(inst_data)
  # Convert Inf (numeric) to 'Inf' (character) due to a mistaken readr guess
  inst_data <-
    dplyr::mutate(
      inst_data,
      dplyr::across(
        tidyselect::contains("_type") & where(is.numeric),
        as.character
      )
    )

  rename_checkboxes <- function(column) {
    if (grepl("_err", column)) {
      col_item <- stringr::str_extract(column, "navi_main_\\d+")
      last_num <- stringr::str_extract(column, ".$")
      dplyr::case_when(
        last_num == "1" ~ paste0(col_item, "_substitution"),
        last_num == "2" ~ paste0(col_item, "_ommission"),
        last_num == "3" ~ paste0(col_item, "_other")
      )
    } else {
      column
    }
  }
  # Rename checkbox columns
  colnames(inst_data) <- vapply(new_cols, rename_checkboxes, "char")
  inst_data
}

.preprocess_navipre <- function(inst_data) {
  # navipre_npcomp_x_acc___1/2 should be match, read
  new_cols <- colnames(inst_data)

  rename_checkboxes <- function(column) {
    if (grepl("___", column)) {
      col_item <- stringr::str_extract(column, "navipre_[[:alnum:]]+_\\d+")
      last_num <- stringr::str_extract(column, "\\d+$")
      dplyr::case_when(
        last_num == "1" ~ paste0(col_item, "_match"),
        last_num == "2" ~ paste0(col_item, "_read")
      )
    } else {
      column
    }
  }
  # Rename checkbox columns
  colnames(inst_data) <- vapply(new_cols, rename_checkboxes, "char")
  inst_data
}

# .preprocess nnb error analysis since it's entirely a checkbox matrix
# note that the type of the word will end up in the 'subtest' column
# this may change later if i figure out a good way to maintain a list
# of field regexes and output columns, but it's okay for now
.preprocess_nnberr <- function(inst_data) {
  # nnberr_s3_x_type___1/2/3/4/5/6/7/8/9/10/11/12 should be:
  # SR, SU, Pn, Pr, N, WC, ESR, Paragrammatic, EM, Pe, Un, DK
  new_cols <- colnames(inst_data)

  rename_checkboxes <- function(column) {
    if (grepl("___", column)) {
      col_num <- stringr::str_extract(column, "_(\\d+)")
      col_type <- stringr::str_extract(column, "_(noun|verb)")
      col_item <- paste0("nnberr", col_type, col_num)
      last_num <- stringr::str_extract(column, "\\d+$")
      dplyr::case_when(
        last_num == "1" ~ paste0(col_item, "_SR"),
        last_num == "2" ~ paste0(col_item, "_SU"),
        last_num == "3" ~ paste0(col_item, "_Pn"),
        last_num == "4" ~ paste0(col_item, "_Pr"),
        last_num == "5" ~ paste0(col_item, "_N"),
        last_num == "6" ~ paste0(col_item, "_WC"),
        last_num == "7" ~ paste0(col_item, "_ESR"),
        last_num == "8" ~ paste0(col_item, "_Paragrammatic"),
        last_num == "9" ~ paste0(col_item, "_EM"),
        last_num == "10" ~ paste0(col_item, "_Pe"),
        last_num == "11" ~ paste0(col_item, "_Un"),
        last_num == "12" ~ paste0(col_item, "_DK")
      )
    } else {
      column
    }
  }
  # Rename checkbox columns
  colnames(inst_data) <- vapply(new_cols, rename_checkboxes, "char")
  inst_data
}

.preprocess_navs <- function(inst_data) {
  # wab_obn_x_cues___1/2/3 should be Tactile, phonemic, Semantic
  # wab_ynq_x_acc___1/2/3/4/5 should be acc, verbal, gestural, eye blink, NR
  new_cols <- colnames(inst_data)

  rename_checkboxes <- function(column) {
    if (grepl("___", column)) {
      last_num <- stringr::str_extract(column, ".$")
      if (grepl("args_", column)) {
        dplyr::case_when(
          last_num == "1" ~ gsub("args___.$", "x", column),
          last_num == "2" ~ gsub("args___.$", "V", column),
          last_num == "3" ~ gsub("args___.$", "y", column),
          last_num == "4" ~ gsub("args___.$", "z", column),
        )
      } else {
        dplyr::case_when(
          last_num == "1" ~ gsub("aw___.$", "A", column),
          last_num == "2" ~ gsub("aw___.$", "W", column)
        )
      }
    }
    else {
      column
    }
  }

  # Rename checkbox columns
  colnames(inst_data) <- vapply(new_cols, rename_checkboxes, "char")

  # Remove dropdown menu responses (they're exported as the integer values and not the character labels)
  dplyr::select(inst_data, -tidyselect::matches("vct_\\d+_resp"))
}

.preprocess_asha <- function(inst_data) {
  # Procesing the checkbox at the end of the form
  inst_data[["asha_sources_1_choice"]] <- dplyr::case_when(
    inst_data[["asha_sources_1_choice___1"]] == 1 & inst_data[["asha_sources_1_choice___2"]] == 1 ~ 3, # Both
    inst_data[["asha_sources_1_choice___1"]] == 1 ~ 1, # Family only
    inst_data[["asha_sources_1_choice___2"]] == 1 ~ 2, # Staff only
    TRUE ~ 0 # Neither
  )

  inst_data <- dplyr::mutate(inst_data, dplyr::across(tidyselect::ends_with("_choice"), as.numeric))

  # inst_data[["asha_sources_1_choice___1"]] <- ifelse(inst_data[["asha_sources_1_choice___1"]] == 1, "Family", NA)
  # inst_data[["asha_sources_1_choice___2"]] <- ifelse(inst_data[["asha_sources_1_choice___2"]] == 1, "Staff", NA)
  # inst_data[["asha_sources_1_choice"]] <- paste0(inst_data[["asha_sources_1_choice___1"]], inst_data[["asha_sources_1_choice___2"]])
  # inst_data[["asha_sources_1_choice"]] <- ifelse(inst_data[["asha_sources_1_choice"]] == "FamilyStaff", "Both", inst_data[["asha_sources_1_choice"]])
  inst_data[["asha_sources_1_choice___1"]] <- NULL
  inst_data[["asha_sources_1_choice___2"]] <- NULL

  inst_data
}

.preprocess_hand <- function(inst_data) {
  # Processing the checkbox matrix at the end of the form
  # honestly i dont know why people would want this itemized..

  names(inst_data) <- gsub("___","",names(inst_data))

  inst_data
}
tsostarics/anrlab documentation built on Nov. 6, 2022, 3:54 a.m.