R/crp.R
In lrd: A Package for Processing Lexical Response Data

Documented in crp

#' Conditional Response Probability
#'
#' This function calculates the conditional response
#' probability of each lag position. Participants' lag
#' between subsequent named items is tallied and then
#' divided by the possible combination of subsequent lags
#' given their response pattern.
#'
#' This output can then be used to create a CRP visualizations,
#' and an example can be found in our manuscript/vignettes.
#'
#' Important: The code is written assuming the data provided are for
#' a single recall list. If repeated measures are used (i.e., there are
#' multiple lists completed by each participant or multiple list versions),
#' you should use this function several times, once on each list/answer key.
#'
#' @param data a dataframe of the scored free recall that you would
#' like to calculate - use prop_correct_free() for best formatting.
#' @param position a column name in the dataframe that contains
#' answered position of each response in quotes (i.e., "column")
#' @param answer a column name of the answer given for that position
#' in the original dataframe.
#' @param id a column name of the participant id in the original
#' dataframe.
#' @param key a vector containing the scoring key or data column name.
#' This column does not have to be included in the original dataframe.
#' We assume your answer key is in the tested position order. You should
#' not include duplicates in your answer key.
#' @param scored a column in the original dataframe indicating if the
#' participant got the answer correct (1) or incorrect (0).
#'
#' @return
#' \item{DF_CRP}{A dataframe of the proportion correct for each
#' conditional lag position including any other between subjects
#' variables present in the data.}
#'
#' @keywords proportion correct scoring free recall serial position
#' @export
#' @examples
#'
#' data(free_data)
#' data(answer_key_free2)
#'
#' free_data <- subset(free_data,
#'  List_Type == "Cat_Recall_L1")
#'
#' DF_long <- arrange_data(data = free_data,
#'  responses = "Response",
#'  sep = " ",
#'  id = "Username")
#'
#' scored_output <- prop_correct_free( data = DF_long,
#'  responses = "response",
#'  key = answer_key_free2$Answer_Key,
#'  id = "Sub.ID",
#'  cutoff = 1,
#'  flag = TRUE,
#'  group.by = "Version")
#'
#' crp_output <- crp(data = scored_output$DF_Scored,
#'  position = "position",
#'  answer = "Answer",
#'  id = "Sub.ID",
#'  key = answer_key_free2$Answer_Key,
#'  scored = "Scored")
#'
#'  head(crp_output)
#'
crp <- function(data, position, answer, id,
                key, scored){

  # for cran check
  Sub.ID <- NULL

  #create answer key with order
  if (sum(duplicated(key)) > 0){
    stop("Your answer key contains duplicates. Please check your data.")
  }
  key <- data.frame("Answer" = key, "Tested.Position" = 1:length(key))

  #merge that with the data
  DF <- as.data.frame(data)
  colnames(DF)[grepl(answer, colnames(DF))] <- "Answer"
  colnames(DF)[grepl(position, colnames(DF))] <- "Answered.Position"
  colnames(DF)[grepl(scored, colnames(DF))] <- "Scored"
  colnames(DF)[grepl(id, colnames(DF))] <- "Sub.ID"
  DF <- merge(DF, key, by = "Answer")

  #calculate the number of times within window
  DF$Answered.Position <- as.numeric(DF$Answered.Position)
  DF$Tested.Position <- as.numeric(DF$Tested.Position)
  DF$Lag <- DF$Tested.Position - DF$Answered.Position

  number_spots <- 1:max(DF$Tested.Position)
  DF_final <- NULL

  # Get all possible lags for 0 options
  all_lags <- sort(c((1:(nrow(key) - 1)) * -1, 1:(nrow(key) - 1)))

  # Create a dataframe of all possible options and ids
  merge_lags <- data.frame(Sub.ID = rep(unique(DF$Sub.ID), each = length(all_lags)),
                           participant_lags = rep(all_lags, length(unique(DF$Sub.ID))))

  #for each participant calculate the possible lags
  for (i in unique(DF$Sub.ID)){

    temp_part <- subset(DF,
                        Sub.ID == i)
    temp_part <- temp_part[order(temp_part$Answered.Position), ]

    participant_lags <- diff(temp_part$Tested.Position)
    possible_lags <- c()

    if (nrow(temp_part) > 1){
      #participant lags
      for (j in 1:nrow(temp_part)){

        #take up to the current answered position
        current_used <- temp_part$Tested.Position[1:j]
        answers_left <- setdiff(number_spots, current_used)
        current_spot <- temp_part$Tested.Position[j]

        possible_lags <- c(possible_lags, answers_left - current_spot)


      } #answers loop

      table_part_lags <- as.data.frame(table(participant_lags))
      table_possible_lags <- as.data.frame(table(possible_lags))
      colnames(table_possible_lags) <- c("participant_lags", "Possible.Freq")
      table_part_lags <- merge(table_part_lags,
                               table_possible_lags,
                               by = "participant_lags")

      table_part_lags$Sub.ID <- i

      if (is.null(DF_final)){
        DF_final <- table_part_lags
      } else {
        DF_final <- rbind(DF_final, table_part_lags)
      }

    } #close nrow check

  } #participant loop

  # create other columns in merge_lags
  other.columns <- setdiff(colnames(DF),
                           c("Responses", "Sub.ID", "Answer",
                             "Scored", "Answered.Position",
                             "Tested.Position", "Lag",
                             colnames(merge_lags)))
  for (col in other.columns){
    DF_temp <- unique(DF[ , c("Sub.ID", col)])
    if (sum(duplicated(DF_temp$Sub.ID)) == 0){
      merge_lags <- merge(merge_lags, DF_temp, by = "Sub.ID")
    }
  }

  # Merge all possible lags with real lags
  DF_final$participant_lags <- as.numeric(as.character(DF_final$participant_lags))
  DF_final <- merge(DF_final, merge_lags, by = c("Sub.ID", "participant_lags"), all = T)

  # Create CRP
  DF_final$CRP <- DF_final$Freq / DF_final$Possible.Freq

  # Add zeroes back in
  DF_final$Freq[is.na(DF_final$Freq)] <- 0
  DF_final$Possible.Freq[is.na(DF_final$Possible.Freq)] <- 0
  DF_final$CRP[is.na(DF_final$CRP)] <- 0

  return(DF_final)

}

#' @rdname crp