#' Proportion Correct for Sentences
#'
#' This function computes the proportion of correct sentence responses
#' per participant. Proportions can either be separated by
#' condition or collapsed across conditions. You will need to ensure
#' each trial is marked with a unique id to correspond to the answer
#' key.
#'
#' Note: other columns included in the dataframe will be found
#' in the final scored dataset. If these other columns are
#' between subjects data, they will also be included in the
#' participant dataset (i.e., there's a one to one match of
#' participant ID and column information).
#'
#' @param data a dataframe of the variables you would like to return.
#' Other variables will be included in the scored output and
#' in the participant output if they are a one to one match with
#' the participant id.
#' @param responses a column name in the dataframe that contains
#' the participant answers for each item in quotes (i.e., "column")
#' @param key a vector containing the scoring key or data column name.
#' This column does not have to be included in the original dataframe.
#' @param key.trial a vector containing the trial numbers for each answer.
#' Note: If you input long data (i.e., repeating trial-answer responses),
#' we will take the unique combination of the responses. If a trial number
#' is repeated, you will receive an error. Key and key.trial can also be
#' a separate dataframe, depending on how your output data is formatted.
#' @param id a column name containing participant ID numbers from
#' the original dataframe
#' @param id.trial a column name containing the trial numbers
#' for the participant data from the original dataframe
#' @param cutoff a numeric value that determines the criteria for
#' scoring (i.e., 0 = strictest, 5 = is most lenient). The scoring
#' criteria uses a Levenshtein distance measure to match participant
#' responses to the answer key.
#' @param flag a logical argument if you want to flag participant scores
#' that are outliers using z-scores away from the mean score for group
#' @param group.by an optional argument that can be used to group the
#' output by condition columns. These columns should be in the original
#' dataframe and concatenated c() if there are multiple columns
#' @param token.split an optional argument that can be used to delineate
#' how to separate tokens. The default is a space after punctuation and
#' additional spacing is removed.
#'
#' @return
#' \item{DF_Scored}{The dataframe of the original response, answer,
#' scoring, and any other or grouping variables. This dataframe can
#' be used to determine if the cutoff score and scoring matched your
#' answer key as intended. Distance measures are not perfect! Issues
#' and suggestions for improvement are welcome.}
#' \item{DF_Participant}{A dataframe of the proportion correct by
#' participant, which also includes optional z-scoring, grouping, and
#' other variables.}
#' \item{DF_Group}{A dataframe of the summary scores by any optional
#' grouping variables, along with overall total proportion correct
#' scoring.}
#'
#' @keywords proportion correct scoring recall sentences
#' @import stats
#' @import utils
#' @export
#' @examples
#'
#' #This data contains sentence recall test with responses and answers together.
#' #You can use a separate answer key, but this example will show you an
#' #embedded answer key. This example also shows how you can use different
#' #stimuli across participants (i.e., each person sees a randomly selected
#' #set of trials from a larger set).
#'
#' data(sentence_data)
#'
#' scored_output <- prop_correct_sentence(data = sentence_data,
#' responses = "Response",
#' key = "Sentence",
#' key.trial = "Trial.ID",
#' id = "Sub.ID",
#' id.trial = "Trial.ID",
#' cutoff = 1,
#' flag = TRUE,
#' group.by = "Condition",
#' token.split = " ")
#'
#' head(scored_output$DF_Scored)
#'
#' head(scored_output$DF_Participant)
#'
#' head(scored_output$DF_Group)
#'
prop_correct_sentence <- function(data, responses,
key, key.trial, id, id.trial,
cutoff = 0, flag = FALSE,
group.by = NULL,
token.split = " "){
#create data from inputs ----
#grab the input dataframe and convert to our names
DF <- as.data.frame(data)
colnames(DF)[grepl(responses, colnames(DF))] <- "Responses"
colnames(DF)[grepl(id, colnames(DF))] <- "Sub.ID"
colnames(DF)[grepl(id.trial, colnames(DF))] <- "Trial.ID"
#create the answer key
#if the length > 1, assume it's a vector to merge together
if (length(key) > 1){
answer_key <- data.frame("Answer" = key, "Trial.ID" = key.trial)
} else { #assume it is in the original dataframe
answer_key <- data.frame("Answer" = data[ , key],
"Trial.ID" = data[ , key.trial])
}
#find unique keys
answer_key <- unique(answer_key)
#make sure no trial IDs are repeated because then bork
dups <- duplicated(answer_key$Trial.ID)
if(sum(dups) > 0){
stop("You have duplicate trial ids for your answer key. Please check your data.")
}
#now merge key and data
DF <- merge(DF, answer_key, by = "Trial.ID")
#create the scored data ----
#lower case everything
DF$Answer <- tolower(DF$Answer)
DF$Responses <- tolower(DF$Responses)
#remove punctuation
DF$Answer <- gsub("[[:punct:]]", "", DF$Answer)
DF$Responses <- gsub("[[:punct:]]", "", DF$Responses)
#crunch white space
DF$Answer <- gsub("\\s+", " ", DF$Answer)
DF$Answer <- trimws(DF$Answer)
DF$Responses <- gsub("\\s+", " ", DF$Responses)
DF$Responses <- trimws(DF$Responses)
#create blank columns
DF$Proportion.Match <- NA
DF$Shared.Items <- NA
DF$Corrected.Items <- NA
DF$Omitted.Items <- NA
DF$Extra.Items <- NA
#dear lord a loop is the best idea
for (i in 1:nrow(DF)){
#cheap tokenization
answer.tokens <- unlist(strsplit(DF$Answer[i], split = token.split))
response.tokens <- unlist(strsplit(DF$Responses[i], split = token.split))
#figure out number of words in each
shared <- c()
omitted <- c()
extras <- c()
corrected <- c()
shared <- intersect(answer.tokens, response.tokens)
omitted <- setdiff(answer.tokens, response.tokens)
extras <- setdiff(response.tokens, answer.tokens)
omitted_final <- omitted
extras_final <- extras
#figure out if extra words are actually omitted words
#need both to check this
if (length(extras) > 0 & length(omitted) > 0){
for (j in 1:length(extras)){
lev_score <- adist(extras[j], omitted)
names(lev_score) <- omitted
#Find the minimum value for best match
#Figure out if the min score is within the cut off
if(min(lev_score) <= cutoff) {
corrected <- c(corrected, extras[j])
extras_final <- extras_final[!grepl(extras[j], extras_final)]
omitted_final <- omitted_final[!grepl(attr(which.min(lev_score), "names"), omitted_final)]
} #figure out min lev
} #going through extra words
} #only if extras is a thing
if(length(shared) > 0){ DF$Shared.Items[i] <- paste(shared, collapse = " ") } else {
DF$Shared.Items[i] <- NA
}
if(length(omitted_final) > 0){ DF$Omitted.Items[i] <- paste(omitted_final, collapse = " ") } else {
DF$Omitted.Items[i] <- NA
}
if(length(corrected) > 0){ DF$Corrected.Items[i] <- paste(corrected, collapse = " ") } else {
DF$Corrected.Items[i] <- NA
}
if(length(extras_final) > 0){ DF$Extra.Items[i] <- paste(extras_final, collapse = " ") } else {
DF$Extra.Items[i] <- NA
}
DF$Proportion.Match[i] <- (length(shared) + length(corrected)) / length(answer.tokens)
}
#create participant summary ----
k <- tapply(DF$Trial.ID, DF$Sub.ID, length)
if(min(k) != max(k)){
warning("The number of trials is not the same for every participant.
This summary represents an average of the avaliable trials
for each participant.")
}
#create participant data frame ----
if (!is.null(group.by)){
DF_participant <- aggregate(DF$Proportion.Match,
by = DF[ , c(group.by, "Sub.ID")],
mean)
colnames(DF_participant) <- c(group.by, "Sub.ID", "Proportion.Correct")
} else {
DF_participant <- aggregate(DF$Proportion.Match,
list(DF$Sub.ID), mean)
colnames(DF_participant) <- c("Sub.ID", "Proportion.Correct")
}
#add back in other columns that are one to one
other.columns <- setdiff(colnames(DF),
c("Responses", "Sub.ID", "Answer", "Scored",
colnames(DF_participant)))
for (col in other.columns){
DF_temp <- unique(DF[ , c("Sub.ID", col)])
if (sum(duplicated(DF_temp$Sub.ID)) == 0){
DF_participant <- merge(DF_participant, DF_temp, by = "Sub.ID")
}
}
#if they want to flag participants ----
if (flag) {
#flag by group
if (!is.null(group.by)){
DF_participant$Z.Score.Group <- ave(DF_participant$Proportion.Correct,
DF_participant[ , group.by],
FUN = scale)
}
DF_participant$Z.Score.Participant <- scale(DF_participant$Proportion.Correct)
}
#group summary ----
#if they want a grouping variable
if (!is.null(group.by)){
#summarize participant scores by group
DF_group_person <- aggregate(DF$Proportion.Match,
by = DF[ , c(group.by, "Sub.ID")],
mean)
colnames(DF_group_person) <- c(group.by,"Sub.ID", "Mean")
#why does aggregate do this
#if one variable by has to be a list
#if more than one, no list allowed
if (length(group.by) > 1){
DF_group <- aggregate(DF_group_person$Mean,
by = DF_group_person[ , group.by], mean)
DF_group$SD <- aggregate(DF_group_person$Mean,
by = DF_group_person[ , group.by], sd)$x
DF_group$N <- aggregate(DF_group_person$Mean,
by = DF_group_person[ , group.by], length)$x
} else {
DF_group <- aggregate(DF_group_person$Mean,
by = list(DF_group_person[ , group.by]), mean)
DF_group$SD <- aggregate(DF_group_person$Mean,
by = list(DF_group_person[ , group.by]), sd)$x
DF_group$N <- aggregate(DF_group_person$Mean,
by = list(DF_group_person[ , group.by]), length)$x
}
colnames(DF_group) <- c(group.by, "Mean", "SD", "N")
return(list(DF_Scored = DF,
DF_Participant = DF_participant,
DF_Group = DF_group))
} else {
return(list(DF_Scored = DF,
DF_Participant = DF_participant))
}
}
#' @rdname prop_correct_sentence
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.