# Set root dir to project directory to ensure that code is always run relative to the project directory, no matter if it is run using `knitr` or interactively. knitr::opts_knit$set(root.dir = rprojroot::find_root(rprojroot::has_file("DESCRIPTION"))) # Attach tideverse package to enable access to pipe (%>%) require(tidyverse)
This notebook performs cleaning and quality assessment of an individual participant's data. It takes as input a participant's raw trial log file and produces as output an overview of the task performance criteria that are met and unmet, a visualization of the indifference points estimated from the calibration task, and multiple csv files written to disk containing cleaned data from different subsets of trials.
Before reading the data the following is specified:
notebook_name <- stringr::str_to_lower(stringr::str_replace_all(params$title, " ", "_"))
notebook_name
), which is used to save output to a notebook-specific directory# Raw trial log files are read from here raw_data_dir <- file.path("data","raw") # Cleaned data will be written here preproc_data_dir <- file.path("data","derivatives", notebook_name) preproc_incl_data_dir <- file.path(preproc_data_dir, "included") preproc_excl_performance_data_dir <- file.path(preproc_data_dir, "excluded_performance") preproc_excl_software_error_data_dir <- file.path(preproc_data_dir, "excluded_software_error") # Create non-existing dirs if they don't exist cmdsddfeitc::check_dir(all_dirs = c(raw_data_dir, preproc_data_dir, preproc_incl_data_dir, preproc_excl_performance_data_dir, preproc_excl_software_error_data_dir) )
# Determine which file should be read trial_log_filename <- list.files(path = raw_data_dir, pattern = sprintf("trial_log.*%.3d.csv", params$participant_id[[1]]) ) # Determine task version, because factor levels (e.g. framing, trial_type) are task-dependent task_version <- ifelse(stringr::str_detect(trial_log_filename, "defer_speedup"), "defer_speedup", ifelse(stringr::str_detect(trial_log_filename, "date_delay"), "date_delay", NA ) )
trial_log_filename
) and task version (task_version
)crit <- list() ## Trial numbers crit$n_trial$practice = 10 crit$n_trial$calibration$defer_speedup = 70 # 7 delays * (7 standard + 3 control trials) crit$n_trial$calibration$date_delay = 63 # 7 delays * (7 standard + 2 control trials) crit$n_trial$experiment$defer_speedup = 276 # 3 blocks of 92 trials crit$n_trial$experiment$date_delay = 184 # 2 blocks of 92 trials ## Minimum accuracy on control trials crit$min_accuracy_control = 0.75 ## Maximum proportion of fast responses (RT < 1500 ms) crit$max_proportion_too_fast_rt = 0.25 ## Maximum proportion of omission errors (no response recorded) crit$max_proportion_omission_error = 0.25 ## Additional criterion: maximum proportion of SS or LL choice option, collapsed across framing conditions, delays, etc. (i.e. to detect extreme choice patterns) crit$max_proportion_any_choice = 0.95 ## Additional criterion: maximum number of standard trials on which m_s > m_l (i.e. this relates to a bug in the task code that affected participants with indifference points close to m_l at long delays, for fix see https://github.com/bramzandbelt/itch_time_framing_task/commit/9723ef2628a1f7cd87b029545ebba39df8cd7b40) crit$max_n_standard_trials_ms_greater_than_ml = 0
crit
) (see preregistration document, https://osf.io/cmdp2/) that are used to benchmark performance:r crit$n_trial$practice
r crit$n_trial$calibration[task_version]
r crit$n_trial$experiment[task_version]
r crit$min_accuracy_control
r crit$max_proportion_too_fast_rt
r crit$max_proportion_omission_error
Having specified all relevant variables, the raw trial log is read:
# Read (and print) the data, using task-dependent column specifications (raw_data <- readr::read_csv(file = file.path(raw_data_dir, trial_log_filename), col_types = cmdsddfeitc::get_col_types(stringr::str_c("trial_log_data_", task_version))) )
Data cleaning encompasses the following steps for the following reasons:
_ix
(e.g. block_ix
, iter_ix
) from double to integerstage_id
to enable grouping and computing of quality assessment and summary statistics separately for practice, calibration, and experiment stages;omission_error
to ease quality assessmentframing
to frame
, as it is used in the itchmodel
packageall_trials_cleaned <- raw_data %>% # Code integers as integers dplyr::mutate_at(vars(dplyr::ends_with("ix")), as.integer) %>% # Add variable stage_id dplyr::mutate(stage_id = dplyr::case_when(stringr::str_detect(raw_data$block_id, "^p") ~ "practice", stringr::str_detect(raw_data$block_id, "^i") ~ "calibration", stringr::str_detect(raw_data$block_id, "^e") ~ "experiment" ) ) %>% dplyr::mutate(stage_id = factor(stage_id, levels = c("practice", "calibration", "experiment"), ordered = TRUE)) %>% # Add variable omission_error dplyr::mutate(omission_error = is.infinite(rt)) %>% dplyr::rename(frame = framing) %>% # Exclude repeated stages (e.g. due to failure to meet calibration criteria or accidentally quiting the experiment, in participant_id = 92) dplyr::group_by(stage_id) %>% dplyr::filter(session_ix == max(unique(session_ix)), iter_ix == max(unique(iter_ix))) %>% dplyr::ungroup()
# First identify relevant columns for assessing the data from the calibration and experiment task rel_cols <- c("subject_ix", # participant index "stage_id", # practice, calibration (ip procedure), experiment "block_id", # (p)ractice, (i)p procedure, (e)xperiment "block_ix", # block index "iter_ix", # iteration index "frame", # time frame used "m_s", # amount of smaller-sooner option "m_s_cat", # category: below, at, or above IP "m_l", # amount of larger-later option "t_s", # delay to the smaller-sooner option "t_l", # delay to the larger-later option "trial_type", # standard, catch, or instruction manipulation check "choice", # smaller-sooner (SS) or larger-later (LL) option "response", # upper (LL) or lower (SS) bound "rt", # response time "trial_correct", # whether or not response was correct "too_fast", # whether or not response was fast "omission_error" # whether or not a response is lacking ) # Smaller-sooner amount category (m_s_cat, with levels below, at, and above IP) does not exist at the calibration stage rel_cols_clbr <- rel_cols[!(rel_cols %in% c("m_s_cat", "response"))] # The iteration index is only used in the calibration task (to count how often participants have to repeat the task in order to comply with performance criteria) rel_cols_expt <- rel_cols[rel_cols %in% c("subject_ix", "frame", "m_s_cat", "m_s", "m_l", "t_s", "t_l", "choice", "response", "rt")]
Data selection involves taking the cleaned data (all_trials_cleaned
) to define three additional subsets for the following purposes:
clbr_trials_standard
) will be used to visualize estimated indifference points and written to disk for further (exploratory) analysis# Standard trials from the calibration task will be written to disk for further analysis (clbr_trials_standard <- all_trials_cleaned %>% dplyr::filter(stage_id == "calibration") %>% # # Only the last block needed: performance did not meet criteria in earlier blocks # dplyr::filter(iter_ix == max(unique(iter_ix))) %>% dplyr::select(rel_cols_clbr) %>% dplyr::filter(trial_type %in% "standard"))
expt_trials_standard
) will be written to disk for modeling analysis # Standard trials from the experiment will be written to disk for further analysis (expt_trials_standard <- all_trials_cleaned %>% dplyr::filter(stage_id == "experiment", trial_type %in% "standard", too_fast == FALSE, omission_error == FALSE ) %>% # # Only the last block, in case an error occurred (e.g. in participant ID 92) # dplyr::filter(iter_ix == max(unique(iter_ix))) %>% dplyr::mutate(response = ifelse(choice == "ll", "upper", ifelse(choice == "ss", "lower", NA) ) ) %>% dplyr::select(rel_cols_expt) )
control_trials
)# Control trials will be used for assessing whether participants paid attention (control_trials <- all_trials_cleaned %>% dplyr::filter(!trial_type %in% "standard"))
Indifference points in the calibration task are defined as the smaller-sooner amount (m_s
) on the last standard trial of each delay.
(calibration_indifference_points <- clbr_trials_standard %>% # We're interested in catch trials and instruction manipulation check trials dplyr::select(subject_ix, t_l, m_s) %>% # Last trial is approximate indifference point dplyr::group_by(subject_ix, t_l) %>% dplyr::summarize(ip = dplyr::last(m_s)) %>% dplyr::mutate(relative_diff = c(NA, diff(ip))/ 43.52, relative_diff_crit_met = (relative_diff < 0.2 | is.na(relative_diff)), last_ip_within_bounds = c(rep(TRUE,6), dplyr::last(ip) >= 0.17 & dplyr::last(ip) <= 43.52 - 0.17), monotonic_decrease = relative_diff_crit_met & last_ip_within_bounds))
We benchmark the following performance criteria:
n_trial
) in each stage: to verify that all required data is collected (e.g. if participant fails to meet performance criteria in the calibration task three times, no further experimental data is collected);accuracy_control
) in the calibration and experiment stage: to verify that participants paid attention to the choice options and trial instruction;proportion_too_fast_rt
) in the calibration and experiment stage: to verify that participants consider each scenario carefully;proportion_omission_error
) in the calibration and experiment stage: to verify that participants response within the 10000 ms response deadline.# Benchmark trial numbers trial_numbers <- all_trials_cleaned %>% dplyr::group_by(stage_id) %>% # dplyr::filter(iter_ix == max(unique(iter_ix))) %>% dplyr::summarize(criterion = "n_trial", value = n()) %>% tidyr::complete(stage_id, fill = list(criterion = "n_trial", value = 0)) %>% dplyr::mutate(criterion_met = dplyr::case_when((.$stage_id == "practice" & .$value == 10) ~ TRUE, (.$stage_id == "practice" & .$value != 10) ~ FALSE, (.$stage_id == "calibration" & .$value %% crit$n_trial$calibration[[task_version]] == 0) ~ TRUE, (.$stage_id == "calibration" & .$value %% crit$n_trial$calibration[[task_version]] != 0) ~ FALSE, (.$stage_id == "experiment" & .$value == crit$n_trial$experiment[[task_version]]) ~ TRUE, (.$stage_id == "experiment" & .$value != crit$n_trial$experiment[[task_version]]) ~ FALSE ) ) # Benchmark control trial accuracy accuracy_control_trials <- control_trials %>% dplyr::group_by(stage_id) %>% dplyr::summarize(criterion = "accuracy_control", value = sum(trial_correct) / n()) %>% dplyr::mutate(criterion_met = ifelse(value >= crit$min_accuracy_control, TRUE, FALSE) ) # Benchmark proportion of fast responses proportion_too_fast_rt <- all_trials_cleaned %>% dplyr::filter(stage_id %in% c("calibration", "experiment"), trial_type == "standard") %>% dplyr::group_by(stage_id) %>% dplyr::summarize(n = n(), n_too_fast = sum(too_fast == TRUE), criterion = "proportion_too_fast", value = n_too_fast / n) %>% dplyr::select(-n, -n_too_fast) %>% dplyr::mutate(criterion_met = ifelse(value <= crit$max_proportion_too_fast_rt, TRUE, FALSE) ) # Benchmark proportion of omission errors proportion_omission_error <- all_trials_cleaned %>% dplyr::filter(stage_id %in% c("calibration", "experiment"), trial_type == "standard") %>% dplyr::group_by(stage_id) %>% dplyr::summarize(n = n(), n_omission_error = sum(omission_error == TRUE), criterion = "proportion_omission_error", value = n_omission_error / n) %>% dplyr::select(-n, -n_omission_error) %>% dplyr::mutate(criterion_met = ifelse(value <= crit$max_proportion_omission_error, TRUE, FALSE) ) # Benchmark proporion of SS and LL choices proportion_choice_options <- expt_trials_standard %>% dplyr::count(response) %>% dplyr::mutate(value = n / sum(n)) %>% dplyr::arrange(-value) %>% dplyr::slice(1) %>% dplyr::mutate(stage_id = factor("experiment", levels = c("practice", "calibration", "experiment"), ordered = TRUE), criterion = "max_proportion_any_choice", criterion_met = ifelse(value < crit$max_proportion_any_choice, TRUE, FALSE) ) %>% dplyr::select(stage_id, criterion, value, criterion_met) # Benchmark number of standard trials on which ms > m_l n_standard_trials_ms_greater_than_ml <- expt_trials_standard %>% dplyr::filter (m_s > m_l) %>% dplyr::count() %>% dplyr::rename(value = n) %>% dplyr::mutate(stage_id = factor("experiment", levels = c("practice", "calibration", "experiment"), ordered = TRUE), criterion = "max_n_standard_trials_ms_greater_than_ml", criterion_met = ifelse(value <= crit$max_n_standard_trials_ms_greater_than_ml, TRUE, FALSE) ) # Show all benchmarks (all_benchmarks <- dplyr::bind_rows(trial_numbers, accuracy_control_trials, proportion_too_fast_rt, proportion_omission_error, proportion_choice_options, n_standard_trials_ms_greater_than_ml )) # Determine if all criteria were met all_criterion_met <- all(all_benchmarks$criterion_met) if (all_criterion_met) { print("All criteria were met: participant INCLUDED for further analysis") } else { print("Not all criteria were met: participant EXCLUDED for further analysis") }
Visualize the calibration task indifference points
if (params$visualize) { title_str <- ifelse(all(calibration_indifference_points$monotonic_decrease), "Indifference points decrease monotonically with delay", expression(paste("Indifference points ", bold("do not"), " decrease monotonically with delay")) ) if (all(calibration_indifference_points$monotonic_decrease)) { color_vals <- c("black", "red") # Two values must be provided, but only first will be used } else { color_vals <- c("red", "black") } ggplot2::ggplot(calibration_indifference_points, ggplot2::aes(x = t_l, y = ip, color = monotonic_decrease) ) + # Geoms ggplot2::geom_point(shape = 19, size = 3) + ggplot2::geom_hline(yintercept = 43.52, color = "gray50", linetype = "solid", size = 0.5, alpha = 0.5) + ggplot2::geom_hline(yintercept = 0, color = "gray50", linetype = "solid", size = 0.5, alpha = 0.5) + # Scales ggplot2::scale_color_manual(name = "Monotonic decrease", values = color_vals) + ggplot2::scale_x_continuous(name = "Delay (days)", breaks = c(2,4,8,16,32,64,128), labels = c(2,4,8,16,32,64,128)) + ggplot2::scale_y_continuous(name = "Indifference point (€)", limits = c(0,43.52)) + # Title ggplot2::ggtitle(title_str) + # Theme ggplot2::theme_minimal() + ggplot2::theme(panel.grid.minor = ggplot2::element_blank()) }
The directory in which the cleaned data will be written depends on whether the dataset was compromised because of a coding error in the stimulus presentation software (preproc_excl_software_error_data_dir
) and if not, whether all performance criteria were met (preproc_incl_data_dir
) or not (preproc_excl_performance_data_dir
).
# Define output directory based on whether all performance criteria were met data_output_dir <- ifelse(!n_standard_trials_ms_greater_than_ml$criterion_met, preproc_excl_software_error_data_dir, ifelse(all_criterion_met, preproc_incl_data_dir, preproc_excl_performance_data_dir ) )
We write 4 files to disk:
# Calibration task: used for exploratory data analysis clbr_trials_standard_file <- file.path(data_output_dir, sprintf("calibration_standard_trials_task-%s_pid-%.3d.csv", task_version, params$participant_id # pid ) ) readr::write_csv(clbr_trials_standard, path = clbr_trials_standard_file ) print(sprintf("%s", clbr_trials_standard_file))
# Standard trials from experiment: used for modeling expt_trials_standard_file <- file.path(data_output_dir, sprintf("experiment_standard_trials_task-%s_pid-%.3d.csv", task_version, params$participant_id # pid ) ) readr::write_csv(expt_trials_standard, path = expt_trials_standard_file ) print(sprintf("%s", expt_trials_standard_file))
# Standard trials from experiment: used for modeling calibration_indifference_points_file <- file.path(data_output_dir, sprintf("calibration_indifference_points_pid-%.3d.csv", params$participant_id # pid ) ) readr::write_csv(calibration_indifference_points %>% dplyr::select(subject_ix, t_l, ip), path = calibration_indifference_points_file ) print(sprintf("%s", calibration_indifference_points_file))
benchmark_results_file <- file.path(data_output_dir, sprintf("benchmark_results_task-%s_pid-%.3d.csv", task_version, params$participant_id # pid ) ) readr::write_csv(all_benchmarks %>% dplyr::mutate(participant_id = params$participant_id) %>% dplyr::select(participant_id, dplyr::everything()), path = benchmark_results_file ) print(sprintf("%s", benchmark_results_file))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.