# Set root dir to project directory to ensure that code is always run relative to the project directory, no matter if it is run using `knitr` or interactively.
knitr::opts_knit$set(root.dir = rprojroot::find_root(rprojroot::has_file("DESCRIPTION")))

# Attach tideverse package to enable access to pipe (%>%)
require(tidyverse)

Overview

This notebook performs cleaning and quality assessment of an individual participant's data. It takes as input a participant's raw trial log file and produces as output an overview of the task performance criteria that are met and unmet, a visualization of the indifference points estimated from the calibration task, and multiple csv files written to disk containing cleaned data from different subsets of trials.

Preliminaries

Before reading the data the following is specified:

notebook_name <- 
   stringr::str_to_lower(stringr::str_replace_all(params$title, " ", "_"))
# Raw trial log files are read from here
raw_data_dir <- 
  file.path("data","raw")

# Cleaned data will be written here
preproc_data_dir <- 
  file.path("data","derivatives", notebook_name)
preproc_incl_data_dir <- 
  file.path(preproc_data_dir, "included")
preproc_excl_performance_data_dir <- 
  file.path(preproc_data_dir, "excluded_performance")
preproc_excl_software_error_data_dir <- 
  file.path(preproc_data_dir, "excluded_software_error")

# Create non-existing dirs if they don't exist
cmdsddfeitc::check_dir(all_dirs = c(raw_data_dir, 
                                    preproc_data_dir, 
                                    preproc_incl_data_dir, 
                                    preproc_excl_performance_data_dir,
                                    preproc_excl_software_error_data_dir)
                       )
# Determine which file should be read
trial_log_filename <- list.files(path = raw_data_dir, 
                                 pattern = sprintf("trial_log.*%.3d.csv", params$participant_id[[1]])
                                 )

# Determine task version, because factor levels (e.g. framing, trial_type) are task-dependent
task_version <- ifelse(stringr::str_detect(trial_log_filename, "defer_speedup"),
                    "defer_speedup",
                    ifelse(stringr::str_detect(trial_log_filename, "date_delay"),
                           "date_delay",
                           NA
                           )
                    )
crit <- list()

## Trial numbers
crit$n_trial$practice = 10
crit$n_trial$calibration$defer_speedup = 70 # 7 delays * (7 standard + 3 control trials)
crit$n_trial$calibration$date_delay = 63 # 7 delays * (7 standard + 2 control trials)
crit$n_trial$experiment$defer_speedup = 276 # 3 blocks of 92 trials
crit$n_trial$experiment$date_delay = 184 # 2 blocks of 92 trials

## Minimum accuracy on control trials
crit$min_accuracy_control = 0.75

## Maximum proportion of fast responses (RT < 1500 ms)
crit$max_proportion_too_fast_rt = 0.25

## Maximum proportion of omission errors (no response recorded)
crit$max_proportion_omission_error = 0.25

## Additional criterion: maximum proportion of SS or LL choice option, collapsed across framing conditions, delays, etc. (i.e. to detect extreme choice patterns)
crit$max_proportion_any_choice = 0.95

## Additional criterion: maximum number of standard trials on which m_s > m_l (i.e. this relates to a bug in the task code that affected participants with indifference points close to m_l at long delays, for fix see https://github.com/bramzandbelt/itch_time_framing_task/commit/9723ef2628a1f7cd87b029545ebba39df8cd7b40)
crit$max_n_standard_trials_ms_greater_than_ml = 0

Read data

Having specified all relevant variables, the raw trial log is read:

# Read (and print) the data, using task-dependent column specifications
(raw_data <-
    readr::read_csv(file = file.path(raw_data_dir, trial_log_filename),
                    col_types = cmdsddfeitc::get_col_types(stringr::str_c("trial_log_data_", task_version)))
  )

Preprocess data

Data cleaning

Data cleaning encompasses the following steps for the following reasons:

  1. correct erroneous coding of variable names ending with _ix (e.g. block_ix, iter_ix) from double to integer
  2. add a factor variable stage_id to enable grouping and computing of quality assessment and summary statistics separately for practice, calibration, and experiment stages;
  3. add a logical variable omission_error to ease quality assessment
  4. rename the variable framing to frame, as it is used in the itchmodel package
all_trials_cleaned <- 
  raw_data %>%
  # Code integers as integers
  dplyr::mutate_at(vars(dplyr::ends_with("ix")), 
                  as.integer) %>%
  # Add variable stage_id
  dplyr::mutate(stage_id =
                  dplyr::case_when(stringr::str_detect(raw_data$block_id, "^p") 
                                   ~ "practice",
                                   stringr::str_detect(raw_data$block_id, "^i") 
                                   ~ "calibration",
                                   stringr::str_detect(raw_data$block_id, "^e") 
                                   ~ "experiment"
                                   )
                ) %>%
  dplyr::mutate(stage_id = factor(stage_id, 
                                  levels = c("practice", 
                                             "calibration",
                                             "experiment"), 
                                  ordered = TRUE)) %>%
  # Add variable omission_error
  dplyr::mutate(omission_error = is.infinite(rt)) %>%
  dplyr::rename(frame = framing) %>%
  # Exclude repeated stages (e.g. due to failure to meet calibration criteria or accidentally quiting the experiment, in participant_id = 92)
  dplyr::group_by(stage_id) %>% 
  dplyr::filter(session_ix == max(unique(session_ix)), 
                iter_ix == max(unique(iter_ix))) %>% 
  dplyr::ungroup()

Data selection

# First identify relevant columns for assessing the data from the calibration and experiment task
rel_cols <- c("subject_ix", # participant index
              "stage_id", # practice, calibration (ip procedure), experiment
              "block_id", # (p)ractice, (i)p procedure, (e)xperiment
              "block_ix", # block index
              "iter_ix",  # iteration index
              "frame",    # time frame used
              "m_s",      # amount of smaller-sooner option
              "m_s_cat",  # category: below, at, or above IP
              "m_l",      # amount of larger-later option
              "t_s",      # delay to the smaller-sooner option
              "t_l",      # delay to the larger-later option
              "trial_type", # standard, catch, or instruction manipulation check
              "choice",  # smaller-sooner (SS) or larger-later (LL) option
              "response", # upper (LL) or lower (SS) bound
              "rt",      # response time
              "trial_correct", # whether or not response was correct
              "too_fast",  # whether or not response was fast
              "omission_error" # whether or not a response is lacking
              )

# Smaller-sooner amount category (m_s_cat, with levels below, at, and above IP) does not exist at the calibration stage
rel_cols_clbr <- rel_cols[!(rel_cols %in% c("m_s_cat", "response"))]

# The iteration index is only used in the calibration task (to count how often participants have to repeat the task in order to comply with performance criteria)
rel_cols_expt <- rel_cols[rel_cols %in% c("subject_ix", "frame", "m_s_cat", "m_s", "m_l", "t_s", "t_l", "choice", "response", "rt")]

Data selection involves taking the cleaned data (all_trials_cleaned) to define three additional subsets for the following purposes:

  1. standard trials from the calibration task (clbr_trials_standard) will be used to visualize estimated indifference points and written to disk for further (exploratory) analysis
# Standard trials from the calibration task will be written to disk for further analysis
(clbr_trials_standard <- 
    all_trials_cleaned %>%
    dplyr::filter(stage_id == "calibration") %>%
    # # Only the last block needed: performance did not meet criteria in earlier blocks
    # dplyr::filter(iter_ix == max(unique(iter_ix))) %>%
    dplyr::select(rel_cols_clbr) %>%
    dplyr::filter(trial_type %in% "standard"))
  1. standard trials from the experiment task (expt_trials_standard) will be written to disk for modeling analysis
# Standard trials from the experiment will be written to disk for further analysis
(expt_trials_standard <- 
   all_trials_cleaned %>%
   dplyr::filter(stage_id == "experiment",
                 trial_type %in% "standard",
                 too_fast == FALSE,
                 omission_error == FALSE
                 ) %>%
   # # Only the last block, in case an error occurred (e.g. in participant ID 92)
   # dplyr::filter(iter_ix == max(unique(iter_ix))) %>%
   dplyr::mutate(response = ifelse(choice == "ll",
                                  "upper",
                                  ifelse(choice == "ss",
                                         "lower",
                                         NA)
                                  )
                 ) %>%
   dplyr::select(rel_cols_expt)
 )
  1. control trials from both the calibration task and the experiment task (control_trials)
# Control trials will be used for assessing whether participants paid attention
(control_trials <- 
    all_trials_cleaned %>%
    dplyr::filter(!trial_type %in% "standard"))

Identify calibration task indifference points

Indifference points in the calibration task are defined as the smaller-sooner amount (m_s) on the last standard trial of each delay.

(calibration_indifference_points <- 
  clbr_trials_standard %>%
  # We're interested in catch trials and instruction manipulation check trials
  dplyr::select(subject_ix, t_l, m_s) %>%
  # Last trial is approximate indifference point
  dplyr::group_by(subject_ix, t_l) %>%
  dplyr::summarize(ip = dplyr::last(m_s)) %>%
  dplyr::mutate(relative_diff = c(NA, diff(ip))/ 43.52,
                relative_diff_crit_met = (relative_diff < 0.2 | is.na(relative_diff)),
                last_ip_within_bounds = c(rep(TRUE,6),
                                          dplyr::last(ip) >= 0.17 &
                                          dplyr::last(ip) <= 43.52 - 0.17),
                monotonic_decrease = relative_diff_crit_met & last_ip_within_bounds))

Analyze data

We benchmark the following performance criteria:

# Benchmark trial numbers
trial_numbers <- 
  all_trials_cleaned %>%
  dplyr::group_by(stage_id) %>%
  # dplyr::filter(iter_ix == max(unique(iter_ix))) %>%
  dplyr::summarize(criterion = "n_trial",
                   value = n()) %>%
  tidyr::complete(stage_id, fill = list(criterion = "n_trial", value = 0)) %>%
  dplyr::mutate(criterion_met = 
                  dplyr::case_when((.$stage_id == "practice" & 
                                      .$value == 10) 
                                   ~ TRUE,
                                   (.$stage_id == "practice" & 
                                      .$value != 10) 
                                   ~ FALSE,
                                   (.$stage_id == "calibration" & 
                                      .$value %% crit$n_trial$calibration[[task_version]] == 0)
                                   ~ TRUE,
                                   (.$stage_id == "calibration" & 
                                      .$value %% crit$n_trial$calibration[[task_version]] != 0)
                                   ~ FALSE,
                                   (.$stage_id == "experiment" & 
                                      .$value == crit$n_trial$experiment[[task_version]])
                                   ~ TRUE,
                                   (.$stage_id == "experiment" & 
                                      .$value != crit$n_trial$experiment[[task_version]])
                                   ~ FALSE
                                   )
                )

# Benchmark control trial accuracy
accuracy_control_trials <- 
  control_trials %>%
  dplyr::group_by(stage_id) %>%
  dplyr::summarize(criterion = "accuracy_control",
                   value = sum(trial_correct) / n()) %>%
  dplyr::mutate(criterion_met = 
                  ifelse(value >= crit$min_accuracy_control,
                         TRUE,
                         FALSE)
                  )

# Benchmark proportion of fast responses
proportion_too_fast_rt <- 
  all_trials_cleaned %>%
  dplyr::filter(stage_id %in% c("calibration", "experiment"),
                trial_type == "standard") %>%
  dplyr::group_by(stage_id) %>%
  dplyr::summarize(n = n(),
                   n_too_fast = sum(too_fast == TRUE),
                   criterion = "proportion_too_fast",
                   value = n_too_fast / n) %>%
  dplyr::select(-n, -n_too_fast) %>%
  dplyr::mutate(criterion_met = 
                  ifelse(value <= crit$max_proportion_too_fast_rt,
                         TRUE,
                         FALSE)
                  )

# Benchmark proportion of omission errors
proportion_omission_error <- 
  all_trials_cleaned %>%
  dplyr::filter(stage_id %in% c("calibration", "experiment"),
                trial_type == "standard") %>%
  dplyr::group_by(stage_id) %>%
  dplyr::summarize(n = n(),
                   n_omission_error = sum(omission_error == TRUE),
                   criterion = "proportion_omission_error",
                   value = n_omission_error / n) %>%
  dplyr::select(-n, -n_omission_error) %>%
  dplyr::mutate(criterion_met = 
                  ifelse(value <= crit$max_proportion_omission_error,
                         TRUE,
                         FALSE)
                  )

# Benchmark proporion of SS and LL choices
proportion_choice_options <- 
  expt_trials_standard %>%
  dplyr::count(response) %>%
  dplyr::mutate(value = n / sum(n)) %>% 
  dplyr::arrange(-value) %>%
  dplyr::slice(1) %>%
  dplyr::mutate(stage_id = factor("experiment", levels = c("practice", "calibration", "experiment"), ordered = TRUE),
                criterion = "max_proportion_any_choice",
                criterion_met = 
                  ifelse(value < crit$max_proportion_any_choice,
                         TRUE,
                         FALSE)
                ) %>%
  dplyr::select(stage_id, criterion, value, criterion_met)

# Benchmark number of standard trials on which ms > m_l
n_standard_trials_ms_greater_than_ml <- 
  expt_trials_standard %>%
  dplyr::filter (m_s > m_l) %>%
  dplyr::count() %>%
  dplyr::rename(value = n) %>%
  dplyr::mutate(stage_id = factor("experiment", levels = c("practice", "calibration", "experiment"), ordered = TRUE),
                criterion = "max_n_standard_trials_ms_greater_than_ml",
                criterion_met = 
                  ifelse(value <= crit$max_n_standard_trials_ms_greater_than_ml,
                         TRUE,
                         FALSE)
                )

# Show all benchmarks
(all_benchmarks <- 
dplyr::bind_rows(trial_numbers,
                 accuracy_control_trials,
                 proportion_too_fast_rt,
                 proportion_omission_error,
                 proportion_choice_options,
                 n_standard_trials_ms_greater_than_ml
                 ))

# Determine if all criteria were met
all_criterion_met <- all(all_benchmarks$criterion_met)

if (all_criterion_met) {
    print("All criteria were met: participant INCLUDED for further analysis")
  } else {
    print("Not all criteria were met: participant EXCLUDED for further analysis")
  }

Visualize data

Visualize the calibration task indifference points

if (params$visualize) {
 title_str <- ifelse(all(calibration_indifference_points$monotonic_decrease),
                    "Indifference points decrease monotonically with delay",
                    expression(paste("Indifference points ", bold("do not"), " decrease monotonically with delay"))
                    )

if (all(calibration_indifference_points$monotonic_decrease)) {
  color_vals <- c("black", "red") # Two values must be provided, but only first will be used
} else {
  color_vals <- c("red", "black")
}

ggplot2::ggplot(calibration_indifference_points,
                ggplot2::aes(x = t_l,
                             y = ip,
                             color = monotonic_decrease)
                ) + 

  # Geoms
  ggplot2::geom_point(shape = 19,
                      size = 3) +

  ggplot2::geom_hline(yintercept = 43.52, color = "gray50", linetype = "solid", size = 0.5, alpha = 0.5) +
  ggplot2::geom_hline(yintercept = 0, color = "gray50", linetype = "solid", size = 0.5, alpha = 0.5) +

  # Scales
  ggplot2::scale_color_manual(name = "Monotonic decrease",
                                values = color_vals) +

  ggplot2::scale_x_continuous(name = "Delay (days)",
                              breaks = c(2,4,8,16,32,64,128),
                              labels = c(2,4,8,16,32,64,128)) +
  ggplot2::scale_y_continuous(name = "Indifference point (€)",
                              limits = c(0,43.52)) +

  # Title
  ggplot2::ggtitle(title_str) + 

  # Theme
  ggplot2::theme_minimal() + 
  ggplot2::theme(panel.grid.minor = ggplot2::element_blank()) 
}

Write data to disk

The directory in which the cleaned data will be written depends on whether the dataset was compromised because of a coding error in the stimulus presentation software (preproc_excl_software_error_data_dir) and if not, whether all performance criteria were met (preproc_incl_data_dir) or not (preproc_excl_performance_data_dir).

# Define output directory based on whether all performance criteria were met
data_output_dir <- 
  ifelse(!n_standard_trials_ms_greater_than_ml$criterion_met,
         preproc_excl_software_error_data_dir,
         ifelse(all_criterion_met,
                preproc_incl_data_dir,
                preproc_excl_performance_data_dir
                )
         )

We write 4 files to disk:

  1. calibration task data (final block only), which will be used for exploratory analysis of the indifference point procedure:
# Calibration task: used for exploratory data analysis
clbr_trials_standard_file <- 
  file.path(data_output_dir,
            sprintf("calibration_standard_trials_task-%s_pid-%.3d.csv",
                    task_version,
                    params$participant_id # pid
                    )
            )
readr::write_csv(clbr_trials_standard,
                 path = clbr_trials_standard_file
            )
print(sprintf("%s", clbr_trials_standard_file))
  1. experimental task data, which will be used for computational modeling and control analyses:
# Standard trials from experiment: used for modeling
expt_trials_standard_file <- 
  file.path(data_output_dir,
            sprintf("experiment_standard_trials_task-%s_pid-%.3d.csv", 
                    task_version,
                    params$participant_id # pid
                    )
            )

readr::write_csv(expt_trials_standard,
                 path = expt_trials_standard_file
            )
print(sprintf("%s", expt_trials_standard_file))
  1. indifference points from the calibration task:
# Standard trials from experiment: used for modeling
calibration_indifference_points_file <- 
  file.path(data_output_dir,
            sprintf("calibration_indifference_points_pid-%.3d.csv", 
                    params$participant_id # pid
                    )
            )

readr::write_csv(calibration_indifference_points %>% 
                   dplyr::select(subject_ix, t_l, ip),
                 path = calibration_indifference_points_file
            )
print(sprintf("%s", calibration_indifference_points_file))
  1. benchmark results:
benchmark_results_file <- 
  file.path(data_output_dir,
            sprintf("benchmark_results_task-%s_pid-%.3d.csv", 
                    task_version,
                    params$participant_id # pid
                    )
            )

readr::write_csv(all_benchmarks %>% 
                   dplyr::mutate(participant_id = params$participant_id) %>%
                   dplyr::select(participant_id, dplyr::everything()),
                 path = benchmark_results_file
            )
print(sprintf("%s", benchmark_results_file))



bramzandbelt/cmdsddfeitc documentation built on June 28, 2019, 8:19 a.m.