# Set root dir to project directory to ensure that code is always run relative to the project directory, no matter if it is run using `knitr` or interactively.
knitr::opts_knit$set(root.dir = rprojroot::find_root(rprojroot::has_file("DESCRIPTION")))
knitr::opts_knit$set(eval.after = 'fig.cap')

# Attach tideverse package to enable access to pipe (%>%)
require(tidyverse)

Overview

This notebook contains visualizations of performance on control trials (the analyses were performed at the individual level in the notebook 01_preprocessing_idv.Rmd). In addition, this notebook contains visualizations of exclusions and inclusions (given that the same data are used).

Preliminaries

Before reading the data the following is specified:

notebook_name <- 
   stringr::str_to_lower(stringr::str_replace_all(params$title, " ", "_"))
# Data will be read from here --------------------------------------------------

# Directory containing preprocessed data 
preprocessed_data_dir <- 
  file.path("data","derivatives", "01_preprocessing")

# Directory containing preprocessed data from included participants only
preprocessed_included_data_dir <- 
  file.path(preprocessed_data_dir, "included")

# Data will be written here ----------------------------------------------------

# Derived data will be written here
data_derivatives_dir <- 
  file.path("data","derivatives", notebook_name)

# Figures will be written here
figures_dir <- 
  file.path("figures", notebook_name)

# Create non-existing dirs if they don't exist
cmdsddfeitc::check_dir(all_dirs = c(data_derivatives_dir, figures_dir))
# File names of benchmark data, all participants
benchmark_all_filenames <- list.files(path = preprocessed_data_dir, 
                                      pattern = sprintf("^benchmark_results_task-%s.*.csv$",
                                                        params$task[[1]]),
                                      recursive = TRUE
                                      )

# File names of benchmark data, included participants only
benchmark_included_filenames <- 
  list.files(path = preprocessed_included_data_dir, 
             pattern = sprintf("^benchmark_results_task-%s.*.csv$",
                               params$task[[1]])
             )

Read data

Having specified all relevant variables, the benchmark data (generated by the notebook 01_preprocessing.Rmd) are read:

# Read the data, using task-dependent column specifications

# Benchmark data - all
benchmark_all_data <- tibble::tibble()

for (i_file in 1:length(benchmark_all_filenames)) {
  benchmark_all_data <- 
    dplyr::bind_rows(benchmark_all_data,
                     readr::read_csv(file = file.path(preprocessed_data_dir, benchmark_all_filenames[[i_file]]),
                                     col_types = cmdsddfeitc::get_col_types("benchmark_data"))
                     )
}

# Benchmark data - included
benchmark_included_data <- tibble::tibble()

for (i_file in 1:length(benchmark_included_filenames)) {
  benchmark_included_data <- 
    dplyr::bind_rows(benchmark_included_data,
                     readr::read_csv(file = file.path(preprocessed_included_data_dir, benchmark_included_filenames[[i_file]]),
                                     col_types = cmdsddfeitc::get_col_types("benchmark_data"))
                     )
}

Preprocess data

The benchmark data are analyzed for two purposes:

  1. visualization of performance on control trials
  2. visualizations of exclusions and inclusions

For the first purpose, benchmark data from included participants (benchmark_included_data) are tidied and prepared for plotting.

# Tidy the benchmark_included_data data, so that it's ready for plotting
benchmark_included_data_tidy <- 
  benchmark_included_data %>% 
  dplyr::filter(!(criterion %in% c("n_trial", 
                                   "max_n_standard_trials_ms_greater_than_ml",
                                   "max_proportion_any_choice")), 
                stage_id == "experiment") %>%
  # These columns have no added value anymore (all rows have stage_id = experiment & criterion_met = TRUE)
  dplyr::select(-criterion_met, stage_id) %>%
  # To make plotting easier, frame control trial accuracy as error rate (i.e. 1 - accuracy) for compatibility with other measures
  tidyr::spread(key = criterion, value = value) %>%
  dplyr::mutate(proportion_control_error = 1 - accuracy_control) %>%
  # These columns are now redundant
  dplyr::select(-accuracy_control) %>%
  tidyr::gather(key = "criterion",
                value = "value",
                proportion_control_error, proportion_too_fast, 
                proportion_omission_error) %>%
  # Relabel, for plotting purposes
  dplyr::mutate(participant_id = factor(participant_id),
                criterion = factor(criterion, 
                                   levels = c("proportion_too_fast",
                                              "proportion_omission_error",
                                              "proportion_control_error"),
                                   labels = c("Fast responses",
                                              "Omission errors",
                                              "Commission errors"
                                              ),
                                   ordered = TRUE
                                   )
                )

# Print for inspection
benchmark_included_data_tidy %>% dplyr::arrange(participant_id, criterion)

For the second purpose, benchmark data from both excluded and included participants (benchmark_all_data) are preprocessed. Specifically, I:

Note: lt_failed_experiment takes precedence over lt_affected_by_technical_error.

# Identify groups --------------------------------------------------------------

# We identify 4 groups, for each we make a lookup table

# Group 1 - participants who failed calibration task performance criteria
lt_failed_calibration <- 
  benchmark_all_data %>% 
  # Identify these participants by identifying fail the n_trial criterion for 
  # the experiment (i.e. n_trial = 0)
  dplyr::filter(stage_id == "experiment", criterion == "n_trial") %>% 
  dplyr::select(participant_id, criterion_met) %>%
  dplyr::mutate(failed_calibration = ifelse(!criterion_met,
                                            "Y",
                                            "N")) %>%
  dplyr::mutate(task = params$task) %>%
  dplyr::select(participant_id, task, failed_calibration)

# Group 2 - participants who failed experiment task performance criteria
lt_failed_experiment <- 
  benchmark_all_data %>% 
  dplyr::filter(stage_id == "experiment", criterion %in% c("accuracy_control", 
                                                           "proportion_too_fast",
                                                           "proportion_omission_error",
                                                           "max_proportion_any_choice")
                ) %>% 
  dplyr::group_by(participant_id) %>%
  dplyr::summarize(failed_experiment = ifelse(!all(criterion_met),
                                              "Y",
                                              "N"
                                              )) %>%
  dplyr::arrange(participant_id)


# Group 3 - participants who were affected by a technical error in the task

# Note that failure to meet performance criteria (control trials) takes 
# precendence over technical error (which affected standard trials only)
pid_failed_calibration_or_experiment <- 
  dplyr::left_join(lt_failed_calibration,
                   lt_failed_experiment,
                   by = "participant_id") %>%
  dplyr::filter((failed_calibration == "Y") | (failed_experiment == "Y")) %>%
  dplyr::pull(participant_id)

lt_affected_by_technical_error <- 
  benchmark_all_data %>% 
  dplyr::filter(!(participant_id %in% pid_failed_calibration_or_experiment),
                stage_id == "experiment", 
                criterion == "max_n_standard_trials_ms_greater_than_ml") %>%
  dplyr::mutate(affected_by_tecnical_error = ifelse(!criterion_met,
                                                    "Y",
                                                    "N")
                ) %>%
  dplyr::select(participant_id, affected_by_tecnical_error) %>%
  dplyr::arrange(participant_id)

# Group 4 - participants who were included in the analysis

lt_included_in_analysis <- 
  benchmark_included_data %>%
  dplyr::filter(stage_id == "experiment", criterion == "n_trial") %>%
  dplyr::mutate(included = ifelse(criterion_met,
                                  "Y",
                                  "N")
                ) %>%
  dplyr::select(participant_id, included) 

# Join tibbles together for plotting -------------------------------------------

sankey_data <- 
  dplyr::left_join(lt_failed_calibration, 
                   lt_included_in_analysis, 
                   by = "participant_id") %>%
  dplyr::left_join(., 
                   lt_failed_experiment, 
                   by = "participant_id") %>%
  dplyr::left_join(., 
                   lt_affected_by_technical_error, 
                   by = "participant_id") %>%
  # Replace NAs with meaningful labels, depending on column
  tidyr::replace_na(list(included = "N",
                         failed_calibration = "NA",
                         failed_experiment = "NA",
                         affected_by_tecnical_error = "NA")) %>%
  dplyr::arrange(participant_id) %>%
  dplyr::select(participant_id, task, failed_calibration, failed_experiment, 
                affected_by_tecnical_error, included) %>%
  dplyr::group_by(task, failed_calibration, failed_experiment, 
                  affected_by_tecnical_error, included) %>%
  # Compute frequencies
  dplyr::summarize(freq = n()) %>%
  dplyr::ungroup() %>%
  # Make group labels for plotting purposes
  dplyr::mutate(grp_label = 
                  ifelse(included == "Y",
                         "included",
                         ifelse(affected_by_tecnical_error == "Y",
                                "technical_error",
                                ifelse((failed_calibration == "N") & 
                                         (failed_experiment == "Y"),
                                       "failed_experiment",
                                       ifelse(failed_calibration == "Y",
                                              "failed_calibration",
                                              NA)
                                       )
                                )
                         )
                ) %>%
  # Convert variables to factors to make sure that plot looks like as desired
  dplyr::mutate(failed_calibration = factor(failed_calibration,
                                            levels = c("N", "Y"),
                                            labels = c("N", "Y"),
                                            ordered = TRUE),
                failed_experiment = factor(failed_experiment,
                                           levels = c("Y", "N", "NA"),
                                           labels = c("Y", "N", "NA"),
                                           ordered = TRUE),
                affected_by_tecnical_error = factor(affected_by_tecnical_error,
                                           levels = c("Y", "N", "NA"),
                                           labels = c("Y", "N", "NA"),
                                           ordered = TRUE),
                included = factor(included,
                                  levels = c("N", "Y"),
                                  labels = c("N", "Y"),
                                  ordered = TRUE),
                grp_label = 
                  factor(grp_label,
                         levels = c("included", "failed_calibration", 
                                    "failed_experiment", "technical_error"),
                         labels = c("included", "failed calibration", 
                                    "failed experiment", "technical error") 
                         # ordered = TRUE)
                  )
  )

# Print for inspection
sankey_data %>% dplyr::arrange(dplyr::desc(freq))

Visualize data

Figure \@ref(fig:sancheck) shows sanity check results. Figure \@ref(fig:figsankey) shows the flow of participants in the experiment and number of included and excluded datasets.

Results from first control analysis: error on , as well

In section 2.2. of the preregistration, we listed the exclusion criteria after study enrollment, including:

  1. accuracy < 75% on catch trials and instruction manipulation check trials (see also section 2.6.1. of preregistration)
  2. no response on > 25% of standard intertemporal choice trials
  3. fast response time (< 1500 ms) on 25% of standard intertemporal choice trials

Figure \@ref(fig:sancheck) shows these criteria as commission error rate (criterion 1, note than minimum of 75% accuracy means maximum 25% errors), omission error rate (criterion 2), and fast response rate (criterion 3).

# Source: https://stackoverflow.com/questions/10035786/ggplot2-y-axis-label-decimal-precision
fmt_dcimals <- function(decimals=0){
   # return a function responpsible for formatting the 
   # axis labels with a given number of decimals 
   function(x) format(x,nsmall = decimals,scientific = FALSE)
}

expt_control_trial_performance <- 
  ggplot2::ggplot(benchmark_included_data_tidy,
                  mapping = ggplot2::aes(x = criterion,
                                         y = value)) + 
    ggbeeswarm::geom_quasirandom(color = "darkgrey",
                                 shape = 21) +

    # Red dashed line represents the value above which a participant would be excluded
    ggplot2::geom_hline(yintercept = 0.25,
                        color = "red",
                        linetype = "dashed") +
    ggplot2::stat_summary(fun.ymin = function(z) {quantile(z, 0.25)},
                          fun.ymax = function(z) {quantile(z, 0.75)},
                          fun.y = median
                          ) +
    ggplot2::coord_flip() +
    ggplot2::scale_x_discrete(name = "Criterion") +
    ggplot2::scale_y_continuous(name = "Rate",
                                limits = c(0,.3),
                                breaks = seq(0,.3,.1),
                                labels = fmt_dcimals(2)) +

    cmdsddfeitc::theme_cmfsddfeitc() +
    ggplot2::theme(aspect.ratio = 2,
                   panel.background = ggplot2::element_rect(fill = "white"))

expt_control_trial_performance

if (params$task == "defer_speedup") {
  ctrl_trials <- "catch"
} else if (params$task == "date_delay") {
  ctrl_trials <- "catch and instruction manipulation check"
}

cap_expt_control_trial_performance <- sprintf("Rates of commission errors on control (%s) trials, omission errors (response failures), and fast (< 1.5 s) responses. For each variable, individual data (grey open circles), group median (black filled circle), and interquartile range (black line) is shown. Red dashed line indicates the maximum permissable rate, above which participants were excluded.", ctrl_trials)

Sankey diagram of participant inclusions and exclusions

n_included <- 
  sankey_data %>% 
  dplyr::filter(grp_label == "included") %>% 
  dplyr::pull(freq)
n_tech_error <- 
  sankey_data %>% 
  dplyr::filter(grp_label == "technical error") %>% 
  dplyr::pull(freq)
n_failed_calibration <- 
  sankey_data %>% 
  dplyr::filter(grp_label == "failed calibration") %>% 
  dplyr::pull(freq)
n_failed_experiment <- 
  sankey_data %>% 
  dplyr::filter(grp_label == "failed experiment") %>% 
  dplyr::pull(freq)
n_expt <- n_included + n_tech_error + n_failed_experiment
n_total <- n_failed_calibration + n_expt
n_excluded <- n_total - n_included

nodes = data.frame("name" = c(sprintf("Calibration (%d)", n_total), # 0
                              sprintf("Experiment (%d)", n_expt),   # 1
                              sprintf("Excluded (%d)", n_excluded), # 2
                              sprintf("Included (%d)", n_included)), # 3
                     "group" = c("stage",
                                 "stage",
                                 "status_ex",
                                 "status_in"
                                 )
                     )

# Give a color for each group:
my_color <- 'd3.scaleOrdinal() .domain(["stage", "status_in", "status_ex", "in", "tech_ex", "perf_ex"]) .range(["grey", "blue", "red", "lightblue", "pink", "orange"])'

links = as.data.frame(matrix(c(0, 1, n_expt, # from calibration to experiment
                               0, 2, n_failed_calibration,  # from calibration to excluded
                               1, 2, n_failed_experiment, # from experiment to excluded
                               1, 2, n_tech_error, # from experiment to excluded
                               1, 3, n_included # from experiment to included
                               ),
                             byrow = TRUE, 
                             ncol = 3)
                      )

names(links) = c("source", "target", "value")
links$group = c("in", "perf_ex", "perf_ex", "tech_ex", "in")
sn <- networkD3::sankeyNetwork(Links = links,
                         Nodes = nodes,
                         Source = "source",
                         Target = "target",
                         Value = "value",
                         NodeID = "name",
                         NodeGroup = "group",
                         LinkGroup = "group",
                         colourScale = my_color,
                         fontSize = 18,
                         fontFamily = "Helvetica",
                         width = 476,
                         nodeWidth = 5,
                         nodePadding = 20,
                         sinksRight = FALSE)

# Print Sankey diagram
sn

# Caption text
cap_sn <- sprintf("Sankey diagram of exclusions and inclusions for the %s task. A total of %d participants were enrolled in the experiment and participated in the calibration task (left grey node). After the calibration task, %d participants were excluded because they failed to meet preset performance criteria (orange band connecting left grey node and right red node), leaving %d participants in the experiment. After the experiment, %d additional participants were excluded: %d failed to meet preset performance criteria in the experiment (orange band connecting middle grey node and right red node) and %d because their data were affected by a technical error (pink band connecting middle grey node and right red node). Consequently, data from %d participants were included in the analyses (blue band connecting middle grey node and right blue node).",
                  stringr::str_replace(params$task, pattern = "_", replacement = "-"), 
                  n_total, 
                  n_failed_calibration, 
                  n_total - n_failed_calibration, 
                  n_failed_experiment + n_tech_error, 
                  n_failed_experiment, 
                  n_tech_error, 
                  n_included
                  )

Write data

# General parameters ===========================================================

# Resolution
dpi <- 300

Write plots of performance on control trials in the experiment to disk

# Performance on control trials in the experiment ==============================

# Filename
fn <- sprintf("expt_control_trial_performance_task-%s_dpi-%d", params$task, dpi)

# As raster image --------------------------------------------------------------
filename <- paste0(fn, ".png")
ggplot2::ggsave(path = figures_dir,
                filename = filename,
                plot = expt_control_trial_performance,
                width = 8.5,
                units = "cm",
                dpi = dpi)

print(sprintf("%s", file.path(figures_dir, filename)))

# As vector image --------------------------------------------------------------
filename <- paste0(fn, ".pdf")
ggplot2::ggsave(path = figures_dir,
                filename = filename,
                plot = expt_control_trial_performance,
                width = 8.5,
                units = "cm",
                dpi = dpi)

# Print filename, so it's clear from the static report where to find the file
print(sprintf("%s", file.path(figures_dir, filename)))

# Sankey diagram of exclusions and inclusions ==================================

# Filename
fn <- sprintf("sankey_diagram_exclusions_inclusions_task-%s_dpi-%d", params$task, dpi)

# As raster image --------------------------------------------------------------

# N.B. this requires phantomjs (e.g. using HomeBrew for OS X: brew install phantomjs) and rbokeh
filename <- paste0(fn, ".png")
rbokeh::widget2png(sn, file.path(figures_dir, filename))

# Print filename, so it's clear from the static report where to find the file
print(sprintf("%s", file.path(figures_dir, filename)))

Write Sankey diagams showing exclusions and inclusions to disk

# Sankey diagram of exclusions and inclusions ==================================

# Filename
fn <- sprintf("sankey_diagram_exclusions_inclusions_task-%s_dpi-%d", params$task, dpi)

# As raster image --------------------------------------------------------------

# N.B. this requires phantomjs (e.g. using HomeBrew for OS X: brew install phantomjs) and rbokeh
filename <- paste0(fn, ".png")
rbokeh::widget2png(sn, file.path(figures_dir, filename))

# Print filename, so it's clear from the static report where to find the file
print(sprintf("%s", file.path(figures_dir, filename)))


bramzandbelt/cmdsddfeitc documentation built on June 28, 2019, 8:19 a.m.