# Set root dir to project directory to ensure that code is always run relative to the project directory, no matter if it is run using `knitr` or interactively. knitr::opts_knit$set(root.dir = rprojroot::find_root(rprojroot::has_file("DESCRIPTION"))) knitr::opts_knit$set(eval.after = 'fig.cap') # Attach tideverse package to enable access to pipe (%>%) require(tidyverse)
This notebook contains visualizations of performance on control trials (the analyses were performed at the individual level in the notebook 01_preprocessing_idv.Rmd
). In addition, this notebook contains visualizations of exclusions and inclusions (given that the same data are used).
Before reading the data the following is specified:
notebook_name <- stringr::str_to_lower(stringr::str_replace_all(params$title, " ", "_"))
notebook_name
), which is used to save output to a notebook-specific directory# Data will be read from here -------------------------------------------------- # Directory containing preprocessed data preprocessed_data_dir <- file.path("data","derivatives", "01_preprocessing") # Directory containing preprocessed data from included participants only preprocessed_included_data_dir <- file.path(preprocessed_data_dir, "included") # Data will be written here ---------------------------------------------------- # Derived data will be written here data_derivatives_dir <- file.path("data","derivatives", notebook_name) # Figures will be written here figures_dir <- file.path("figures", notebook_name) # Create non-existing dirs if they don't exist cmdsddfeitc::check_dir(all_dirs = c(data_derivatives_dir, figures_dir))
# File names of benchmark data, all participants benchmark_all_filenames <- list.files(path = preprocessed_data_dir, pattern = sprintf("^benchmark_results_task-%s.*.csv$", params$task[[1]]), recursive = TRUE ) # File names of benchmark data, included participants only benchmark_included_filenames <- list.files(path = preprocessed_included_data_dir, pattern = sprintf("^benchmark_results_task-%s.*.csv$", params$task[[1]]) )
benchmark_all_filenames
) and the subset of included participants (benchmark_included_filenames
)Having specified all relevant variables, the benchmark data (generated by the notebook 01_preprocessing.Rmd
) are read:
# Read the data, using task-dependent column specifications # Benchmark data - all benchmark_all_data <- tibble::tibble() for (i_file in 1:length(benchmark_all_filenames)) { benchmark_all_data <- dplyr::bind_rows(benchmark_all_data, readr::read_csv(file = file.path(preprocessed_data_dir, benchmark_all_filenames[[i_file]]), col_types = cmdsddfeitc::get_col_types("benchmark_data")) ) } # Benchmark data - included benchmark_included_data <- tibble::tibble() for (i_file in 1:length(benchmark_included_filenames)) { benchmark_included_data <- dplyr::bind_rows(benchmark_included_data, readr::read_csv(file = file.path(preprocessed_included_data_dir, benchmark_included_filenames[[i_file]]), col_types = cmdsddfeitc::get_col_types("benchmark_data")) ) }
The benchmark data are analyzed for two purposes:
For the first purpose, benchmark data from included participants (benchmark_included_data
) are tidied and prepared for plotting.
# Tidy the benchmark_included_data data, so that it's ready for plotting benchmark_included_data_tidy <- benchmark_included_data %>% dplyr::filter(!(criterion %in% c("n_trial", "max_n_standard_trials_ms_greater_than_ml", "max_proportion_any_choice")), stage_id == "experiment") %>% # These columns have no added value anymore (all rows have stage_id = experiment & criterion_met = TRUE) dplyr::select(-criterion_met, stage_id) %>% # To make plotting easier, frame control trial accuracy as error rate (i.e. 1 - accuracy) for compatibility with other measures tidyr::spread(key = criterion, value = value) %>% dplyr::mutate(proportion_control_error = 1 - accuracy_control) %>% # These columns are now redundant dplyr::select(-accuracy_control) %>% tidyr::gather(key = "criterion", value = "value", proportion_control_error, proportion_too_fast, proportion_omission_error) %>% # Relabel, for plotting purposes dplyr::mutate(participant_id = factor(participant_id), criterion = factor(criterion, levels = c("proportion_too_fast", "proportion_omission_error", "proportion_control_error"), labels = c("Fast responses", "Omission errors", "Commission errors" ), ordered = TRUE ) ) # Print for inspection benchmark_included_data_tidy %>% dplyr::arrange(participant_id, criterion)
For the second purpose, benchmark data from both excluded and included participants (benchmark_all_data
) are preprocessed. Specifically, I:
lt_failed_calibration
); lt_failed_experiment
);lt_affected_by_technical_error
);lt_included_in_analysis
).sankey_data
) for plottingNote: lt_failed_experiment
takes precedence over lt_affected_by_technical_error
.
# Identify groups -------------------------------------------------------------- # We identify 4 groups, for each we make a lookup table # Group 1 - participants who failed calibration task performance criteria lt_failed_calibration <- benchmark_all_data %>% # Identify these participants by identifying fail the n_trial criterion for # the experiment (i.e. n_trial = 0) dplyr::filter(stage_id == "experiment", criterion == "n_trial") %>% dplyr::select(participant_id, criterion_met) %>% dplyr::mutate(failed_calibration = ifelse(!criterion_met, "Y", "N")) %>% dplyr::mutate(task = params$task) %>% dplyr::select(participant_id, task, failed_calibration) # Group 2 - participants who failed experiment task performance criteria lt_failed_experiment <- benchmark_all_data %>% dplyr::filter(stage_id == "experiment", criterion %in% c("accuracy_control", "proportion_too_fast", "proportion_omission_error", "max_proportion_any_choice") ) %>% dplyr::group_by(participant_id) %>% dplyr::summarize(failed_experiment = ifelse(!all(criterion_met), "Y", "N" )) %>% dplyr::arrange(participant_id) # Group 3 - participants who were affected by a technical error in the task # Note that failure to meet performance criteria (control trials) takes # precendence over technical error (which affected standard trials only) pid_failed_calibration_or_experiment <- dplyr::left_join(lt_failed_calibration, lt_failed_experiment, by = "participant_id") %>% dplyr::filter((failed_calibration == "Y") | (failed_experiment == "Y")) %>% dplyr::pull(participant_id) lt_affected_by_technical_error <- benchmark_all_data %>% dplyr::filter(!(participant_id %in% pid_failed_calibration_or_experiment), stage_id == "experiment", criterion == "max_n_standard_trials_ms_greater_than_ml") %>% dplyr::mutate(affected_by_tecnical_error = ifelse(!criterion_met, "Y", "N") ) %>% dplyr::select(participant_id, affected_by_tecnical_error) %>% dplyr::arrange(participant_id) # Group 4 - participants who were included in the analysis lt_included_in_analysis <- benchmark_included_data %>% dplyr::filter(stage_id == "experiment", criterion == "n_trial") %>% dplyr::mutate(included = ifelse(criterion_met, "Y", "N") ) %>% dplyr::select(participant_id, included) # Join tibbles together for plotting ------------------------------------------- sankey_data <- dplyr::left_join(lt_failed_calibration, lt_included_in_analysis, by = "participant_id") %>% dplyr::left_join(., lt_failed_experiment, by = "participant_id") %>% dplyr::left_join(., lt_affected_by_technical_error, by = "participant_id") %>% # Replace NAs with meaningful labels, depending on column tidyr::replace_na(list(included = "N", failed_calibration = "NA", failed_experiment = "NA", affected_by_tecnical_error = "NA")) %>% dplyr::arrange(participant_id) %>% dplyr::select(participant_id, task, failed_calibration, failed_experiment, affected_by_tecnical_error, included) %>% dplyr::group_by(task, failed_calibration, failed_experiment, affected_by_tecnical_error, included) %>% # Compute frequencies dplyr::summarize(freq = n()) %>% dplyr::ungroup() %>% # Make group labels for plotting purposes dplyr::mutate(grp_label = ifelse(included == "Y", "included", ifelse(affected_by_tecnical_error == "Y", "technical_error", ifelse((failed_calibration == "N") & (failed_experiment == "Y"), "failed_experiment", ifelse(failed_calibration == "Y", "failed_calibration", NA) ) ) ) ) %>% # Convert variables to factors to make sure that plot looks like as desired dplyr::mutate(failed_calibration = factor(failed_calibration, levels = c("N", "Y"), labels = c("N", "Y"), ordered = TRUE), failed_experiment = factor(failed_experiment, levels = c("Y", "N", "NA"), labels = c("Y", "N", "NA"), ordered = TRUE), affected_by_tecnical_error = factor(affected_by_tecnical_error, levels = c("Y", "N", "NA"), labels = c("Y", "N", "NA"), ordered = TRUE), included = factor(included, levels = c("N", "Y"), labels = c("N", "Y"), ordered = TRUE), grp_label = factor(grp_label, levels = c("included", "failed_calibration", "failed_experiment", "technical_error"), labels = c("included", "failed calibration", "failed experiment", "technical error") # ordered = TRUE) ) ) # Print for inspection sankey_data %>% dplyr::arrange(dplyr::desc(freq))
Figure \@ref(fig:sancheck) shows sanity check results. Figure \@ref(fig:figsankey) shows the flow of participants in the experiment and number of included and excluded datasets.
In section 2.2. of the preregistration, we listed the exclusion criteria after study enrollment, including:
Figure \@ref(fig:sancheck) shows these criteria as commission error rate (criterion 1, note than minimum of 75% accuracy means maximum 25% errors), omission error rate (criterion 2), and fast response rate (criterion 3).
# Source: https://stackoverflow.com/questions/10035786/ggplot2-y-axis-label-decimal-precision fmt_dcimals <- function(decimals=0){ # return a function responpsible for formatting the # axis labels with a given number of decimals function(x) format(x,nsmall = decimals,scientific = FALSE) } expt_control_trial_performance <- ggplot2::ggplot(benchmark_included_data_tidy, mapping = ggplot2::aes(x = criterion, y = value)) + ggbeeswarm::geom_quasirandom(color = "darkgrey", shape = 21) + # Red dashed line represents the value above which a participant would be excluded ggplot2::geom_hline(yintercept = 0.25, color = "red", linetype = "dashed") + ggplot2::stat_summary(fun.ymin = function(z) {quantile(z, 0.25)}, fun.ymax = function(z) {quantile(z, 0.75)}, fun.y = median ) + ggplot2::coord_flip() + ggplot2::scale_x_discrete(name = "Criterion") + ggplot2::scale_y_continuous(name = "Rate", limits = c(0,.3), breaks = seq(0,.3,.1), labels = fmt_dcimals(2)) + cmdsddfeitc::theme_cmfsddfeitc() + ggplot2::theme(aspect.ratio = 2, panel.background = ggplot2::element_rect(fill = "white")) expt_control_trial_performance if (params$task == "defer_speedup") { ctrl_trials <- "catch" } else if (params$task == "date_delay") { ctrl_trials <- "catch and instruction manipulation check" } cap_expt_control_trial_performance <- sprintf("Rates of commission errors on control (%s) trials, omission errors (response failures), and fast (< 1.5 s) responses. For each variable, individual data (grey open circles), group median (black filled circle), and interquartile range (black line) is shown. Red dashed line indicates the maximum permissable rate, above which participants were excluded.", ctrl_trials)
n_included <- sankey_data %>% dplyr::filter(grp_label == "included") %>% dplyr::pull(freq) n_tech_error <- sankey_data %>% dplyr::filter(grp_label == "technical error") %>% dplyr::pull(freq) n_failed_calibration <- sankey_data %>% dplyr::filter(grp_label == "failed calibration") %>% dplyr::pull(freq) n_failed_experiment <- sankey_data %>% dplyr::filter(grp_label == "failed experiment") %>% dplyr::pull(freq) n_expt <- n_included + n_tech_error + n_failed_experiment n_total <- n_failed_calibration + n_expt n_excluded <- n_total - n_included nodes = data.frame("name" = c(sprintf("Calibration (%d)", n_total), # 0 sprintf("Experiment (%d)", n_expt), # 1 sprintf("Excluded (%d)", n_excluded), # 2 sprintf("Included (%d)", n_included)), # 3 "group" = c("stage", "stage", "status_ex", "status_in" ) ) # Give a color for each group: my_color <- 'd3.scaleOrdinal() .domain(["stage", "status_in", "status_ex", "in", "tech_ex", "perf_ex"]) .range(["grey", "blue", "red", "lightblue", "pink", "orange"])' links = as.data.frame(matrix(c(0, 1, n_expt, # from calibration to experiment 0, 2, n_failed_calibration, # from calibration to excluded 1, 2, n_failed_experiment, # from experiment to excluded 1, 2, n_tech_error, # from experiment to excluded 1, 3, n_included # from experiment to included ), byrow = TRUE, ncol = 3) ) names(links) = c("source", "target", "value") links$group = c("in", "perf_ex", "perf_ex", "tech_ex", "in") sn <- networkD3::sankeyNetwork(Links = links, Nodes = nodes, Source = "source", Target = "target", Value = "value", NodeID = "name", NodeGroup = "group", LinkGroup = "group", colourScale = my_color, fontSize = 18, fontFamily = "Helvetica", width = 476, nodeWidth = 5, nodePadding = 20, sinksRight = FALSE) # Print Sankey diagram sn # Caption text cap_sn <- sprintf("Sankey diagram of exclusions and inclusions for the %s task. A total of %d participants were enrolled in the experiment and participated in the calibration task (left grey node). After the calibration task, %d participants were excluded because they failed to meet preset performance criteria (orange band connecting left grey node and right red node), leaving %d participants in the experiment. After the experiment, %d additional participants were excluded: %d failed to meet preset performance criteria in the experiment (orange band connecting middle grey node and right red node) and %d because their data were affected by a technical error (pink band connecting middle grey node and right red node). Consequently, data from %d participants were included in the analyses (blue band connecting middle grey node and right blue node).", stringr::str_replace(params$task, pattern = "_", replacement = "-"), n_total, n_failed_calibration, n_total - n_failed_calibration, n_failed_experiment + n_tech_error, n_failed_experiment, n_tech_error, n_included )
# General parameters =========================================================== # Resolution dpi <- 300
# Performance on control trials in the experiment ============================== # Filename fn <- sprintf("expt_control_trial_performance_task-%s_dpi-%d", params$task, dpi) # As raster image -------------------------------------------------------------- filename <- paste0(fn, ".png") ggplot2::ggsave(path = figures_dir, filename = filename, plot = expt_control_trial_performance, width = 8.5, units = "cm", dpi = dpi) print(sprintf("%s", file.path(figures_dir, filename))) # As vector image -------------------------------------------------------------- filename <- paste0(fn, ".pdf") ggplot2::ggsave(path = figures_dir, filename = filename, plot = expt_control_trial_performance, width = 8.5, units = "cm", dpi = dpi) # Print filename, so it's clear from the static report where to find the file print(sprintf("%s", file.path(figures_dir, filename))) # Sankey diagram of exclusions and inclusions ================================== # Filename fn <- sprintf("sankey_diagram_exclusions_inclusions_task-%s_dpi-%d", params$task, dpi) # As raster image -------------------------------------------------------------- # N.B. this requires phantomjs (e.g. using HomeBrew for OS X: brew install phantomjs) and rbokeh filename <- paste0(fn, ".png") rbokeh::widget2png(sn, file.path(figures_dir, filename)) # Print filename, so it's clear from the static report where to find the file print(sprintf("%s", file.path(figures_dir, filename)))
# Sankey diagram of exclusions and inclusions ================================== # Filename fn <- sprintf("sankey_diagram_exclusions_inclusions_task-%s_dpi-%d", params$task, dpi) # As raster image -------------------------------------------------------------- # N.B. this requires phantomjs (e.g. using HomeBrew for OS X: brew install phantomjs) and rbokeh filename <- paste0(fn, ".png") rbokeh::widget2png(sn, file.path(figures_dir, filename)) # Print filename, so it's clear from the static report where to find the file print(sprintf("%s", file.path(figures_dir, filename)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.