knitr::opts_chunk$set(echo = TRUE) if (is.null(params$subdir) || (params$subdir == "")) { stop("You need to specify the subdirectory containing the raw data") } if (!dir.exists(params$subdir)) { stop("subdirectory '", params$subdir, "' does not exist") } if (is.null(params$anondir) || (params$anondir == "")) { stop("You need to specify the subdirectory for the anonymized data") } library("truthiness") library("dplyr") library("tibble") library("tidyr") library("readr")
Importing data from files in subdirectory r params$subdir
.
anondir <- normalize_path(params$anondir) # no trailing slash ## pretty print big integers .pi <- function(x) prettyNum(x, big.mark=',') ## check for manual exclusions man_exclude_part_fname <- file.path(dirname(anondir), "manually_exclude_participants.csv") man_exclude_phs_fname <- file.path(dirname(anondir), "manually_exclude_phases.csv") if (file.exists(man_exclude_part_fname)) { man_exclude_part <- read_csv(man_exclude_part_fname, col_types = "cc") if (nrow(man_exclude_part)) { cat("Read in ", nrow(man_exclude_part), " manual participant exclusions from ", man_exclude_part_fname, "\n", sep = "") } else { cat("No manual exclusions found in the file `", man_exclude_part_fname, "`.\n", sep = "") } } else { write_csv(tibble(PID = character(0), reason = character(0)), man_exclude_part_fname) cat("Created the file `", man_exclude_part_fname, "`. Enter any manual participant-level exclusions in this file.\n", sep = "") man_exclude_part <- read_csv(man_exclude_part_fname, col_types = "cc") } if (file.exists(man_exclude_phs_fname)) { man_exclude_phs <- read_csv(man_exclude_phs_fname, col_types = "cic") %>% mutate(phase_id = factor(phase_id, levels = 1:4)) if (nrow(man_exclude_phs)) { cat("Read in ", nrow(man_exclude_phs), " manual phase exclusions from ", man_exclude_phs_fname, "\n", sep = "") } else { cat("No manual exclusions found in the file `", man_exclude_phs_fname, "`.\n", sep = "") } } else { write_csv(tibble(PID = character(0), phase_id = integer(0), reason = character(0)), man_exclude_phs_fname) cat("Created the file `", man_exclude_phs_fname, "`. Enter any manual phase-level exclusions in this file.\n", sep = "") man_exclude_phs <- read_csv(man_exclude_phs_fname, col_types = "cic") %>% mutate(phase_id = factor(phase_id, levels = 1:4)) }
## all the pre-processing is done in this chunk private_sess_fname <- file.path(dirname(anondir), paste0(basename(anondir), "_NOT_ANONYMIZED_sessions.rds")) private_phase_fname <- file.path(dirname(anondir), paste0(basename(anondir), "_NOT_ANONYMIZED_phases.rds")) if (dir.exists(anondir)) { unlink(anondir, TRUE, TRUE) } dir.create(anondir) if (check_fake(params$subdir)) {flag_fake(anondir)} sess <- import_sessions_simulated(params$subdir) phase <- import_phase_info_simulated(params$subdir) ratings <- import_tratings_simulated(params$subdir) cjudgments <- import_cjudgments_simulated(params$subdir) ## destroy data from non-consenting participants sess_consent <- sess %>% mutate(chk_consent_all = grepl("^Yes", ConsentAll, ignore.case = TRUE)) phase2 <- semi_join(phase, sess_consent %>% filter(chk_consent_all), "PID") phase_consent <- phase2 %>% mutate(chk_consent = grepl("^Yes", Consent, ignore.case = TRUE)) ## TODO remove ratings from non-consenting participants ## remove non-native speakers sess_native <- sess_consent %>% mutate(chk_native = grepl("English", nativelang, ignore.case = TRUE)) ## phase-level exclusion: was the phase completed? phase_finished <- phase_consent %>% mutate(chk_finished = Finished == "TRUE") ## identify anyone who looked up answers; remove them from all phases cheaters <- phase_finished %>% filter(grepl("^Yes", cheat, ignore.case = TRUE)) %>% distinct(PID) %>% mutate(chk_nocheat = FALSE) sess_cheat <- sess_native %>% left_join(cheaters, "PID") %>% replace_na(list(chk_nocheat = TRUE)) dur_cutoffs <- tibble(phase_id = factor(1:4, levels = 1:4), min_dur = c(3L * 60L, rep(1L * 60L, 3)), max_dur = c(40L * 60L, rep(30L * 60L, 3))) ## identify participants who were too fast or too slow on *any* phase phase_speed <- phase_finished %>% inner_join(dur_cutoffs, "phase_id") %>% mutate(chk_dur_phase = (as.integer(`Duration (in seconds)`) >= min_dur) & (as.integer(`Duration (in seconds)`) <= max_dur)) %>% select(-min_dur, -max_dur) ## manual phase exclusions phase_man <- phase_speed %>% left_join(man_exclude_phs %>% mutate(chk_notmanex = FALSE) %>% rename(reason_for_manual_exclusion = reason) %>% select(PID, phase_id, chk_notmanex, reason_for_manual_exclusion), c("PID", "phase_id")) %>% replace_na(list(chk_notmanex = TRUE, reason_for_manual_exclusion = NA_character_)) sess_dur <- sess_cheat %>% left_join(phase_man %>% filter(!chk_dur_phase) %>% distinct(PID) %>% mutate(chk_dur_all = FALSE), "PID") %>% replace_na(list(chk_dur_all = TRUE)) ## now find any flatliners ## cjudgments scores ispt <- split(cjudgments[["category"]], cjudgments[["PID"]]) res <- sapply(ispt, function(.x) {length(unique(.x)) == 1L}) flat_cjudgments <- names(res)[res] ## truth ratings tspt <- split(ratings[["trating"]], list(ratings[["PID"]], ratings[["phase_id"]]), sep = ",") res <- sapply(tspt, function(.x) {length(unique(.x)) == 1L}) flat_truth <- unique(sapply(names(res)[res], function(.x) {strsplit(.x, ",")[[1]][1]}, USE.NAMES = FALSE)) flatliners <- union(flat_cjudgments, flat_truth) sess_noflat <- sess_dur %>% mutate(chk_noflatline = !(PID %in% flatliners)) sess_keep <- sess_noflat %>% left_join(man_exclude_part %>% mutate(chk_notmanex = FALSE) %>% rename(reason_for_manual_exclusion = reason) %>% select(PID, chk_notmanex, reason_for_manual_exclusion), "PID") %>% replace_na(list(chk_notmanex = TRUE, reason_for_manual_exclusion = NA_character_)) %>% mutate(keep = chk_consent_all & chk_native & chk_nocheat & chk_dur_all & chk_noflatline & chk_notmanex) %>% select(!starts_with("reason_for_manual_exclusion"), reason_for_manual_exclusion) phase_keep <- semi_join(phase_man, sess_keep %>% filter(keep), "PID") %>% mutate(keep = chk_consent & chk_finished & chk_notmanex) %>% select(!starts_with("reason_for_manual_exclusion"), reason_for_manual_exclusion) %>% select(-chk_dur_phase) ## done with participant level + phase level exclusions ## from here on, use phase_keep and sess_keep n_item <- nrow(distinct(ratings, stim_id))
r warn(params$subdir)
The raw data files contain r .pi(nrow(ratings))
truth ratings of
r .pi(n_item)
statements from r .pi(nrow(sess))
participants.
First, we excluded anyone who did not give consent for the entire study.
There were four additional exclusion criteria for participants, applied in the following order:
Particpants are excluded if one or more of these variables is FALSE
:
chk_consent_all
: gave consent for all phaseschk_native
: is native speakerchk_nocheat
: never reported looking up answerschk_dur_all
: never completed a phase too quickly or too slowlychk_noflatline
: never produced a 'flatline' responsechk_notmanex
: not manually excluded (see reason_for_manual_exclusion
)Note: n
is the number of participants excluded for that combination of reasons:
excluded_part <- sess_keep %>% filter(!keep) %>% select(PID, list_id, starts_with("chk_"), reason_for_manual_exclusion) tots <- count(excluded_part, chk_consent_all, chk_native, chk_nocheat, chk_dur_all, chk_noflatline, chk_notmanex) tots
r .pi(nrow(sess))
r .pi(sum(pull(tots, n)))
r .pi(nrow(sess) - nrow(excluded_part))
DT::datatable(excluded_part)
Phase-level exclusions apply after any participant-level exclusions; i.e., they apply only on any phases that remain after removing subjects.
The only automatically applied exclusion criteria applied at the phase-level were (1) failure to give consent (chk_consent
) and (2) failure to complete all of the ratings in the phase (chk_finished
). Manual exclusions are listed in the file r man_exclude_phs_fname
and tagged by the variable chk_notmanex
.
excluded_phs <- phase_keep %>% filter(!keep) %>% select(PID, phase_id, chk_consent, chk_finished, chk_notmanex) tots_phs <- count(excluded_phs, chk_consent, chk_finished, chk_notmanex) tots_phs
r .pi(nrow(phase))
r .pi(nrow(phase) - nrow(phase %>% semi_join(sess_keep %>% filter(keep), "PID")))
r .pi(sum(pull(tots_phs, n)))
r .pi(nrow(phase_keep %>% filter(keep)))
DT::datatable(excluded_phs)
Comments left by participants (if any) will appear below.
phase_keep %>% filter(!(comments %in% c("", "xxxx"))) %>% DT::datatable()
## if we have all phase data, anonymize sess_keep[["ID"]] <- sprintf("S%04d", sample(seq_len(nrow(sess_consent)))) share_cols <- c("list_id", "age", "gender", "nationality", "nativelang", "chk_consent_all", "chk_native", "chk_nocheat", "chk_dur_all", "chk_noflatline", "chk_notmanex", "keep", "reason_for_manual_exclusion") sess_private <- sess_keep[, c("PID", "ID", setdiff(names(sess_keep), c("PID", "ID", "ConsentAll", share_cols)))] sess_share <- sess_keep[, c("ID", share_cols)] %>% arrange(ID) %>% filter(chk_consent_all) %>% select(-chk_consent_all) pshare_cols <- c("phase_id", "Duration (in seconds)", "chk_consent", "chk_finished", "chk_notmanex", "keep", "reason_for_manual_exclusion") pkeep <- phase_keep %>% inner_join(sess_private[, c("ID", "PID")], "PID") phase_share <- pkeep[, c("ID", pshare_cols)] %>% arrange(ID, phase_id) %>% filter(chk_consent) %>% select(-chk_consent) colnames(phase_share)[colnames(phase_share) == "Duration (in seconds)"] <- "duration_secs" phase_private <- pkeep[, c("PID", "ID", "phase_id", setdiff(names(phase_keep), c("PID", "ID", "phase_id", "list_id", "Duration (in seconds)", pshare_cols)))] ratings2 <- ratings %>% group_by(PID, phase_id) %>% ungroup() %>% semi_join(sess_keep %>% filter(keep), "PID") %>% semi_join(phase_keep %>% filter(keep), c("PID", "phase_id")) %>% inner_join(phase_private[, c("PID", "ID", "phase_id")], c("PID", "phase_id")) ratings_share <- ratings2[, c("ID", "phase_id", "stim_id", "trating")] %>% arrange(ID, phase_id, stim_id) cjudgments2 <- cjudgments %>% group_by(PID) %>% ungroup() %>% semi_join(sess_keep %>% filter(keep), "PID") %>% semi_join(phase_keep %>% filter(phase_id == 1L, keep), "PID") %>% inner_join(phase_private %>% filter(phase_id == 1L) %>% select("PID", "ID"), "PID") cjudgments_share <- cjudgments2[, c("ID", "stim_id", "category")] %>% arrange(ID, stim_id) saveRDS(sess_private, private_sess_fname) cat("Wrote non-anonymized pre-processed session data to `", private_sess_fname, "`.\n", sep = "") saveRDS(phase_private, private_phase_fname) cat("\nWrote non-anonymized pre-processed phase data to `", private_phase_fname, "`\n", sep = "") readr::write_csv(sess_share, file.path(anondir, "ANON_sessions.csv")) readr::write_csv(phase_share, file.path(anondir, "ANON_phases.csv")) readr::write_csv(ratings_share, file.path(anondir, "ANON_ratings.csv")) readr::write_csv(cjudgments_share, file.path(anondir, "ANON_categories.csv")) cat("\nWrote anonymized data to files ", "`ANON_sessions.csv`, `ANON_phases.csv`, `ANON_ratings.csv`, and ", "`ANON_categories.csv` in subdirectory `", anondir, "`.\n", sep = "")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.