The Truth Trajectory: Data Preprocessing and Anonymization

knitr::opts_chunk$set(echo = TRUE)

if (is.null(params$subdir) || (params$subdir == "")) {
  stop("You need to specify the subdirectory containing the raw data")
}

if (!dir.exists(params$subdir)) {
  stop("subdirectory '", params$subdir, "' does not exist")
}

if (is.null(params$anondir) || (params$anondir == "")) {
  stop("You need to specify the subdirectory for the anonymized data")
}

library("truthiness")
library("dplyr")
library("tibble")
library("tidyr")
library("readr")
library("forcats")

Importing data from files in subdirectory r params$subdir.

anondir <- normalize_path(params$anondir) # no trailing slash

## pretty print big integers
.pi <- function(x) prettyNum(x, big.mark=',')

## check for manual exclusions
man_exclude_part_fname <- file.path(dirname(anondir),
                                    "manually_exclude_participants.csv")

man_exclude_phs_fname <- file.path(dirname(anondir),
                                   "manually_exclude_phases.csv")

if (file.exists(man_exclude_part_fname)) {
  man_exclude_part <- read_csv(man_exclude_part_fname, col_types = "cc")
  if (nrow(man_exclude_part)) {
    cat("Read in ", nrow(man_exclude_part), " manual participant exclusions from ",
        man_exclude_part_fname, "\n", sep = "")
  } else {
    cat("No manual exclusions found in the file `",
        man_exclude_part_fname, "`.\n", sep = "")
  }
} else {
  write_csv(tibble(PID = character(0), reason = character(0)),
            man_exclude_part_fname)
  cat("Created the file `", man_exclude_part_fname,
      "`. Enter any manual participant-level exclusions in this file.\n",
      sep = "")
  man_exclude_part <- read_csv(man_exclude_part_fname, col_types = "cc")
}

if (file.exists(man_exclude_phs_fname)) {
  man_exclude_phs <- read_csv(man_exclude_phs_fname, col_types = "cic") %>%
    mutate(phase_id = factor(phase_id, levels = 1:4))
  if (nrow(man_exclude_phs)) {
    cat("Read in ", nrow(man_exclude_phs), " manual phase exclusions from ",
        man_exclude_phs_fname, "\n", sep = "")
  } else {
    cat("No manual exclusions found in the file `",
        man_exclude_phs_fname, "`.\n", sep = "")
  }
} else {
  write_csv(tibble(PID = character(0), phase_id = integer(0),
                   reason = character(0)),
            man_exclude_phs_fname)
  cat("Created the file `", man_exclude_phs_fname,
      "`. Enter any manual phase-level exclusions in this file.\n", sep = "")
  man_exclude_phs <- read_csv(man_exclude_phs_fname, col_types = "cic") %>%
    mutate(phase_id = factor(phase_id, levels = 1:4))
}

.allfiles <- paste0("`", paste(locate_data_files(params$subdir),
                               collapse = "`, `"), "`")

Importing data from the following data files: r .allfiles

## all the pre-processing is done in this chunk
private_sess_fname <- file.path(dirname(anondir),
                                paste0(basename(params$anondir),
                                       "_NOT_ANONYMIZED_sessions.rds"))

private_phase_fname <- file.path(dirname(anondir),
                                 paste0(basename(params$anondir),
                                        "_NOT_ANONYMIZED_phases.rds"))

if (dir.exists(anondir)) {
  unlink(anondir, TRUE, TRUE)
}
dir.create(anondir)

sess <- import_sessions(params$subdir)
phase <- import_phase_info(params$subdir)
ratings_pre <- import_tratings(params$subdir)
cjudgments_pre <- import_cjudgments(params$subdir)

Participant-level exclusions

## THIS CODE CHUNK APPLIES ALL PARTICIPANT-LEVEL AND PHASE-LEVEL EXCLUSIONS

## check for participants who completed multiple times
sess_mult <- sess %>% 
  count(PID, name = "sessions") %>%
  filter(sessions > 1L) %>%
  mutate(chk_noduplicates = FALSE,
         excl_reason = "Duplicate sessions",
         excl_phase = "1")

## remove participants who logged multiple sessions
sess_nodups <- sess %>%
  anti_join(sess_mult, "PID") %>%
  full_join(sess_mult %>% select(-sessions), "PID") %>%
  replace_na(list(chk_noduplicates = TRUE))

## remove duplicates for phase data
phase_mult <- semi_join(phase, sess_mult, "PID") %>%
  count(PID, phase_id) %>%
  filter(n > 1L)

phase_nodups <-
  bind_rows(anti_join(phase, phase_mult, c("PID", "phase_id")),
            phase %>%
            semi_join(phase_mult, c("PID", "phase_id")) %>%
            arrange(PID, phase_id, desc(Finished), desc(Progress)) %>%
            group_by(PID, phase_id) %>%
            slice(1L)%>%
            ungroup())

## remove duplicates for ratings data
ratings_mult <- ratings_pre %>%
  count(PID, phase_id, stim_id) %>%
  filter(n > 1L) %>%
  select(-n)

ratings_nodups <-
  bind_rows(anti_join(ratings_pre, ratings_mult,
                      c("PID", "phase_id", "stim_id")),
            ratings_mult)

## remove duplicate for category judgment data
cjudgments_mult <- cjudgments_pre %>%
  count(PID, stim_id) %>%
  filter(n > 1L) %>%
  select(-n)

cjudgments_nodups <-
  bind_rows(anti_join(cjudgments_pre, cjudgments_mult,
                      c("PID", "stim_id")),
            cjudgments_mult)

## destroy data from non-consenting participants
sess_consent <- sess_nodups %>%
  mutate(chk_consent_all = grepl("^Yes", ConsentAll, ignore.case = TRUE),
         excl_reason = if_else(
           is.na(excl_reason) & !chk_consent_all,
           "Did not consent to all phases, or consent missing",
           excl_reason),
         excl_phase = if_else(
           is.na(excl_phase) & !chk_consent_all,
           "1", excl_phase))


## remove non-native speakers
sess_native <- sess_consent %>%
  mutate(chk_native = grepl("English", NativeLang,
                            ignore.case = TRUE),
         excl_reason = if_else(is.na(excl_reason) & !chk_native,
                               "Non-native English speaker",
                               excl_reason),
         excl_phase = if_else(is.na(excl_phase) & !chk_native,
                              "1",
                              excl_phase))

###########################################################################
## phase-level exclusion

## remove data from phase where consent was not given
## either consent to all missing/not given or
## phase-level consent missing/not given
phase_consent <- phase_nodups %>%
  mutate(chk_consent = grepl("^Yes", Consent),
         p_excl_reason = if_else(!chk_consent, "Did not give consent for phase (or consent missing)",
                                 NA_character_))

## was the phase completed?
phase_finished <- phase_consent %>%
  mutate(chk_finished = toupper(Finished) == "TRUE",
         p_excl_reason = if_else(is.na(p_excl_reason) & !chk_finished,
                                 "Did not complete phase",
                                 p_excl_reason))

###########################################################################
## check whether to delete entire participant data due to phase-level performance

## identify anyone who looked up answers; remove them from all phases
cheaters <- phase_finished %>%
  filter(grepl("^Yes", Cheat, ignore.case = TRUE)) %>%
  arrange(PID, phase_id) %>%
  group_by(PID) %>%
  slice(1L) %>%
  ungroup() %>%
  select(PID, phase_id) %>%
  mutate(chk_nocheat = FALSE,
         phase_id = as.character(phase_id)) %>%
  select(PID, excl_phase_id2 = phase_id, chk_nocheat)

sess_cheat <- sess_native %>%
  left_join(cheaters, "PID") %>%
  replace_na(list(chk_nocheat = TRUE)) %>%
  mutate(excl_reason = if_else(is.na(excl_reason) & !chk_nocheat,
                               "Looked up answers",
                               excl_reason),
         excl_phase = if_else(is.na(excl_phase) & !chk_nocheat,
                              excl_phase_id2,
                              excl_phase)) %>%
  select(-excl_phase_id2)

## now find any flatliners
## cjudgments scores
ispt <- split(cjudgments_nodups[["category"]], cjudgments_nodups[["PID"]])
res <- sapply(ispt, function(.x) {length(unique(.x)) == 1L})
flat_cjudgments <- names(res)[res]

phs_flat_c <- 
  tibble(PID = names(res),
         phase_id = factor("1", levels = 1:4),
         chk_noflat_c = !res)

## truth ratings
tspt <- split(ratings_nodups[["trating"]],
              list(ratings_nodups[["PID"]], as.character(ratings_nodups[["phase_id"]])),
              sep = ",")
res <- sapply(tspt, function(.x) {length(unique(.x)) == 1L})
flat_truth <- unique(sapply(names(res)[res],
                            function(.x) {strsplit(.x, ",")[[1]][1]},
                            USE.NAMES = FALSE))
flatliners <- union(flat_cjudgments, flat_truth)

phs_flat_t <- 
  tibble(PID = sapply(strsplit(names(res), ","), function(.x) {.x[[1]]}),
         phase_id = factor(
           sapply(strsplit(names(res), ","), function(.x) {.x[[2]]}),
           levels = 1:4),
         chk_noflat_t = !res)

## first phase with flatlining
first_phs_flat <- left_join(phs_flat_t, phs_flat_c,
                      c("PID", "phase_id")) %>%
  replace_na(list(chk_noflat_c = TRUE)) %>%
  mutate(chk_noflatline = chk_noflat_t & chk_noflat_c) %>%
  select(-chk_noflat_t, -chk_noflat_c) %>%
  filter(!chk_noflatline) %>%
  arrange(PID, phase_id) %>%
  group_by(PID) %>%
  slice(1) %>%
  ungroup()

## note: some flatliners were invited back (!)
## ^^ above table gives the phase at which they *should* have been excluded

sess_noflat <- sess_cheat %>%
  left_join(first_phs_flat %>%
            rename(excl_phase2 = phase_id), "PID") %>%
  replace_na(list(chk_noflatline = TRUE)) %>%
  mutate(excl_reason = if_else(is.na(excl_reason) & !chk_noflatline,
                               "Flatlining",
                               excl_reason),
         excl_phase = if_else(is.na(excl_phase) & !chk_noflatline,
                              as.character(excl_phase2),
                              excl_phase)) %>%
  select(-excl_phase2)

## too fast or too slow
dur_cutoffs <-
  tibble(phase_id = factor(1:4, levels = 1:4),
         min_dur = c(3L * 60L, rep(1L * 60L, 3)),
         max_dur = c(40L * 60L, rep(30L * 60L, 3)))

## identify participants who were too fast or too slow on *any* phase
phase_speed <- phase_finished %>%
  inner_join(dur_cutoffs, "phase_id") %>%
  mutate(chk_dur_phase =
                  (as.integer(`Duration (in seconds)`) >= min_dur) &
                  (as.integer(`Duration (in seconds)`) <= max_dur)) %>%
  select(-min_dur, -max_dur)

## manual phase exclusions
phase_man <- phase_speed %>%
  left_join(man_exclude_phs %>%
            mutate(chk_notmanex = FALSE) %>%
            rename(reason_for_manual_exclusion = reason) %>%
            select(PID, phase_id, chk_notmanex, reason_for_manual_exclusion),
            c("PID", "phase_id")) %>%
  replace_na(list(chk_notmanex = TRUE,
                  reason_for_manual_exclusion = NA_character_))

## get first phase where there is a duration outside the acceptable range
first_dur_excl <- phase_man %>%
  filter(!chk_dur_phase) %>%
  arrange(PID, phase_id) %>%
  group_by(PID) %>%
  slice(1) %>%
  ungroup() %>%
  mutate(excl_phase2 = as.character(phase_id)) %>%
  select(PID, chk_dur_all = chk_dur_phase, excl_phase2)

sess_dur <- sess_noflat %>%
  left_join(first_dur_excl, "PID") %>%
  replace_na(list(chk_dur_all = TRUE)) %>%
  mutate(excl_reason = if_else(
           is.na(excl_reason) & !chk_dur_all,
           "Phase duration was outside acceptable range",
           excl_reason),
         excl_phase = if_else(
           is.na(excl_phase) & !chk_dur_all,
           excl_phase2,
           excl_phase)) %>%
  select(-excl_phase2)

## manual exclusions
sess_notmanex <- bind_rows(anti_join(sess_dur,
                    man_exclude_part, "PID") %>%
          mutate(chk_notmanex = TRUE),
          man_exclude_part) %>%
  replace_na(list(chk_notmanex = FALSE)) %>%
  mutate(excl_reason = if_else(
           is.na(excl_reason) & !chk_notmanex,
           paste0("Other: ", reason), excl_reason),
         excl_phase = if_else(
           is.na(excl_phase) & !chk_notmanex,
           "1", excl_phase)) %>%
  select(-reason)

## exclude phases missing consent (or with manual exclusion) by filling with NAs
phase_zap <- phase_man %>%
  filter(chk_consent & chk_notmanex) %>%
  bind_rows(phase_man %>%
            filter(!chk_consent | !chk_notmanex) %>%
            select(PID, list_id, phase_id, chk_consent, p_excl_reason, chk_finished,
                   chk_dur_phase, chk_notmanex, reason_for_manual_exclusion))

## eliminate truth ratings / category judgments for phases where
## consent was not given or where there was manual exclusion
## this may not really be necessary because I think prolific terminated
## sessions where consent was not given...
ratings_zap <- ratings_nodups %>%
  semi_join(phase_zap %>% filter(chk_consent & chk_notmanex),
            c("PID", "phase_id")) %>%
  bind_rows(semi_join(ratings_nodups,
                      phase_zap %>%
                      filter(!chk_consent | !chk_notmanex),
                      c("PID", "phase_id")) %>%
            select(-trating))

cjudgments_zap <- cjudgments_nodups %>%
  semi_join(phase_zap %>% filter(chk_consent, phase_id == "1"), "PID") %>%
  bind_rows(semi_join(cjudgments_nodups,
                      phase_zap %>% filter(!chk_consent, phase_id == "1"),
                      "PID") %>%
            select(-category))

## zap all data to NA for the *sessions* where consent is absent or manually excluded
sess_discard <- sess_notmanex %>%
  filter(!chk_consent_all | !chk_notmanex)

phase_zap2 <- anti_join(phase_zap, sess_discard, "PID") %>%
  bind_rows(semi_join(phase_zap, sess_discard, "PID") %>%
            select(PID, list_id, phase_id,
                   chk_consent, p_excl_reason, chk_finished,
                   chk_dur_phase, chk_notmanex, reason_for_manual_exclusion))

## zap ratings where consent not given
ratings <- ratings_zap %>%
  anti_join(sess_discard, "PID") %>%
  bind_rows(semi_join(ratings_zap, sess_discard, "PID") %>%
            select(-trating))

## zap cjudgments where consent not given
cjudgments <- cjudgments_zap %>%
  anti_join(sess_discard, "PID") %>%
  bind_rows(semi_join(cjudgments_zap, sess_discard, "PID") %>%
            select(-category))

## which sessions do we have ratings data for?
have_ratings_data <-
  semi_join(ratings %>% filter(!is.na(trating)),
            phase_zap2 %>%
            semi_join(sess_notmanex %>%
                      filter(chk_noduplicates & chk_consent_all &
                             chk_native & chk_nocheat & chk_noflatline &
                             chk_dur_all & chk_notmanex), "PID") %>%
            filter(chk_consent & chk_finished & chk_dur_phase & chk_notmanex),
            c("PID", "phase_id")) %>%
  distinct(PID)

missing_data <- anti_join(sess_notmanex %>%
          filter(chk_noduplicates & chk_consent_all &
                             chk_native & chk_nocheat & chk_noflatline &
                             chk_dur_all & chk_notmanex),
          have_ratings_data, "PID") %>%
  mutate(chk_anydata = FALSE,
         excl_reason = if_else(is.na(excl_reason) & !chk_anydata,
                               "No ratings data remaining",
                               excl_reason),
         excl_phase = if_else(is.na(excl_phase) & !chk_anydata,
                              "1", excl_phase))

sess_keep <- bind_rows(anti_join(sess_notmanex, missing_data, "PID"),
          missing_data) %>%
  replace_na(list(chk_anydata = TRUE)) %>%
  mutate(keep = chk_noduplicates & chk_consent_all & chk_native & chk_nocheat & chk_dur_all &
           chk_noflatline & chk_anydata & chk_notmanex,
         excl_reason = factor(excl_reason) %>%
           fct_relevel("Duplicate sessions",
                       "Did not consent to all phases, or consent missing",
                       "Non-native English speaker",
                       "Looked up answers",
                       "Flatlining",
                       "Phase duration was outside acceptable range",
                       "No ratings data remaining"))  

phase_keep <- phase_zap2 %>%
  inner_join(sess_keep %>% select(PID, keep), "PID") %>%
  rename(sess_keep = keep) %>%
  mutate(p_excl_reason = if_else(is.na(p_excl_reason) & !sess_keep,
                                 "Session was excluded", p_excl_reason),
         keep = chk_consent & chk_finished & chk_dur_phase &
           chk_notmanex & sess_keep) %>%
  select(-sess_keep)


## done with participant level + phase level exclusions
## from here on, use phase_keep, sess_keep, ratings_zap2, and cjudgments_zap2
n_item <- nrow(stimulus_materials)

The raw data files contain r .pi(nrow(ratings)) truth ratings of r .pi(n_item) statements from r .pi(nrow(sess_keep)) participants.

There were eight exclusion criteria for participants, applied in the following order, and indicated in the data by the value of the corresponding logical data variable (TRUE = not excluded, FALSE = excluded):

  1. Duplicate sessions recorded, i.e., subjects started two sessions at once for a single phase (chk_noduplicates);
  2. Consent to data collection across all phases was absent (chk_consent_all);
  3. Not being a (self-reported) native speaker of English (chk_native);
  4. Reporting having looked up answers in at least one phase of the study (chk_nocheat);
  5. Flat lining; i.e., using only one response category across an entire phase of the study (chk_noflatline);
  6. Failing to complete all phases in a reasonable amount of time---for Phase 1, between 3 and 40 minutes; for all other phases, between 1 and 30 minutes (chk_dur_all);
  7. No ratings data, usually due to phase-level exclusions (chk_anydata);
  8. Any other reason determined by the researcher (chk_notmanex).

Summary of participant-level exclusions

Note: n is the number of participants excluded for the stated reason:

tots <- sess_keep %>%
  filter(!keep) %>%
  count(excl_reason)

tots

List of excluded participants

DT::datatable(sess_keep %>% filter(!keep) %>% select(PID, excl_phase, excl_reason))

Phase-level exclusions

Phase-level exclusions apply after any participant-level exclusions; i.e., they apply only on any phases that remain after removing subjects.

The only automatically applied exclusion criteria applied at the phase-level were: 1. failure to give consent (chk_consent); 2. failure to complete all of the ratings in the phase (chk_finished); 3. any other reason determined by the researcher.

Manual exclusions are listed in the file r man_exclude_phs_fname and tagged by the variable chk_notmanex.

Summary of phase-level exclusions

phase1_excl <- phase_keep %>%
  filter(phase_id == "1") %>%
  anti_join(sess_keep %>% filter(excl_phase == "1"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex))

phase2_excl <- phase_keep %>%
  filter(phase_id == "2") %>%
  anti_join(sess_keep %>% filter(excl_phase == "2"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex))

phase3_excl <- phase_keep %>%
  filter(phase_id == "3") %>%
  anti_join(sess_keep %>% filter(excl_phase == "3"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex))

phase4_excl <- phase_keep %>%
  filter(phase_id == "4") %>%
  anti_join(sess_keep %>% filter(excl_phase == "4"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex))

excluded_p <- bind_rows(phase1_excl, phase2_excl,
                          phase3_excl, phase4_excl)

tots_phs <- excluded_p %>%
  mutate(p_excl_reason = factor(p_excl_reason) %>%
           fct_relevel("Did not give consent for phase (or consent missing)",
                       "Did not complete phase")) %>%
  count(p_excl_reason)

tots_phs

List of excluded phases

DT::datatable(excluded_p %>%
  select(PID, phase_id, StartDate, EndDate, p_excl_reason))

Comments

Comments in the TechDiff field

phase_nodups %>%
  filter(!is.na(TechDiff)) %>%
  select(phase_id, PID, TechDiff) %>%
  arrange(desc(phase_id), TechDiff, PID) %>%
  DT::datatable()
if ("Comments" %in% names(sess_keep)) {
  cat("### Comments left at end of final phase")
}
if ("Comments" %in% names(sess_keep)) {
  sess_keep %>%
    filter(!is.na(Comments)) %>%
    select(PID, Comments) %>%
    DT::datatable()
}

Dropouts and exclusions by phase

n_starting <- phase_keep %>%
  distinct(phase_id, PID) %>%
  count(phase_id, name = "n_attempted")

n_excluded_s <- sess_keep %>%
  filter(!is.na(excl_phase)) %>%
  mutate(excl_phase = factor(excl_phase, levels = as.character(1:4))) %>%
  count(phase_id = excl_phase, .drop = FALSE, name = "n_subjects_excl")

phase1_excl <- phase_keep %>%
  filter(phase_id == "1") %>%
  anti_join(sess_keep %>% filter(excl_phase == "1"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex)) %>%
  nrow()

phase2_excl <- phase_keep %>%
  filter(phase_id == "2") %>%
  anti_join(sess_keep %>% filter(excl_phase == "2"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex)) %>%
  nrow()

phase3_excl <- phase_keep %>%
  filter(phase_id == "3") %>%
  anti_join(sess_keep %>% filter(excl_phase == "3"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex)) %>%
  nrow()

phase4_excl <- phase_keep %>%
  filter(phase_id == "4") %>%
  anti_join(sess_keep %>% filter(excl_phase == "4"), "PID") %>%
  filter(!(chk_consent & chk_finished & chk_notmanex)) %>%
  nrow()

n_excluded_p <- phase_keep %>%
  distinct(phase_id) %>%
  mutate(n_phase_excl = c(phase1_excl, phase2_excl,
                          phase3_excl, phase4_excl))

n_remain <- inner_join(n_starting, n_excluded_s, "phase_id") %>%
  inner_join(n_excluded_p, "phase_id") %>%
  mutate(n_retained_phases = n_attempted - n_subjects_excl - n_phase_excl,
         n_invited = lag(n_attempted - n_subjects_excl),
         dropout = sprintf(
           "%0.1f%%",
           100 * ((n_invited - n_attempted) / n_invited)),
         exclusion = sprintf(
           "%0.1f%%",
           100 * ((n_subjects_excl + n_phase_excl) / n_attempted)),
         attrition = sprintf(
           "%0.1f%%",
           100 * (n_invited - n_retained_phases) / n_invited)) %>%
  select(phase_id, n_invited, n_attempted,
         n_subj_ex = n_subjects_excl,
         n_phase_ex = n_phase_excl,
         n_retained = n_retained_phases,
         dropout, exclusion, attrition)

##%>%        
##  select(phase_id, n_invited, n_attempted, n_subjects_excl,
##n_phase_excl, n_retained_phases,
##         dropout, exclusion, attrition)

n_remain

Anonymized data

TODO: load in the data from the RDS files and run usethis::use_data() to add to the {truthiness} package.

## if we have all phase data, anonymize
sess_keep[["ID"]] <- sprintf("S%04d", sample(seq_len(nrow(sess_keep))))

share_cols <- c("list_id",
                "Age", "Gender", "Nationality", "NativeLang",
                "keep",
                "excl_phase",
                "excl_reason",
                "chk_noduplicates",
                "chk_consent_all", "chk_native", "chk_nocheat",
                "chk_dur_all", "chk_noflatline", "chk_anydata",
                "chk_notmanex")

sess_private <- sess_keep[, c("PID", "ID",
                              setdiff(names(sess_keep),
                                      c("PID", "ID", "ConsentAll",
                                        share_cols)))]

sess_share <- sess_keep[, c("ID", share_cols)] %>%
  arrange(ID) %>%
  mutate(Gender = fct_relevel(Gender,
                              c("Female", "Male", "Gender variant",
                                "Prefer not to say")) %>%
           fct_explicit_na(),
         list_id = fct_relevel(list_id,
                               levels(stimulus_conditions[["list_id"]])))

pshare_cols <- c("phase_id", "Duration (in seconds)",
                 "keep",
                 "p_excl_reason",
                 "chk_consent", "chk_finished",
                 "chk_notmanex", 
                 "reason_for_manual_exclusion")

pkeep <- phase_keep %>%
  inner_join(sess_private[, c("ID", "PID")], "PID")

phase_share <- pkeep[, c("ID", pshare_cols)] %>%
  arrange(ID, phase_id)

colnames(phase_share)[colnames(phase_share) == "Duration (in seconds)"] <-
  "duration_secs"

phase_private <- pkeep[, c("PID", "ID", "phase_id",
                           setdiff(names(phase_keep),
                                   c("PID", "ID", "phase_id", "list_id",
                                     "Duration (in seconds)",
                                     pshare_cols)))]

ratings2 <- ratings %>%
  inner_join(sess_keep %>% select(PID, ID), "PID") %>%
  select(-PID) %>%
  select(ID, everything())

ratings_share <-
  ratings2[, c("ID", "phase_id", "stim_id", "trating")] %>%
  arrange(ID, phase_id, stim_id)

cjudgments2 <- cjudgments %>%
  inner_join(sess_keep %>% select(PID, ID), "PID") %>%
  select(-PID) %>%
  select(ID, everything())

cjudgments_share <- cjudgments2[, c("ID", "stim_id", "category")] %>%
  arrange(ID, stim_id)

saveRDS(sess_private, private_sess_fname)
cat("Wrote non-anonymized pre-processed session data to `",
    private_sess_fname, "`.\n", sep = "")
saveRDS(phase_private, private_phase_fname)
cat("\nWrote non-anonymized pre-processed phase data to `",
    private_phase_fname, "`\n", sep = "")  

readr::write_csv(sess_share, file.path(anondir, "ANON_sessions.csv"))
saveRDS(sess_share, file.path(anondir, "ANON_sessions.rds"))

readr::write_csv(phase_share %>% select(-reason_for_manual_exclusion),
                 file.path(anondir, "ANON_phases.csv"))
saveRDS(phase_share %>% select(-reason_for_manual_exclusion),
        file.path(anondir, "ANON_phases.rds"))

readr::write_csv(ratings_share, file.path(anondir, "ANON_ratings.csv"))
saveRDS(ratings_share, file.path(anondir, "ANON_ratings.rds"))

readr::write_csv(cjudgments_share, file.path(anondir, "ANON_categories.csv"))
saveRDS(cjudgments_share, file.path(anondir, "ANON_categories.rds"))

cat("\nWrote anonymized data to text files ",
    "`ANON_sessions.csv`, `ANON_phases.csv`, `ANON_ratings.csv`, and ",
    "`ANON_categories.csv` in subdirectory `", anondir, "`.\n", sep = "")

cat("\nWrote anonymized data to binary files ",
    "`ANON_sessions.rds`, `ANON_phases.rds`, `ANON_ratings.rds`, and ",
    "`ANON_categories.rds` in subdirectory `", anondir, "`.\n", sep = "")


Try the truthiness package in your browser

Any scripts or data that you put into this service are public.

truthiness documentation built on May 24, 2021, 9:07 a.m.