library("knitr")
opts_chunk$set(echo = FALSE)
opts_chunk$set(collapse = TRUE, comment = "#>", message = FALSE)

wd <- rprojroot::find_rstudio_root_file()
opts_knit$set(root.dir = wd)
library("L2TDatabase")
library("dplyr")
library("tidyr")
library("stringr")

# Load external dependencies
wd <- rprojroot::find_rstudio_root_file()
source(file.path(wd, "inst/paths.R"))
source(paths$GetSiteInfo, chdir = TRUE)
source(file.path(wd, "inst/migrations/dates.R"))

# Collect dates/scores
df_dates <- collect_dates(paths$score_dates, recursive = TRUE) %>% 
  mutate(Study = ifelse(Study == "Medu", "MaternalEd", Study))

df_no_visits <- df_dates %>% 
  filter(Variable == "Visited") %>% 
  select(-Variable) %>% 
  rename(Visited = Value) %>% 
  filter(Visited == FALSE)

columns_in_dirt <- df_dates %>% 
  select(Variable) %>% 
  distinct

# Columns used the 2 sites x 3 timepoints spreadsheets
df_dates$Variable %>% unique() %>% sort()

# Get T1 scores for both sites. Function sourced via paths$GetSiteInfo
t1 <- get_study_info("TimePoint1")
t2 <- get_study_info("TimePoint2")
t3 <- get_study_info("TimePoint3")
ci1 <- get_study_info("CochlearV1")
ci2 <- get_study_info("CochlearV2")
ci_age <- get_study_info("CochlearMatching")
lt <- get_study_info("LateTalker")
medu <- get_study_info("MaternalEd")
dialect <- get_study_info("DialectSwitch") %>% 
  lapply(. %>% mutate(Study = "Dialect"))

# Select necessary columns
get_scores_from_info <- function(df) {
  # Select and rename columns we want to compare
  df %>%
    select(ParticipantID = Participant_ID, Study, Source,
           # use maybe_starts_with so that we don't get an error when the column is not found
           EVT_Form = maybe_starts_with("EVT_Form"),
           EVT_Date = maybe_starts_with("EVT_COMPLETION"),
           EVT_Raw = maybe_starts_with("EVT_raw"),
           EVT_Standard = maybe_starts_with("EVT_standard"),
           EVT_GSV = maybe_starts_with("EVT_GSV"),
           PPVT_Form = maybe_starts_with("PPVT_Form"),
           PPVT_Date = maybe_starts_with("PPVT_COMPLETION"),
           PPVT_Raw = maybe_starts_with("PPVT_raw"),
           PPVT_Standard = maybe_starts_with("PPVT_standard"),
           PPVT_GSV = maybe_starts_with("PPVT_GSV"),
           FruitStroop_Date = maybe_starts_with("FruitStroop_Date"),
           FruitStroop_Score = maybe_matches("fruitstroop_time|FruitStroop_Score"),
           VerbalFluency_Date = maybe_starts_with("VerbalFluency_Date"),
           VerbalFluency_Score = maybe_matches("verbalfluency_raw|VerbalFluency_Score"),
           VerbalFluency_AgeEquivalent = maybe_matches("verbalfluency_AE|VerbalFluency_AgeEquivalent"),
           CTOPPElision_Date = maybe_starts_with("CTOPPElision_Date"),
           CTOPPElision_Raw = maybe_matches("CTOPP_Elision_raw|CTOPPElision_Raw"),
           CTOPPElision_Scaled = maybe_matches("CTOPP_Elision_scaled|CTOPPElision_Scaled"),
           CTOPPBlending_Date = maybe_starts_with("CTOPPBlending_Date"),
           CTOPPBlending_Raw = maybe_matches("CTOPP_Blending_raw|CTOPPBlending_Raw"),
           CTOPPBlending_Scaled = maybe_matches("CTOPP_Blending_scaled|CTOPPBlending_Scaled"),
           CTOPPMemory_Date = maybe_matches("CTOPP_MemoryforDigits_COMPLETION|CTOPPMemory_Date"),
           CTOPPMemory_Raw = maybe_matches("CTOPP_MemoryforDigits_raw|CTOPPMemory_Raw"),
           CTOPPMemory_Scaled = maybe_matches("CTOPP_MemoryforDigits_scaled|CTOPPMemory_Scaled"),
           KBIT_Date = maybe_starts_with("KBIT_Date"),
           KBIT_Nonverbal_Raw = maybe_starts_with("KBIT_Nonverbal_Raw"),
           KBIT_Nonverbal_Standard = maybe_starts_with("KBIT_Nonverbal_Standard"),
           GFTA_Date = maybe_matches("GFTA_COMPLETION_DATE|GFTA_Date"),
           DELV_Date = maybe_starts_with("DELV_Date"),
           DELV_LanguageVar_ColumnAScore = maybe_starts_with("DELV_LanguageVar_ColumnAScore"),
           DELV_LanguageVar_ColumnBScore = maybe_starts_with("DELV_LanguageVar_ColumnBScore"),
           DELV_DegreeLanguageVar = maybe_matches("DELV_DegreeLanguageVar"),
           DELV_LanguageRisk_DiagnosticErrorScore = maybe_starts_with("DELV_LanguageRisk_DiagnosticErrorScore"),
           DELV_LanguageRisk = maybe_matches("DELV_LanguageRisk$"),
           BRIEFP_Date = maybe_matches("BRIEFP_Date")) %>%
    # Set dates to strings
    mutate_each(funs(format), maybe_matches("Date"))
}


# Combine scores in the UMN/UW spreadsheets, convert to long format
normalize_sheets <- . %>% 
  lapply(get_scores_from_info) %>% 
  Filter(function(x) nrow(x) != 0, .) %>% 
  bind_rows() %>% 
  gather(Variable, Value, -ParticipantID, -Study, -Source)

# Normalize data from all three studies, combine to single study
df_info_long <- list(t1, t2, t3, ci1, ci2, ci_age, lt, medu, dialect) %>% 
  lapply(normalize_sheets) %>% 
  bind_rows()

# Which columns need to be compared?
columns_in_info_sheets <- df_info_long %>% 
  select(Study, Variable) %>% 
  distinct()

# Get the re-entered scores into the same format
df_dirt <- df_dates %>%
  select(-Site) %>%
  semi_join(columns_in_info_sheets) %>% 
  mutate(Source = "DIRT")

In Spring 2016, we had our data-entry team re-enter test scores gathered in our studies, so that we could find data-entry discrepancies. This script compares the original to the re-entered scores.

Studies under consideration

Data from the following studies are checked:

unique(df_info_long$Study)

Participant pool comparison

Do the same participants contribute scores in each set?

df_both <- bind_rows(df_dirt, df_info_long)
df_sources_per_id <- df_both %>% 
  select(-Variable, -Value) %>% 
  distinct() %>% 
  mutate(Found = TRUE) %>% 
  spread(Source, Found) %>% 
  arrange(Study, ParticipantID)

Participants in original score-set ("ParticipantInfo") not in the re-entered score-set ("DIRT"):

df_sources_per_id %>% 
  filter(is.na(DIRT)) %>% 
  anti_join(df_no_visits)

Participants in re-entered score-set ("DIRT") who visited the lab but are not in the original score-set ("ParticipantInfo").

df_sources_per_id %>% 
  filter(is.na(ParticipantInfo)) %>% 
  anti_join(df_no_visits)

Value Comparison

We now compare the scores in each score-set. This check is only being performed on participants in both score-sets.

# Find which kids appear in both sources
df_children_to_compare <- df_sources_per_id %>% 
  filter(!is.na(DIRT), !is.na(ParticipantInfo)) %>% 
  select(Study, ParticipantID) %>% 
  distinct()

df_comparing_scores <- df_both %>% 
  semi_join(df_children_to_compare) 

discrepancies <- df_comparing_scores %>%
  # select(-Study) %>%
  split(.$Variable) %>%
  lapply(readr::type_convert) %>%
  lapply(spread_, "Source", "Value") %>% 
  # lapply(function(df) select(df, DIRT)) %>%
  lapply(function(df) filter(df, DIRT %!==% ParticipantInfo)) %>%
  Filter(function(df) nrow(df) != 0, .) %>% 
  lapply(as.data.frame)

Summary

This table lists all the fields that were checked and whether any discrepancies were found in that field.

columns_with_discrepancies <- discrepancies %>% names()

results <- data_frame(
  Check = "Discrepancies", 
  Date = format(Sys.Date()),
  Passing = nrow(bind_rows(discrepancies)) == 0)

readr::write_csv(results, "./inst/audit/results_integrity.csv")

columns_with_errors <- discrepancies %>% 
  lapply(. %>% select(Study, Variable)) %>% 
  bind_rows() %>% 
  distinct() %>% 
  arrange(Study, Variable) %>% 
  mutate(Passing = FALSE)

# Make a mock-error table if there are no errors
if (nrow(columns_with_errors) == 0) {
  columns_with_errors <- data_frame(
    Study = NA_character_, Variable = NA_character_, Passing = FALSE
  )
}

columns_in_info_sheets %>% 
  select(Study, Variable) %>% 
  distinct %>% 
  left_join(columns_with_errors) %>% 
  mutate(Passing = if_else(is.na(Passing), TRUE, Passing)) %>% 
  arrange(Study, Variable) %>% 
  mutate(Status = if_else(Passing, ":white_check_mark:", ":x:")) %>% 
  as.data.frame() %>% 
  rename(` ` = Status) %>% 
  kable()

Details

These are all the mismatching values.

discrepancies

Unchecked fields

The following columns in DIRT were not checked because there is not a matching column in the participant info spreadsheets

columns_in_dirt_by_study <- df_dates %>% 
  select(Study, Variable) %>% 
  distinct()

columns_in_dirt_by_study %>% 
  anti_join(columns_in_info_sheets) %>% 
  filter(Variable != "DOB", Variable != "Visited") %>% 
  as.data.frame() %>% 
  mutate(Status = ":grey_question:") %>% 
  rename(` ` = Status) %>% 
  kable()


LearningToTalk/L2TDatabase documentation built on June 24, 2020, 3:45 a.m.