library("knitr") opts_chunk$set(echo = FALSE) opts_chunk$set(collapse = TRUE, comment = "#>", message = FALSE) wd <- rprojroot::find_rstudio_root_file() opts_knit$set(root.dir = wd)
library("L2TDatabase") library("dplyr") library("tidyr") library("stringr") # Load external dependencies wd <- rprojroot::find_rstudio_root_file() source(file.path(wd, "inst/paths.R")) source(paths$GetSiteInfo, chdir = TRUE) source(file.path(wd, "inst/migrations/dates.R")) # Collect dates/scores df_dates <- collect_dates(paths$score_dates, recursive = TRUE) %>% mutate(Study = ifelse(Study == "Medu", "MaternalEd", Study)) df_no_visits <- df_dates %>% filter(Variable == "Visited") %>% select(-Variable) %>% rename(Visited = Value) %>% filter(Visited == FALSE) columns_in_dirt <- df_dates %>% select(Variable) %>% distinct # Columns used the 2 sites x 3 timepoints spreadsheets df_dates$Variable %>% unique() %>% sort() # Get T1 scores for both sites. Function sourced via paths$GetSiteInfo t1 <- get_study_info("TimePoint1") t2 <- get_study_info("TimePoint2") t3 <- get_study_info("TimePoint3") ci1 <- get_study_info("CochlearV1") ci2 <- get_study_info("CochlearV2") ci_age <- get_study_info("CochlearMatching") lt <- get_study_info("LateTalker") medu <- get_study_info("MaternalEd") dialect <- get_study_info("DialectSwitch") %>% lapply(. %>% mutate(Study = "Dialect")) # Select necessary columns get_scores_from_info <- function(df) { # Select and rename columns we want to compare df %>% select(ParticipantID = Participant_ID, Study, Source, # use maybe_starts_with so that we don't get an error when the column is not found EVT_Form = maybe_starts_with("EVT_Form"), EVT_Date = maybe_starts_with("EVT_COMPLETION"), EVT_Raw = maybe_starts_with("EVT_raw"), EVT_Standard = maybe_starts_with("EVT_standard"), EVT_GSV = maybe_starts_with("EVT_GSV"), PPVT_Form = maybe_starts_with("PPVT_Form"), PPVT_Date = maybe_starts_with("PPVT_COMPLETION"), PPVT_Raw = maybe_starts_with("PPVT_raw"), PPVT_Standard = maybe_starts_with("PPVT_standard"), PPVT_GSV = maybe_starts_with("PPVT_GSV"), FruitStroop_Date = maybe_starts_with("FruitStroop_Date"), FruitStroop_Score = maybe_matches("fruitstroop_time|FruitStroop_Score"), VerbalFluency_Date = maybe_starts_with("VerbalFluency_Date"), VerbalFluency_Score = maybe_matches("verbalfluency_raw|VerbalFluency_Score"), VerbalFluency_AgeEquivalent = maybe_matches("verbalfluency_AE|VerbalFluency_AgeEquivalent"), CTOPPElision_Date = maybe_starts_with("CTOPPElision_Date"), CTOPPElision_Raw = maybe_matches("CTOPP_Elision_raw|CTOPPElision_Raw"), CTOPPElision_Scaled = maybe_matches("CTOPP_Elision_scaled|CTOPPElision_Scaled"), CTOPPBlending_Date = maybe_starts_with("CTOPPBlending_Date"), CTOPPBlending_Raw = maybe_matches("CTOPP_Blending_raw|CTOPPBlending_Raw"), CTOPPBlending_Scaled = maybe_matches("CTOPP_Blending_scaled|CTOPPBlending_Scaled"), CTOPPMemory_Date = maybe_matches("CTOPP_MemoryforDigits_COMPLETION|CTOPPMemory_Date"), CTOPPMemory_Raw = maybe_matches("CTOPP_MemoryforDigits_raw|CTOPPMemory_Raw"), CTOPPMemory_Scaled = maybe_matches("CTOPP_MemoryforDigits_scaled|CTOPPMemory_Scaled"), KBIT_Date = maybe_starts_with("KBIT_Date"), KBIT_Nonverbal_Raw = maybe_starts_with("KBIT_Nonverbal_Raw"), KBIT_Nonverbal_Standard = maybe_starts_with("KBIT_Nonverbal_Standard"), GFTA_Date = maybe_matches("GFTA_COMPLETION_DATE|GFTA_Date"), DELV_Date = maybe_starts_with("DELV_Date"), DELV_LanguageVar_ColumnAScore = maybe_starts_with("DELV_LanguageVar_ColumnAScore"), DELV_LanguageVar_ColumnBScore = maybe_starts_with("DELV_LanguageVar_ColumnBScore"), DELV_DegreeLanguageVar = maybe_matches("DELV_DegreeLanguageVar"), DELV_LanguageRisk_DiagnosticErrorScore = maybe_starts_with("DELV_LanguageRisk_DiagnosticErrorScore"), DELV_LanguageRisk = maybe_matches("DELV_LanguageRisk$"), BRIEFP_Date = maybe_matches("BRIEFP_Date")) %>% # Set dates to strings mutate_each(funs(format), maybe_matches("Date")) } # Combine scores in the UMN/UW spreadsheets, convert to long format normalize_sheets <- . %>% lapply(get_scores_from_info) %>% Filter(function(x) nrow(x) != 0, .) %>% bind_rows() %>% gather(Variable, Value, -ParticipantID, -Study, -Source) # Normalize data from all three studies, combine to single study df_info_long <- list(t1, t2, t3, ci1, ci2, ci_age, lt, medu, dialect) %>% lapply(normalize_sheets) %>% bind_rows() # Which columns need to be compared? columns_in_info_sheets <- df_info_long %>% select(Study, Variable) %>% distinct() # Get the re-entered scores into the same format df_dirt <- df_dates %>% select(-Site) %>% semi_join(columns_in_info_sheets) %>% mutate(Source = "DIRT")
In Spring 2016, we had our data-entry team re-enter test scores gathered in our studies, so that we could find data-entry discrepancies. This script compares the original to the re-entered scores.
Data from the following studies are checked:
unique(df_info_long$Study)
Do the same participants contribute scores in each set?
df_both <- bind_rows(df_dirt, df_info_long) df_sources_per_id <- df_both %>% select(-Variable, -Value) %>% distinct() %>% mutate(Found = TRUE) %>% spread(Source, Found) %>% arrange(Study, ParticipantID)
Participants in original score-set ("ParticipantInfo") not in the re-entered score-set ("DIRT"):
df_sources_per_id %>% filter(is.na(DIRT)) %>% anti_join(df_no_visits)
Participants in re-entered score-set ("DIRT") who visited the lab but are not in the original score-set ("ParticipantInfo").
df_sources_per_id %>% filter(is.na(ParticipantInfo)) %>% anti_join(df_no_visits)
We now compare the scores in each score-set. This check is only being performed on participants in both score-sets.
# Find which kids appear in both sources df_children_to_compare <- df_sources_per_id %>% filter(!is.na(DIRT), !is.na(ParticipantInfo)) %>% select(Study, ParticipantID) %>% distinct() df_comparing_scores <- df_both %>% semi_join(df_children_to_compare) discrepancies <- df_comparing_scores %>% # select(-Study) %>% split(.$Variable) %>% lapply(readr::type_convert) %>% lapply(spread_, "Source", "Value") %>% # lapply(function(df) select(df, DIRT)) %>% lapply(function(df) filter(df, DIRT %!==% ParticipantInfo)) %>% Filter(function(df) nrow(df) != 0, .) %>% lapply(as.data.frame)
This table lists all the fields that were checked and whether any discrepancies were found in that field.
columns_with_discrepancies <- discrepancies %>% names() results <- data_frame( Check = "Discrepancies", Date = format(Sys.Date()), Passing = nrow(bind_rows(discrepancies)) == 0) readr::write_csv(results, "./inst/audit/results_integrity.csv") columns_with_errors <- discrepancies %>% lapply(. %>% select(Study, Variable)) %>% bind_rows() %>% distinct() %>% arrange(Study, Variable) %>% mutate(Passing = FALSE) # Make a mock-error table if there are no errors if (nrow(columns_with_errors) == 0) { columns_with_errors <- data_frame( Study = NA_character_, Variable = NA_character_, Passing = FALSE ) } columns_in_info_sheets %>% select(Study, Variable) %>% distinct %>% left_join(columns_with_errors) %>% mutate(Passing = if_else(is.na(Passing), TRUE, Passing)) %>% arrange(Study, Variable) %>% mutate(Status = if_else(Passing, ":white_check_mark:", ":x:")) %>% as.data.frame() %>% rename(` ` = Status) %>% kable()
These are all the mismatching values.
discrepancies
The following columns in DIRT were not checked because there is not a matching column in the participant info spreadsheets
columns_in_dirt_by_study <- df_dates %>% select(Study, Variable) %>% distinct() columns_in_dirt_by_study %>% anti_join(columns_in_info_sheets) %>% filter(Variable != "DOB", Variable != "Visited") %>% as.data.frame() %>% mutate(Status = ":grey_question:") %>% rename(` ` = Status) %>% kable()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.