\newpage
write("Export screening data and run corresponding quality checks", stderr())
db_name <- "screening" qc_root <- "DQC_00_0SG" qc_incorrect_date_setup <- paste(qc_root, "01", sep = "_") qc_nontimely_submission_id <- paste(qc_root, "02", sep = "_") qc_nontimely_completion_id <- paste(qc_root, "03", sep = "_") qc_screening_before_startdate <- paste(qc_root, "04", sep = "_") qc_screening_after_lockdate <- paste(qc_root, "05", sep = "_") qc_screening_w_nonvalid_deviceid <- paste(qc_root, "06", sep = "_") qc_screening_before_facility_startdate <- paste(qc_root, "07", sep = "_") qc_nonvalid_fid <- paste(qc_root, "08", sep = "_") qc_screening_other_fids <- paste(qc_root, "09", sep = "_") qc_inconsistent_facility_info1 <- paste(qc_root, "10", sep = "_") qc_inconsistent_facility_info2 <- paste(qc_root, "11", sep = "_") qc_inconsistent_facility_info3 <- paste(qc_root, "12", sep = "_") qc_inconsistent_age_info <- paste(qc_root, "13", sep = "_") qc_underaged_cg_id <- paste(qc_root, "14", sep = "_") qc_dup_repeats_id <- paste(qc_root, "15", sep = "_") qc_other_repeats_id <- paste(qc_root, "16", sep = "_")
n_deviceid_edit_records <- 0 n_nonvalid_deviceid_records <- 0 n_other_fid_records <- 0 n_incorrect_date_setup_records <- 0 n_late_submissions <- 0 n_late_completions <- 0 n_before_startdate_records <- 0 n_before_facility_startdate_records <- 0 n_after_lockdate_records <- 0 n_ineligible_cg_records <- 0 n_repeat_edit_records <- 0 n_inconsistent_age_info <- 0 n_cleaned_screening_records <- 0 n_edit_nonvalid_fid_records <- 0 n_edit_inconsistent_fid_records <- 0 n_edit_inconsistent_fid1a <- 0 n_edit_inconsistent_fid1b <- 0 n_edit_inconsistent_fid2 <- 0 n_edit_inconsistent_fid3 <- 0 n_inconsistent_fid_records <- 0 n_inconsistent_facility_info1 <- 0 n_inconsistent_facility_info1z <- 0 n_inconsistent_facility_info2 <- 0 n_inconsistent_facility_info3 <- 0
dictionary <- timci::import_country_specific_xls_dict("main_dict.xlsx", Sys.getenv('TIMCI_COUNTRY')) screening_dictionary <- subset(dictionary, screening == 1) n_screening_dictionary_vars <- nrow(screening_dictionary)
There are r n_screening_dictionary_vars
variables exported from the raw r db_name
database.
screening_dictionary %>% dplyr::select(new, old) %>% knitr::kable(col.names = c("Database reference", "ODK reference"))
# Screening data raw_screening_data <- timci::extract_screening_data(facility_data, is_pilot) n_raw_screening_records <- nrow(raw_screening_data) # Split into PII and deidentified raw data out <- timci::extract_enrolled_participants(facility_data, is_pilot) raw_day0_data <- out[[1]] raw_pii <- out[[2]] raw_repeat_data <- facility_data %>% dplyr::filter(repeat_consult == 1)
Initially, there are r n_raw_screening_records
record(s) in the raw r db_name
database from the start of the study on r start_date
.
write(" o Date and time checks", stderr())
facility_cols <- colnames(facility_data) facility_data <- facility_data %>% timci::allocate_screening_facility(research_facilities)
r qc_incorrect_date_setup
]write(" o Incorrect data and time setup on the device", stderr())
qc_description <- "The enrolment date is defined as the creation (start) date of the form. However if for any reason the date is not set up correctly on the tablet, the enrolment date is not correct and follow-up is not triggered on the right date." qc_rule <- "If the server submission date is anterior to the creation date (start of the form on the tablet) in the database, the creation date is considered incorrect and is replaced by the server submission date." qc_type <- "date_discrepancy" df <- facility_data col_date1 <- "submission_date" col_date2 <- "start" cleaning <- "replace_by_start_date" qc_text <- "an incorrect date setup on the tablet" qc_idx <- qc_incorrect_date_setup qc_export_label <- "incorrect_date_setup" qc_export_description <- "the date setup is not correct" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_incorrect_date_setup_records <- n_detected
facility_data <- cleaned_df
fig_df <- qc_df %>% dplyr::mutate(week = lubridate::floor_date(as.Date(start), "week", week_start = getOption("lubridate.week.start", 1))) fig_caption <- "Incorrect date and time setup on the device" facility_col <- "fid_from_device" date_col <- "week" date_lbl <- "Weeks" date_format <- "%b%y" comparison <- "area" fill_col <- "" cat(knitr::knit_child('database_export_sub_facet_bar_plot.Rmd', envir = environment(), quiet = TRUE))
r qc_nontimely_submission_id
]write(" o Non-timely submission", stderr())
qc_description <- "Transfer of a finalised submission to the ODK Central server not done on the day the submission was finalised (i.e. transfer time superior to 0 day). A delay superior to 12 days mean that the participant has never appeared in the Day 7 follow-up log on the tablet." qc_rule <- action_alert_no_modification qc_type <- "date_discrepancy" df <- facility_data col_date1 <- "end" col_date2 <- "submission_date" cleaning <- "none" qc_text <- "an actual transfer date later than the finalisation date" qc_idx <- qc_nontimely_submission_id qc_export_label <- "nontimely_day0_submission" qc_export_description <- "the submission was not transferred on the same day it was finalised" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_late_submissions <- n_detected
fig_df <- qc_df %>% dplyr::mutate(week = as.Date(lubridate::floor_date(as.Date(end), "week", week_start = getOption("lubridate.week.start", 1)))) %>% dplyr::mutate(Difference = dplyr::case_when( diff < 3 ~ "a) Less than 3 days", diff >= 3 & diff < 7 ~ "b) 3-6 days", diff >= 7 ~ "c) 7 days and above")) fig_caption <- "Submissions not transferred on the same day they were finalised" facility_col <- "fid_from_device" date_col <- "week" date_lbl <- "Weeks" date_format <- "%b%y" comparison <- "area" fill_col <- "Difference" cat(knitr::knit_child('database_export_sub_facet_bar_plot.Rmd', envir = environment(), quiet = TRUE))
r qc_nontimely_completion_id
]write(" o Non-timely completion", stderr())
qc_description <- "Finalisation of the submission not done on the same day the submission was started (i.e., duration from start to end strictly superior to 0 day)." qc_rule <- action_alert_no_modification qc_type <- "date_discrepancy" df <- facility_data col_date1 <- "start" col_date2 <- "end" cleaning <- "none" qc_text <- "an actual finalisation date later than the start date" qc_idx <- qc_nontimely_completion_id qc_export_label <- "nontimely_day0_completion" qc_export_description <- "the submission was not finalised on the same day it was started" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_late_completions <- n_detected
fig_df <- qc_df %>% dplyr::mutate(week = as.Date(lubridate::floor_date(as.Date(start), "week", week_start = getOption("lubridate.week.start", 1)))) %>% dplyr::mutate(Difference = dplyr::case_when( diff <= 1 ~ "a) 1 day", diff > 1 & diff < 3 ~ "b) 1-2 days", diff >= 3 ~ "c) 3 days and above")) fig_caption <- "Number of submissions not finalised on the same day they were started" facility_col <- "fid_from_device" date_col <- "week" date_lbl <- "Weeks" date_format <- "%b%y" comparison <- "type" fill_col <- "Difference" cat(knitr::knit_child('database_export_sub_facet_bar_plot.Rmd', envir = environment(), quiet = TRUE))
r qc_screening_before_startdate
]write(" o Study start date context", stderr())
qc_description <- "Screening data are considered valid only from the study start date. Data may have been entered before this date for training purposes." qc_rule <- paste0("Records entered before the study start date on ", start_date, " are deleted") qc_type <- "anterior_to_startdate" df <- facility_data qc_text <- paste0("an entry date anterior to the study start date on **", start_date, "**") qc_idx <- qc_screening_before_startdate qc_export_label <- "anterior_to_startdate" qc_export_description <- "the entry date is anterior to the study start date" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) facility_data_before_start <- qc_df %>% dplyr::select(date_visit, child_id) n_before_startdate_records <- n_detected
facility_data <- cleaned_df
r qc_screening_after_lockdate
]write(" o Study lock date context", stderr())
qc_description <- "Screening data are considered valid only until the date for the lock. Note that follow-up data will be managed differently, since they are considered valid after the lock data as soon as they correspond to a participant enrolled before the date of the lock." qc_rule <- paste0("Records entered after the lock date on ", lock_date, " are deleted") qc_type <- "posterior_to_lockdate" df <- facility_data qc_text <- paste0("an entry date posterior to the lock date on **", lock_date, "**") qc_idx <- qc_screening_after_lockdate qc_export_label <- "posterior_to_lockdate" qc_export_description <- "the entry date is posterior to the lock date" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) facility_data_after_lock <- qc_df %>% dplyr::select(date_visit, child_id) n_after_lockdate_records <- n_detected
facility_data <- cleaned_df
r qc_screening_after_lockdate
]write(" o Data entry times", stderr())
fig_df <- facility_data %>% dplyr::mutate(start_time1 = as.POSIXct(paste0("2021-01-01 ", start_time), tz = Sys.getenv("TZ"))) %>% dplyr::mutate(week = lubridate::floor_date(as.Date(end), "week", week_start = getOption("lubridate.week.start", 1))) fig_caption <- "Spatiotemporal pattern of data entry times" facility_col <- "fid_from_device" date_col <- "week" date_lbl <- "Weeks" date_format <- "%b%y" y_col <- "start_time1" y_lbl <- "Start time of record entry" y_is_time <- TRUE time_break_str <- "4 hours" time_format <- "%H:%M" comparison <- "type" cat(knitr::knit_child('database_export_sub_facet_scatter_plot.Rmd', envir = environment(), quiet = TRUE))
r qc_underaged_cg_id
]write(" o Caregiver eligibility", stderr())
qc_description <- "Caregivers must be above 18 years old to be able provide consent for the child to be part in the study." qc_rule <- "Children with a caregiver below 18 years old are kept in the screening database but excluded from the study." qc_type <- "underaged_cg" df <- facility_data qc_text <- "a consenting caregiver below 18 years old" qc_idx <- qc_underaged_cg_id qc_export_label <- "underaged_cg" qc_export_description <- "the consenting caregiver is below 18 years old" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_ineligible_cg_records <- n_detected
facility_data <- cleaned_df
r paste0(qc_dup_repeats_id, "a")
]write(" o Repeat visits incorrectly recorded as new enrolments", stderr())
::: {custom-style="redparagraph"}
Initial check r qc_dup_repeats_id
a
:::
qc_description <- "Participants who came back to the facility between 1 and 28 days after their enrolment should be recorded as repeat visits. If they are recorded as new enrolments with the same participant ID they were initially given, this artificially creates duplicates in the database." qc_rule <- "Repeat visits incorrectly recorded as new enrolments are detected and corrected proactively were possible." qc_type <- "repeat_duplicates" df <- facility_data col_id <- "child_id" col_date <- "start" cleaning <- "none" qc_text <- "duplicated IDs" qc_idx <- paste0(qc_dup_repeats_id, "a") qc_export_label <- "timci_repeat_visit_duplicates" qc_export_description <- "the repeat visit was allocated the same ID that has been used for another participant in the database" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_repeat_visit_records <- n_detected
fig_df <- qc_df %>% dplyr::mutate(week = as.Date(lubridate::floor_date(as.Date(date_2), "week", week_start = getOption("lubridate.week.start", 1)))) %>% dplyr::mutate(fid = substr(id, 3, 7)) fig_caption <- "Repeat visits incorrectly recorded as new enrolments" facility_col <- "fid" date_col <- "week" date_lbl <- "Weeks" date_format <- "%b%y" cat(knitr::knit_child('database_export_sub_bar_plot.Rmd', envir = environment(), quiet = TRUE))
# Parameters for manual corrections mc_description <- paste0('Manual correction of repeat visits recorded as duplicated IDs. Confirmed repeat records are manually edited in the ', db_name, ' database as described below: variable *repeat_consult* set to *1* variable *consent* set to *NA* variable *enrolled* set to *NA*') to_correct_df <- facility_data correction_type <- "convert_screening_from_day0_to_repeat" # Parameters for the quality check following manual corrections qc_description <- "Remaining repeat visits incorrectly recorded as new enrolments are deleted from the database." qc_idx <- paste0(qc_dup_repeats_id, "b") cat(knitr::knit_child('database_export_sub_corrections.Rmd', envir = environment(), quiet = TRUE)) n_repeat_edit_records <- n_mc facility_data <- corrected_df
r qc_inconsistent_age_info
]write(" o Discrepancy in child age and categories", stderr())
qc_description <- "Age group consistent with young infants flag and categories as specified in the codebook." qc_rule <- action_alert_no_modification qc_type <- "blank_value" df <- facility_data %>% dplyr::filter(yg_infant == 1) col_value <- "who_age_ctg" qc_text <- "missing young infant age category" qc_idx <- qc_inconsistent_age_info qc_export_label <- "inconsistent_age_info" qc_export_description <- "age information is inconsistent" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_inconsistent_age_info <- n_detected
ggplot2::ggplot(facility_data, ggplot2::aes(x = who_age_ctg, fill = yg_infant)) + ggplot2::geom_bar(position = "stack") + ggplot2::theme(text = element_text(size = tsize), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank())
r qc_inconsistent_facility_info1
]write(" o Discrepancy in facility IDs (vs. device)", stderr())
This last check is to ensure that there are no remaining inconsistencies between the variables fid (when available, i.e. on enrolled children only) and the derived variable fid_from_device (all screened children) since the content of fid is then replaced by the content of fid_from_device for facilitating analyses.
qc_description <- "Check the facility ID from the first 5 characters of the child ID (**fid**) versus from the device(s) which has(ve) been used in this facility on that day (**fid_from_device**)." qc_rule <- action_alert_no_modification qc_type <- "inconsistent_facility_info1" df <- facility_data qc_text <- "inconsistent facility info" qc_idx <- paste0(qc_inconsistent_facility_info1, "z") qc_export_label <- "inconsistent_facility_info1" qc_export_description <- "facility information between the child ID and the device is inconsistent" cat(knitr::knit_child('database_export_sub_quality_check.Rmd', envir = environment(), quiet = TRUE)) n_inconsistent_facility_info1z <- n_detected
write(" o Pseudonymisation", stderr())
Pseudonymisation is performed using a cryptographic hash function (md5) that takes strings as input and produces a random-like fixed-length output.
facility_data$fid <- facility_data$fid_from_device
screening_data_nopseudo <- timci::extract_screening_data(facility_data, is_pilot) %>% dplyr::select(-fid_from_device)
screening_data <- screening_data_nopseudo %>% dplyr::rowwise() %>% dplyr::mutate(uuid = ifelse(uuid != "", digest(uuid, algo = crypto_algo), ""), prev_id = ifelse(prev_id != "", digest(prev_id, algo = crypto_algo), ""), device_id = ifelse(device_id != "", digest(device_id, algo = crypto_algo), "")) %>% dplyr::ungroup()
n_cleaned_screening_records <- nrow(screening_data)
write(" o Data cleaning summary", stderr())
timci::create_screening_qc_flowchart(n_raw_screening_records, n_nonvalid_deviceid_records, n_other_fid_records, n_before_startdate_records, n_before_facility_startdate_records, n_after_lockdate_records, n_ineligible_cg_records, n_nonvalid_fid_records, n_edit_nonvalid_fid_records, n_inconsistent_fid_records, n_edit_inconsistent_fid_records, n_repeat_visit_records, n_repeat_edit_records, n_incorrect_date_setup_records, n_late_submissions, n_late_completions, n_inconsistent_age_info, n_cleaned_screening_records)
write(" o Data overview", stderr())
fig_df <- screening_data %>% dplyr::mutate(Status = dplyr::case_when( repeat_consult == 1 ~ "Repeat visit", consent == 1 ~ "Newly enrolled", .default = "Not enrolled")) %>% dplyr::mutate(week = lubridate::floor_date(as.Date(start), "week", week_start = getOption("lubridate.week.start", 1))) fig_caption <- "Screening data overview over time by facility" facility_col <- "fid" date_col <- "week" date_lbl <- "Weeks" date_format <- "%b%y" fill_col <- "Status" comparison <- "type" cat(knitr::knit_child('database_export_sub_facet_bar_plot.Rmd', envir = environment(), quiet = TRUE))
write(" o Cast data types", stderr())
screening_data <- screening_data %>% dplyr::mutate(across(c(form_version, fid, who_age_ctg, dob_knwn, age_mo_knwn, consult_reason, main_cg, main_cg_lbl, repeat_consult, consent, enrolled), factor)) %>% dplyr::mutate(across(c(age_yr, age_mo), as.integer)) %>% dplyr::mutate(across(c(date_visit), ~format(as.POSIXct(.),"%Y-%m-%d"))) %>% dplyr::mutate(across(c(start), as.POSIXct))
write(" o Skim data", stderr())
skimr::skim(screening_data)
write(" o Data export", stderr())
timci::dataset_export(facility_data, "01", "timci_facility_data", rctls_dir, "Raw screening data")
timci::dataset_export(raw_screening_data, "01", "timci_screening_data", rctls_dir, "Raw screening data")
timci::dataset_export(screening_data_nopseudo, "01", "timci_screening_data_without_pseudonymisation", rctls_dir, "Cleaned screening data")
timci::dataset_export(screening_data, "01", "timci_screening_data", locked_db_dir, "Cleaned screening data")
timci::dataset_export(facility_data_audit, "00", "timci_facility_audit_data", rctls_dir, "Screening audit data")
rm(raw_screening_data, screening_data_nopseudo, screening_data, facility_data_audit) gc()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.