\newpage

write("Export hospital/hospitalisation data and run corresponding quality checks", stderr())

Hospitalisation data quality checks

db_name <- "hospitalisation follow-up"
qc_root <- "DQC_HS"
qc_nontimely_submission_hospitfu <- paste(qc_root, "01", sep = "_")
qc_nontimely_completion_hospitfu <- paste(qc_root, "02", sep = "_")
qc_other_fids_hospitfu <- paste(qc_root, "03", sep = "_")
qc_pids_out_lock_range_hospitfu <- paste(qc_root, "04", sep = "_")
qc_nonvalid_hospitfu <- paste(qc_root, "05", sep = "_")
qc_nonconsistent_name_hospitfu <- paste(qc_root, "06", sep = "_")
qc_duplicated_hospitfu <- paste(qc_root, "07", sep = "_")
qc_hospit_before_enrolment <- paste(qc_root, "08", sep = "_")
qc_death_before_enrolment <- paste(qc_root, "09", sep = "_")
qc_discharge_before_enrolment <- paste(qc_root, "10", sep = "_")
hospit_data_uids <- NULL
locked_hospit_data <- NULL
hospit_fu <- NULL
hospitfu_pii_drops_is_not_empty <- NULL
n_cdsa_pilot_hospitfu_records <- 0
n_edit_nonvalid_pid_hospitfu_records <- 0
n_drop_nonvalid_pid_hospitfu_records <- 0
n_inconsistent_names_day7fu_records <- 0
n_edit_inconsistent_name_hospitfu_records <- 0
n_drop_inconsistent_name_hospitfu_records <- 0
n_death_prior_day0_hospitfu <- 0
n_hospit_prior_day0_hospitfu <- 0
n_discharge_prior_day0_hospitfu <- 0

Format raw data

hospit_data <- timci::format_hospital_data(raw_hospit_data,
                                           is_deidentified = FALSE)
raw_hospit_data <- hospit_data

There are r if ( !is.null(raw_hospit_data) ) { nrow(raw_hospit_data) } else { 'NA' } record(s) in the raw r db_name database.

hospitfu_dictionary <- timci::import_country_specific_xls_dict("hospit_dict.xlsx", Sys.getenv('TIMCI_COUNTRY'))
n_hospitfu_dictionary_vars <- nrow(hospitfu_dictionary)

There are r n_hospitfu_dictionary_vars variables exported from the raw r db_name database.

hospitfu_dictionary %>%
  dplyr::select(new,
                old) %>%
  knitr::kable(col.names = c("Database reference",
                             "ODK reference"))

Dates and times

Late submissions [Compliance check r qc_nontimely_submission_hospitfu]

write(" o Inconsistent finalisation and submission", stderr())
qc_description <- "Transfer of a finalised submission to the ODK Central server not done on the day the submission was finalised (i.e. transfer time superior to 0 day)."
qc_rule <- action_alert_no_modification
qc_type <- "date_discrepancy"
df <- raw_hospit_data
col_date1 <- "end"
col_date2 <- "submission_date"
cleaning <- "none"
qc_text <- "the submission was not transferred on the same day it was finalised"
qc_idx <- qc_nontimely_submission_hospitfu
qc_export_label <- "nontimely_hospit_submission"
qc_export_description <- "the submission was not transferred on the same day it was finalised"
cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
fig_caption <- "Submissions not transferred on the same day they were finalised"
facility_col <- "fid"
date_col <- "week"
date_lbl <- "Weeks"
date_format <- "%b%y"
comparison <- "area"
fill_col <- "Difference"
fig_df <- qc_df %>% 
  dplyr::mutate(week = as.Date(lubridate::floor_date(as.Date(end), "week", week_start = getOption("lubridate.week.start", 1)))) %>% 
  dplyr::mutate(fid = substr(child_id, 3,7)) %>% 
  dplyr::mutate(Difference = dplyr::case_when(
    diff < 3 ~ "Less than 3 days",
    diff >= 3 & diff < 7 ~ "3-6 days",
    diff >= 7 ~ "7 days and above"))

cat(knitr::knit_child('database_export_sub_facet_bar_plot.Rmd',
                      envir = environment(),
                      quiet = TRUE))

Late completions [Compliance check r qc_nontimely_completion_hospitfu]

write(" o Non-timely completion", stderr())
qc_description <- "Finalisation of the submission not done on the same day the submission was started (i.e., duration from start to end strictly superior to 0 day)."
qc_rule <- action_alert_no_modification
qc_type <- "date_discrepancy"
df <- raw_hospit_data
col_date1 <- "start"
col_date2 <- "end"
cleaning <- "none"
qc_text <- "the submission was not finalised on the same day it was started"
qc_idx <- qc_nontimely_completion_hospitfu
qc_export_label <- "nontimely_hospit_completion"
qc_export_description <- "the submission was not finalised on the same day it was started"
cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
fig_caption <- "Number of submissions not finalised on the same day they were started"
facility_col <- "fid"
date_col <- "week"
date_lbl <- "Weeks"
date_format <- "%b%y"
comparison <- "area"
fill_col <- "Difference"
fig_df <- qc_df %>%
  dplyr::mutate(week = as.Date(lubridate::floor_date(as.Date(start), "week", week_start = getOption("lubridate.week.start", 1)))) %>% 
  dplyr::mutate(fid = substr(child_id, 3,7)) %>% 
  dplyr::mutate(Difference = dplyr::case_when(
    diff <= 1 ~ "1 day",
    diff > 1 & diff < 3 ~ "1-2 days",
    diff >= 3 ~ "3 days and above"))

cat(knitr::knit_child('database_export_sub_facet_bar_plot.Rmd',
                      envir = environment(),
                      quiet = TRUE))

Participant identification

hospit_data <- raw_hospit_data

Follow-ups corresponding to participants enrolled in the CDSA pilot (India only) [Context check r qc_other_fids_hospitfu]

write(" o Follow-ups corresponding to participants enrolled in the CDSA pilot", stderr())

r if (!is_india) {paste0("This check is only implemented for India.")}

qc_description <- paste0("Exclude hospital follow-ups that correspond to participants enrolled in the CDSA pilot.")
qc_rule <- "Discard follow-up records of participants who are enrolled in CDSA pilot facilities."
qc_type <- "nonvalid_ids2"
df <- hospit_data
idcol1 <- "child_id"
refdf <- rbind(other_timci_study_df) %>%
  dplyr::filter( !is.na(child_id) )
idcol2 <- "child_id"
qc_text <- "follow-up records corresponding to participants enrolled for the CDSA pilot"
qc_idx <- qc_other_fids_hospitfu
qc_export_label <- "pids_from_cdsa_pilot"
qc_export_description <- "the follow-up corresponds to a child who has been enrolled in the CDSA pilot"
cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
n_cdsa_pilot_hospitfu_records <- n_detected
hospit_data <- cleaned_df

Follow-ups corresponding to participants enrolled outside the lock date range [context check r qc_pids_out_lock_range_hospitfu]

write(" o Follow-ups corresponding to participants enrolled outside the lock date range", stderr())
qc_description <- paste0("Hospitalisation follow-ups are relevant only if associated to participants enrolled between the start date ", start_date, " and the lock date on ", lock_date, ".")
qc_rule <- "Discard follow-up records of participants who are enrolled in the Day 0 database outside the date range considered for the lock."
qc_type <- "nonvalid_ids2"
df <- hospit_data
idcol1 <- "child_id"
refdf <- facility_data_after_lock %>%
  dplyr::filter( !is.na(child_id) )
idcol2 <- "child_id"
col_date <- "date"
qc_text <- "participant IDs outside the lock date range"
qc_idx <- qc_pids_out_lock_range_hospitfu
qc_export_label <- "pids_outside_lock_range"
qc_export_description <- "the follow-up correspond to a child who has been enrolled outside the date range for the lock"
cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
n_afterlock_pids_hospitfu_records <- n_detected
hospit_data <- cleaned_df

Non-valid participant IDs [compliance check r qc_nonvalid_hospitfu]

write(" o Non-valid participant IDs", stderr())

::: {custom-style="redparagraph"} Initial check r qc_nonvalid_hospitfua :::

qc_description <- "Hospitalisation follow-ups are relevant only if they can be reconciled with a participant enrolled at Day 0. Reconciliation is based on the participant ID."
qc_rule <- "Keep only IDs of participant who are found in the locked Day 0 database."
qc_type <- "nonvalid_ids"
df <- hospit_data
idcol1 <- "child_id"
refdf <- allday0_data
idcol2 <- "child_id"
datecol <- "date_enrol"
ldate_diff <- -2
udate_diff <- 2
matched_names <- TRUE
cleaning <- "none"
qc_text <- "non-valid participant IDs"
qc_idx <- paste0(qc_nonvalid_hospitfu, "a")
qc_export_label <- "nonvalid_pids_hospitfu_and_matched_names"
qc_export_description <- "the child ID does not correspond to any ID found the locked Day 0 database"

cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
n_nonvalid_pids_hospitfu <- n_detected
mc_description <- ""
to_correct_df <- hospit_data
correction_type <- "correct_hospit_non_valid_ids"

# Parameters for the quality check following manual corrections
qc_rule <- "Keep only IDs of children who can be found in the initial Day 0 database."
qc_idx <- paste0(qc_nonvalid_hospitfu, "b")
cleaning <- "drop_all"

cat(knitr::knit_child('database_export_sub_corrections.Rmd',
                      envir = environment(),
                      quiet = TRUE))

n_edit_nonvalid_pid_hospitfu_records <- n_mc
n_drop_nonvalid_pid_hospitfu_records <- n_detected
hospit_data <- cleaned_df

Non-consistent participant names [Context check r qc_nonconsistent_name_hospitfu]

write(" o Non-consistent participant names", stderr())

r paste0('[Initial check ', qc_nonconsistent_name_hospitfu, 'a]{custom-style="underlined"}')

qc_description <- "Hospital follow-ups are relevant only if the name of the participant matches the name of the participant enrolled at Day 0. Some errors can be introduced when participant IDs have been duplicated or follow-ups have been entered manually."
qc_rule <- "Keep only IDs of children whose name matches the name in the initial Day 0 database."
qc_type <- "inconsistent_names"
df <- hospit_data
refdf <- allday0_data
datecol <- "date_enrol"
ldate_diff <- -2
udate_diff <- 2
matched_names <- TRUE
cleaning <- "none"
qc_text <- "non-consistent participant names"
qc_idx <- paste0(qc_nonconsistent_name_hospitfu, "a")
qc_export_label <- "nonconsistent_names_hospitfu_with_matched_names"
qc_export_description <- paste0("the child name found in the ", db_name, " is not consistent with the name of the child with the same ID found in the locked Day 0 database")

cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))

n_inconsistent_names_day7fu_records <- n_detected
mc_description <- ""
to_correct_df <- hospit_data
correction_type <- "correct_hospit_inconsistent_names"

# Parameters for the quality check following manual corrections
qc_idx <- paste0(qc_nonconsistent_name_hospitfu, "b")
qc_export_label <- "nonconsistent_names_hospitfu_with_matched_names"

cat(knitr::knit_child('database_export_sub_corrections.Rmd',
                      envir = environment(),
                      quiet = TRUE))

hospit_data <- corrected_df
n_edit_inconsistent_name_hospitfu_records <- n_mc
mc_description <- "Delete records where it cannot be ascertained that the child enrolled and followed-up are the same individual (manual evaluation)."
to_correct_df <- hospit_data
correction_type <- "delete_hospit_records"

# Parameters for the quality check following manual corrections
qc_idx <- paste0(qc_nonconsistent_name_hospitfu, "c")
qc_rule <- "Remaining records with inconsistent names are kept in the database (i.e. some similarity have been found despite the low automated score)."
qc_export_label <- "nonconsistent_names_hospitfu_with_matched_names"
cleaning <- "none"

cat(knitr::knit_child('database_export_sub_corrections.Rmd',
                      envir = environment(),
                      quiet = TRUE))

hospit_data <- corrected_df
n_drop_inconsistent_name_hospitfu_records <- n_mc

The following checks will focus solely on records where the child has been found in hospital records, i.e. records where the variable found is set to 1.

found_hospit_data <- hospit_data %>% 
  dplyr::filter(found == 1)
found_hospit_data_is_not_empty <- timci::is_not_empty(found_hospit_data)

Duplicate management [compliance check r qc_duplicated_hospitfu]

write(" o Duplicate management", stderr())
cat("The dataset is empty. No check performed")
qc_description <- "It is possible to have more than one successful follow-up records (**found** = 1) available for the same participant."
qc_rule <- action_alert_no_modification
qc_type <- "duplicates"
df <- found_hospit_data
col_id <- "child_id"
col_date <- "start"
cleaning <- "none"
qc_text <- "duplicated IDs"
qc_idx <- qc_duplicated_hospitfu
qc_export_label <- "duplicated_hospitfu"
qc_export_description <- "successful hospitalisation follow-ups are duplicated"

cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))

n_duplicates_hospitfu <- n_detected

Dates and times

Invalid date of hospitalisation [Context check r qc_hospit_before_enrolment]

write(" o Invalid date of hospitalisation", stderr())
cat("The dataset is empty. No check performed")
qc_description <- "The reported hospital visit should have happened between enrolment at Day 0 and the hospitalisation follow-up."
qc_rule <- action_alert_no_modification
qc_type <- "date_discrepancy"
df <- found_hospit_data %>% 
  merge(allday0_data %>%
          dplyr::select(child_id,
                        fid,
                        date_visit),
        by = "child_id",
        all.x = TRUE)
col_date1 <- "date_visit_rhf"
col_date2 <- "date_visit"
qc_text <- "a date of hospitalisation before the enrolment date"
qc_idx <- qc_hospit_before_enrolment
qc_export_label <- "hospit_before_enrolment"
qc_export_description <- "the reported date of hospitalisation was before the enrolment date"

cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
n_hospit_prior_day0_hospitfu <- n_detected

Invalid date of death [Context check r qc_death_before_enrolment]

write(" o Invalid date of death", stderr())
cat("The dataset is empty. No check performed")
qc_description <- "The reported death should have happened between enrolment at Day 0 and the hospitalisation follow-up."
qc_rule <- action_alert_no_modification
qc_type <- "date_discrepancy"
df <- found_hospit_data %>% 
  merge(allday0_data %>%
          dplyr::select(child_id,
                        fid,
                        date_visit),
        by = "child_id",
        all.x = TRUE)
col_date1 <- "date_death"
col_date2 <- "date_visit"
qc_text <- "a date of death before the enrolment date"
qc_idx <- qc_death_before_enrolment
qc_export_label <- "death_before_enrolment"
qc_export_description <- "the reported date of death was before the enrolment date"
cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
n_death_prior_day0_hospitfu <- n_detected

Invalid date of discharge [Context check r qc_discharge_before_enrolment]

write(" o Invalid date of discharge", stderr())
cat("The dataset is empty. No check performed")
qc_description <- "The reported hospital visit should have happened between enrolment at Day 0 and the hospitalisation follow-up."
qc_rule <- action_alert_no_modification
qc_type <- "date_discrepancy"
df <- found_hospit_data %>% 
  merge(allday0_data %>%
          dplyr::select(child_id,
                        fid,
                        date_visit),
        by = "child_id",
        all.x = TRUE)
col_date1 <- "date_discharge"
col_date2 <- "date_visit"
qc_text <- "a date of discharge before the enrolment date"
qc_idx <- qc_discharge_before_enrolment
qc_export_label <- "discharge_before_enrolment"
qc_export_description <- "the reported date of discharge was before the enrolment date"
cat(knitr::knit_child('database_export_sub_quality_check.Rmd',
                      envir = environment(),
                      quiet = TRUE))
n_discharge_prior_day0_hospitfu <- n_detected

Pseudonymisation

write(" o Pseudonymisation", stderr())
hospitfu_pii_drops <- hospitfu_dictionary %>%
  dplyr::filter(deidentified == 0)

There are r nrow(hospitfu_pii_drops) variables deleted from the anonymised cleaned r db_name database.

hospitfu_pii_drops %>%
  dplyr::select(new) %>%
  knitr::kable(row.names = FALSE,
               col.names = c("Database reference"),
               caption = "Columns dropped for the cleaned data export")
hospitfu_deidentified_dict <- hospitfu_dictionary %>%
  dplyr::filter(deidentified == 1)

There are r nrow(hospitfu_deidentified_dict) variables exported from the cleaned anonymised r db_name database.

hospit_data_nopseudo <- hospit_data %>%
  dplyr::select(dplyr::any_of(c(hospitfu_deidentified_dict$new)))

Pseudonymisation is performed using a cryptographic hash function (md5) that takes strings as input and produces a random-like fixed-length output.

hospit_data_no_pii <- hospit_data_nopseudo %>%
  dplyr::rowwise() %>%
  dplyr::mutate(uuid = ifelse(uuid != "", digest(uuid, algo = crypto_algo), ""),
                child_id = ifelse(child_id != "", digest(child_id, algo = crypto_algo), ""),
                device_id = ifelse(device_id != "", digest(device_id, algo = crypto_algo), "")) %>%
  dplyr::ungroup()

Data cleaning summary

timci::create_hospit_qc_flowchart(nrow(raw_hospit_data),
                                  n_cdsa_pilot_hospitfu_records,
                                  n_afterlock_pids_hospitfu_records,
                                  n_nonvalid_pids_hospitfu,
                                  n_edit_nonvalid_pid_hospitfu_records,
                                  n_drop_nonvalid_pid_hospitfu_records,
                                  n_inconsistent_names_day7fu_records,
                                  n_edit_inconsistent_name_hospitfu_records,
                                  n_drop_inconsistent_name_hospitfu_records,
                                  n_duplicates_hospitfu,
                                  n_death_prior_day0_hospitfu,
                                  n_hospit_prior_day0_hospitfu,
                                  n_discharge_prior_day0_hospitfu,
                                  nrow(hospit_data))

Data overview

hospit_data_no_pii <- hospit_data_no_pii %>%
  dplyr::mutate(across(c(device_id,
                         form_version,
                         rhf_id,
                         rhf_name,
                         sex_hf,
                         dob_knwn_hf,
                         found,
                         time_visit_hf,
                         rr,
                         spo2,
                         hospit,
                         ward,
                         o2,
                         o2_type,
                         dx_admission,
                         adm_outcome,
                         dx_discharge,
                         o2_duration,
                         death_cause,
                         facility_trans),
                       factor)) %>%
  dplyr::mutate(across(c(spo2_meas,
                         rr_meas),
                       as.numeric)) %>% 
  dplyr::mutate(across(c(date,
                         date_visit_rhf,
                         date_discharge,
                         date_death),
                       ~format(as.Date(.), "%Y-%m-%d"))) %>% 
  dplyr::mutate(across(c(start),
                       ~format(as.Date(.), "%Y-%m-%d %H:%M:%S"))) %>%
  dplyr::mutate(across(c(dx_admission_oth,
                         dx_discharge_oth,
                         facility_trans_oth,
                         ward_oth),
                       as.character)) 
skimr::skim(hospit_data_no_pii)

Data export

timci::dataset_export(raw_hospit_data,
                      "05a",
                      "timci_followup_hospit_data",
                      rctls_dir,
                      "Raw hospitalisation data")
timci::dataset_export(hospit_data_nopseudo,
                      "05a",
                      "timci_cleaned_followup_hospit_data_without_pseudonymisation",
                      rctls_dir,
                      "Cleaned hospitalisation data")
timci::dataset_export(hospit_data_no_pii,
                      "05a",
                      "timci_followup_hospit_data",
                      locked_db_dir,
                      "Cleaned de-identified hospitalisation data")
rm(raw_hospit_data,
   hospit_data_nopseudo)
gc()


Thaliehln/timci documentation built on April 8, 2024, 3:38 p.m.