knitr::opts_chunk$set(echo = FALSE,
                      warning = FALSE,
                      message = FALSE)

[Check description:]{custom-style="underlined"} r qc_description

[Action:]{custom-style="underlined"} r qc_rule

qc_df <- NULL
qc_df2 <- NULL
qc_reuse_df <- NULL # reuse non de-identified dataframe for subsequent checks
cleaned_df <- NULL
cols <- colnames(df)
n_df <- nrow(df)
n_detected <- 0
n_to_remove <- 0
cleaned_df_status_update <- ""
duplicate_checks <- c("duplicates",
                      "duplicates_with_names",
                      "true_duplicates",
                      "day0_duplicates",
                      "repeat_duplicates")
is_duplicate_check <- (qc_type %in% duplicate_checks)
out <- timci::remove_facilities_for_other_studies(df,
                                                  all_facilities,
                                                  excluded_facilities)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::allocate_screening_facility2(df,
                                           research_facilities)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
# outcols <- c("date_visit", "child_id", "fid_from_device", "child_id_manual", "device_id", "uuid")
# qc_df <- qc_df %>%
#   dplyr::select(outcols)
df <- df %>%
  dplyr::mutate(study_start_date = start_date) %>%
  dplyr::mutate(days_since_start = floor(difftime(as.Date(date_visit, "%Y-%m-%d"), study_start_date, units = "days")))
qc_df <- df %>%
  dplyr::filter(days_since_start < 0) %>%
  dplyr::select(date_visit, child_id, uuid, days_since_start) %>%
  dplyr::arrange(date_visit = as.Date(date_visit, "%Y-%m-%d")) # Order entries by date
cleaned_df <- df %>%
  dplyr::filter(days_since_start >= 0) %>%
  dplyr::arrange(date_visit = as.Date(date_visit, "%Y-%m-%d")) # Order entries by date
research_facilities %>%
          dplyr::select(facility_id,
                        facility_start) %>% 
  filter(!is.na(facility_start)) %>%
  knitr::kable()
df <- df %>%
  merge(research_facilities %>%
          dplyr::select(facility_id,
                        facility_start),
        by.x = "fid_from_device",
        by.y = "facility_id") %>% 
  dplyr::mutate(days_since_start = floor(difftime(as.Date(date_visit, "%Y-%m-%d"), facility_start, units = "days")))
qc_df <- df %>%
  dplyr::filter(!is.na(days_since_start) & days_since_start < 0) %>%
  dplyr::select(date_visit, child_id, uuid, days_since_start) %>%
  dplyr::arrange(date_visit = as.Date(date_visit, "%Y-%m-%d")) # Order entries by date
cleaned_df <- df %>%
  dplyr::filter(is.na(days_since_start) | (!is.na(days_since_start) & days_since_start >= 0)) %>%
  dplyr::arrange(date_visit = as.Date(date_visit, "%Y-%m-%d")) # Order entries by date
df <- df %>%
  dplyr::mutate(study_lock_date = lock_date) %>%
  dplyr::mutate(days_to_lock = floor(difftime(as.Date(date_visit, "%Y-%m-%d"), study_lock_date, units = "days")))
qc_df <- df %>%
  dplyr::filter(days_to_lock > 0) %>%
  dplyr::select(date_visit, child_id, uuid, days_to_lock) %>%
  dplyr::arrange(date_visit = as.Date(date_visit, "%Y-%m-%d")) # Order entries by date
cleaned_df <- df %>%
  dplyr::filter(days_to_lock <= 0) %>%
  dplyr::arrange(date_visit = as.Date(date_visit, "%Y-%m-%d")) # Order entries by date
out <- timci::identify_ids_outside_lock_range(df = df,
                                              col_id = idcol,
                                              day0_df = day0_data,
                                              start_date = start_date,
                                              end_date = lock_date)
qc_df <- out[[1]]
qc_df <- df[!is.na(df$enrolled), ]
qc_df <- qc_df[qc_df$enrolled == 1 & qc_df$cg_eligibility == 0, ]
cleaned_df <- df
cleaned_df$consent[cleaned_df$uuid %in% qc_df$uuid] <- 0
cleaned_df$enrolled[cleaned_df$uuid %in% qc_df$uuid] <- 0
out <- timci::identify_nonvalid_ids(df,
                                    idcol1,
                                    refdf,
                                    idcol2)
qc_df <- out[[1]]
out <- timci::identify_nonvalid_ids(df,
                                    idcol1,
                                    refdf,
                                    idcol2)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::identify_nonvalid_ids2(df,
                                     idcol1,
                                     refdf,
                                     idcol2)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::identify_nonvalid_ids_flagged(df,
                                            idcol1,
                                            refdf,
                                            idcol2)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::identify_nonvalid_ids_with_matched_names(df1 = df,
                                                       col_id1 = idcol1,
                                                       df2 = refdf,
                                                       col_id2 = idcol2,
                                                       col_date1 = datecol,
                                                       ldate_diff = ldate_diff,
                                                       udate_diff = udate_diff,
                                                       matched_names = matched_names,
                                                       cleaning = cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::detect_inconsistent_dates(df,
                                        col_date_start = col_date1,
                                        col_date_end = col_date2,
                                        cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::detect_inconsistent_dates(df,
                                        col_date_start = col_date1,
                                        col_date_end = col_date2,
                                        cleaning,
                                        list_of_cols = fu_cols)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::identify_duplicates_by_dates(df,
                                           col_id,
                                           col_date,
                                           cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
qc_df2 <- out[[3]]
out <- timci::identify_duplicates_with_names(df,
                                             col_id,
                                             col_date,
                                             cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
qc_df2 <- out[[3]]
out <- timci::identify_day0_duplicates_and_fu(df = df,
                                              day7fu_df = ref_df,
                                              cleaning = cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
qc_df2 <- out[[3]]
out <- timci::identify_repeat_duplicate(df,
                                        col_id,
                                        col_date,
                                        cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::identify_true_duplicate(df,
                                      col_id,
                                      col_date,
                                      cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
qc_df <- df %>% 
  dplyr::filter(fid != "" & !is.na(fid),
                fid != fid_from_device,
                fid != fid_from_main_device) # This row has been added for the Tanzania
if ( "longitude" %in% cols ) {
  qc_df <- qc_df %>%
  timci::find_closest_facility(research_facilities)
}

qccols <- colnames(qc_df)
outcols <- c("date_visit",
             "child_id",
             "fid",
             "district",
             "fid_from_device",
             "fid_from_main_device")
if ( "facility_id" %in%  qccols ) {
  outcols <- c(outcols, "facility_id")
}
if ( "fid_ra" %in%  qccols ) {
  outcols <- c(outcols, "fid_ra", "district_ra")
}
if ( "child_id_manual" %in%  qccols ) {
  outcols <- c(outcols, "child_id_manual")
}
if ( "latitude" %in%  qccols ) {
  outcols <- c(outcols, "latitude")
}
if ( "longitude" %in%  qccols ) {
  outcols <- c(outcols, "longitude")
}
if ( "gps_accuracy" %in%  qccols ) {
  outcols <- c(outcols, "gps_accuracy")
}
if ( "dist" %in%  qccols ) {
  outcols <- c(outcols, "dist")
}
outcols <- c(outcols,
             "uuid",
             "device_id")

qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- df %>% 
  dplyr::filter(fid != "" & !is.na(fid),
                fid_ra != "" & !is.na(fid_ra),
                fid != fid_ra)
if ( "longitude" %in% cols ) {
  qc_df <- qc_df %>%
    timci::find_closest_facility(research_facilities) %>%
    dplyr::filter(facility_id != "" & !is.na(facility_id))
}

qccols <- colnames(qc_df)
outcols <- c("date_visit",
             "child_id",
             "fid",
             "district")
if ( "fid_from_device" %in%  qccols ) {
  outcols <- c(outcols, "fid_from_device")
}
if ( "fid_from_main_device" %in%  qccols ) {
  outcols <- c(outcols, "fid_from_main_device")
}
if ( "fid_ra" %in%  qccols ) {
  outcols <- c(outcols, "fid_ra")
}
if ( "district_ra" %in%  qccols ) {
  outcols <- c(outcols, "district_ra")
}
if ( "facility_id" %in%  qccols ) {
  outcols <- c(outcols, "facility_id")
}
if ( "child_id_manual" %in%  qccols ) {
  outcols <- c(outcols, "child_id_manual")
}
if ( "latitude" %in%  qccols ) {
  outcols <- c(outcols, "latitude")
}
if ( "longitude" %in%  qccols ) {
  outcols <- c(outcols, "longitude")
}
if ( "gps_accuracy" %in%  qccols ) {
  outcols <- c(outcols, "gps_accuracy")
}
if ( "dist" %in%  qccols ) {
  outcols <- c(outcols, "dist")
}
outcols <- c(outcols,
             "uuid",
             "device_id")

qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- df %>% 
  dplyr::filter(!is.na(fid) & fid != "") %>%
  dplyr::filter(fid != fid_from_main_device)
if ( "longitude" %in% cols ) {
  qc_df <- qc_df %>%
  timci::find_closest_facility(research_facilities)
}

qccols <- colnames(qc_df)
outcols <- c("date_visit",
             "child_id",
             "fid",
             "district",
             "fid_from_device",
             "fid_from_main_device")
if ( "facility_id" %in%  qccols ) {
  outcols <- c(outcols, "facility_id")
}
if ( "fid_ra" %in%  qccols ) {
  outcols <- c(outcols, "fid_ra", "district_ra")
}
if ( "child_id_manual" %in%  qccols ) {
  outcols <- c(outcols, "child_id_manual")
}
if ( "latitude" %in%  qccols ) {
  outcols <- c(outcols, "latitude")
}
if ( "longitude" %in%  qccols ) {
  outcols <- c(outcols, "longitude")
}
if ( "gps_accuracy" %in%  qccols ) {
  outcols <- c(outcols, "gps_accuracy")
}
if ( "dist" %in%  qccols ) {
  outcols <- c(outcols, "dist")
}
outcols <- c(outcols,
             "uuid",
             "device_id")

qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- df %>%
  dplyr::filter(fid != "" & !is.na(fid),
                child_id != "" & !is.na(child_id),
                fid != substr(child_id, 3, 7))

qccols <- colnames(qc_df)
outcols <- c("date_visit",
             "child_id",
             "fid",
             "district")
if ( "facility_id" %in%  qccols ) {
  outcols <- c(outcols, "facility_id")
}
outcols <- c(outcols,
             "uuid",
             "device_id")

qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- df %>%
  dplyr::filter(`facility_identification-fcode` != "" & !is.na(`facility_identification-fcode`),
                `child_identification-pid` != "" & !is.na(`child_identification-pid`),
                `facility_identification-fcode` != substr(`child_identification-pid`, 3, 7))

qccols <- colnames(qc_df)
outcols <- c("date",
             "child_identification-pid",
             "facility_identification-fcode",
             "facility_identification-district")
if ( "facility_id" %in%  qccols ) {
  outcols <- c(outcols, "facility_id")
}
outcols <- c(outcols,
             "meta-instanceID",
             "DeviceID")

qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::rename(uuid = `meta-instanceID`) %>% 
  dplyr::arrange(`facility_identification-fcode`)
qc_df <- timci::detect_missing_value(df, col_value)
outcols <- c("child_id", "fid", "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- timci::detect_blank_value(df, col_value)
outcols <- c("child_id", "fid", "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- timci::detect_negative_value(df, col_value)
outcols <- c("child_id", "fid", "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- df %>%
  dplyr::filter(step_dur_sec < 0) %>%
  dplyr::select(step_type,
                step_name,
                time_start,
                time_end,
                step_dur_sec,
                PARENT_KEY) %>%
  dplyr::count(PARENT_KEY) %>%
  dplyr::arrange(desc(n))
cleaned_df <- df %>%
  dplyr::mutate(step_dur_sec = ifelse(step_dur_sec >= 0, step_dur_sec, NA))
qc_df <- timci::detect_missing_clinical_presentation(df)
qc_df <- timci::detect_missing_diagnosis(df)
outcols <- c("child_id", "fid", "date_visit", "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- timci::detect_missing_referral(df)
outcols <- c("child_id", "fid", "date_visit", "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- timci::detect_missing_treatment(df)
outcols <- c("child_id", "fid", "date_visit", "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols) %>%
  dplyr::arrange(fid)
qc_df <- df[(is.na(df$rx_amoxicillin) & !is.na(df$rx_misc_oth)) & (is.na(df$rx_amoxicillin_hf) & !is.na(df$rx_misc_oth_hf)),]
outcols <- c("child_id", "fid", "date_visit", "uuid")
drug_disp_df1 <- df %>%
  dplyr::filter(!!rlang::sym(col_id) == 1) %>%
  dplyr::select(free_text1,
                uuid,
                submission_date) %>%
  dplyr::rename(free_text = free_text1)
drug_disp_df2 <- df %>%
  dplyr::filter(!!rlang::sym(paste0(col_id, "_hf")) == 1) %>%
  dplyr::select(free_text2,
                uuid,
                submission_date) %>%
  dplyr::rename(free_text = free_text2)
qc_df <- rbind(drug_disp_df1,
               drug_disp_df2)
drug_disp_df1 <- df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(selected = ifelse(value %in% as.integer(unlist(strsplit(!!rlang::sym(col_id), split = ";"))),
                                  1,
                                  0)) %>% 
  ungroup() %>% 
  dplyr::filter(selected == 1) %>%
  dplyr::select(free_text1,
                uuid,
                submission_date) %>%
  dplyr::rename(free_text = free_text1)
drug_disp_df2 <- df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(selected = ifelse(value %in% as.integer(unlist(strsplit(!!rlang::sym(paste0(col_id, "_hf")), split = ";"))),
                                  1,
                                  0)) %>% 
  ungroup() %>% 
  dplyr::filter(selected == 1) %>%
  dplyr::select(free_text2,
                uuid,
                submission_date) %>%
  dplyr::rename(free_text = free_text2)
qc_df <- rbind(drug_disp_df1,
               drug_disp_df2)
qc_df <- timci::detect_drug(df,
                            drug_val_col,
                            drug_text_col,
                            drug_vec)
outcols <- c("child_id",
             "fid",
             "date_visit",
             drug_val_col,
             drug_text_col,
             "uuid")
qc_df <- qc_df %>%
  dplyr::select(outcols)
out <- timci::detect_inconsistent_names_between_visits(refdf = refdf,
                                                       fudf = df,
                                                       col_date = datecol,
                                                       ldate_diff = ldate_diff,
                                                       udate_diff = udate_diff,
                                                       matched_names = matched_names,
                                                       cleaning = cleaning)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
out <- timci::detect_inconsistent_names_between_visits(refdf = refdf,
                                                       fudf = df,
                                                       col_date = datecol,
                                                       ldate_diff = ldate_diff,
                                                       udate_diff = udate_diff,
                                                       matched_names = matched_names,
                                                       cleaning = cleaning,
                                                       repeats = TRUE)
qc_df <- out[[1]]
cleaned_df <- out[[2]]
qc_df <- df %>% 
  dplyr::filter(edits > 0)
df <- df %>% 
  rename(child_id = "child_identification-pid",
         back_from_lab = "child_identification-back_from_lab")
df1 <- df %>% 
  dplyr::filter(back_from_lab == 0)
df2 <- df %>%
  dplyr::filter(back_from_lab == 1)
qc_df <- df2[!df2$child_id %in% df1$child_id, ]
n_detected <- nrow(qc_df)
date_cols <- grep("date", names(qc_df))
remove_date_cols <- grep("_date", names(qc_df))
date_cols <- date_cols[!date_cols %in% remove_date_cols]
x <- unname((unlist(qc_df[colnames(qc_df[date_cols])])))
n_detected <- sum(!is.na(x))
n_to_remove <- n_detected - nrow(qc_df)

[Check output:]{custom-style="underlined"} The check has detected r n_detected record(s) (i.e. r sprintf('%.1f', 100 * n_detected / n_df) % of all records) with r qc_text in the r db_name database.

partial_disp <- FALSE
full_disp <- FALSE
qc_df_is_not_empty <- timci::is_not_empty(qc_df)
timci::quality_check_export(df = qc_df,
                            idx = qc_idx,
                            label = qc_export_label,
                            cdir = qc_dir,
                            description = qc_export_description)
qc_reuse_df <- qc_df
partial_disp <- qc_df_is_not_empty & ( nrow(qc_df) > 10 )
full_disp <- qc_df_is_not_empty & ( nrow(qc_df) <= 10 )
cols <- colnames(qc_df)

names_to_remove <- grep("name", colnames(qc_df))
if ( length(names_to_remove) ) {
  qc_df <- qc_df %>% 
    dplyr::select(-names_to_remove)
}

reformat_cond <- ( length(qc_df) > 10 ) & ( qc_type != "date_discrepancy" )
if ( reformat_cond ) {
  if ( 'date_visit' %in% cols ) {
    kcols <- c("date_visit")
    if ( 'child_id' %in% cols ) {
      kcols <- c(kcols, "child_id")
    } else if ( 'prev_id' %in% cols ) {
      kcols <- c(kcols, "prev_id")
    }
    if ( 'fid' %in% cols ) {
      kcols <- c(kcols,"fid")
    } else if ( 'fid_from_device' %in% cols ) {
      kcols <- c(kcols, "fid_from_device")
    }
    qc_df <- qc_df %>% 
      dplyr::select(kcols)
  }
}
n_cols <- length(qc_df)
qc_df %>%
  select(1:min(n_cols, 5)) %>%
  head(5) %>%
  knitr::kable(row.names = FALSE,
               caption = "Five first rows")
qc_df %>%
  select(1:min(n_cols, 5)) %>%
  tail(5) %>%
  knitr::kable(row.names = FALSE,
               caption = "Five last rows")
qc_df %>%
  select(1:min(n_cols, 5)) %>%
  knitr::kable(row.names = FALSE)
n_cleaned_df <- nrow(cleaned_df)
cleaned_df_status_update <- ifelse(n_cleaned_df < n_df,
                                   paste0('After deletion of all the records detected by this check, there are now **', n_cleaned_df, '** record(s) in the ', db_name, ' database.'),
                                   paste0('There are still **', n_cleaned_df, '** record(s) in the ', db_name, ' database.'))

r cleaned_df_status_update



Thaliehln/timci documentation built on April 8, 2024, 3:38 p.m.