R/clean_timci_data.R

Defines functions correct_tf_inconsistent_facilities correct_spo2_values selective_multi_replace selective_replace correct_day0_drug_data correct_spa_sco_all correct_spa_sco_fids correct_spa_sco_hcp_ids correct_hospit_ids correct_day28_all correct_day28_duplicates correct_day7_all correct_day7_duplicates correct_day0_all correct_repeat_ids delete_repeat_records delete_day0_records edit_day0_to_repeat edit_day0_child_ids_spa_sco edit_day0_child_ids correct_day0_inconsistent_facilities correct_day0_non_valid_facilities correct_device_ids

Documented in correct_day0_all correct_day0_drug_data correct_day0_inconsistent_facilities correct_day0_non_valid_facilities correct_day28_all correct_day28_duplicates correct_day7_all correct_day7_duplicates correct_device_ids correct_hospit_ids correct_repeat_ids correct_spa_sco_all correct_spa_sco_fids correct_spa_sco_hcp_ids correct_spo2_values correct_tf_inconsistent_facilities delete_day0_records delete_repeat_records edit_day0_child_ids edit_day0_child_ids_spa_sco edit_day0_to_repeat selective_multi_replace selective_replace

#' Edit non-valid facilities in Day 0 data entries (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_device_ids <- function(df) {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ "day0_deviceid_correction_from_field_tanzania.csv",
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)
      df <- df %>%
        merge(edits[, c("old_device_id", "uuid", "new_device_id")],
              by.x = c("device_id", "uuid"),
              by.y = c("old_device_id", "uuid"),
              all.x = TRUE)

      # Discarded edits
      discarded_edit <- df %>%
        dplyr::filter(device_id == "")

      # Correct data
      df$device_id <- ifelse(is.na(df$new_device_id), df$device_id, df$new_device_id)

      # Remove the column new_device_id from the dataframe
      drop <- c("new_device_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, discarded_edit)
    }
  }
  out

}

#' Edit non-valid facilities in Day 0 data entries (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_day0_non_valid_facilities <- function(df) {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ "day0_non_valid_facility_correction_tanzania.csv",
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ "day0_non_valid_facility_correction_kenya.csv",
                            TRUE ~ "")

  out <- list(df,NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)
      df <- df %>%
        merge(edits[, c("old_child_id", "uuid", "new_child_id")],
              by.x = c("child_id", "uuid"),
              by.y = c("old_child_id", "uuid"),
              all.x = TRUE)
      df$child_id <- ifelse(is.na(df$new_child_id), df$child_id, df$new_child_id)
      df$fid <- ifelse(is.na(df$new_child_id), df$fid, substr(df$new_child_id, 3,7))
      if ("fid_from_device" %in% colnames(df))
      {
        df$fid_from_device <- ifelse(is.na(df$new_child_id), df$fid_from_device, substr(df$new_child_id, 3,7))
      }

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, NULL)
    }
  }
  out

}

#' Edit non-valid facilities in Day 0 data entries (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day0_facility_correction1").
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_day0_inconsistent_facilities <- function(df,
                                                 csv_prefix = "day0_facility_correction1") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      discarded_edits <- df %>%
        merge(edits[, c("child_id", "uuid", "new_fid")],
              by = c("child_id", "uuid"),
              all.y = TRUE) %>%
        dplyr::filter(fid == "") %>%
        dplyr::select(child_id,
                      uuid,
                      new_fid)

      df <- df %>%
        merge(edits[, c("child_id", "uuid", "new_fid")],
              by = c("child_id", "uuid"),
              all.x = TRUE)

      df$fid <- ifelse(is.na(df$new_fid), df$fid, df$new_fid)
      if ( "fid_from_device" %in% colnames(df) )
      {
        df$fid_from_device <- ifelse(is.na(df$new_fid), df$fid_from_device, df$new_fid)
      }

      # Remove the column new_child_id from the dataframe
      drop <- c("new_fid")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, discarded_edits)
    }
  }
  out

}

#' Edit incorrect child IDs in Day 0 data entries (TIMCI-specific function)
#'
#' This function can be used to correct documented child ID duplicates, incorrect facility codes, or typos in Day 0 data entries. It reads in a CSV file containing corrections and applies them to the input dataframe.
#'
#' @param df A dataframe containing the Day 0 data entries to be corrected.
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day0_duplicate_correction").
#' @return A list containing the edited dataframe and the list of applied corrections.
#' @import dplyr
#' @import readr
#' @export

edit_day0_child_ids <- function(df,
                                csv_prefix = "day0_duplicate_correction") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {

    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      found_edits <- edits[, c("old_child_id", "uuid", "new_child_id")] %>%
        merge(df[, c("child_id", "uuid")],
              by.x = c("old_child_id", "uuid"),
              by.y = c("child_id", "uuid"),
              all.x = FALSE,
              all.y = FALSE)

      df <- df %>%
        merge(edits[, c("old_child_id", "uuid", "new_child_id")],
              by.x = c("child_id", "uuid"),
              by.y = c("old_child_id", "uuid"),
              all.x = TRUE)
      df$child_id <- ifelse(is.na(df$new_child_id),
                            df$child_id,
                            df$new_child_id)
      df$child_id <- as.character(df$child_id)
      df$fid <- ifelse(is.na(df$new_child_id), df$fid, substr(df$new_child_id, 3,7))
      if ("fid_from_device" %in% colnames(df))
      {
        df$fid_from_device <- ifelse(is.na(df$new_child_id), df$fid_from_device, substr(df$new_child_id, 3,7))
      }

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, found_edits, NULL)
    }
  }
  out

}

#' Edit incorrect child IDs based on Day 0 data correction in SPA observation entries (TIMCI-specific function)
#'
#' This function can be used to correct documented child ID duplicates, incorrect facility codes, or typos in Day 0 data entries. It reads in a CSV file containing corrections and applies them to the input dataframe.
#'
#' @param df A dataframe containing the Day 0 data entries to be corrected.
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day0_duplicate_correction").
#' @return A list containing the edited dataframe and the list of applied corrections.
#' @import dplyr
#' @import readr
#' @export

edit_day0_child_ids_spa_sco <- function(df,
                                        csv_prefix = "day0_duplicate_correction") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {

    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      found_edits <- edits[, c("old_child_id", "new_child_id")] %>%
        merge(df[, c("child_identification-pid", "meta-instanceID")],
              by.x = "old_child_id",
              by.y = "child_identification-pid",
              all.x = FALSE,
              all.y = FALSE)

      df <- df %>%
        merge(edits[, c("old_child_id", "new_child_id")],
              by.x = "child_identification-pid",
              by.y = "old_child_id",
              all.x = TRUE) %>%
        dplyr::mutate(`child_identification-pid` = as.character(ifelse(is.na(new_child_id),
                                                                       `child_identification-pid`,
                                                                       new_child_id)),
                      `facility_identification-fcode` = ifelse(is.na(new_child_id),
                                                               `facility_identification-fcode`,
                                                               substr(`child_identification-pid`, 3,7)))

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, found_edits, NULL)
    }
  }
  out

}

#' Edit incorrect child IDs in Day 0 data entries (TIMCI-specific function)
#' This function can be used to correct documented child ID duplicates, incorrect facility codes or typos
#'
#' @param df dataframe
#' @return This function returns a list that contains an edited dataframe and the list of edits
#' @import dplyr
#' @export

edit_day0_to_repeat <- function(df) {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ "day0_repeat_correction_same_id_tanzania.csv",
                            Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ "day0_repeat_correction_senegal.csv",
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ "day0_repeat_correction_kenya.csv",
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ "day0_repeat_correction_india.csv",
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {

    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      found_edits <- edits[, c("old_child_id", "uuid")] %>%
        merge(df[, c("child_id", "uuid")],
              by.x = c("old_child_id", "uuid"),
              by.y = c("child_id", "uuid"),
              all.x = FALSE,
              all.y = FALSE)

      df <- df %>%
        merge(edits[, c("old_child_id", "uuid", "new_child_id")],
              by.x=c("child_id", "uuid"),
              by.y=c("old_child_id", "uuid"),
              all.x=TRUE)
      df$prev_enrl <- ifelse(is.na(df$new_child_id),
                             df$prev_enrl,
                             1)
      df$prev_id <- ifelse(is.na(df$new_child_id),
                           df$prev_id,
                           df$child_id)
      df$prev_hf_name_card <- ifelse(is.na(df$new_child_id),
                                     df$prev_hf_name_card,
                                     df$facility)
      df$repeat_consult <- as.integer( ifelse(is.na(df$new_child_id),
                                              df$repeat_consult,
                                              1) )
      df$consent <- ifelse(is.na(df$new_child_id),
                           df$consent,
                           NA)
      df$enrolled <- ifelse(is.na(df$new_child_id),
                            df$enrolled,
                            NA)
      df$child_id_scan <- as.integer( ifelse(is.na(df$new_child_id),
                                             df$child_id_scan,
                                             0) )
      df$child_id_manual <- as.integer( ifelse(is.na(df$new_child_id),
                                               df$child_id_manual,
                                               0) )

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, found_edits, NULL)
    }
  }
  out

}

#' Drop incorrect child IDs in Day 0 data entries (TIMCI-specific function)
#' This function can be used to drop documented child IDs
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day0_training_deletion").
#' @return This function returns a list that contains a cleaned dataframe and the list of dropped records
#' @import dplyr
#' @export

delete_day0_records <- function(df,
                                csv_prefix = "day0_training_deletion") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)

  if ( csv_filename != "" ) {

    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      records_to_drop <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      found_records <- records_to_drop %>%
        merge(df[, c("child_id", "uuid")],
              by.x = c("child_id", "uuid"),
              by.y = c("child_id", "uuid"),
              all.x = FALSE,
              all.y = FALSE)
      df <- df[!(df$uuid %in% found_records$uuid), ]

      out <- list(df, found_records, NULL)
    }
  }
  out

}

#' Drop incorrect child IDs in repeat data entries (TIMCI-specific function)
#' This function can be used to drop documented child IDs
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day0_repeat_inconsistent_names_deletion").
#' @return This function returns a list that contains a cleaned dataframe and the list of dropped records
#' @import dplyr
#' @export

delete_repeat_records <- function(df,
                                  csv_prefix = "day0_repeat_inconsistent_names_deletion") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)

  if ( csv_filename != "" ) {

    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      records_to_drop <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      found_records <- records_to_drop %>%
        merge(df[, c("prev_id", "uuid")],
              by.x = c("child_id", "uuid"),
              by.y = c("prev_id", "uuid"),
              all.x = FALSE,
              all.y = FALSE)
      df <- df[!(df$uuid %in% found_records$uuid), ]

      out <- list(df, records_to_drop, NULL)
    }
  }
  out

}

#' Correct repeat follow-up visit IDs (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "repeat_non_valid_pid_correction").
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_repeat_ids <- function(df,
                               csv_prefix = "repeat_non_valid_pid_correction") {

  csv_filename <- dplyr::case_when(Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                                   TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname)
      df <- df %>%
        merge(edits[, c("old_child_id", "uuid", "new_child_id")],
              by.x = c("prev_id", "uuid"),
              by.y = c("old_child_id", "uuid"),
              all.x = TRUE)
      df$prev_id <- ifelse(is.na(df$new_child_id), df$prev_id, df$new_child_id)

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, NULL)
    }
  }
  out

}

#' Edit Day 0 data for all errors that were detected by quality checks (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns an edited dataframe with corrections
#' @import dplyr
#' @export

correct_day0_all <- function(df) {

  # Correct incorrect facility of enrolment
  df <- timci::correct_day0_non_valid_facilities(df)[[1]]
  # Delete dummy/test data
  df <- timci::delete_day0_records(df,
                                   csv_prefix = "day0_training_deletion")[[1]]
  # Correct duplicated child IDs
  df <- timci::edit_day0_child_ids(df,
                                   csv_prefix = "day0_duplicate_correction")[[1]]

  if (Sys.getenv("TIMCI_COUNTRY") == "Kenya") {
    out <- timci::detect_inconsistent_dates(df,
                                            "submission_date",
                                            "start",
                                            cleaning = "replace_by_start_date")
    df <- out[[2]]
  }

  df

}

#' Correct Day 7 duplicates (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day7_non_valid_pid_correction").
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_day7_duplicates <- function(df,
                                    csv_prefix = "day7_non_valid_pid_correction") {

  csv_filename <- dplyr::case_when(Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                                   TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname)
      if ("a1-pid" %in% colnames(df))
      {
        df <- df %>%
          merge(edits[, c("old_child_id", "uuid", "new_child_id")],
                by.x = c("a1-pid", "meta-instanceID"),
                by.y = c("old_child_id", "uuid"),
                all.x = TRUE)
        df$"a1-pid" <- ifelse(is.na(df$new_child_id), df$"a1-pid", df$new_child_id)
        df$"a1-fid" <- ifelse(is.na(df$new_child_id), df$"a1-fid", substr(df$new_child_id, 3,7))
      } else if ("child_id" %in% colnames(df))
      {
        df <- df %>%
          merge(edits[, c("old_child_id", "uuid", "new_child_id")],
                by.x = c("child_id", "uuid"),
                by.y = c("old_child_id", "uuid"),
                all.x = TRUE)
        df$child_id <- ifelse(is.na(df$new_child_id), df$child_id, df$new_child_id)
        df$fid <- ifelse(is.na(df$new_child_id), df$fid, substr(df$new_child_id, 3,7))
      }

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, NULL)
    }
  }
  out

}

#' Edit Day 7 follow-up data for all errors that were detected by quality checks (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns an edited dataframe with corrections
#' @import dplyr
#' @export

correct_day7_all <- function(df) {

  # Correct duplicated child IDs
  df <- timci::correct_day7_duplicates(df)[[1]]

}

#' Correct Day 28 duplicates (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day28_non_valid_pid_correction").
#' @import dplyr
#' @export

correct_day28_duplicates <- function(df,
                                     csv_prefix = "day28_non_valid_pid_correction") {

  csv_filename <- dplyr::case_when(Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                                   TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname)
      if ("a1-pid" %in% colnames(df))
      {
        df <- df %>%
          merge(edits[, c("old_child_id", "uuid", "new_child_id")],
                by.x = c("a1-pid", "meta-instanceID"),
                by.y = c("old_child_id", "uuid"),
                all.x = TRUE)
        df$"a1-pid" <- ifelse(is.na(df$new_child_id), df$"a1-pid", df$new_child_id)
        df$"a1-fid" <- ifelse(is.na(df$new_child_id), df$"a1-fid", substr(df$new_child_id, 3,7))
      } else if ("child_id" %in% colnames(df))
      {
        df <- df %>%
          merge(edits[, c("old_child_id", "uuid", "new_child_id")],
                by.x = c("child_id", "uuid"),
                by.y = c("old_child_id", "uuid"),
                all.x = TRUE)
        df$child_id <- ifelse(is.na(df$new_child_id), df$child_id, df$new_child_id)
        df$fid <- ifelse(is.na(df$new_child_id), df$fid, substr(df$new_child_id, 3,7))
      }

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, NULL)
    }
  }
  out

}

#' Edit Day 28 follow-up data for all errors that were detected by quality checks (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns an edited dataframe with corrections
#' @import dplyr
#' @export

correct_day28_all <- function(df) {

  # Correct duplicated child IDs
  df <- timci::correct_day28_duplicates(df)[[1]]

}

#' Correct hospital follow-up IDs (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day7_non_valid_pid_correction").
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_hospit_ids <- function(df,
                               csv_prefix = "hospit_non_valid_pid_correction") {

  csv_filename <- dplyr::case_when(Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                                   TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname)
      if ("a1-id" %in% colnames(df))
      {
        df <- df %>%
          merge(edits[, c("old_child_id", "uuid", "new_child_id")],
                by.x = c("a1-id", "meta-instanceID"),
                by.y = c("old_child_id", "uuid"),
                all.x = TRUE)
        df$"a1-id" <- ifelse(is.na(df$new_child_id), df$"a1-pid", df$new_child_id)
      } else if ("child_id" %in% colnames(df))
      {
        df <- df %>%
          merge(edits[, c("old_child_id", "uuid", "new_child_id")],
                by.x = c("child_id", "uuid"),
                by.y = c("old_child_id", "uuid"),
                all.x = TRUE)
        df$child_id <- ifelse(is.na(df$new_child_id), df$child_id, df$new_child_id)
      }

      # Remove the column new_child_id from the dataframe
      drop <- c("new_child_id")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, NULL)
    }
  }
  out

}

#' Edit incorrect healthcare provider (HCP) IDs in SPA sick child observation entries (TIMCI-specific function)
#' This function can be used to correct documented HCP IDs
#'
#' @param df dataframe
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_spa_sco_hcp_ids <- function(df) {

  csv_filename <- dplyr::case_when(Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ "spa_sco_hcp_correction_kenya.csv",
                                   TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    edits <- readr::read_csv(csv_pathname)
    df <- df %>%
      merge(edits[, c("old_hcp_id", "uuid", "new_hcp_id")],
            by.x = c("hcp_identification-hcpid", "meta-instanceID"),
            by.y = c("old_hcp_id", "uuid"),
            all.x = TRUE)
    df$"hcp_identification-hcpid" <- ifelse(is.na(df$new_hcp_id), df$"hcp_identification-hcpid", df$new_hcp_id)

    # Remove the column new_child_id from the dataframe
    drop <- c("new_hcp_id")
    df <- df[,!(names(df) %in% drop)]

    out <- list(df, edits, NULL)
  }
  out

}

#' Edit non-valid facilities in SPA sick child observation data entries (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "spa_sco_facility_correction").
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_spa_sco_fids <- function(df,
                                 csv_prefix = "spa_sco_facility_correction") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      discarded_edits <- df %>%
        merge(edits[, c("old_fid", "uuid", "new_fid")],
              by.x = c("facility_identification-fcode", "meta-instanceID"),
              by.y = c("old_fid", "uuid"),
              all.y = TRUE) %>%
        dplyr::filter(`facility_identification-fcode` == "") %>%
        dplyr::select(`facility_identification-fcode`,
                      `meta-instanceID`,
                      new_fid)

      df <- df %>%
        merge(edits[, c("old_fid", "uuid", "new_fid")],
              by.x = c("facility_identification-fcode", "meta-instanceID"),
              by.y = c("old_fid", "uuid"),
              all.x = TRUE) %>%
        dplyr::mutate(`facility_identification-fcode` = ifelse(is.na(new_fid), `facility_identification-fcode`, new_fid))

      # Remove the column new_child_id from the dataframe
      drop <- c("new_fid")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, discarded_edits)
    }
  }
  out

}

#' Edit SPA sick child observation data for all errors that were detected by quality checks (TIMCI-specific function)
#'
#' @param df dataframe
#' @return This function returns an edited dataframe with corrections
#' @import dplyr
#' @export

correct_spa_sco_all <- function(df) {

  # Edit incorrect HCP IDs
  df <- timci::correct_spa_sco_hcp_ids(df)[[1]]

  # Edit incorrect facility IDs
  df <- timci::correct_spa_sco_fids(df)[[1]]

  df

}

#' Edit drug data in Day 0 data entries (TIMCI-specific function - Kenya and Senegal only)
#'
#' @param day0_df dataframe that contains Day 0 data and needs to be corrected
#' @param drug_df dataframe that contains corrected (structured) drug data to edit in Day 0 data
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_day0_drug_data <- function(day0_df,
                                   drug_df) {

  # Remove columns from drug_df for processing
  drop <- c("start",
            "end",
            "free_text1",
            "free_text2",
            "rx_type2",
            "rx_othtype2",
            "rx_type_hf2",
            "rx_othtype_hf2",
            "child_id")
  drug_df1 <- drug_df[,!(names(drug_df) %in% drop)]

  cols <- colnames(day0_df)
  if ("rx_antibio_oth" %in% cols) {
    day0_df$rx_antibio_oth <- as.character(day0_df$rx_antibio_oth)
  }
  if ("rx_antibio_oth_hf" %in% cols) {
    day0_df$rx_antibio_oth_hf <- as.character(day0_df$rx_antibio_oth_hf)
  }
  day0_df$rx_antimalarials <- as.character(day0_df$rx_antimalarials)
  day0_df$rx_antimalarials_hf <- as.character(day0_df$rx_antimalarials_hf)
  if ("rx_consumables" %in% cols) {
    day0_df$rx_consumables <- as.character(day0_df$rx_consumables)
  }
  if ("rx_consumables_hf" %in% cols) {
    day0_df$rx_consumables_hf <- as.character(day0_df$rx_consumables_hf)
  }

  # Replace 0 values in df if values entered in the drug dataframe is equal to 1
  cols <- colnames(day0_df)
  colnames(drug_df1) <- paste0(colnames(drug_df1),"1")
  df <- day0_df %>%
    merge(drug_df1,
          by.x = "uuid",
          by.y = "uuid1",
          all.x = TRUE) %>%
    timci::selective_replace("rx_amoxicillin", cols) %>%
    timci::selective_replace("rx_amoxicillin_hf", cols) %>%
    timci::selective_replace("rx_penicillinG", cols) %>%
    timci::selective_replace("rx_penicillinG_hf", cols) %>%
    timci::selective_replace("rx_ceftriaxone", cols) %>%
    timci::selective_replace("rx_ceftriaxone_hf", cols) %>%
    timci::selective_replace("rx_cef_antibiotics", cols) %>%
    timci::selective_replace("rx_cef_antibiotics_hf", cols) %>%
    timci::selective_replace("rx_ciprofloxacin", cols) %>%
    timci::selective_replace("rx_ciprofloxacin_route", cols) %>%
    timci::selective_replace("rx_ciprofloxacin_hf", cols) %>%
    timci::selective_replace("rx_ciprofloxacin_route_hf", cols) %>%
    timci::selective_replace("rx_gentamicin", cols) %>%
    timci::selective_replace("rx_gentamicin_route", cols) %>%
    timci::selective_replace("rx_gentamicin_hf", cols) %>%
    timci::selective_replace("rx_gentamicin_route_hf", cols) %>%
    timci::selective_replace("rx_metronidazol", cols) %>%
    timci::selective_replace("rx_metronidazol_route", cols) %>%
    timci::selective_replace("rx_metronidazol_hf", cols) %>%
    timci::selective_replace("rx_metronidazol_route_hf", cols) %>%
    timci::selective_replace("rx_ampicillin", cols) %>%
    timci::selective_replace("rx_ampicillin_hf", cols) %>%
    timci::selective_replace("rx_azithromycin", cols) %>%
    timci::selective_replace("rx_azithromycin_hf", cols) %>%
    timci::selective_replace("rx_benzathinepeniG", cols) %>%
    timci::selective_replace("rx_benzathinepeniG_hf", cols) %>%
    timci::selective_replace("rx_aclav", cols) %>%
    timci::selective_replace("rx_aclav_hf", cols) %>%
    timci::selective_replace("rx_cotrimoxazole", cols) %>%
    timci::selective_replace("rx_cotrimoxazole_hf", cols) %>%
    timci::selective_multi_replace("rx_antibio_oth", cols) %>%
    timci::selective_multi_replace("rx_antimalarials", cols) %>%
    #timci::selective_multi_replace("rx_artesunate_route", cols) %>%
    #timci::selective_multi_replace("rx_quinine_route", cols) %>%
    timci::selective_multi_replace("rx_imci", cols) %>%
    timci::selective_multi_replace("rx_creams", cols) %>%
    timci::selective_multi_replace("rx_consumables", cols) %>%
    dplyr::select(cols)

  out <- list(df, drug_df, NULL)
  out

}

#' Replace NA and 0 values in a data frame column with 1 if another corresponding column is 1.
#'
#' This function replaces NA and 0 values in a specified column of a data frame with 1 if
#' another corresponding column with the same name and a "1" suffix has a value of 1.
#'
#' @param df A data frame.
#' @param col A character string indicating the name of the column to replace.
#' @param cols A character vector of column names to check for the existence of the `col` column.
#'
#' @return The input data frame with specified column values replaced.
#'
#' @import dplyr rlang
#'
#' @export

selective_replace <- function(df, col, cols) {

  out <- df
  if (col %in% cols) {
    qcol <- rlang::sym(col) # Quote the arguments that refer to data frame columns
    out <- out %>%
      dplyr::mutate(!!qcol := dplyr::case_when(
        ( !!qcol == 0 ) & ( !!rlang::sym(paste0(col, "1")) == 1 )                      ~ 1,
        is.na(as.numeric(!!qcol)) & !is.na(as.numeric(!!rlang::sym(paste0(col, "1")))) ~ as.numeric(!!rlang::sym(paste0(col, "1"))),
        .default = as.numeric(!!qcol))
        )
  }

  out

}

#' Replace NA and 0 values in a data frame column with 1 if another corresponding column is 1.
#'
#' This function replaces NA and 0 values in a specified column of a data frame with 1 if
#' another corresponding column with the same name and a "1" suffix has a value of 1.
#'
#' @param df A data frame.
#' @param col A character string indicating the name of the column to replace.
#' @param cols A character vector of column names to check for the existence of the `col` column.
#'
#' @return The input data frame with specified column values replaced.
#'
#' @import dplyr rlang
#'
#' @export

selective_multi_replace <- function(df, col, cols) {

  out <- df
  if (col %in% cols) {
    qcol <- rlang::sym(col) # Quote the arguments that refer to data frame columns
    out <- out %>%
      dplyr::mutate(!!qcol := dplyr::case_when(
        ( !!qcol != "96" ) & ( !!rlang::sym(paste0(col, "1")) != "96" ) ~ paste0(!!qcol, ";", !!rlang::sym(paste0(col, "1"))),
        ( !!qcol == "96" ) & ( !!rlang::sym(paste0(col, "1")) != "96" ) ~ !!rlang::sym(paste0(col, "1")),
        ( !!qcol == "" ) & ( !!rlang::sym(paste0(col, "1")) != "" )     ~ !!rlang::sym(paste0(col, "1")),
        .default = !!qcol)
      )
  }

  out

}

#' Correct hospital follow-up IDs (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "day7_non_valid_pid_correction").
#' @param meas integer with value 1 or 2 to indicate the measurement variable to replace
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_spo2_values <- function(df,
                               csv_prefix = "day0_spo2_meas1",
                               meas = 1) {

  csv_filename <- dplyr::case_when(Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                                   Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                                   TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')
    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname)
      df <- df %>%
        merge(edits[, c("child_id", "uuid", "new_spo2")],
              by = c("child_id", "uuid"),
              all.x = TRUE)
      if (meas == 1) {
        df$spo2_meas1 <- ifelse(is.na(df$new_spo2), df$spo2_meas1, df$new_spo2)
      } else if (meas == 2) {
        df$spo2_meas2 <- ifelse(is.na(df$new_spo2), df$spo2_meas2, df$new_spo2)
      }

      # Remove the column new_spo2 from the dataframe
      drop <- c("new_spo2")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, NULL)
    }
  }
  out

}

#' Edit non-valid facilities in time-flow data entries (TIMCI-specific function)
#'
#' @param df dataframe
#' @param csv_prefix A string value indicating the prefix of the CSV file from which to read the corrections (default is "tf_facility_correction").
#' @return This function returns a list that contains a dataframe with corrections and the list of edits
#' @import dplyr
#' @export

correct_tf_inconsistent_facilities <- function(df,
                                               csv_prefix = "tf_facility_correction") {

  csv_filename <- case_when(Sys.getenv('TIMCI_COUNTRY') == 'Senegal' ~ paste(csv_prefix, "senegal.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Tanzania' ~ paste(csv_prefix, "tanzania.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'Kenya' ~ paste(csv_prefix, "kenya.csv", sep = "_"),
                            Sys.getenv('TIMCI_COUNTRY') == 'India' ~ paste(csv_prefix, "india.csv", sep = "_"),
                            TRUE ~ "")

  out <- list(df, NULL, NULL)
  if ( csv_filename != "" ) {
    csv_pathname <- system.file(file.path('extdata', 'cleaning', csv_filename), package = 'timci')

    if ( file.exists(csv_pathname) ) {
      edits <- readr::read_csv(csv_pathname, show_col_types = FALSE)

      discarded_edits <- df %>%
        merge(edits[, c("old_fid", "uuid", "new_fid")],
              by.x = c("fid", "uuid"),
              by.y = c("old_fid", "uuid"),
              all.y = TRUE) %>%
        dplyr::filter(fid == "") %>%
        dplyr::select(fid,
                      uuid,
                      new_fid)

      df <- df %>%
        merge(edits[, c("old_fid", "uuid", "new_fid")],
              by.x = c("fid", "uuid"),
              by.y = c("old_fid", "uuid"),
              all.x = TRUE)

      df$fid <- ifelse(is.na(df$new_fid), df$fid, df$new_fid)

      # Remove the column new_child_id from the dataframe
      drop <- c("new_fid")
      df <- df[,!(names(df) %in% drop)]

      out <- list(df, edits, discarded_edits)
    }
  }
  out

}
Thaliehln/timci documentation built on April 8, 2024, 3:38 p.m.