hisreportr: hisreportr: Automating MSF OCA EMR reporting

Documented in prepare_hcv_report_data

#' Prepare raw HCV data
#'
#' @inheritParams prepare_tb_report_data
#'
#' @importFrom dplyr inner_join left_join
#' @importFrom assertthat assert_that
#' @importFrom purrr map2_df
#'

prepare_hcv_report_data <- function(x, 
                                   categorise_age = TRUE) {
  
  patient_id <- NULL
  starttre <- NULL
  samp_date <- NULL
  assessment_date <- NULL
  visit_date <- NULL
  end_date <- NULL
  
  # Check arg ----
  assertthat::assert_that(is.list(x),
                          is.data.frame(x$hcv_start))
  
  # no missing start date or patient ID values
  assertthat::are_equal(sum(is.na(x$patient_id)), 0)
  
  # Impute missing start tx dates ----
  # use consult date where starttre is missing
  x$hcv_start$starttre <- as.Date(ifelse(is.na(x$hcv_start$starttre),
                                        x$hcv_start$consult_date,
                                        x$hcv_start$starttre), 
                                 origin = "1970-01-01")
  
  # remove consult date variable
  x$hcv_start$consult_date <- NULL

  # Clean variables ----
  # * clean age and gender ----
  age_gender <- clean_adm_demographics(x$hcv_characteristics)
  
  # * merge age/gender data with start treatment data 
  age_gender_start <- dplyr::left_join(x$hcv_start, 
                                       age_gender, 
                                       by = "patient_id")
  
  # * Recalculate age at start of treatment episode
  start_tx_age_gender <- recalculate_age_tx_start(age_gender_start,
                                  start_var = starttre,
                                  categorise = categorise_age)
  
  
  # * Baseline genotype and fibrosis ----
  # concat hcv genotype variables
  x$hcv_genotype <- merge_hcv_genotype(x$hcv_genotype, 
                            drop_levels = TRUE)
  
  base_fib_geno <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
                                    df_info = x$hcv_genotype,
                                    key_var = patient_id,
                                    start_date = starttre,
                                    eval_date = samp_date,
                                    post_start = FALSE)
  
  # * Baseline performance status
  base_ecog <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
                                df_info = x$hcv_perf_status,
                                key_var = patient_id,
                                start_date = starttre,
                                eval_date = assessment_date,
                                post_start = FALSE)
  # rename date variable to avoid later conflict
  names(base_ecog)[names(base_ecog) == "assessment_date"] <- "ecog_date"
  
  # * Baseline treatment ----
  # convert long hcv_treatment datafrome to wide
  # each visit per patient is unique - also restricted to visits
  # where treatment is started or restarted
  hcv_tx_df<- drug_tx_by_visit(x$hcv_treatment)
  
  # Treatment data with start dates
  ## Use data after treatment start date - closest or equal to start date
  base_treatment <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
                                     df_info = hcv_tx_df,
                                     key_var = patient_id,
                                     start_date = starttre,
                                     eval_date = visit_date,
                                     post_start = TRUE)
  
  # Baseline weight and height ----
  base_weight_height <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
                                         df_info = x$hcv_weight_height,
                                         key_var = patient_id,
                                         start_date = starttre, 
                                         eval_date = assessment_date,
                                         post_start = FALSE)
  
  # * Outcome data ----
  end_outcome <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
                                  df_info = x$hcv_end[c("patient_id", "end_date", "outcome")],
                                  key_var = patient_id,
                                  start_date = starttre,
                                  eval_date = end_date,
                                  post_start = TRUE)
  
  # ** Rename end_date to prevent duplicate variable names ----
  names(end_outcome)[names(end_outcome) == "end_date"] <- "end_tx_date"
  
  # * Cohort exit data where end_fu == "Yes" ----
  end_cohort_outcome <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
                                    df_info = x$hcv_end[x$hcv_end$end_fu == "Yes" &
                                                          !is.na(x$hcv_end$end_fu) , 
                                                        c("patient_id", "end_date", "end_fu", "end_fu_reason")],
                                    key_var = patient_id,
                                    start_date = starttre,
                                    eval_date = end_date,
                                    post_start = TRUE)
  ## ** Rename end_date to prevent duplicate variable names ----
  names(end_cohort_outcome)[names(end_cohort_outcome) == "end_date"] <- "end_fu_date"
  
  

  # Merge data frames ----

  # * Fibrosis, genotype with ECOG ----
  fg_ecog <- dplyr::left_join(base_fib_geno, 
                         base_ecog,
                         by = c("patient_id", "starttre"))
  
  # * Merge treatment ----
  fg_ecog_tx <- dplyr::left_join(fg_ecog,
                           base_treatment,
                           by = c("patient_id", "starttre"))
  
  # * Merge age ----
  fg_ecog_tx_ag <- dplyr::left_join(fg_ecog_tx,
                                    start_tx_age_gender,
                                    by = c("patient_id", "starttre"))
  
  # * Merge episode outcome ----
  fg_ecog_tx_epout <- dplyr::left_join(fg_ecog_tx_ag,
                                       end_outcome, 
                                       by = c("patient_id", "starttre"))
  
  # * Merge cohort outcome ----
  fg_ecog_tx_epout_cout <- dplyr::left_join(fg_ecog_tx_epout,
                                            end_cohort_outcome,
                                            by = c("patient_id", "starttre"))
  
  # * Merge weight and height ----
  fg_ecog_tx_epout_cout_wh <- dplyr::left_join(fg_ecog_tx_epout_cout,
                                               base_weight_height,
                                               by = c("patient_id", "starttre"))
  

  # Generate variables ----
  new <- fg_ecog_tx_epout_cout_wh
  
  # check if duplicate treatment episode rows are present
  assertthat::assert_that(nrow(new) == nrow(unique(new[c("patient_id", "starttre")])))
  
  # convert variables to factors
  new[hcv_levels$var_names] <- purrr::map2_df(.x = new[hcv_levels$var_names],
                                            .y = hcv_levels$var_levels,
                                            .f = ~ factor(.x, 
                                                          levels = .y))
  
  # convert HCV treatment variable to factor for reporting
  new$drug_factor <- ifelse(is.na(new$drug_tx_prescribed), 
                           NA_character_,
                           ifelse(new$drug_tx_prescribed %in% core_hcv_formulations, 
                                  new$drug_tx_prescribed, 
                                  "Other"))
  new$drug_factor <- factor(new$drug_factor, levels = core_hcv_formulations)
  
  # add genotype factor variable to manage number of reporting options
  new$factor_full_genotype <- factor(new$hcv_full_genotype, 
                                     levels = core_hcv_genotypes)
  
  # create start month variable
  new$start_month <- factor(month.name[as.numeric(format(new$starttre, "%m"))],
                           levels = month.name)
  
  # * New start treatment quarter variable ----
  new$start_quarter <- lubridate::quarter(new$starttre, with_year = TRUE)
  
  # * BMI ----
  new$bmi <- new$weight / (new$height / 100) ^ 2
  
  # * BMI factor ----
  new$bmi_factor <- factor(ifelse(new$bmi >= 18.5, 0, 1),
                           levels = c(0, 1),
                           labels = c("\u2265 18.5", "< 18.5"))
  
  # add reporting time
  new <- calculate_reporting_period(new, disease = "hcv")

  # check output
  assertthat::assert_that(nrow(x$hcv_start) == nrow(new))
  
  # check for duplicated variable names
    assertthat::assert_that(isFALSE(any(grepl("\\.", names(new)))))

  new
}