#' Prepare raw HCV data
#'
#' @inheritParams prepare_tb_report_data
#'
#' @importFrom dplyr inner_join left_join
#' @importFrom assertthat assert_that
#' @importFrom purrr map2_df
#'
prepare_hcv_report_data <- function(x,
categorise_age = TRUE) {
patient_id <- NULL
starttre <- NULL
samp_date <- NULL
assessment_date <- NULL
visit_date <- NULL
end_date <- NULL
# Check arg ----
assertthat::assert_that(is.list(x),
is.data.frame(x$hcv_start))
# no missing start date or patient ID values
assertthat::are_equal(sum(is.na(x$patient_id)), 0)
# Impute missing start tx dates ----
# use consult date where starttre is missing
x$hcv_start$starttre <- as.Date(ifelse(is.na(x$hcv_start$starttre),
x$hcv_start$consult_date,
x$hcv_start$starttre),
origin = "1970-01-01")
# remove consult date variable
x$hcv_start$consult_date <- NULL
# Clean variables ----
# * clean age and gender ----
age_gender <- clean_adm_demographics(x$hcv_characteristics)
# * merge age/gender data with start treatment data
age_gender_start <- dplyr::left_join(x$hcv_start,
age_gender,
by = "patient_id")
# * Recalculate age at start of treatment episode
start_tx_age_gender <- recalculate_age_tx_start(age_gender_start,
start_var = starttre,
categorise = categorise_age)
# * Baseline genotype and fibrosis ----
# concat hcv genotype variables
x$hcv_genotype <- merge_hcv_genotype(x$hcv_genotype,
drop_levels = TRUE)
base_fib_geno <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
df_info = x$hcv_genotype,
key_var = patient_id,
start_date = starttre,
eval_date = samp_date,
post_start = FALSE)
# * Baseline performance status
base_ecog <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
df_info = x$hcv_perf_status,
key_var = patient_id,
start_date = starttre,
eval_date = assessment_date,
post_start = FALSE)
# rename date variable to avoid later conflict
names(base_ecog)[names(base_ecog) == "assessment_date"] <- "ecog_date"
# * Baseline treatment ----
# convert long hcv_treatment datafrome to wide
# each visit per patient is unique - also restricted to visits
# where treatment is started or restarted
hcv_tx_df<- drug_tx_by_visit(x$hcv_treatment)
# Treatment data with start dates
## Use data after treatment start date - closest or equal to start date
base_treatment <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
df_info = hcv_tx_df,
key_var = patient_id,
start_date = starttre,
eval_date = visit_date,
post_start = TRUE)
# Baseline weight and height ----
base_weight_height <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
df_info = x$hcv_weight_height,
key_var = patient_id,
start_date = starttre,
eval_date = assessment_date,
post_start = FALSE)
# * Outcome data ----
end_outcome <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
df_info = x$hcv_end[c("patient_id", "end_date", "outcome")],
key_var = patient_id,
start_date = starttre,
eval_date = end_date,
post_start = TRUE)
# ** Rename end_date to prevent duplicate variable names ----
names(end_outcome)[names(end_outcome) == "end_date"] <- "end_tx_date"
# * Cohort exit data where end_fu == "Yes" ----
end_cohort_outcome <- merge_to_episode(df_episode = x$hcv_start[, c("patient_id", "starttre")],
df_info = x$hcv_end[x$hcv_end$end_fu == "Yes" &
!is.na(x$hcv_end$end_fu) ,
c("patient_id", "end_date", "end_fu", "end_fu_reason")],
key_var = patient_id,
start_date = starttre,
eval_date = end_date,
post_start = TRUE)
## ** Rename end_date to prevent duplicate variable names ----
names(end_cohort_outcome)[names(end_cohort_outcome) == "end_date"] <- "end_fu_date"
# Merge data frames ----
# * Fibrosis, genotype with ECOG ----
fg_ecog <- dplyr::left_join(base_fib_geno,
base_ecog,
by = c("patient_id", "starttre"))
# * Merge treatment ----
fg_ecog_tx <- dplyr::left_join(fg_ecog,
base_treatment,
by = c("patient_id", "starttre"))
# * Merge age ----
fg_ecog_tx_ag <- dplyr::left_join(fg_ecog_tx,
start_tx_age_gender,
by = c("patient_id", "starttre"))
# * Merge episode outcome ----
fg_ecog_tx_epout <- dplyr::left_join(fg_ecog_tx_ag,
end_outcome,
by = c("patient_id", "starttre"))
# * Merge cohort outcome ----
fg_ecog_tx_epout_cout <- dplyr::left_join(fg_ecog_tx_epout,
end_cohort_outcome,
by = c("patient_id", "starttre"))
# * Merge weight and height ----
fg_ecog_tx_epout_cout_wh <- dplyr::left_join(fg_ecog_tx_epout_cout,
base_weight_height,
by = c("patient_id", "starttre"))
# Generate variables ----
new <- fg_ecog_tx_epout_cout_wh
# check if duplicate treatment episode rows are present
assertthat::assert_that(nrow(new) == nrow(unique(new[c("patient_id", "starttre")])))
# convert variables to factors
new[hcv_levels$var_names] <- purrr::map2_df(.x = new[hcv_levels$var_names],
.y = hcv_levels$var_levels,
.f = ~ factor(.x,
levels = .y))
# convert HCV treatment variable to factor for reporting
new$drug_factor <- ifelse(is.na(new$drug_tx_prescribed),
NA_character_,
ifelse(new$drug_tx_prescribed %in% core_hcv_formulations,
new$drug_tx_prescribed,
"Other"))
new$drug_factor <- factor(new$drug_factor, levels = core_hcv_formulations)
# add genotype factor variable to manage number of reporting options
new$factor_full_genotype <- factor(new$hcv_full_genotype,
levels = core_hcv_genotypes)
# create start month variable
new$start_month <- factor(month.name[as.numeric(format(new$starttre, "%m"))],
levels = month.name)
# * New start treatment quarter variable ----
new$start_quarter <- lubridate::quarter(new$starttre, with_year = TRUE)
# * BMI ----
new$bmi <- new$weight / (new$height / 100) ^ 2
# * BMI factor ----
new$bmi_factor <- factor(ifelse(new$bmi >= 18.5, 0, 1),
levels = c(0, 1),
labels = c("\u2265 18.5", "< 18.5"))
# add reporting time
new <- calculate_reporting_period(new, disease = "hcv")
# check output
assertthat::assert_that(nrow(x$hcv_start) == nrow(new))
# check for duplicated variable names
assertthat::assert_that(isFALSE(any(grepl("\\.", names(new)))))
new
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.