#***********************************************************************
# project: LTBI screening ACE
# N Green
# Oct 2016
#
# 02-data-prep_modelling.R
# pre-process imputed dataset, from Aldridge (2016) Lancet
data("051206 - IMPUTED_sample",
package = "LTBIscreeningproject")
screen_age_range <- 18:45
year_cohort <- '2009' #largest cohort
set.seed(23456)
if (!exists("screen_age_range")) screen_age_range <- 18:45
IMPUTED_sample <- IMPUTED_IOM_ETS_WHO_merged_15_2_9
rm(IMPUTED_IOM_ETS_WHO_merged_15_2_9)
# remove duplicate and not needed ---------------------------------
rm_variables_names <-
c("firstcdt","exdt", "sex","dob","cxr","cxrdt","cxrint","cxrintdt","ssdnd","dta1","dta2","dta3",
"dtb1","dtb2","cresb2","dtb3","stres","iomgdln","ssspunob","ssnormcxr","sscxntb","ssappdec",
"famcont","cert","expdt","birthyr","ager", "cxrndr","cxrukres","examyr","exammth","dst2", "ssndr","consp","nocert",
"active","lab","clinical","ssres","scres","protocol","rclientsourceid","weight","SourceID","threshold",
"uk_caserepdate","uk_sitepulm","uk_microscopy","uk_pcr","uk_etsculture","uk_culture",
"uk_sputsmear","uk_origsputsmear","uk_anyres","uk_allfirstres","uk_secondres",
"uk_prevdiag","uk_bcgvacc","uk_anyriskfactor","uk_typed_loci","uk_atleast23",
"uk_clusteranalysis","uk_clusterno1_analysis","uk_clustered","uk_cluster_size","uk_clustersize_n",
"uk_clusterno1_analysis_24","male","active_pulm","any_culture","culture_pos","smear_pos",
"TB_IOM","lab_new","sputum_taken","time_to_cert","xray_new","clustered2", "first_screen",
"last_screen","time_first_last_screen","first_screen_TB","last_screen_TB",
"max_weight","keep_ETS_dup", "min_exdt", "next_screen","days_between_screens","cohort_end_dups",
"uk_pulm_culture","uk_pulm_smear","uk_pulm_resist","uk_extpulm_culture","uk_extrapulm_smear",
"uk_extrapulm_resist","cohort_end","cohort_end_visa","cohort_end_1yr","uk_anyriskfactor_risk",
"uk_bcgvacc_risk","uk_prevdiag_risk","uk_delayindiag_cat","uk_clustersize_cat","uk_clustersize_n_cat",
"uk_clusterno1_analysis_24_unique", "uk_max_loci_pre","uk_cluster_size_pre","uk_clustered_pre","uk_clustered_mi",
"uk_cluster_trans","uk_cluster_react","uk_first_in_cluster", "prop")
IMPUTED_sample <- IMPUTED_sample[ ,!names(IMPUTED_sample) %in% rm_variables_names]
# remove duplicate pre-entry screened
IMPUTED_sample <- dplyr::filter(IMPUTED_sample,
dup_ref_id_orig == 0)
# remove duplicate notification dates
IMPUTED_sample <- dplyr::filter(IMPUTED_sample,
is.na(rNotificationDate) | uk_tb == 1)
##TODO: why are there missing issdt uk entry dates?
# for now just remove them...
IMPUTED_sample <- dplyr::filter(IMPUTED_sample,
!is.na(issdt))
# remove death before entry to UK
##TODO: are deaths before entry the same for all imputation samples?
IMPUTED_sample <- dplyr::filter(IMPUTED_sample,
date_death1 >= issdt)
# eligible screening age range only
IMPUTED_sample <- dplyr::filter(IMPUTED_sample,
age_at_entry %in% screen_age_range)
# create LTBI probs by WHO active TB group ---------------------------------
who_levels <- c("(0,50]", "(50,150]", "(150,250]", "(250,350]", "(350,1e+05]")
# match active TB prevalence groups in dataset to Pareek (2011)
IMPUTED_sample$who_prev_cat_Pareek2011 <- cut(IMPUTED_sample$who_prevalence,
breaks = c(0, 50, 150, 250, 350, 100000))
# ref. Pareek M et al. Lancet Infect Dis. Elsevier Ltd; 2011;11(6)
# ages 18-35 pooled
pLatentTB.who <- c(0.03, 0.13, 0.2, 0.3, 0.3) %>% setNames(who_levels)
### assume >35 == 35 year olds ###
# i.e. age independent
pLatentTB.who_age <-
matrix(data = pLatentTB.who,
ncol = 28,
nrow = length(pLatentTB.who)) %>%
data.frame(who_levels, .) %>%
purrr::set_names("who_prev_cat_Pareek2011", as.character(screen_age_range))
# join with main data set
pLatentTB.who_age.long <- reshape2:::melt.data.frame(data = pLatentTB.who_age,
id.vars = "who_prev_cat_Pareek2011",
value.name = "pLTBI",
variable.name = "age_at_entry")
IMPUTED_sample <- merge(x = IMPUTED_sample,
y = pLatentTB.who_age.long,
by = c("age_at_entry",
"who_prev_cat_Pareek2011"))
IMPUTED_sample$LTBI <- sample_tb(prob = 1 - IMPUTED_sample$pLTBI)
# extract uk entry year only
IMPUTED_sample$issdt_year <- format(IMPUTED_sample$issdt, '%Y')
# create time-to-events -------------------------
# from uk entry to event dates
IMPUTED_sample <-
IMPUTED_sample %>%
dplyr::mutate(date_death1_issdt = date_death1 - issdt,
date_exit_uk1_issdt = date_exit_uk1 - issdt,
date_death1_issdt.years = as.numeric(date_death1_issdt)/365.25,
date_exit_uk1_issdt.years = as.numeric(date_exit_uk1_issdt)/365.25,
date_exit_uk1_issdt = ifelse(date_exit_uk1_issdt.years == 100, Inf, date_exit_uk1_issdt),
date_exit_uk1_issdt.years = ifelse(date_exit_uk1_issdt.years == 100, Inf, date_exit_uk1_issdt.years))
# uk entry to follow-up days -------------------------------------------------
FUP_DATE <- as.Date("2013-12-31")
# days from 1960-01-01 to 2013-12-31
date_origin <- as.Date("1960-01-01") #STATA
fup_limit <- FUP_DATE - date_origin
IMPUTED_sample <-
IMPUTED_sample %>%
dplyr::mutate(fup_limit_issdt_days = FUP_DATE - issdt,
fup_limit_issdt = fup_limit_issdt_days/365.25,
fup1_date = as.Date(fup1, origin = date_origin),
fup_issdt_days = fup1_date - issdt,
fup_issdt = fup_issdt_days/365.25,
cens1 = fup1_date == FUP_DATE,
death1 = fup1_date == date_death1,
exit_uk1 = fup1_date == date_exit_uk1)
# remove indiv follow-up date before entry
IMPUTED_sample <- dplyr::filter(IMPUTED_sample,
fup1_date > issdt)
# remove tb before entry
IMPUTED_sample <-
IMPUTED_sample %>%
dplyr::mutate(rNotificationDate_issdt = rNotificationDate - issdt,
uk_tb = ifelse(rNotificationDate_issdt < 0 | is.na(rNotificationDate_issdt),
yes = 0,
no = 1),
rNotificationDate_issdt = ifelse(uk_tb,
yes = rNotificationDate_issdt,
no = NA),
rNotificationDate_issdt.years = as.numeric(rNotificationDate_issdt)/365.25)
# total sample size
n.pop_screen <- nrow(IMPUTED_sample)
# total sample sizes for each yearly cohort
n.popyear_screen <-
aggregate(x = rep(1, n.pop_screen),
by = list(IMPUTED_sample$issdt_year),
sum) %>%
purrr::set_names(c("year", "pop"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.