ADTools: Alzheimer Disease Data Tools

Documented in adt_get_biocard

#' Import BIOCARD Data
#'
#' Import BIOCARD data from source files and generate the analysis dataset.
#'
#' @inheritParams parameters
#'
#' @return
#'
#' Returned the analysis dataset with: patients' ids, baseline times,
#' corresponding biomarkers, biomarker test times, etc.
#'
#' @examples
#' \dontrun{
#' ## with default unoverlaped window
#' dt_biocard <- get_biocard(path)
#'
#' ## with costomized window
#' dt_biocard <- get_biocard(path,
#'                           window_setting = window_Set)
#'
#' ## with dictionary provided by user
#' dt_biocard <- get_biocard(path,
#'                           src_tables = "dict_src_tables.xlsx")
#' }
#'
#' @export
#'
#'
adt_get_biocard <- function(path     = ".",
                            reference_time = NULL, 
                            window_setting = NULL, 
                            pattern    = "*.xls",
                            src_files  = NULL,
                            src_tables = NULL,
                            ..., 
                            verbose    = TRUE) {

    ## --------- internal functions -------------------------------------
    
    ## window setting
    if (is.null(window_setting)) {
        window_setting = list(cog   = c(730, FALSE), 
                              diag  = c(730, FALSE), 
                              csf   = c(730, FALSE), 
                              hippo = c(730, FALSE), 
                              amy   = c(730, FALSE), 
                              ec    = c(730, FALSE))
    }
    
    ## convert date
    f_date <- function(code, date_name, dta) {
        mvar <- a_map_var("BIOCARD", code, date_name, dict_src_tables)
        dfmt <- dict_src_files %>%
            filter(adt_table_code == code)

        stopifnot(1 == nrow(dfmt))

        dta %>%
            mutate(!!date_name :=
                       as.Date(!!as.name(mvar),
                               dfmt[["src_date_format"]])) %>%
            select(- all_of(mvar))
    }

    ## map var
    f_map <- function(code, var, dta, fc = NULL) {
        mvar <- a_map_var("BIOCARD", code, var, dict_src_tables)
        dta  <- dta %>%
            rename(!!var := mvar)

        if (!is.null(fc))
            dta[var] <- lapply(dta[var], fc)

        dta
    }

    ## --------- prepare files  -------------------------------------

    ## list all file names matching the pattern
    file_names <- list.files(path       = path,
                             pattern    = pattern,
                             full.names = TRUE)

    ## dictionary of src files
    dict_src_files  <- adt_get_dict("src_files",  csv_fname = src_files)
    dict_src_tables <- adt_get_dict("src_tables", csv_fname = src_tables)
    dict_data       <- adt_get_dict("ana_data")

    ## --------- read tables -------------------------------------
    
    vec_tbls <- c("COG", "DIAG", "CSF", "DEMO",
                  "HIPPO", "AMY", "EC", "GE")

    chk_all <- NULL
    for (i in vec_tbls) {
        cur_dat  <- a_read_file(i, file_names, dict_src_files, verbose)
        cur_chk  <- a_check_src(i, cur_dat,    dict_src_tables)

        assign(paste("dat_", tolower(i), sep = ""),
               cur_dat)

        chk_all <- rbind(chk_all, cur_chk)
    }

    if (dim(chk_all)[1] > 0) {
        err_msg <- a_err_msg("biocard_load_error")
        message(err_msg)
        print(chk_all)
    }

    dat_lsta  <- a_read_file("LIST_A", file_names, dict_src_files, verbose)
    dat_lstb  <- a_read_file("LIST_B", file_names, dict_src_files, verbose)

    ## ----------  manipulation ----------------------------------
    
    a_print("Formatting data ...", verbose)

    dat_cog   <- f_date("COG",  "date_cog",   dat_cog)
    dat_cog   <- f_map("COG",   "subject_id", dat_cog)

    dat_diag  <- f_date("DIAG", "date_diag",  dat_diag)
    dat_diag  <- f_map("DIAG",  "subject_id", dat_diag)

    dat_csf   <- f_date("CSF",  "date_csf",   dat_csf)
    dat_csf   <- f_map("CSF",   "abeta",      dat_csf)
    dat_csf   <- f_map("CSF",   "subject_id", dat_csf)

    dat_demo  <- f_map("DEMO", "subject_id", dat_demo)
    dat_demo  <- dat_demo %>%
        select(-c("jhuanonid", "lettercode", "nihid"))

    dat_hippo <- f_date("HIPPO", "date_hippo", dat_hippo)
    dat_hippo <- f_map("HIPPO",  "subject_id", dat_hippo)

    dat_hippo <- f_map("HIPPO",  "intracranial_vol_hippo",
                       dat_hippo, as.numeric)

    dat_hippo <- f_map("HIPPO",  "l_hippo", dat_hippo, as.numeric)
    dat_hippo <- f_map("HIPPO",  "r_hippo", dat_hippo, as.numeric)

    ## MRI amygdala
    dat_amy <- f_date("AMY", "date_amy",        dat_amy)
    dat_amy <- f_map("AMY", "subject_id",       dat_amy)

    dat_amy <- f_map("AMY", "intracranial_vol_amy",
                     dat_amy, as.numeric)

    dat_amy <- f_map("AMY", "l_amy", dat_amy, as.numeric)
    dat_amy <- f_map("AMY", "r_amy", dat_amy, as.numeric)


    ## MRI EC volume
    dat_ec  <- f_date("EC", "date_ec",         dat_ec)
    dat_ec  <- f_map("EC", "subject_id",       dat_ec)
    dat_ec  <- f_map("EC", "intracranial_vol_ec", dat_ec, as.numeric)
    dat_ec  <- f_map("EC", "l_ec_vol",         dat_ec, as.numeric)
    dat_ec  <- f_map("EC", "r_ec_vol",         dat_ec, as.numeric)
    dat_ec  <- f_map("EC", "l_ec_thick",       dat_ec, as.numeric)
    dat_ec  <- f_map("EC", "r_ec_thick",       dat_ec, as.numeric)

    ## process data
    dat_hippo$bi_hippo <- (dat_hippo$l_hippo + dat_hippo$r_hippo) / 2
    dat_amy$bi_amy     <- (dat_amy$l_amy     + dat_amy$r_amy)     / 2
    dat_ec$bi_ec_vol   <- (dat_ec$l_ec_vol   + dat_ec$r_ec_vol)   / 2
    dat_ec$bi_ec_thick <- (dat_ec$l_ec_thick + dat_ec$r_ec_thick) / 2

    ## race
    dat_ge <- dat_ge %>%
        select(-c("jhuanonid", "lettercode", "nihid"))

    dat_ge <- f_map("GE", "subject_id", dat_ge)

    ## exclude subjects from list A and list B
    id_name <- a_map_var("BIOCARD", "LIST_A", "subject_id", dict_src_tables)

    exid <- c(dat_lsta[[id_name]],
              dat_lstb[[id_name]])

    ## ------------- merge all data -------------------------------------
    
    # use vec_tbl
    a_print("Combining all data ...", verbose)
    dat_all <- dat_cog %>% 
        select(subject_id, date = date_cog) %>% 
        rbind(dat_diag  %>% select(subject_id, date = date_diag )) %>% 
        rbind(dat_amy   %>% select(subject_id, date = date_amy  )) %>% 
        rbind(dat_csf   %>% select(subject_id, date = date_csf  )) %>% 
        rbind(dat_ec    %>% select(subject_id, date = date_ec   )) %>% 
        rbind(dat_hippo %>% select(subject_id, date = date_hippo)) %>% 
        distinct() %>% 
        arrange(subject_id, date)
    
    s_cog   <- a_add_suffix(dat_cog,   "_cog")
    s_diag  <- a_add_suffix(dat_diag,  "_diag")
    s_amy   <- a_add_suffix(dat_amy,   "_amy")
    s_csf   <- a_add_suffix(dat_csf,   "_csf")
    s_ec    <- a_add_suffix(dat_ec,    "_ec")
    s_hippo <- a_add_suffix(dat_hippo, "_hippo")
    
    variable_list <- list(cog   = names(s_cog), 
                          diag  = names(s_diag), 
                          amy   = names(s_amy), 
                          csf   = names(s_csf) , 
                          ec    = names(s_ec) , 
                          hippo = names(s_hippo))
    
    dat_info <- dat_all %>% 
        left_join(s_cog   , by = c("subject_id", "date")) %>% 
        left_join(s_diag  , by = c("subject_id", "date")) %>% 
        left_join(s_amy   , by = c("subject_id", "date")) %>% 
        left_join(s_csf   , by = c("subject_id", "date")) %>% 
        left_join(s_ec    , by = c("subject_id", "date")) %>% 
        left_join(s_hippo , by = c("subject_id", "date")) %>% 
        left_join(dat_demo, by = c("subject_id")) %>% 
        left_join(dat_ge  , by = c("subject_id")) %>%
        select(!(ends_with(".x") | ends_with(".y")))
    
    if (is.null(reference_time)) {
        reference_time <- s_cog %>% select(subject_id, date)
    }
    if (reference_time == "COG") {
      reference_time <- s_cog %>% select(subject_id, date)
    }
    if (reference_time == "DIAG") {
      reference_time <- s_diag %>% select(subject_id, date)
    }

    ## ------------- combine data --------------------------------------
    
    a_print("Merging analysis dataset...", verbose)
    dat_se <- reference_time %>% 
        rename(impute_date = date)

    for (bmk in tolower(setdiff(vec_tbls, c("DEMO", "GE")))) { 
        a_print(sprintf("Extracting biomarker from %s", bmk), verbose)
        dat <- dat_info %>% 
            select(variable_list[[bmk]]) %>% 
            filter(!!as.name(paste0("indicator_", bmk)) == 1)
        window <- window_setting[[bmk]][1]
        window_overlap <- window_setting[[bmk]][2] == 1
        dat_bmk <- a_select_bmk(reference_time, dat, window, window_overlap) %>% 
            rename("{paste0('date_', bmk)}" := date)
        dat_se <- dat_se %>% 
            left_join(dat_bmk, by = c("subject_id", "impute_date"))
    }
    a_print("Merging demo ...", verbose)
    a_print("Merging ge ...", verbose)
    dat_se <- dat_se %>%
        left_join(dat_demo, by = c("subject_id")) %>%
        left_join(dat_ge, by = c("subject_id")) 

    ## load ApoE-4
    dat_se$apoe4 <- adt_apoe(dat_se$apoecode, ...) 

    ## drop duplicates
    dat_se <- dat_se %>%
        select(!(ends_with(".x") | ends_with(".y")))

    ## add exid
    dat_se <- dat_se %>%
        rowwise() %>%
        mutate(exclude = subject_id %in% exid)
 
    # format variable
    
    a_print("Done.", verbose)
    ## return
    rtn <- list(ana_dt = dat_se, 
                dat_ty = "biocard", 
                window_setting = window_setting, 
                reference_time = reference_time)
    class(rtn) <- "ad_ana_data"
    return(rtn)
}