R/importers.R

Defines functions import_blood_mrna

Documented in import_blood_mrna

#' Read and clean TVS blood mRNA data file for analysis in R.
#' @param blood_file A full path to the TVS blood mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @details The samples metadata file is used to convert the original sample_id identifiers  
#' to the new, within-project, id system. The samples-file also contains more precise sample_type information.
#'
#' @return A dataset object of class tbl_df/data.frame.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_blood_mrna <- function(blood_file = "tvs_blood_gwe_loess.txt",
                              samples_file = "samples.csv") {
    # new identifiers
    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- blood_file %>%
        readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
        dplyr::select(2:99) %>%
        # add sample type to sample ids
        rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Blood"))
    
    mrna <- mrna %>%
        dplyr::select(-PROBE_ID) %>%
        # transpose
        t() %>%
        # fix column names
        tibble::as_tibble(
            rownames = "sample_id", 
            .name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
        ) %>% 
        # sample type from the file is used to recode sample_ids
        tidyr::separate(
            col = sample_id, 
            into = c("sample_id_file", "sample_type_file"), 
            sep = "/"
        )
    
    # recode sample_ids and sample_types from the samples-metadata
    sample_to_patient_map <- samples %>%
        select(patient_id, sample_id) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(patient_id, sample_id)
    
    sample_to_type_map <- samples %>%
        select(sample_id, sample_type) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(sample_type, sample_id)
    
    mrna_ids_types <- mrna %>%
        magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
        # convert to the new id format in samples-metadata
        # to make sample_ids unique across datasets
        mutate(sample_id = case_when(
            sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
            sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
            sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
            TRUE ~ NA_character_
        )) %>%
        mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
        mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- mrna %>%
        magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
        bind_cols(mrna_ids_types, .)
    
    # add empty rows for missing samples and patients
    # need to transpose with t() to make this fast again
    mrna_colnames <- colnames(mrna)
    
    missing_patients <- samples %>%
        pull(patient_id) %>%
        setdiff(mrna_ids_types$patient_id) %>%
        enframe(name = NULL, value = "patient_id") %>%
        t() %>%
        rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
    
    mrna <- mrna %>%
        t() %>%
        cbind(missing_patients) %>%
        t() %>%
        as_tibble(.name_repair = ~ mrna_colnames) %>%
        arrange(patient_id, sample_id)
    
    # convert probe class to numeric
    probe_names <- magrittr::extract(colnames(mrna), -(1:3))
    mrna_probes <- mrna %>%
        magrittr::extract( , -(1:3)) %>%
        t() %>%
        as_tibble(.name_repair = "unique") %>%
        mutate_all(as.numeric) %>%
        t() %>%
        as_tibble(.name_repair = ~ probe_names)
    
    # recombine probes with ids and sample type
    mrna <- mrna %>%
        magrittr::extract(1:3) %>%
        bind_cols(mrna_probes)
    
    return(mrna)
    
}

#' Read and clean TVS monocyte mRNA data file for analysis in R.
#' @param monocyte_file A full path to the TVS monocyte mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @details The samples metadata file is used to convert the original sample_id identifiers  
#' to the new, within-project, id system. The samples-file also contains more precise sample_type information.
#'
#' @return A dataset object of class tbl_df/data.frame.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_monocyte_mrna <- function(monocyte_file = "tvs_monocyte_gwe_loess.txt",
                                 samples_file = "samples.csv") {

    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- monocyte_file %>%
        readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
        dplyr::select(2:100) %>%
        rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Monocyte"))

    # transpose, fix column names
    mrna <- mrna %>%
        dplyr::select(-PROBE_ID) %>%
        t() %>%
        tibble::as_tibble(
            rownames = "sample_id", 
            .name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
        ) %>% 
        # sample_type from the file is used to recode sample_ids to match samples-metadata
        tidyr::separate(
            col = sample_id, 
            into = c("sample_id_file", "sample_type_file"), 
            sep = "/"
        )
    
    # recode sample_ids and sample_types from the samples-metadata
    sample_to_patient_map <- samples %>%
        select(patient_id, sample_id) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(patient_id, sample_id)
    
    sample_to_type_map <- samples %>%
        select(sample_id, sample_type) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(sample_type, sample_id)
    
    mrna_ids_types <- mrna %>%
        magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
        # convert to the new id format in samples-metadata
        # to make sample_ids unique across datasets
        mutate(sample_id = case_when(
            sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
            sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
            sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
            TRUE ~ NA_character_
        )) %>%
        mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
        mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- mrna %>%
        magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
        bind_cols(mrna_ids_types, .)
    
    # add empty rows for missing samples and patients
    # need to transpose with t() to make this fast again
    mrna_colnames <- colnames(mrna)
    
    missing_patients <- samples %>%
        pull(patient_id) %>%
        setdiff(mrna_ids_types$patient_id) %>%
        enframe(name = NULL, value = "patient_id") %>%
        t() %>%
        rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
    
    mrna <- mrna %>%
        t() %>%
        cbind(missing_patients) %>%
        t() %>%
        as_tibble(.name_repair = ~ mrna_colnames) %>%
        arrange(patient_id, sample_id)
    
    # convert probe class to numeric
    probe_names <- magrittr::extract(colnames(mrna), -(1:3))
    mrna_probes <- mrna %>%
        magrittr::extract( , -(1:3)) %>%
        t() %>%
        as_tibble(.name_repair = "unique") %>%
        mutate_all(as.numeric) %>%
        t() %>%
        as_tibble(.name_repair = ~ probe_names)
    
    # recombine probes with ids and sample type
    mrna <- mrna %>%
        magrittr::extract(1:3) %>%
        bind_cols(mrna_probes)
    
    return(mrna)
    
}

#' Read and clean TVS artery mRNA data file for analysis in R.
#' @param artery_file A full path to the TVS artery mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @details The samples metadata file is used to convert the original sample_id identifiers  
#' to the new, within-project, id system. The samples-file also contains more precise sample_type information.
#'
#' @return A dataset object of class tbl_df/data.frame.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_artery_mrna <- function(artery_file = "tvs_plaque_gwe_loess_copy.csv",
                               samples_file = "samples.csv") {
    
    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- artery_file %>%
        readr::read_csv(progress = FALSE, col_types = cols(.default = col_character())) %>%
        dplyr::select(2:95) %>%
        rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Artery"))
    # transpose, fix column names
    mrna <- mrna %>%
        dplyr::select(-PROBE_ID) %>%
        t() %>%
        tibble::as_tibble(
            rownames = "sample_id", 
            .name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
        ) %>% 
        # sample_type from the file is used to recode sample_ids to match samples-metadata
        tidyr::separate(
            col = sample_id, 
            into = c("sample_id_file", "sample_type_file"), 
            sep = "/"
        )
    
    # recode sample_ids and sample_types from the samples-metadata
    sample_to_patient_map <- samples %>%
        select(patient_id, sample_id) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(patient_id, sample_id)
    
    sample_to_type_map <- samples %>%
        select(sample_id, sample_type) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(sample_type, sample_id)
    
    mrna_ids_types <- mrna %>%
        magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
        # convert to the new id format in samples-metadata
        # to make sample_ids unique across datasets
        mutate(sample_id = case_when(
            sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
            sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
            sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
            TRUE ~ NA_character_
        )) %>%
        mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
        mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- mrna %>%
        magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
        bind_cols(mrna_ids_types, .)
    
    # add empty rows for missing samples and patients
    # need to transpose with t() to make this fast again
    mrna_colnames <- colnames(mrna)
    
    missing_patients <- samples %>%
        pull(patient_id) %>%
        setdiff(mrna_ids_types$patient_id) %>%
        enframe(name = NULL, value = "patient_id") %>%
        t() %>%
        rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
    
    mrna <- mrna %>%
        t() %>%
        cbind(missing_patients) %>%
        t() %>%
        as_tibble(.name_repair = ~ mrna_colnames) %>%
        arrange(patient_id, sample_id)
    
    # convert probe class to numeric
    probe_names <- magrittr::extract(colnames(mrna), -(1:3))
    mrna_probes <- mrna %>%
        magrittr::extract( , -(1:3)) %>%
        t() %>%
        as_tibble(.name_repair = "unique") %>%
        mutate_all(as.numeric) %>%
        t() %>%
        as_tibble(.name_repair = ~ probe_names)
    
    # recombine probes with ids and sample type
    mrna <- mrna %>%
        magrittr::extract(1:3) %>%
        bind_cols(mrna_probes)
    
    return(mrna)
    
}

#' Read and clean TVS mRNA data files for analysis in R.
#'
#' @param blood_file A full path to the TVS blood mRNA data file.
#' @param monocyte_file A full path to the TVS monocyte mRNA data file.
#' @param artery_file A full path to the TVS artery mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#'
#' @details The "samples.csv" metadata-file is used to convert the original "sample_id" identifiers  
#' to the new, within-project, id-system. The "samples.csv"-file also contains more precise "sample_type" information.
#'
#' @return A dataset object of class "tbl_df" (tibble) and "data.frame" (base).
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_mrna <- function(blood_file = "tvs_blood_gwe_loess.txt",
                        monocyte_file = "tvs_monocyte_gwe_loess.txt",
                        artery_file = "tvs_plaque_gwe_loess_copy.csv",
                        samples_file = "samples.csv") {

    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id, sample_type)
    
    # read the three mRNA datasets and
    # concatenate sample_type to sample_id (which are column names in the raw data)
    # this is an ugly hack to speed up the transposing step
    mrna_blood <- blood_file %>%
        readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
        dplyr::select(2:99) %>%
        rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Blood"))
    
    mrna_artery <- artery_file %>%
        readr::read_csv(progress = FALSE, col_types = cols(.default = col_character())) %>%
        dplyr::select(2:95) %>%
        rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Artery"))
    
    mrna_mono <- monocyte_file %>%
        readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
        dplyr::select(2:100) %>%
        rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Monocyte"))
    
    # combine datasets by
    # listing unique probes and joining data sequentially by PROBE_ID
    mrna <- list(mrna_artery, mrna_blood, mrna_mono) %>%
        purrr::map(dplyr::pull, PROBE_ID) %>%
        unlist() %>%
        unique() %>%
        tibble::enframe(name = NULL, value = "PROBE_ID") %>%
        dplyr::left_join(mrna_blood, by = "PROBE_ID") %>%
        dplyr::left_join(mrna_artery, by = "PROBE_ID") %>%
        dplyr::left_join(mrna_mono, by = "PROBE_ID")
    
    # transpose, fix column names
    mrna <- mrna %>%
        dplyr::select(-PROBE_ID) %>%
        t() %>%
        tibble::as_tibble(
            rownames = "sample_id", 
            .name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
        ) %>% 
        # sample_type from the file is used to recode sample_ids to match samples-metadata
        tidyr::separate(
            col = sample_id, 
            into = c("sample_id_file", "sample_type_file"), 
            sep = "/"
        )

    # recode sample_ids and sample_types from the samples-metadata
    sample_to_patient_map <- samples %>%
        select(patient_id, sample_id) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(patient_id, sample_id)
    
    sample_to_type_map <- samples %>%
        select(sample_id, sample_type) %>%
        filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
        set_names(sample_type, sample_id)
    
    mrna_ids_types <- mrna %>%
        magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
        # convert to the new id format in samples-metadata
        # to make sample_ids unique across datasets
        mutate(sample_id = case_when(
            sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
            sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
            sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
            TRUE ~ NA_character_
        )) %>%
        mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
        mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
        select(patient_id, sample_id, sample_type)
    
    mrna <- mrna %>%
        magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
        bind_cols(mrna_ids_types, .)
    
    # add empty rows for missing samples and patients
    # need to transpose with t() to make this fast again
    mrna_colnames <- colnames(mrna)
    
    missing_patients <- samples %>%
        pull(patient_id) %>%
        setdiff(mrna_ids_types$patient_id) %>%
        enframe(name = NULL, value = "patient_id") %>%
        t() %>%
        rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
    
    mrna <- mrna %>%
        t() %>%
        cbind(missing_patients) %>%
        t() %>%
        as_tibble(.name_repair = ~ mrna_colnames) %>%
        arrange(patient_id, sample_id)
    
    # convert probe class to numeric
    probe_names <- magrittr::extract(colnames(mrna), -(1:3))
    mrna_probes <- mrna %>%
        magrittr::extract( , -(1:3)) %>%
        t() %>%
        as_tibble(.name_repair = "unique") %>%
        mutate_all(as.numeric) %>%
        t() %>%
        as_tibble(.name_repair = "unique") %>%
        magrittr::set_colnames(probe_names)
        
    # recombine probes with ids and sample type
    mrna <- mrna %>%
        magrittr::extract(1:3) %>%
        bind_cols(mrna_probes)
    
    return(mrna)
}

#' Read and clean TVS lipids data file for analysis.
#'
#' @param lipids_file A full path to the TVS lipids data file.
#' @param samples_file A full path to the TVS samples metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_lipids <- function(lipids_file = "TVS_data_171220_lipidomics.xlsx",
                          samples_file = "samples.csv") {

    lipids <- lipids_file %>%
        readxl::read_excel() %>%
        dplyr::rename(sample_id = ID, sample_type = MATRIX) %>%
        # transpose dataset
        tidyr::gather(
            "variable",
            "value",
            -sample_id,
            -sample_type
        ) %>%
        tidyr::spread(variable, value) %>%
        janitor::clean_names() %>%
        dplyr::select(
            sample_id,
            sample_type,
            dplyr::everything()
        ) %>%
        # fix one missing sample_type using the code (PL/SE) in sample id
        dplyr::mutate(sample_type = dplyr::if_else(
            condition = is.na(sample_type) & stringr::str_detect(sample_id, "PL"),
            true = "Plasma",
            false = sample_type
        ))
    
    # add all patient_ids (incl. missing) from the samples-metadata
    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id) 
    
    lipids <- samples %>%
        right_join(lipids, by = "sample_id")
    
    lipids <- samples %>%
        # get unique patients
        select(patient_id) %>%
        filter(patient_id != "Multiple patients") %>%
        distinct() %>%
        # -------------------
        anti_join(lipids, by = "patient_id") %>%
        bind_rows(lipids) %>%
        arrange(sample_type, patient_id)
    
    return(lipids)

}

#' Read and clean TVS lipoprotein data file for analysis.
#'
#' @param lipoprotein_file A full path to the TVS lipoprotein data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @param format Whether output dataset should be "samples" (wide, one row per sample) or "particles" (long, no serum data, one row per particle type) formatted.
#' 
#' @return A clean lipoprotein dataset of class "spec_tbl_df", "tbl_df", tbl", "data.frame".
#'
#' @importFrom magrittr "%>%"
#' @export

import_lipoproteins <- function(lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
                                samples_file = "samples.csv",
                                format = "samples") {

    lipoproteins <- lipoprotein_file %>%
        readxl::read_excel(skip = 14, col_types = "text") %>%
        dplyr::slice(-1) %>%
        dplyr::rename(sample_id = ...1) %>%
        janitor::clean_names() %>%
        # fix renaming mistakes
        dplyr::rename(
            apo_b_to_apo_a1 = apo_bto_apo_a1,
            idl_c_efr = idl_c_e_fr,
            ldl_c_efr = ldl_c_e_fr,
            vldl_tg_efr = vldl_tg_e_fr
        ) %>%
        dplyr::mutate_at(dplyr::vars(-sample_id), list(as.numeric)) %>%
        # sample_type is known to be Serum
        dplyr::mutate(sample_type = "Serum") %>%
        # unique sample_ids of type SE02_
        dplyr::mutate(sample_id = stringr::str_c("SE02_", sample_id)) %>%
        # albumin os not lipoprotein
        # remove also derived/computed variables (not original data)
        select(-alb, -apo_b_to_apo_a1) %>%
        dplyr::select(
            sample_id, 
            sample_type, 
            serum_tg, 
            serum_c,
            apo_a1,
            apo_b,
            dplyr::everything()
        )
    
    # get patient_ids (incl. missing patients to make it explicit) from samples-metadata
    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id)
    
    lipoproteins <- samples %>%
        right_join(lipoproteins, by = "sample_id")
    
    lipoproteins <- samples %>%
        select(patient_id) %>%
        filter(patient_id != "Multiple patients") %>%
        distinct() %>%
        anti_join(lipoproteins, by = "patient_id") %>%
        bind_rows(lipoproteins) %>%
        arrange(patient_id)
    
    if (format == "particles") {
        # lots of data in the column names -> long-formatted version too
        lipoproteins <- lipoproteins %>%
            gather("variable", "value", -(patient_id:sample_type)) %>%
            mutate(measure = case_when(
                str_detect(variable, "_tg$") ~ "Triglycerides",
                str_detect(variable, "_l$") ~ "Lipids",
                str_detect(variable, "_pl$") ~ "Phospholipids",
                str_detect(variable, "[a-z]_c$") ~ "Cholesterol",
                str_detect(variable, "[0-9]_c$") ~ "Cholesterol Lipido",
                str_detect(variable, "_fc$") ~ "Free cholesterol",
                str_detect(variable, "_ce$") ~ "Cholesterol esters",
                str_detect(variable, "_d$") ~ "Mean diameter",
                str_detect(variable, "_p$") ~ "Particles",
                str_detect(variable, "_efr$") ~ "Particles Lipido",
                str_detect(variable, "apo_a1") ~ "apo_a1_lipido",
                str_detect(variable, "apo_b$") ~ "apo_b_lipido"
            )) %>%
            mutate(particle_density = case_when(
                str_detect(variable, "vldl_") ~ "Very low",
                str_detect(variable, "idl") ~ "Intermediate",
                str_detect(variable, "ldl_") ~ "Low",
                str_detect(variable, "hdl_") ~ "High",
                str_detect(variable, "^hdl2_") ~ "High (sub 2)",
                str_detect(variable, "^hdl3_") ~ "High (sub 3)",
                TRUE ~ "Not applicable"
            )) %>%
            mutate(compartment = case_when(
                str_detect(variable, "^serum") ~ "Serum",
                particle_density == "Not applicable" ~ "Serum",
                TRUE ~ "Particle"
            )) %>%
            mutate(unit = case_when(
                particle_density == "Not applicable" ~ "Molecule",
                TRUE ~ "Particle"
            )) %>%
            mutate(particle_size = case_when(
                str_detect(variable, "^xxl_") ~ "Chylo and extreme large",
                str_detect(variable, "^xl_") ~ "Very large",
                str_detect(variable, "^l_") ~ "Large",
                str_detect(variable, "^m_") ~ "Medium",
                str_detect(variable, "^s_") ~ "Small",
                str_detect(variable, "^xs_") ~ "Very small",
                compartment == "Particle" ~ "All",
                TRUE ~ "Not applicable"
            )) %>%
            select(-variable, -compartment) %>%
            # for spreading properly
            mutate(tmp = str_c(particle_size, particle_density)) %>%
            group_by(measure, tmp) %>%
            mutate(spreading_index = row_number()) %>%
            # ---------------------
            spread(measure, value) %>%
            ungroup() %>%
            janitor::clean_names() %>%
            select(-tmp, -spreading_index, -apo_a1_lipido, -apo_b_lipido) %>%
            select(patient_id, sample_id, sample_type, unit, everything()) %>%
            # only particle data; molecular serum concentrations 
            # can be found in the wide dataset
            filter(particle_density != "Not applicable") %>%
            arrange(patient_id, unit, particle_density)
        
        return(lipoproteins)
        
    } else if (format == "samples") {
        
        return(lipoproteins)
        
    } else {
        
        stop('argument *format* must be either "particles" or "samples"')
        
    }
    
}

#' Read and clean the TVS low molecular weight metabolites (LMWM) data file for analysis.
#'
#' @param lmwm_file A full path to the TVS low molecular weight metabolites (LMWM) data file.
#' @param samples_file A full path to the TVS samples metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_lmwm <- function(lmwm_file = "TVS0002.1 - LMWM predictions.xls",
                        samples_file = "samples.csv") {
    
    lmwm <- lmwm_file %>%
        readxl::read_excel(skip = 17, col_types = "text") %>%
        slice(-1) %>%
        dplyr::rename(sample_id = ...1) %>%
        janitor::clean_names() %>%
        dplyr::mutate_at(dplyr::vars(-sample_id), list(as.numeric)) %>%
        # sample_type is known to be Serum
        dplyr::mutate(sample_type = "Serum") %>%
        # unique sample_ids of type SE02_
        dplyr::mutate(sample_id = stringr::str_c("SE02_", sample_id)) %>%
        dplyr::select(
            sample_id, 
            sample_type, 
            dplyr::everything()
        )
    
    # get patient_ids (incl. missing patients to make it explicit) from samples-metadata
    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id)
    
    lmwm <- samples %>%
        right_join(lmwm, by = "sample_id")
    
    lmwm <- samples %>%
        select(patient_id) %>%
        filter(patient_id != "Multiple patients") %>%
        distinct() %>%
        anti_join(lmwm, by = "patient_id") %>%
        bind_rows(lmwm) %>%
        arrange(patient_id)

    return(lmwm)
    
}

#' Read and clean TVS tissue/histology data from TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#' @param samples_file A full path to hte TVS samples metadata file.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_histology <- function(database_file = "TVS 26.xlsx",
                             unique_values_file = "unique-values.csv",
                             samples_file = "samples.csv") {
    
    # create mappings from codes (e.g., 1) to values (e.g., Initial lesion (Type I))
    unique_values <- readr::read_csv(
            unique_values_file, 
            col_types = cols(.default = col_character())
        ) %>% 
        filter(name_snakecase %in% c("aha", "unstable", "sample_type"))
    
    sample_type_mapping <- unique_values %>%
        filter(name_snakecase == "sample_type") %$%
        set_names(value, code)
    aha_mapping <- unique_values %>%
        filter(name_snakecase == "aha") %$%
        set_names(value, code)
    stability_mapping <- unique_values %>%
        filter(name_snakecase == "unstable") %$%
        set_names(value, code)
    
    histo <- database_file %>%
        # read -----------
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::rename(sample_type = tissuetype) %>%
        dplyr::select(
            sample_id,
            sample_type,
            aha,
            unstable
        ) %>%
        # clean ----------
        dplyr::mutate_all(as.character) %>%
        # sample 1045 is actually just red blood cells according to 
        # the "forIH"/"for_ih" variable in the database/samples-metadata
        dplyr::mutate(sample_type = dplyr::case_when(
            # use code 999 for Red blood cells, next recode codes to values
            sample_id == "1045" ~ "999",
            TRUE ~ sample_type
        )) %>%
        dplyr::mutate(
            # recode
            sample_type = recode(sample_type, !!!sample_type_mapping),
            aha = recode(aha, !!!aha_mapping),
            unstable = recode(unstable, !!!stability_mapping),
            # convert to the new sample_id system
            sample_id = stringr::str_c("T_", sample_id)
        ) %>%
        # "Blood" samples are not part of this dataset, "Red blood cells" are an error
        filter(sample_type != "Blood")
    
    # get all patient_id (incl. missing) from the samples-metadata
    samples <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(patient_id, sample_id)
    
    histo <- samples %>%
        right_join(histo, by = "sample_id")
    
    histo <- samples %>%
        filter(patient_id != "Multiple patients") %>%
        distinct(patient_id) %>%
        anti_join(histo, by = "patient_id") %>%
        bind_rows(histo) %>%
        arrange(patient_id)
    
    return(histo)
}

#' Read and clean TVS physiological data from the TVS database file for analysis.
#'
#' The included variables are: diastolic and systolic blood pressure, right and left ankle-brachial index, and ejection fraction.
#'
#' @param database_file A full path to the TVS database file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_physiology <- function(database_file = "TVS 26.xlsx") {

    database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::select(
            sample_id,
            systolic,
            diastolic,
            abi_right,
            abi_left,
            ef
        ) %>%
        # create patient_id variable following the new id system
        mutate(patient_id = dplyr::case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = dplyr::case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id) %>%
        # add missing patient_ids too
        bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
        select(patient_id, everything()) %>%
        arrange(patient_id) -> physiology
    
    return(physiology)
}

#' Read and clean TVS macroanatomical data from the TVS database file for analysis.
#'
#' The included variable are: age, sex, weight, height, stenosis in the sampled artery, 
#' and coronary angiography results.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_macro <- function(database_file = "TVS 26.xlsx",
                         unique_values_file = "unique-values.csv") {

    database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::select(
            sample_id,
            age,
            sex,
            weight,
            height,
            stenosis_sampletissue,
            ang_heart
        ) %>%
        dplyr::mutate(
            ang_heart = as.character(ang_heart),
            sex = as.character(sex)
        ) %>%
        # create patient_id variable following the new id system
        mutate(patient_id = dplyr::case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = dplyr::case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id) %>%
        # add missing patient_ids too
        bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) -> macro
    
    # recode codes (f.ex. 2) to values (f.ex. female)
    recodeable_variables <- c("sex", "ang_heart")
    
    readr::read_csv(unique_values_file, col_types = cols(.default = col_character())) %>%
        filter(name_snakecase %in% recodeable_variables) -> unique_values
    
    unique_values %>%
        filter(name_snakecase == "sex") %$%
        set_names(value, code) -> sex_mapping
    
    unique_values %>%
        filter(name_snakecase == "ang_heart") %$%
        set_names(value, code) -> ang_heart_mapping
    
    macro %>%
        mutate(
            sex = recode(sex, !!!sex_mapping),
            ang_heart = recode(ang_heart, !!!ang_heart_mapping)
        ) %>%
        # represent true missing values with NA
        # (and non-applicability with "Not applicable")
        mutate(ang_heart = if_else(
            condition = ang_heart == "Not measured", 
            true = NA_character_, 
            false = ang_heart
        )) %>%
        # note: none of ascending aortas has stenosis information
        # and only few abdominal aortas has --> systematically missing. 
        mutate(stenosis_sampletissue = case_when(
            # LITA samples with identifiers starting with "4" have 0 % stenosis
            str_detect(patient_id, "^4") ~ "0",
            # other than arterial samples are not applicable; id starts with A or B
            str_detect(patient_id, "^A|^B") ~ "Not applicable",
            is.numeric(stenosis_sampletissue) ~ as.character(stenosis_sampletissue),
            TRUE ~ NA_character_
        )) %>%
        select(patient_id, everything()) %>%
        arrange(patient_id) -> macro
    
    return(macro)
}

#' Read and clean TVS exposure data from the TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_exposures <- function(database_file = "TVS 26.xlsx",
                             unique_values_file = "unique-values.csv") {

    exposures <- database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::select(sample_id, q_smoking:q_alcohol) %>%
        # clean values
        dplyr::mutate(q_when_quit = stringr::str_replace(q_when_quit, ",", ".")) %>%
        dplyr::mutate(q_when_quit = stringr::str_replace(q_when_quit, "^\\.$", NA_character_)) %>%
        dplyr::mutate(q_when_quit = as.numeric(q_when_quit)) %>%
        dplyr::mutate_at(dplyr::vars(-q_smoking_dur, -q_when_quit), as.character) %>%
        # create patient_id variable following the new id system
        mutate(patient_id = case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id) %>%
        # add missing patient_ids too
        bind_rows(tibble::tibble(patient_id = c("A0332", "B0550")))
    
    # recode codes to values
    recodeable_variables <- exposures %>%
        select(-patient_id) %>%
        colnames()
    
    unique_values <- readr::read_csv(
        unique_values_file, 
        col_types = cols(.default = col_character())
        ) %>%
        filter(name_snakecase %in% recodeable_variables)
    
    q_smoking_mapping <- unique_values %>%
        filter(name_snakecase == "q_smoking") %$%
        set_names(value, code)
    q_smoking_quit_mapping <- unique_values %>%
        filter(name_snakecase == "q_smoking_quit") %$%
        set_names(value, code)
    q_smoking_pipe_mapping <- unique_values %>%
        filter(name_snakecase == "q_smoking_pipe") %$%
        set_names(value, code)
    q_smoking_cigarettes_mapping <- unique_values %>%
        filter(name_snakecase == "q_smoking_cigarettes") %$%
        set_names(value, code)
    q_alcohol_mapping <- unique_values %>%
        filter(name_snakecase == "q_alcohol") %$%
        set_names(value, code)
    
    # fix missing values
    exposures <- exposures %>%
        mutate(
            q_smoking = recode(q_smoking, !!!q_smoking_mapping),
            q_smoking_quit = recode(q_smoking_quit, !!!q_smoking_quit_mapping),
            q_smoking_pipe = recode(q_smoking_pipe, !!!q_smoking_pipe_mapping),
            q_smoking_cigarettes = recode(q_smoking_cigarettes, !!!q_smoking_cigarettes_mapping),
            q_alcohol = recode(q_alcohol, !!!q_alcohol_mapping)
        ) %>%
        # change representation of missing values
        mutate(q_smoking = case_when(
            q_smoking == "Smoked/Smoking" & q_smoking_quit == "Quit" ~ "Smoked",
            q_smoking == "Smoked/Smoking" & is.na(q_smoking_quit) ~ "Smoking",
            q_smoking == "Never" ~ "Never",
            TRUE ~ NA_character_
        )) %>%
        mutate(q_smoking_quit = case_when(
            q_smoking == "Smoking" ~ "Current",
            q_smoking == "Never" ~ "Not applicable",
            TRUE ~ q_smoking_quit
        )) %>%
        mutate(q_when_quit = case_when(
            q_smoking_quit == "Current" | q_smoking_quit == "Not applicable" ~ "Not applicable",
            q_smoking_quit == "Quit" ~ as.character(q_when_quit),
            TRUE ~ NA_character_
        )) %>%
        mutate(q_smoking_cigarettes = case_when(
            q_smoking == "Never" ~ "0 / day",
            TRUE ~ q_smoking_cigarettes
        )) %>%
        mutate(q_smoking_dur = case_when(
            q_smoking == "Never" ~ 0,
            TRUE ~ q_smoking_dur
        )) %>%
        mutate(q_smoking_pipe = case_when(
            q_smoking == "Never" ~ "No",
            TRUE ~ q_smoking_pipe
        )) %>%
        select(patient_id, everything()) %>%
        arrange(patient_id)
        

    return(exposures)
}

#' Read and clean TVS clinical chemistry variables from database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_clinical_chemistry <- function(database_file = "TVS 26.xlsx") {
    
    clin_labs <- database_file %>%
        readxl::read_excel() %>%
        # clean names
        janitor::clean_names() %>%
        dplyr::rename(
            l_ldlmax = l_ld_lmax,
            l_hdlmax = l_hd_lmax,
            b_crpmin = b_cr_pmin
        ) %>%
        dplyr::select(sample_id, l_totalcholmax:b_kreamin) %>%
        # clean values
        dplyr::mutate(l_totalcholmax = as.numeric(
            stringr::str_replace(l_totalcholmax, ",", ".")
        )) %>%
        # create patient_id variable following the new id system
        mutate(patient_id = case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id) %>%
        # add missing patient_ids too
        bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
        select(patient_id, everything()) %>%
        arrange(patient_id)
    
    return(clin_labs)

}

#' Read and clean TVS diagnosis data from database file for analysis.
#'
#' @importFrom magrittr "%>%"
#' @export

import_diagnoses <- function(database_file = "TVS 26.xlsx",
                             unique_values_file = "unique-values.csv") {
    
    diagnoses <- database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::select(sample_id, cad:pad, d_dm:d_dementia) %>%
        dplyr::mutate_all(as.character)
    
    # ids
    diagnoses <- diagnoses %>%
        mutate(patient_id = case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id) %>%
        bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
        arrange(patient_id)
    
    # recode values
    recodeable_variables <- diagnoses %>%
        select(-patient_id) %>%
        colnames()
    
    unique_values <- readr::read_csv(
        unique_values_file, 
        col_types = cols(.default = col_character())
        ) %>%
        filter(name_snakecase %in% recodeable_variables) %>%
        select(name_snakecase, code, value) %>%
        group_by(name_snakecase) %>%
        nest() %>%
        mutate(mapping = map(data, function(x) { set_names(x$value, x$code) }))
    
    recode_with <- function(var, mapping) {
        recode(var, !!!mapping)
    }
    
    diagnoses_recoded <- diagnoses %>%
        select(unique_values$name_snakecase) %>%
        map2_dfr(unique_values$mapping, recode_with)
    
    diagnoses_not_recoded <- diagnoses[ ,!(colnames(diagnoses) %in% unique(unique_values$name_snakecase))]
    
    diagnoses <- diagnoses_recoded %>%
        bind_cols(diagnoses_not_recoded) %>%
        select(patient_id, everything()) %>%
        # mutate(mi_nbr = as.numeric(mi_nbr)) %>%
        arrange(patient_id)
    
    return(diagnoses)
}

#' Read and clean TVS symptoms data from TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export

import_symptoms <- function(database_file = "TVS 26.xlsx",
                            unique_values_file = "unique-values.csv") {

    symptoms <- database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::select(sample_id, nyha, ccs) %>%
        dplyr::mutate_all(as.character)
    
    # create patient_id variable following the new id system
    symptoms <- symptoms %>%
        mutate(patient_id = case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id)
    
    # recode codes to values
    recodeable_variables <- symptoms %>%
        select(-patient_id) %>%
        colnames()
    
    unique_values <- readr::read_csv(
        unique_values_file, 
        col_types = cols(.default = col_character())
        ) %>%
        filter(name_snakecase %in% recodeable_variables)
    
    nyha_mapping <- unique_values %>%
        filter(name_snakecase == "nyha") %$%
        set_names(value, code)
    ccs_mapping <- unique_values %>%
        filter(name_snakecase == "ccs") %$%
        set_names(value, code)
    
    symptoms <- symptoms %>%
        mutate(
            nyha = recode(nyha, !!!nyha_mapping),
            ccs = recode(ccs, !!!ccs_mapping)
        ) %>%
        select(patient_id, everything())
    
    # add empty rows for two missing patient_ids
    symptoms <- symptoms %>%
        bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
        arrange(patient_id)
    
    return(symptoms)
}

#' Read and clean TVS death/survival data from the TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param samples_file A full path to hte TVS samples metadata file.
#' @param format The format of the output dataset, either "patients" (wide, one row per patient) or "events" (one row per event).
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_deaths <- function(database_file = "TVS 26.xlsx",
                          samples_file = "samples.csv",
                          format = "patients") {

    id_mapping <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        select(sample_id, patient_id)

    deaths <- database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        # sample_type is needed to recode ids
        dplyr::rename(sample_type = tissuetype) %>%
        dplyr::select(sample_id, sample_type, death_date:f_follow_up_time11) %>%
        # clean dates
        dplyr::mutate(death_date = as.numeric(death_date)) %>%
        dplyr::mutate(death_date = janitor::excel_numeric_to_date(
            death_date,
            date_system = "modern"
        )) %>%
        dplyr::mutate_at(dplyr::vars(-dplyr::contains("time")), as.character) %>%
        dplyr::mutate(death_date = dplyr::case_when(
            # actual missingness
            is.na(death_date) & is.na(f_follow_up_time11) ~ NA_character_,
            # if there is no `death_date` but
            # follow up time exists, patient is alive.
            is.na(death_date) & !is.na(f_follow_up_time11) ~ "Not applicable",
            TRUE ~ death_date
        )) %>%
        dplyr::mutate(
            # derive cause of death variable from indicators
            cause_of_death = dplyr::case_when(
                # SCD
                (f_death_scd09 == "1" | f_death_scd11 == "1") ~ "Sudden cardiac death",
                # other CV
                (f_death_cv09 == "1" | f_death_cv11 == "1") &
                    !(f_death_scd09 == "1" | f_death_scd11 == "1") ~ "Other cardiovascular",
                # non-CV
                (f_death_tot09 == "1" | f_death_tot11 == "1") &
                    !(f_death_cv09 == "1" | f_death_cv11 == "1") ~ "Non-cardiovascular",
                death_date == "Not applicable" ~ "Not applicable",
                # unknown
                TRUE ~ NA_character_
            ),
            # add variable for the latest date of measurement
            # assumption: use 2011-06-15 as the reported "June 2011"
            death_last_measurement = dplyr::case_when(

                !is.na(f_death_tot11) ~ lubridate::ymd("2011-06-15"),
                is.na(f_death_tot11) ~ lubridate::as_date(NA)
            ),
            # derive variable for start of follow up
            # approximation: 30.42 days in a month by average
            follow_up_start = dplyr::case_when(

                !is.na(death_last_measurement) ~
                    death_last_measurement -
                    lubridate::days(as.integer(round(f_follow_up_time11 * 30.42, 0))),

                is.na(death_last_measurement) ~ lubridate::as_date(NA)
            )
        ) %>%
        # drop redundant variables
        dplyr::select(-(f_death_tot09:f_follow_up_time11)) %>%
        dplyr::mutate_all(as.character) %>%
        # derive logical death_status
        dplyr::mutate(death_status_last = dplyr::case_when(
            death_date == "Not applicable" ~ "Alive",
            is.na(death_date) ~ NA_character_,
            TRUE ~ "Dead"
        )) %>%
        # derive numeric follow up duration
        dplyr::mutate(follow_up_days = dplyr::case_when(
            !is.na(death_date) & death_date != "Not applicable" ~ 
                lubridate::ymd(death_date) - lubridate::ymd(follow_up_start),
            TRUE ~ lubridate::ymd(death_last_measurement) - lubridate::ymd(follow_up_start)
        )) %>%
        dplyr::mutate(follow_up_days = as.numeric(follow_up_days))

    deaths <- deaths %>%    
        # use new id system; now only patient_id needed
        mutate(sample_id = case_when(
            !(sample_type %in% c("8", "9")) ~ stringr::str_c("T_", sample_id),
            TRUE ~ sample_id
        )) %>%
        left_join(id_mapping, by = "sample_id") %>%
        # the rest of sample_ids are patient_ids
        mutate(patient_id = if_else(is.na(patient_id), sample_id, patient_id)) %>%
        select(
            patient_id, 
            follow_up_start, 
            death_last_measurement,
            death_date,
            death_status_last,
            follow_up_days,
            cause_of_death
        ) %>%
        distinct(patient_id, .keep_all = TRUE)
    
    deaths <- samples_file %>%
        read_csv(col_types = cols(.default = col_character())) %>%
        distinct(patient_id) %>%
        filter(patient_id != "Multiple patients") %>%
        anti_join(deaths, by = "patient_id") %>%
        bind_rows(deaths) %>%
        arrange(patient_id)
    
    if (format == "patients") {
        return(deaths)    
    } else if (format == "events") {
        deaths %>%
            dplyr::select(-death_status_last, -follow_up_days) %>%
            tidyr::gather(event, date, -patient_id, -cause_of_death) %>%
            dplyr::select(patient_id, event, date, cause_of_death) %>%
            dplyr::arrange(patient_id)
    } else {
        stop('parameter *format* needs to be either "patients" or "events".')
    }
    
}

#' Read and clean TVS treatment data from database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export

import_treatments <- function(database_file = "TVS 26.xlsx",
                              unique_values_file = "unique_values.csv") {
    
    treatments <- database_file %>%
        readxl::read_excel() %>%
        janitor::clean_names() %>%
        dplyr::rename(m_bp_med = m_b_pmed) %>%
        dplyr::select(sample_id, m_hormone_rt:cabg) %>%
        dplyr::mutate_all(as.character)
    
    # create patient_id variable following the new id system
    treatments <- treatments %>%
        mutate(patient_id = case_when(
            str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
            TRUE ~ sample_id
        )) %>%
        mutate(patient_id = case_when(
            patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
            TRUE ~ patient_id
        )) %>%
        distinct(patient_id, .keep_all = TRUE) %>%
        select(-sample_id)

    # recode codes to values
    recodeable_variables <- treatments %>%
        select(-patient_id) %>%
        colnames()
    
    unique_values <- readr::read_csv(
        unique_values_file, 
        col_types = cols(.default = col_character())
        ) %>%
        filter(name_snakecase %in% recodeable_variables)
    
    m_hormone_rt_mapping <- unique_values %>%
        filter(name_snakecase == "m_hormone_rt") %$%
        set_names(value, code)
    m_diabmed_mapping <- unique_values %>%
        filter(name_snakecase == "m_diabmed") %$%
        set_names(value, code)
    m_statins_mapping <- unique_values %>%
        filter(name_snakecase == "m_statins") %$%
        set_names(value, code)
    m_statin_agent_mapping <- unique_values %>%
        filter(name_snakecase == "m_statin_agent") %$%
        set_names(value, code)
    m_bp_med_mapping <- unique_values %>%
        filter(name_snakecase == "m_bp_med") %$%
        set_names(value, code)
    pcta_mapping <- unique_values %>%
        filter(name_snakecase == "pcta") %$%
        set_names(value, code)
    cabg_mapping <- unique_values %>%
        filter(name_snakecase == "cabg") %$%
        set_names(value, code)
    
    treatments <- treatments %>%
        mutate(
            m_hormone_rt = recode(m_hormone_rt, !!!m_hormone_rt_mapping),
            m_diabmed = recode(m_diabmed, !!!m_diabmed_mapping),
            m_statins = recode(m_statins, !!!m_statins_mapping),
            m_statin_agent = recode(m_statin_agent, !!!m_statin_agent_mapping),
            m_bp_med = recode(m_bp_med, !!!m_bp_med_mapping),
            pcta = recode(pcta, !!!pcta_mapping),
            cabg = recode(cabg, !!!cabg_mapping)
        ) %>%
        select(patient_id, everything())
    
    # add empty rows for two missing patient_ids
    treatments <- treatments %>%
        bind_rows(data_frame(patient_id = c("A0332", "B0550"))) %>%
        arrange(patient_id)
    
    # fix other missing values
    treatments <- treatments %>%
        mutate(m_statin_agent = case_when(
            m_statins == "No" ~ "No",
            m_statins == "Yes" ~ m_statin_agent,
            TRUE ~ NA_character_
        ))
        # TODO: assume missing value in m_diabmed means actually missing?
    
    return(treatments)
}

#' Import TVS patients metadata file.
#'
#' A tiny wrapper for reading the patients metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param patient_file A full to the TVS patients metadata file.
#' @importFrom magrittr "%>%"
#' @export

import_patients <- function(patients_file = "patients.csv") {
    readr::read_csv(
        patients_file,
        col_types = cols(.default = col_character())
    )
}

#' Import TVS variables metadata file.
#'
#' A tiny wrapper for reading the variables metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param patient_file A full to the TVS variables metadata file.
#' @importFrom magrittr "%>%"
#' @export

import_variables <- function(variables_file = "variables.csv") {
    readr::read_csv(
        variables_file,
        col_types = cols(.default = col_character())
    )
}

#' Import TVS samples metadata file.
#'
#' A tiny wrapper for reading the samples metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param samples_file A full to the TVS samples metadata file.
#' @importFrom magrittr "%>%"
#' @export

import_samples <- function(samples_file = "samples.csv") {
    readr::read_csv(
        samples_file,
        col_types = cols(.default = col_character())
    )
}

#' Import TVS unique values metadata file.
#'
#' A tiny wrapper for reading the unique values metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param unique_values_file A full to the TVS unique values metadata file.
#' @importFrom magrittr "%>%"
#' @export

import_unique_values <- function(unique_values_file = "unique-values.csv") {
    readr::read_csv(
        unique_values_file,
        col_types = cols(.default = col_character())
    )
}

#' Read and clean TVS datasets from data and metadata directory.
#'
#' This function is a simple wrapper for running multiple TVS import-functions. 
#' Filenames are currently hardcoded and must be the original ones. 
#' Currently the following TVS datasets are NOT included: DNA, TaqMan DNA, bacteria.
#'
#' @param data_directory A full path to a directory that contains all raw/original TVS datasets.
#' @param metadata_directory A full path to a directory that contains all TVS metadata-files.
#' 
#' @return Returns nothing. Instead assigns datasets to objects in the global environment.
#' 
#' @export

import_objects <- function(data_directory, 
                           metadata_directory) {
    
    # set working directory temporarily to the input
    original_wd <- getwd()
    setwd(data_directory)
    
    samples_file <- stringr::str_c(metadata_directory, "samples.csv", sep = "/")
    unique_values_file <- stringr::str_c(metadata_directory, "unique-values.csv", sep = "/")
    
    # NOTE
    # the double arrow <<- primitive assigns objects to global environment, 
    # not just to the scope of this function
    
    # -------------- data ----------------
    lipids <<- tvsproject::import_lipids(
        lipids_file = "TVS_data_171220_lipidomics.xlsx",
        samples_file = samples_file
    )
    
    mrna_blood_probes <<- tvsproject::import_blood_mrna(
        samples_file = samples_file,
        blood_file = "tvs_blood_gwe_loess.txt"
    )
    
    mrna_mono_probes <<- tvsproject::import_monocyte_mrna(
        samples_file = samples_file,
        monocyte_file = "tvs_monocyte_gwe_loess.txt"
    )
    
    mrna_artery_probes <<- tvsproject::import_artery_mrna(
        samples_file = samples_file,
        artery_file = "tvs_plaque_gwe_loess_copy.csv"
    )
    
    mrna_all_probes <<- tvsproject::import_mrna(
        blood_file = "tvs_blood_gwe_loess.txt",
        artery_file = "tvs_plaque_gwe_loess_copy.csv",
        monocyte_file = "tvs_monocyte_gwe_loess.txt",
        samples_file = samples_file
    )
    
    lipopro_samples <<- tvsproject::import_lipoproteins(
        lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
        samples_file = samples_file,
        format = "samples"
    )
    
    lipopro_particles <<- tvsproject::import_lipoproteins(
        lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
        samples_file = samples_file,
        format = "particles"
    )
    
    lmwm <<- tvsproject::import_lmwm(
        lmwm_file = "TVS0002.1 - LMWM predictions.xls",
        samples_file = samples_file
    )
    
    histo <<- tvsproject::import_histology(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file,
        samples_file = samples_file
    )
    
    deaths_patients <<- tvsproject::import_deaths(
        database_file = "TVS 26.xlsx",
        samples_file = samples_file,
        format = "patients"
    )
    
    deaths_events <<- tvsproject::import_deaths(
        database_file = "TVS 26.xlsx",
        samples_file = samples_file,
        format = "events"
    )
    
    diagnoses <<- tvsproject::import_diagnoses(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    treatments <<- tvsproject::import_treatments(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    symptoms <<- tvsproject::import_symptoms(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    clinchem <<- tvsproject::import_clinical_chemistry(database_file = "TVS 26.xlsx")
    
    exposures <<- tvsproject::import_exposures(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    macro <<- tvsproject::import_macro(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    physiology <<- tvsproject::import_physiology("TVS 26.xlsx")
    
    # ---------- metadata --------------
    setwd(metadata_directory)
    
    samples <<- tvsproject::import_samples("samples.csv")
    unique_values <<- tvsproject::import_unique_values("unique-values.csv")
    patients <<- tvsproject::import_patients("patients.csv")
    variables <<- tvsproject::import_variables("variables.csv")
    
    setwd(original_wd)
    
    # ----------- classify variables ----------------
    mrna_all_genes <<- tvsproject::classify_mrna_probes(
        mrna_all_probes, 
        variables, 
        summary_function = "max"
    )
    
    mrna_artery_genes <<- mrna_all_genes %>%
        filter(!(sample_type %in% c("Monocyte", "Blood"))) %>%
        bind_rows(mrna_artery_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
    
    mrna_blood_genes <<- mrna_all_genes %>%
        filter(sample_type == "Blood") %>%
        bind_rows(mrna_blood_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
    
    mrna_monocyte_genes <<- mrna_all_genes %>%
        filter(sample_type == "Monocyte") %>%
        bind_rows(mrna_mono_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
    
    lipids_classes <<- tvsproject::classify_lipids(
        lipids, 
        variables, 
        summary_function = "sum"
    )
    
}

#' Import TVS objects and save them in a cache file.
#' 
#' @param data_directory A full path to a directory that contains all raw/original TVS datasets.
#' @param metadata_directory A full path to a directory that contains all TVS metadata-files.
#' @param cache_directory A full path to the output directory.
#' 
#' @return Returns nothing, saves an RData-file to the given cache directory instead.
#' 
#' @export

generate_cache <- function(data_directory, 
                           metadata_directory, 
                           cache_directory) {
    
    # individual objects
    tvsproject::import_objects(data_directory, metadata_directory)
    
    cache_file_name <- stringr::str_c("tvs-objects-", lubridate::today(), ".RData")
    save.image(file = stringr::str_c(cache_directory, cache_file_name))
    
    # combine data objects into one and save another file
    tvs <- tvsproject::combine_data_objects()
    
    cache_file_name <- stringr::str_c("tvs-dataset-", lubridate::today(), ".RData")
    save(tvs, file = stringr::str_c(cache_directory, cache_file_name))
    
    # reset global environment
    rm(list = ls())
    gc()
}

#' Import TVS data as a single dataset object with a row per unique patient.
#' 
#' @param data_directory A full path to a directory that contains all raw/original TVS datasets.
#' @param metadata_directory A full path to a directory that contains all TVS metadata-files.
#' 
#' @importFrom magrittr "%>%"
#' @export

import_data <- function(data_directory, metadata_directory) {

    # set working directory temporarily to the input
    original_wd <- getwd()
    setwd(data_directory)
    
    samples_file <- stringr::str_c(metadata_directory, "samples.csv", sep = "/")
    unique_values_file <- stringr::str_c(metadata_directory, "unique-values.csv", sep = "/")
    
    # -------------- data ----------------
    lipids <- tvsproject::import_lipids(
        lipids_file = "TVS_data_171220_lipidomics.xlsx",
        samples_file = samples_file
    )
    
    mrna_blood_probes <- tvsproject::import_blood_mrna(
        samples_file = samples_file,
        blood_file = "tvs_blood_gwe_loess.txt"
    )
    
    mrna_mono_probes <- tvsproject::import_monocyte_mrna(
        samples_file = samples_file,
        monocyte_file = "tvs_monocyte_gwe_loess.txt"
    )
    
    mrna_artery_probes <- tvsproject::import_artery_mrna(
        samples_file = samples_file,
        artery_file = "tvs_plaque_gwe_loess_copy.csv"
    )
    
    mrna_all_probes <- tvsproject::import_mrna(
        blood_file = "tvs_blood_gwe_loess.txt",
        artery_file = "tvs_plaque_gwe_loess_copy.csv",
        monocyte_file = "tvs_monocyte_gwe_loess.txt",
        samples_file = samples_file
    )
    
    lipopro_samples <- tvsproject::import_lipoproteins(
        lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
        samples_file = samples_file,
        format = "samples"
    )
    
    lipopro_particles <- tvsproject::import_lipoproteins(
        lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
        samples_file = samples_file,
        format = "particles"
    )
    
    lmwm <- tvsproject::import_lmwm(
        lmwm_file = "TVS0002.1 - LMWM predictions.xls",
        samples_file = samples_file
    )
    
    histo <- tvsproject::import_histology(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file,
        samples_file = samples_file
    )
    
    deaths_patients <- tvsproject::import_deaths(
        database_file = "TVS 26.xlsx",
        samples_file = samples_file,
        format = "patients"
    )
    
    deaths_events <- tvsproject::import_deaths(
        database_file = "TVS 26.xlsx",
        samples_file = samples_file,
        format = "events"
    )
    
    diagnoses <- tvsproject::import_diagnoses(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    treatments <- tvsproject::import_treatments(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    symptoms <- tvsproject::import_symptoms(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    clinchem <- tvsproject::import_clinical_chemistry(database_file = "TVS 26.xlsx")
    
    exposures <- tvsproject::import_exposures(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    macro <- tvsproject::import_macro(
        database_file = "TVS 26.xlsx",
        unique_values_file = unique_values_file
    )
    
    physiology <- tvsproject::import_physiology("TVS 26.xlsx")
    
    setwd(original_wd)
    
    # ----------- classify variables ----------------
    mrna_all_genes <- tvsproject::classify_mrna_probes(
        mrna_all_probes, 
        variables, 
        summary_function = "max"
    )
    
    mrna_artery_genes <- mrna_all_genes %>%
        filter(!(sample_type %in% c("Monocyte", "Blood"))) %>%
        bind_rows(mrna_artery_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
    
    mrna_blood_genes <- mrna_all_genes %>%
        filter(sample_type == "Blood") %>%
        bind_rows(mrna_blood_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
    
    mrna_monocyte_genes <- mrna_all_genes %>%
        filter(sample_type == "Monocyte") %>%
        bind_rows(mrna_mono_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
    
    lipids_classes <- tvsproject::classify_lipids(
        lipids, 
        variables, 
        summary_function = "sum"
    )
    
    # ------------- combine dataframes ----------------
    tvs <- samples_file %>%    
        tvsproject::import_samples() %>%
        select(patient_id) %>%
        distinct() %>%
        left_join(macro, by = "patient_id") %>%
        left_join(physiology, by = "patient_id") %>%
        left_join(exposures, by = "patient_id") %>%
        left_join(clinchem, by = "patient_id") %>%
        left_join(symptoms, by = "patient_id") %>%
        left_join(treatments, by = "patient_id") %>%
        left_join(diagnoses, by = "patient_id") %>%
        left_join(deaths_patients, by = "patient_id") %>%
        left_join(
            rename(histo, sample_id_artery = sample_id),
            by = "patient_id"
        ) %>%
        left_join(lipids) %>%
        left_join(lipopro_samples, by = "patient_id", suffix = c("_lipids", "")) %>%
        left_join(
            lmwm,
            by = "patient_id",
            suffix = c("_lipopro", "")
        ) %>%
        left_join(
            select(lipids_classes, -sample_id, -sample_type), 
            by = "patient_id"
        ) %>%
        left_join(
            mrna_artery_genes,
            by = "patient_id",
            suffix = c("_lmwm", "")
        ) %>%
        left_join(
            mrna_blood_genes,
            by = "patient_id",
            suffix = c("_artery", "")
        ) %>%
        left_join(
            mrna_monocyte_genes,
            by = "patient_id",
            suffix = c("_blood", "_monocyte")
        ) %>%
        select(contains("_id"), contains("_type"), everything())
    
    return(tvs)
    
}

#' Combine TVS data objects in the global environment into a single dataset object.
#' 
#' The function assumes that the objects are named exatcly like the function "import_objects" assigns them.
#' When the function doesn't find the objects from its own environment, it searches them from the global environment.
#' 
#' @return A dataset of class "tbl_df" (tibble) and "data.frame" (base).
#' 
#' @importFrom magrittr "%>%"
#' @export

combine_data_objects <- function() {
    
    tvs <- samples %>%
        select(patient_id) %>%
        distinct() %>%
        left_join(macro, by = "patient_id") %>%
        left_join(physiology, by = "patient_id") %>%
        left_join(exposures, by = "patient_id") %>%
        left_join(clinchem, by = "patient_id") %>%
        left_join(symptoms, by = "patient_id") %>%
        left_join(treatments, by = "patient_id") %>%
        left_join(diagnoses, by = "patient_id") %>%
        left_join(deaths_patients, by = "patient_id") %>%
        left_join(
            rename(histo, sample_id_artery = sample_id),
            by = "patient_id"
        ) %>%
        left_join(lipids) %>%
        left_join(lipopro_samples, by = "patient_id", suffix = c("_lipids", "")) %>%
        left_join(
            lmwm,
            by = "patient_id",
            suffix = c("_lipopro", "")
        ) %>%
        left_join(
            select(lipids_classes, -sample_id, -sample_type), 
            by = "patient_id"
        ) %>%
        left_join(
            mrna_artery_genes,
            by = "patient_id",
            suffix = c("_lmwm", "")
        ) %>%
        left_join(
            mrna_blood_genes,
            by = "patient_id",
            suffix = c("_artery", "")
        ) %>%
        left_join(
            mrna_monocyte_genes,
            by = "patient_id",
            suffix = c("_blood", "_monocyte")
        ) %>%
        select(contains("_id"), contains("_type"), everything())
    
    return(tvs)
}

#' Read and clean TVS DNA data files for analysis.
#'
#' @importFrom magrittr "%>%"
#' @export

import_dna <- function(artery_file = "tvs_team16_artery.txt",
                       blood_file = "tvs_team16_blood.txt") {
    
    dna_artery <- artery_file %>%
        readr::read_tsv(col_types = stringr::str_dup("c", 68)) %>%
        # SNP-metadata in meta_variables, remove here
        dplyr::select(-Chr, -Position, -Alleles)
    
    dna_blood <- blood_file %>%
        readr::read_tsv(col_types = stringr::str_dup("c", 88)) %>%
        # SNP-metadata in meta_variables, remove here
        dplyr::select(-Chr, -Position, -Alleles)
    
    dna <- dna_artery %>%
        dplyr::full_join(dna_blood) %>%
        # ugly gather-spread-workaround
        dplyr::bind_rows(rlang::set_names(colnames(.), colnames(.)), .) %>%
        as.matrix() %>%
        t() %>%
        dplyr::as_tibble(.name_repair = "unique") %>%
        # ugly but working row_to_names -workaround
        magrittr::set_colnames((( . %>% magrittr::extract(1, ) %>% as.character() )(.))) %>%
        magrittr::extract(-1, ) %>%
        dplyr::rename(sample_id = SNP_ID)
    
    return(dna)
}

#' Read and clean TVS TaqMan DNA data file for analysis.
#'
#' This is an additional smaller SNP set (Furin, ApoE, IL, and ADAM)
#' measured with the AppliedBioSystems' Taqman assay.
#'
#' TODO: Recoding from numbers to bases.
#'
#' @importFrom magrittr "%>%" "%$%"
#' @export

import_dna_taqman <- function(taqman_file = "all TVS TaqMan genotypes August 2011.xls",
                              variable_metadata_file = "variables.txt") {
    
    dna_taqman <- taqman_file %>%
        readxl::read_excel(col_types = "text") %>%
        dplyr::rename(sample_id = SampleID)
    
    meta_variables <- variable_metadata_file %>%
        readr::read_csv(col_types = stringr::str_dup("c", 22))
    
    # create a map from gene_symbols to SNP identifiers
    rename_map_dna_taqman <- colnames(dna_taqman) %>%
        tibble::enframe() %>%
        dplyr::rename(gene_symbol = value) %>%
        dplyr::slice(9:20) %>%
        # sorry, nesting to avoid creating intermediate object...
        dplyr::left_join(
            dplyr::mutate(
                dplyr::select(
                    .data = meta_variables,
                    name_snakecase,
                    gene_symbol
                ),
                gene_symbol = stringr::str_remove_all(gene_symbol, "-")
            ),
            by = "gene_symbol"
        ) %$% # note the different pipe, see ?magrittr
        rlang::set_names(gene_symbol, name_snakecase)
    
    dna_taqman <- dna_taqman %>%
        dplyr::rename(!!!rename_map_dna_taqman)
    
    return(dna_taqman)
    
}

#' Read and clean TVS miRNA data file for analysis.
#'
#' @importFrom magrittr "%>%"
#' @export

import_mirna <- function(mirna_file = "TVS.miRNA.xls") {
    
    mirna <- mirna_file %>%
        readxl::read_excel() %>%
        tidyr::gather("sample_id", "value", -1) %>%
        tidyr::spread(Probe, value) %>%
        dplyr::rename_all(
            . %>%
                stringr::str_to_lower() %>%
                stringr::str_replace_all(c("-" = "_", "\\*" = "_star"))) %>%
        dplyr::select(sample_id, darkcorner, mirnabrightcorner30:scorner3, dplyr::everything())
    
    return(mirna)
}
eteppo/tvs-project documentation built on Aug. 13, 2019, 8:53 a.m.