#' Read and clean TVS blood mRNA data file for analysis in R.
#' @param blood_file A full path to the TVS blood mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @details The samples metadata file is used to convert the original sample_id identifiers
#' to the new, within-project, id system. The samples-file also contains more precise sample_type information.
#'
#' @return A dataset object of class tbl_df/data.frame.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_blood_mrna <- function(blood_file = "tvs_blood_gwe_loess.txt",
samples_file = "samples.csv") {
# new identifiers
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id, sample_type)
mrna <- blood_file %>%
readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
dplyr::select(2:99) %>%
# add sample type to sample ids
rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Blood"))
mrna <- mrna %>%
dplyr::select(-PROBE_ID) %>%
# transpose
t() %>%
# fix column names
tibble::as_tibble(
rownames = "sample_id",
.name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
) %>%
# sample type from the file is used to recode sample_ids
tidyr::separate(
col = sample_id,
into = c("sample_id_file", "sample_type_file"),
sep = "/"
)
# recode sample_ids and sample_types from the samples-metadata
sample_to_patient_map <- samples %>%
select(patient_id, sample_id) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(patient_id, sample_id)
sample_to_type_map <- samples %>%
select(sample_id, sample_type) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(sample_type, sample_id)
mrna_ids_types <- mrna %>%
magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
# convert to the new id format in samples-metadata
# to make sample_ids unique across datasets
mutate(sample_id = case_when(
sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
TRUE ~ NA_character_
)) %>%
mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
select(patient_id, sample_id, sample_type)
mrna <- mrna %>%
magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
bind_cols(mrna_ids_types, .)
# add empty rows for missing samples and patients
# need to transpose with t() to make this fast again
mrna_colnames <- colnames(mrna)
missing_patients <- samples %>%
pull(patient_id) %>%
setdiff(mrna_ids_types$patient_id) %>%
enframe(name = NULL, value = "patient_id") %>%
t() %>%
rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
mrna <- mrna %>%
t() %>%
cbind(missing_patients) %>%
t() %>%
as_tibble(.name_repair = ~ mrna_colnames) %>%
arrange(patient_id, sample_id)
# convert probe class to numeric
probe_names <- magrittr::extract(colnames(mrna), -(1:3))
mrna_probes <- mrna %>%
magrittr::extract( , -(1:3)) %>%
t() %>%
as_tibble(.name_repair = "unique") %>%
mutate_all(as.numeric) %>%
t() %>%
as_tibble(.name_repair = ~ probe_names)
# recombine probes with ids and sample type
mrna <- mrna %>%
magrittr::extract(1:3) %>%
bind_cols(mrna_probes)
return(mrna)
}
#' Read and clean TVS monocyte mRNA data file for analysis in R.
#' @param monocyte_file A full path to the TVS monocyte mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @details The samples metadata file is used to convert the original sample_id identifiers
#' to the new, within-project, id system. The samples-file also contains more precise sample_type information.
#'
#' @return A dataset object of class tbl_df/data.frame.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_monocyte_mrna <- function(monocyte_file = "tvs_monocyte_gwe_loess.txt",
samples_file = "samples.csv") {
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id, sample_type)
mrna <- monocyte_file %>%
readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
dplyr::select(2:100) %>%
rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Monocyte"))
# transpose, fix column names
mrna <- mrna %>%
dplyr::select(-PROBE_ID) %>%
t() %>%
tibble::as_tibble(
rownames = "sample_id",
.name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
) %>%
# sample_type from the file is used to recode sample_ids to match samples-metadata
tidyr::separate(
col = sample_id,
into = c("sample_id_file", "sample_type_file"),
sep = "/"
)
# recode sample_ids and sample_types from the samples-metadata
sample_to_patient_map <- samples %>%
select(patient_id, sample_id) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(patient_id, sample_id)
sample_to_type_map <- samples %>%
select(sample_id, sample_type) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(sample_type, sample_id)
mrna_ids_types <- mrna %>%
magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
# convert to the new id format in samples-metadata
# to make sample_ids unique across datasets
mutate(sample_id = case_when(
sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
TRUE ~ NA_character_
)) %>%
mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
select(patient_id, sample_id, sample_type)
mrna <- mrna %>%
magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
bind_cols(mrna_ids_types, .)
# add empty rows for missing samples and patients
# need to transpose with t() to make this fast again
mrna_colnames <- colnames(mrna)
missing_patients <- samples %>%
pull(patient_id) %>%
setdiff(mrna_ids_types$patient_id) %>%
enframe(name = NULL, value = "patient_id") %>%
t() %>%
rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
mrna <- mrna %>%
t() %>%
cbind(missing_patients) %>%
t() %>%
as_tibble(.name_repair = ~ mrna_colnames) %>%
arrange(patient_id, sample_id)
# convert probe class to numeric
probe_names <- magrittr::extract(colnames(mrna), -(1:3))
mrna_probes <- mrna %>%
magrittr::extract( , -(1:3)) %>%
t() %>%
as_tibble(.name_repair = "unique") %>%
mutate_all(as.numeric) %>%
t() %>%
as_tibble(.name_repair = ~ probe_names)
# recombine probes with ids and sample type
mrna <- mrna %>%
magrittr::extract(1:3) %>%
bind_cols(mrna_probes)
return(mrna)
}
#' Read and clean TVS artery mRNA data file for analysis in R.
#' @param artery_file A full path to the TVS artery mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @details The samples metadata file is used to convert the original sample_id identifiers
#' to the new, within-project, id system. The samples-file also contains more precise sample_type information.
#'
#' @return A dataset object of class tbl_df/data.frame.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_artery_mrna <- function(artery_file = "tvs_plaque_gwe_loess_copy.csv",
samples_file = "samples.csv") {
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id, sample_type)
mrna <- artery_file %>%
readr::read_csv(progress = FALSE, col_types = cols(.default = col_character())) %>%
dplyr::select(2:95) %>%
rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Artery"))
# transpose, fix column names
mrna <- mrna %>%
dplyr::select(-PROBE_ID) %>%
t() %>%
tibble::as_tibble(
rownames = "sample_id",
.name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
) %>%
# sample_type from the file is used to recode sample_ids to match samples-metadata
tidyr::separate(
col = sample_id,
into = c("sample_id_file", "sample_type_file"),
sep = "/"
)
# recode sample_ids and sample_types from the samples-metadata
sample_to_patient_map <- samples %>%
select(patient_id, sample_id) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(patient_id, sample_id)
sample_to_type_map <- samples %>%
select(sample_id, sample_type) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(sample_type, sample_id)
mrna_ids_types <- mrna %>%
magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
# convert to the new id format in samples-metadata
# to make sample_ids unique across datasets
mutate(sample_id = case_when(
sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
TRUE ~ NA_character_
)) %>%
mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
select(patient_id, sample_id, sample_type)
mrna <- mrna %>%
magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
bind_cols(mrna_ids_types, .)
# add empty rows for missing samples and patients
# need to transpose with t() to make this fast again
mrna_colnames <- colnames(mrna)
missing_patients <- samples %>%
pull(patient_id) %>%
setdiff(mrna_ids_types$patient_id) %>%
enframe(name = NULL, value = "patient_id") %>%
t() %>%
rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
mrna <- mrna %>%
t() %>%
cbind(missing_patients) %>%
t() %>%
as_tibble(.name_repair = ~ mrna_colnames) %>%
arrange(patient_id, sample_id)
# convert probe class to numeric
probe_names <- magrittr::extract(colnames(mrna), -(1:3))
mrna_probes <- mrna %>%
magrittr::extract( , -(1:3)) %>%
t() %>%
as_tibble(.name_repair = "unique") %>%
mutate_all(as.numeric) %>%
t() %>%
as_tibble(.name_repair = ~ probe_names)
# recombine probes with ids and sample type
mrna <- mrna %>%
magrittr::extract(1:3) %>%
bind_cols(mrna_probes)
return(mrna)
}
#' Read and clean TVS mRNA data files for analysis in R.
#'
#' @param blood_file A full path to the TVS blood mRNA data file.
#' @param monocyte_file A full path to the TVS monocyte mRNA data file.
#' @param artery_file A full path to the TVS artery mRNA data file.
#' @param samples_file A full path to the TVS samples metadata file.
#'
#' @details The "samples.csv" metadata-file is used to convert the original "sample_id" identifiers
#' to the new, within-project, id-system. The "samples.csv"-file also contains more precise "sample_type" information.
#'
#' @return A dataset object of class "tbl_df" (tibble) and "data.frame" (base).
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_mrna <- function(blood_file = "tvs_blood_gwe_loess.txt",
monocyte_file = "tvs_monocyte_gwe_loess.txt",
artery_file = "tvs_plaque_gwe_loess_copy.csv",
samples_file = "samples.csv") {
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id, sample_type)
# read the three mRNA datasets and
# concatenate sample_type to sample_id (which are column names in the raw data)
# this is an ugly hack to speed up the transposing step
mrna_blood <- blood_file %>%
readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
dplyr::select(2:99) %>%
rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Blood"))
mrna_artery <- artery_file %>%
readr::read_csv(progress = FALSE, col_types = cols(.default = col_character())) %>%
dplyr::select(2:95) %>%
rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Artery"))
mrna_mono <- monocyte_file %>%
readr::read_tsv(progress = FALSE, col_types = cols(.default = col_character())) %>%
dplyr::select(2:100) %>%
rename_at(vars(-PROBE_ID), function(x) stringr::str_c(x, "/Monocyte"))
# combine datasets by
# listing unique probes and joining data sequentially by PROBE_ID
mrna <- list(mrna_artery, mrna_blood, mrna_mono) %>%
purrr::map(dplyr::pull, PROBE_ID) %>%
unlist() %>%
unique() %>%
tibble::enframe(name = NULL, value = "PROBE_ID") %>%
dplyr::left_join(mrna_blood, by = "PROBE_ID") %>%
dplyr::left_join(mrna_artery, by = "PROBE_ID") %>%
dplyr::left_join(mrna_mono, by = "PROBE_ID")
# transpose, fix column names
mrna <- mrna %>%
dplyr::select(-PROBE_ID) %>%
t() %>%
tibble::as_tibble(
rownames = "sample_id",
.name_repair = ~ stringr::str_to_lower(mrna$PROBE_ID)
) %>%
# sample_type from the file is used to recode sample_ids to match samples-metadata
tidyr::separate(
col = sample_id,
into = c("sample_id_file", "sample_type_file"),
sep = "/"
)
# recode sample_ids and sample_types from the samples-metadata
sample_to_patient_map <- samples %>%
select(patient_id, sample_id) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(patient_id, sample_id)
sample_to_type_map <- samples %>%
select(sample_id, sample_type) %>%
filter(str_detect(sample_id, "T_X|M_(?!G)|B_(?!G)")) %$%
set_names(sample_type, sample_id)
mrna_ids_types <- mrna %>%
magrittr::extract( , 1:2) %>% # much faster than dplyr::select (?)
# convert to the new id format in samples-metadata
# to make sample_ids unique across datasets
mutate(sample_id = case_when(
sample_type_file == "Blood" ~ str_c("B_", sample_id_file),
sample_type_file == "Monocyte" ~ str_c("M_", sample_id_file),
sample_type_file == "Artery" ~ str_c("T_", sample_id_file),
TRUE ~ NA_character_
)) %>%
mutate(patient_id = recode(sample_id, !!!sample_to_patient_map)) %>%
mutate(sample_type = recode(sample_id, !!!sample_to_type_map)) %>%
select(patient_id, sample_id, sample_type)
mrna <- mrna %>%
magrittr::extract( , -(1:2)) %>% # much faster than dplyr::select
bind_cols(mrna_ids_types, .)
# add empty rows for missing samples and patients
# need to transpose with t() to make this fast again
mrna_colnames <- colnames(mrna)
missing_patients <- samples %>%
pull(patient_id) %>%
setdiff(mrna_ids_types$patient_id) %>%
enframe(name = NULL, value = "patient_id") %>%
t() %>%
rbind(matrix(nrow = length(mrna_colnames) - 1, ncol = ncol(.)))
mrna <- mrna %>%
t() %>%
cbind(missing_patients) %>%
t() %>%
as_tibble(.name_repair = ~ mrna_colnames) %>%
arrange(patient_id, sample_id)
# convert probe class to numeric
probe_names <- magrittr::extract(colnames(mrna), -(1:3))
mrna_probes <- mrna %>%
magrittr::extract( , -(1:3)) %>%
t() %>%
as_tibble(.name_repair = "unique") %>%
mutate_all(as.numeric) %>%
t() %>%
as_tibble(.name_repair = "unique") %>%
magrittr::set_colnames(probe_names)
# recombine probes with ids and sample type
mrna <- mrna %>%
magrittr::extract(1:3) %>%
bind_cols(mrna_probes)
return(mrna)
}
#' Read and clean TVS lipids data file for analysis.
#'
#' @param lipids_file A full path to the TVS lipids data file.
#' @param samples_file A full path to the TVS samples metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_lipids <- function(lipids_file = "TVS_data_171220_lipidomics.xlsx",
samples_file = "samples.csv") {
lipids <- lipids_file %>%
readxl::read_excel() %>%
dplyr::rename(sample_id = ID, sample_type = MATRIX) %>%
# transpose dataset
tidyr::gather(
"variable",
"value",
-sample_id,
-sample_type
) %>%
tidyr::spread(variable, value) %>%
janitor::clean_names() %>%
dplyr::select(
sample_id,
sample_type,
dplyr::everything()
) %>%
# fix one missing sample_type using the code (PL/SE) in sample id
dplyr::mutate(sample_type = dplyr::if_else(
condition = is.na(sample_type) & stringr::str_detect(sample_id, "PL"),
true = "Plasma",
false = sample_type
))
# add all patient_ids (incl. missing) from the samples-metadata
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id)
lipids <- samples %>%
right_join(lipids, by = "sample_id")
lipids <- samples %>%
# get unique patients
select(patient_id) %>%
filter(patient_id != "Multiple patients") %>%
distinct() %>%
# -------------------
anti_join(lipids, by = "patient_id") %>%
bind_rows(lipids) %>%
arrange(sample_type, patient_id)
return(lipids)
}
#' Read and clean TVS lipoprotein data file for analysis.
#'
#' @param lipoprotein_file A full path to the TVS lipoprotein data file.
#' @param samples_file A full path to the TVS samples metadata file.
#' @param format Whether output dataset should be "samples" (wide, one row per sample) or "particles" (long, no serum data, one row per particle type) formatted.
#'
#' @return A clean lipoprotein dataset of class "spec_tbl_df", "tbl_df", tbl", "data.frame".
#'
#' @importFrom magrittr "%>%"
#' @export
import_lipoproteins <- function(lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
samples_file = "samples.csv",
format = "samples") {
lipoproteins <- lipoprotein_file %>%
readxl::read_excel(skip = 14, col_types = "text") %>%
dplyr::slice(-1) %>%
dplyr::rename(sample_id = ...1) %>%
janitor::clean_names() %>%
# fix renaming mistakes
dplyr::rename(
apo_b_to_apo_a1 = apo_bto_apo_a1,
idl_c_efr = idl_c_e_fr,
ldl_c_efr = ldl_c_e_fr,
vldl_tg_efr = vldl_tg_e_fr
) %>%
dplyr::mutate_at(dplyr::vars(-sample_id), list(as.numeric)) %>%
# sample_type is known to be Serum
dplyr::mutate(sample_type = "Serum") %>%
# unique sample_ids of type SE02_
dplyr::mutate(sample_id = stringr::str_c("SE02_", sample_id)) %>%
# albumin os not lipoprotein
# remove also derived/computed variables (not original data)
select(-alb, -apo_b_to_apo_a1) %>%
dplyr::select(
sample_id,
sample_type,
serum_tg,
serum_c,
apo_a1,
apo_b,
dplyr::everything()
)
# get patient_ids (incl. missing patients to make it explicit) from samples-metadata
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id)
lipoproteins <- samples %>%
right_join(lipoproteins, by = "sample_id")
lipoproteins <- samples %>%
select(patient_id) %>%
filter(patient_id != "Multiple patients") %>%
distinct() %>%
anti_join(lipoproteins, by = "patient_id") %>%
bind_rows(lipoproteins) %>%
arrange(patient_id)
if (format == "particles") {
# lots of data in the column names -> long-formatted version too
lipoproteins <- lipoproteins %>%
gather("variable", "value", -(patient_id:sample_type)) %>%
mutate(measure = case_when(
str_detect(variable, "_tg$") ~ "Triglycerides",
str_detect(variable, "_l$") ~ "Lipids",
str_detect(variable, "_pl$") ~ "Phospholipids",
str_detect(variable, "[a-z]_c$") ~ "Cholesterol",
str_detect(variable, "[0-9]_c$") ~ "Cholesterol Lipido",
str_detect(variable, "_fc$") ~ "Free cholesterol",
str_detect(variable, "_ce$") ~ "Cholesterol esters",
str_detect(variable, "_d$") ~ "Mean diameter",
str_detect(variable, "_p$") ~ "Particles",
str_detect(variable, "_efr$") ~ "Particles Lipido",
str_detect(variable, "apo_a1") ~ "apo_a1_lipido",
str_detect(variable, "apo_b$") ~ "apo_b_lipido"
)) %>%
mutate(particle_density = case_when(
str_detect(variable, "vldl_") ~ "Very low",
str_detect(variable, "idl") ~ "Intermediate",
str_detect(variable, "ldl_") ~ "Low",
str_detect(variable, "hdl_") ~ "High",
str_detect(variable, "^hdl2_") ~ "High (sub 2)",
str_detect(variable, "^hdl3_") ~ "High (sub 3)",
TRUE ~ "Not applicable"
)) %>%
mutate(compartment = case_when(
str_detect(variable, "^serum") ~ "Serum",
particle_density == "Not applicable" ~ "Serum",
TRUE ~ "Particle"
)) %>%
mutate(unit = case_when(
particle_density == "Not applicable" ~ "Molecule",
TRUE ~ "Particle"
)) %>%
mutate(particle_size = case_when(
str_detect(variable, "^xxl_") ~ "Chylo and extreme large",
str_detect(variable, "^xl_") ~ "Very large",
str_detect(variable, "^l_") ~ "Large",
str_detect(variable, "^m_") ~ "Medium",
str_detect(variable, "^s_") ~ "Small",
str_detect(variable, "^xs_") ~ "Very small",
compartment == "Particle" ~ "All",
TRUE ~ "Not applicable"
)) %>%
select(-variable, -compartment) %>%
# for spreading properly
mutate(tmp = str_c(particle_size, particle_density)) %>%
group_by(measure, tmp) %>%
mutate(spreading_index = row_number()) %>%
# ---------------------
spread(measure, value) %>%
ungroup() %>%
janitor::clean_names() %>%
select(-tmp, -spreading_index, -apo_a1_lipido, -apo_b_lipido) %>%
select(patient_id, sample_id, sample_type, unit, everything()) %>%
# only particle data; molecular serum concentrations
# can be found in the wide dataset
filter(particle_density != "Not applicable") %>%
arrange(patient_id, unit, particle_density)
return(lipoproteins)
} else if (format == "samples") {
return(lipoproteins)
} else {
stop('argument *format* must be either "particles" or "samples"')
}
}
#' Read and clean the TVS low molecular weight metabolites (LMWM) data file for analysis.
#'
#' @param lmwm_file A full path to the TVS low molecular weight metabolites (LMWM) data file.
#' @param samples_file A full path to the TVS samples metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_lmwm <- function(lmwm_file = "TVS0002.1 - LMWM predictions.xls",
samples_file = "samples.csv") {
lmwm <- lmwm_file %>%
readxl::read_excel(skip = 17, col_types = "text") %>%
slice(-1) %>%
dplyr::rename(sample_id = ...1) %>%
janitor::clean_names() %>%
dplyr::mutate_at(dplyr::vars(-sample_id), list(as.numeric)) %>%
# sample_type is known to be Serum
dplyr::mutate(sample_type = "Serum") %>%
# unique sample_ids of type SE02_
dplyr::mutate(sample_id = stringr::str_c("SE02_", sample_id)) %>%
dplyr::select(
sample_id,
sample_type,
dplyr::everything()
)
# get patient_ids (incl. missing patients to make it explicit) from samples-metadata
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id)
lmwm <- samples %>%
right_join(lmwm, by = "sample_id")
lmwm <- samples %>%
select(patient_id) %>%
filter(patient_id != "Multiple patients") %>%
distinct() %>%
anti_join(lmwm, by = "patient_id") %>%
bind_rows(lmwm) %>%
arrange(patient_id)
return(lmwm)
}
#' Read and clean TVS tissue/histology data from TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#' @param samples_file A full path to hte TVS samples metadata file.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_histology <- function(database_file = "TVS 26.xlsx",
unique_values_file = "unique-values.csv",
samples_file = "samples.csv") {
# create mappings from codes (e.g., 1) to values (e.g., Initial lesion (Type I))
unique_values <- readr::read_csv(
unique_values_file,
col_types = cols(.default = col_character())
) %>%
filter(name_snakecase %in% c("aha", "unstable", "sample_type"))
sample_type_mapping <- unique_values %>%
filter(name_snakecase == "sample_type") %$%
set_names(value, code)
aha_mapping <- unique_values %>%
filter(name_snakecase == "aha") %$%
set_names(value, code)
stability_mapping <- unique_values %>%
filter(name_snakecase == "unstable") %$%
set_names(value, code)
histo <- database_file %>%
# read -----------
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::rename(sample_type = tissuetype) %>%
dplyr::select(
sample_id,
sample_type,
aha,
unstable
) %>%
# clean ----------
dplyr::mutate_all(as.character) %>%
# sample 1045 is actually just red blood cells according to
# the "forIH"/"for_ih" variable in the database/samples-metadata
dplyr::mutate(sample_type = dplyr::case_when(
# use code 999 for Red blood cells, next recode codes to values
sample_id == "1045" ~ "999",
TRUE ~ sample_type
)) %>%
dplyr::mutate(
# recode
sample_type = recode(sample_type, !!!sample_type_mapping),
aha = recode(aha, !!!aha_mapping),
unstable = recode(unstable, !!!stability_mapping),
# convert to the new sample_id system
sample_id = stringr::str_c("T_", sample_id)
) %>%
# "Blood" samples are not part of this dataset, "Red blood cells" are an error
filter(sample_type != "Blood")
# get all patient_id (incl. missing) from the samples-metadata
samples <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(patient_id, sample_id)
histo <- samples %>%
right_join(histo, by = "sample_id")
histo <- samples %>%
filter(patient_id != "Multiple patients") %>%
distinct(patient_id) %>%
anti_join(histo, by = "patient_id") %>%
bind_rows(histo) %>%
arrange(patient_id)
return(histo)
}
#' Read and clean TVS physiological data from the TVS database file for analysis.
#'
#' The included variables are: diastolic and systolic blood pressure, right and left ankle-brachial index, and ejection fraction.
#'
#' @param database_file A full path to the TVS database file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_physiology <- function(database_file = "TVS 26.xlsx") {
database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::select(
sample_id,
systolic,
diastolic,
abi_right,
abi_left,
ef
) %>%
# create patient_id variable following the new id system
mutate(patient_id = dplyr::case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = dplyr::case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id) %>%
# add missing patient_ids too
bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
select(patient_id, everything()) %>%
arrange(patient_id) -> physiology
return(physiology)
}
#' Read and clean TVS macroanatomical data from the TVS database file for analysis.
#'
#' The included variable are: age, sex, weight, height, stenosis in the sampled artery,
#' and coronary angiography results.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_macro <- function(database_file = "TVS 26.xlsx",
unique_values_file = "unique-values.csv") {
database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::select(
sample_id,
age,
sex,
weight,
height,
stenosis_sampletissue,
ang_heart
) %>%
dplyr::mutate(
ang_heart = as.character(ang_heart),
sex = as.character(sex)
) %>%
# create patient_id variable following the new id system
mutate(patient_id = dplyr::case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = dplyr::case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id) %>%
# add missing patient_ids too
bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) -> macro
# recode codes (f.ex. 2) to values (f.ex. female)
recodeable_variables <- c("sex", "ang_heart")
readr::read_csv(unique_values_file, col_types = cols(.default = col_character())) %>%
filter(name_snakecase %in% recodeable_variables) -> unique_values
unique_values %>%
filter(name_snakecase == "sex") %$%
set_names(value, code) -> sex_mapping
unique_values %>%
filter(name_snakecase == "ang_heart") %$%
set_names(value, code) -> ang_heart_mapping
macro %>%
mutate(
sex = recode(sex, !!!sex_mapping),
ang_heart = recode(ang_heart, !!!ang_heart_mapping)
) %>%
# represent true missing values with NA
# (and non-applicability with "Not applicable")
mutate(ang_heart = if_else(
condition = ang_heart == "Not measured",
true = NA_character_,
false = ang_heart
)) %>%
# note: none of ascending aortas has stenosis information
# and only few abdominal aortas has --> systematically missing.
mutate(stenosis_sampletissue = case_when(
# LITA samples with identifiers starting with "4" have 0 % stenosis
str_detect(patient_id, "^4") ~ "0",
# other than arterial samples are not applicable; id starts with A or B
str_detect(patient_id, "^A|^B") ~ "Not applicable",
is.numeric(stenosis_sampletissue) ~ as.character(stenosis_sampletissue),
TRUE ~ NA_character_
)) %>%
select(patient_id, everything()) %>%
arrange(patient_id) -> macro
return(macro)
}
#' Read and clean TVS exposure data from the TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_exposures <- function(database_file = "TVS 26.xlsx",
unique_values_file = "unique-values.csv") {
exposures <- database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::select(sample_id, q_smoking:q_alcohol) %>%
# clean values
dplyr::mutate(q_when_quit = stringr::str_replace(q_when_quit, ",", ".")) %>%
dplyr::mutate(q_when_quit = stringr::str_replace(q_when_quit, "^\\.$", NA_character_)) %>%
dplyr::mutate(q_when_quit = as.numeric(q_when_quit)) %>%
dplyr::mutate_at(dplyr::vars(-q_smoking_dur, -q_when_quit), as.character) %>%
# create patient_id variable following the new id system
mutate(patient_id = case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id) %>%
# add missing patient_ids too
bind_rows(tibble::tibble(patient_id = c("A0332", "B0550")))
# recode codes to values
recodeable_variables <- exposures %>%
select(-patient_id) %>%
colnames()
unique_values <- readr::read_csv(
unique_values_file,
col_types = cols(.default = col_character())
) %>%
filter(name_snakecase %in% recodeable_variables)
q_smoking_mapping <- unique_values %>%
filter(name_snakecase == "q_smoking") %$%
set_names(value, code)
q_smoking_quit_mapping <- unique_values %>%
filter(name_snakecase == "q_smoking_quit") %$%
set_names(value, code)
q_smoking_pipe_mapping <- unique_values %>%
filter(name_snakecase == "q_smoking_pipe") %$%
set_names(value, code)
q_smoking_cigarettes_mapping <- unique_values %>%
filter(name_snakecase == "q_smoking_cigarettes") %$%
set_names(value, code)
q_alcohol_mapping <- unique_values %>%
filter(name_snakecase == "q_alcohol") %$%
set_names(value, code)
# fix missing values
exposures <- exposures %>%
mutate(
q_smoking = recode(q_smoking, !!!q_smoking_mapping),
q_smoking_quit = recode(q_smoking_quit, !!!q_smoking_quit_mapping),
q_smoking_pipe = recode(q_smoking_pipe, !!!q_smoking_pipe_mapping),
q_smoking_cigarettes = recode(q_smoking_cigarettes, !!!q_smoking_cigarettes_mapping),
q_alcohol = recode(q_alcohol, !!!q_alcohol_mapping)
) %>%
# change representation of missing values
mutate(q_smoking = case_when(
q_smoking == "Smoked/Smoking" & q_smoking_quit == "Quit" ~ "Smoked",
q_smoking == "Smoked/Smoking" & is.na(q_smoking_quit) ~ "Smoking",
q_smoking == "Never" ~ "Never",
TRUE ~ NA_character_
)) %>%
mutate(q_smoking_quit = case_when(
q_smoking == "Smoking" ~ "Current",
q_smoking == "Never" ~ "Not applicable",
TRUE ~ q_smoking_quit
)) %>%
mutate(q_when_quit = case_when(
q_smoking_quit == "Current" | q_smoking_quit == "Not applicable" ~ "Not applicable",
q_smoking_quit == "Quit" ~ as.character(q_when_quit),
TRUE ~ NA_character_
)) %>%
mutate(q_smoking_cigarettes = case_when(
q_smoking == "Never" ~ "0 / day",
TRUE ~ q_smoking_cigarettes
)) %>%
mutate(q_smoking_dur = case_when(
q_smoking == "Never" ~ 0,
TRUE ~ q_smoking_dur
)) %>%
mutate(q_smoking_pipe = case_when(
q_smoking == "Never" ~ "No",
TRUE ~ q_smoking_pipe
)) %>%
select(patient_id, everything()) %>%
arrange(patient_id)
return(exposures)
}
#' Read and clean TVS clinical chemistry variables from database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_clinical_chemistry <- function(database_file = "TVS 26.xlsx") {
clin_labs <- database_file %>%
readxl::read_excel() %>%
# clean names
janitor::clean_names() %>%
dplyr::rename(
l_ldlmax = l_ld_lmax,
l_hdlmax = l_hd_lmax,
b_crpmin = b_cr_pmin
) %>%
dplyr::select(sample_id, l_totalcholmax:b_kreamin) %>%
# clean values
dplyr::mutate(l_totalcholmax = as.numeric(
stringr::str_replace(l_totalcholmax, ",", ".")
)) %>%
# create patient_id variable following the new id system
mutate(patient_id = case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id) %>%
# add missing patient_ids too
bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
select(patient_id, everything()) %>%
arrange(patient_id)
return(clin_labs)
}
#' Read and clean TVS diagnosis data from database file for analysis.
#'
#' @importFrom magrittr "%>%"
#' @export
import_diagnoses <- function(database_file = "TVS 26.xlsx",
unique_values_file = "unique-values.csv") {
diagnoses <- database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::select(sample_id, cad:pad, d_dm:d_dementia) %>%
dplyr::mutate_all(as.character)
# ids
diagnoses <- diagnoses %>%
mutate(patient_id = case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id) %>%
bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
arrange(patient_id)
# recode values
recodeable_variables <- diagnoses %>%
select(-patient_id) %>%
colnames()
unique_values <- readr::read_csv(
unique_values_file,
col_types = cols(.default = col_character())
) %>%
filter(name_snakecase %in% recodeable_variables) %>%
select(name_snakecase, code, value) %>%
group_by(name_snakecase) %>%
nest() %>%
mutate(mapping = map(data, function(x) { set_names(x$value, x$code) }))
recode_with <- function(var, mapping) {
recode(var, !!!mapping)
}
diagnoses_recoded <- diagnoses %>%
select(unique_values$name_snakecase) %>%
map2_dfr(unique_values$mapping, recode_with)
diagnoses_not_recoded <- diagnoses[ ,!(colnames(diagnoses) %in% unique(unique_values$name_snakecase))]
diagnoses <- diagnoses_recoded %>%
bind_cols(diagnoses_not_recoded) %>%
select(patient_id, everything()) %>%
# mutate(mi_nbr = as.numeric(mi_nbr)) %>%
arrange(patient_id)
return(diagnoses)
}
#' Read and clean TVS symptoms data from TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full path to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @export
import_symptoms <- function(database_file = "TVS 26.xlsx",
unique_values_file = "unique-values.csv") {
symptoms <- database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::select(sample_id, nyha, ccs) %>%
dplyr::mutate_all(as.character)
# create patient_id variable following the new id system
symptoms <- symptoms %>%
mutate(patient_id = case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id)
# recode codes to values
recodeable_variables <- symptoms %>%
select(-patient_id) %>%
colnames()
unique_values <- readr::read_csv(
unique_values_file,
col_types = cols(.default = col_character())
) %>%
filter(name_snakecase %in% recodeable_variables)
nyha_mapping <- unique_values %>%
filter(name_snakecase == "nyha") %$%
set_names(value, code)
ccs_mapping <- unique_values %>%
filter(name_snakecase == "ccs") %$%
set_names(value, code)
symptoms <- symptoms %>%
mutate(
nyha = recode(nyha, !!!nyha_mapping),
ccs = recode(ccs, !!!ccs_mapping)
) %>%
select(patient_id, everything())
# add empty rows for two missing patient_ids
symptoms <- symptoms %>%
bind_rows(tibble::tibble(patient_id = c("A0332", "B0550"))) %>%
arrange(patient_id)
return(symptoms)
}
#' Read and clean TVS death/survival data from the TVS database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param samples_file A full path to hte TVS samples metadata file.
#' @param format The format of the output dataset, either "patients" (wide, one row per patient) or "events" (one row per event).
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_deaths <- function(database_file = "TVS 26.xlsx",
samples_file = "samples.csv",
format = "patients") {
id_mapping <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
select(sample_id, patient_id)
deaths <- database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
# sample_type is needed to recode ids
dplyr::rename(sample_type = tissuetype) %>%
dplyr::select(sample_id, sample_type, death_date:f_follow_up_time11) %>%
# clean dates
dplyr::mutate(death_date = as.numeric(death_date)) %>%
dplyr::mutate(death_date = janitor::excel_numeric_to_date(
death_date,
date_system = "modern"
)) %>%
dplyr::mutate_at(dplyr::vars(-dplyr::contains("time")), as.character) %>%
dplyr::mutate(death_date = dplyr::case_when(
# actual missingness
is.na(death_date) & is.na(f_follow_up_time11) ~ NA_character_,
# if there is no `death_date` but
# follow up time exists, patient is alive.
is.na(death_date) & !is.na(f_follow_up_time11) ~ "Not applicable",
TRUE ~ death_date
)) %>%
dplyr::mutate(
# derive cause of death variable from indicators
cause_of_death = dplyr::case_when(
# SCD
(f_death_scd09 == "1" | f_death_scd11 == "1") ~ "Sudden cardiac death",
# other CV
(f_death_cv09 == "1" | f_death_cv11 == "1") &
!(f_death_scd09 == "1" | f_death_scd11 == "1") ~ "Other cardiovascular",
# non-CV
(f_death_tot09 == "1" | f_death_tot11 == "1") &
!(f_death_cv09 == "1" | f_death_cv11 == "1") ~ "Non-cardiovascular",
death_date == "Not applicable" ~ "Not applicable",
# unknown
TRUE ~ NA_character_
),
# add variable for the latest date of measurement
# assumption: use 2011-06-15 as the reported "June 2011"
death_last_measurement = dplyr::case_when(
!is.na(f_death_tot11) ~ lubridate::ymd("2011-06-15"),
is.na(f_death_tot11) ~ lubridate::as_date(NA)
),
# derive variable for start of follow up
# approximation: 30.42 days in a month by average
follow_up_start = dplyr::case_when(
!is.na(death_last_measurement) ~
death_last_measurement -
lubridate::days(as.integer(round(f_follow_up_time11 * 30.42, 0))),
is.na(death_last_measurement) ~ lubridate::as_date(NA)
)
) %>%
# drop redundant variables
dplyr::select(-(f_death_tot09:f_follow_up_time11)) %>%
dplyr::mutate_all(as.character) %>%
# derive logical death_status
dplyr::mutate(death_status_last = dplyr::case_when(
death_date == "Not applicable" ~ "Alive",
is.na(death_date) ~ NA_character_,
TRUE ~ "Dead"
)) %>%
# derive numeric follow up duration
dplyr::mutate(follow_up_days = dplyr::case_when(
!is.na(death_date) & death_date != "Not applicable" ~
lubridate::ymd(death_date) - lubridate::ymd(follow_up_start),
TRUE ~ lubridate::ymd(death_last_measurement) - lubridate::ymd(follow_up_start)
)) %>%
dplyr::mutate(follow_up_days = as.numeric(follow_up_days))
deaths <- deaths %>%
# use new id system; now only patient_id needed
mutate(sample_id = case_when(
!(sample_type %in% c("8", "9")) ~ stringr::str_c("T_", sample_id),
TRUE ~ sample_id
)) %>%
left_join(id_mapping, by = "sample_id") %>%
# the rest of sample_ids are patient_ids
mutate(patient_id = if_else(is.na(patient_id), sample_id, patient_id)) %>%
select(
patient_id,
follow_up_start,
death_last_measurement,
death_date,
death_status_last,
follow_up_days,
cause_of_death
) %>%
distinct(patient_id, .keep_all = TRUE)
deaths <- samples_file %>%
read_csv(col_types = cols(.default = col_character())) %>%
distinct(patient_id) %>%
filter(patient_id != "Multiple patients") %>%
anti_join(deaths, by = "patient_id") %>%
bind_rows(deaths) %>%
arrange(patient_id)
if (format == "patients") {
return(deaths)
} else if (format == "events") {
deaths %>%
dplyr::select(-death_status_last, -follow_up_days) %>%
tidyr::gather(event, date, -patient_id, -cause_of_death) %>%
dplyr::select(patient_id, event, date, cause_of_death) %>%
dplyr::arrange(patient_id)
} else {
stop('parameter *format* needs to be either "patients" or "events".')
}
}
#' Read and clean TVS treatment data from database file for analysis.
#'
#' @param database_file A full path to the TVS database file.
#' @param unique_values_file A full to the TVS unique values metadata file.
#'
#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%$%"
#' @export
import_treatments <- function(database_file = "TVS 26.xlsx",
unique_values_file = "unique_values.csv") {
treatments <- database_file %>%
readxl::read_excel() %>%
janitor::clean_names() %>%
dplyr::rename(m_bp_med = m_b_pmed) %>%
dplyr::select(sample_id, m_hormone_rt:cabg) %>%
dplyr::mutate_all(as.character)
# create patient_id variable following the new id system
treatments <- treatments %>%
mutate(patient_id = case_when(
str_detect(sample_id, "V") ~ str_remove(sample_id, "V"),
TRUE ~ sample_id
)) %>%
mutate(patient_id = case_when(
patient_id == "2033" | patient_id == "3039" ~ "2033_3039",
TRUE ~ patient_id
)) %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(-sample_id)
# recode codes to values
recodeable_variables <- treatments %>%
select(-patient_id) %>%
colnames()
unique_values <- readr::read_csv(
unique_values_file,
col_types = cols(.default = col_character())
) %>%
filter(name_snakecase %in% recodeable_variables)
m_hormone_rt_mapping <- unique_values %>%
filter(name_snakecase == "m_hormone_rt") %$%
set_names(value, code)
m_diabmed_mapping <- unique_values %>%
filter(name_snakecase == "m_diabmed") %$%
set_names(value, code)
m_statins_mapping <- unique_values %>%
filter(name_snakecase == "m_statins") %$%
set_names(value, code)
m_statin_agent_mapping <- unique_values %>%
filter(name_snakecase == "m_statin_agent") %$%
set_names(value, code)
m_bp_med_mapping <- unique_values %>%
filter(name_snakecase == "m_bp_med") %$%
set_names(value, code)
pcta_mapping <- unique_values %>%
filter(name_snakecase == "pcta") %$%
set_names(value, code)
cabg_mapping <- unique_values %>%
filter(name_snakecase == "cabg") %$%
set_names(value, code)
treatments <- treatments %>%
mutate(
m_hormone_rt = recode(m_hormone_rt, !!!m_hormone_rt_mapping),
m_diabmed = recode(m_diabmed, !!!m_diabmed_mapping),
m_statins = recode(m_statins, !!!m_statins_mapping),
m_statin_agent = recode(m_statin_agent, !!!m_statin_agent_mapping),
m_bp_med = recode(m_bp_med, !!!m_bp_med_mapping),
pcta = recode(pcta, !!!pcta_mapping),
cabg = recode(cabg, !!!cabg_mapping)
) %>%
select(patient_id, everything())
# add empty rows for two missing patient_ids
treatments <- treatments %>%
bind_rows(data_frame(patient_id = c("A0332", "B0550"))) %>%
arrange(patient_id)
# fix other missing values
treatments <- treatments %>%
mutate(m_statin_agent = case_when(
m_statins == "No" ~ "No",
m_statins == "Yes" ~ m_statin_agent,
TRUE ~ NA_character_
))
# TODO: assume missing value in m_diabmed means actually missing?
return(treatments)
}
#' Import TVS patients metadata file.
#'
#' A tiny wrapper for reading the patients metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param patient_file A full to the TVS patients metadata file.
#' @importFrom magrittr "%>%"
#' @export
import_patients <- function(patients_file = "patients.csv") {
readr::read_csv(
patients_file,
col_types = cols(.default = col_character())
)
}
#' Import TVS variables metadata file.
#'
#' A tiny wrapper for reading the variables metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param patient_file A full to the TVS variables metadata file.
#' @importFrom magrittr "%>%"
#' @export
import_variables <- function(variables_file = "variables.csv") {
readr::read_csv(
variables_file,
col_types = cols(.default = col_character())
)
}
#' Import TVS samples metadata file.
#'
#' A tiny wrapper for reading the samples metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param samples_file A full to the TVS samples metadata file.
#' @importFrom magrittr "%>%"
#' @export
import_samples <- function(samples_file = "samples.csv") {
readr::read_csv(
samples_file,
col_types = cols(.default = col_character())
)
}
#' Import TVS unique values metadata file.
#'
#' A tiny wrapper for reading the unique values metadata file,
#' as part of a more consistent set of import functions.
#'
#' @param unique_values_file A full to the TVS unique values metadata file.
#' @importFrom magrittr "%>%"
#' @export
import_unique_values <- function(unique_values_file = "unique-values.csv") {
readr::read_csv(
unique_values_file,
col_types = cols(.default = col_character())
)
}
#' Read and clean TVS datasets from data and metadata directory.
#'
#' This function is a simple wrapper for running multiple TVS import-functions.
#' Filenames are currently hardcoded and must be the original ones.
#' Currently the following TVS datasets are NOT included: DNA, TaqMan DNA, bacteria.
#'
#' @param data_directory A full path to a directory that contains all raw/original TVS datasets.
#' @param metadata_directory A full path to a directory that contains all TVS metadata-files.
#'
#' @return Returns nothing. Instead assigns datasets to objects in the global environment.
#'
#' @export
import_objects <- function(data_directory,
metadata_directory) {
# set working directory temporarily to the input
original_wd <- getwd()
setwd(data_directory)
samples_file <- stringr::str_c(metadata_directory, "samples.csv", sep = "/")
unique_values_file <- stringr::str_c(metadata_directory, "unique-values.csv", sep = "/")
# NOTE
# the double arrow <<- primitive assigns objects to global environment,
# not just to the scope of this function
# -------------- data ----------------
lipids <<- tvsproject::import_lipids(
lipids_file = "TVS_data_171220_lipidomics.xlsx",
samples_file = samples_file
)
mrna_blood_probes <<- tvsproject::import_blood_mrna(
samples_file = samples_file,
blood_file = "tvs_blood_gwe_loess.txt"
)
mrna_mono_probes <<- tvsproject::import_monocyte_mrna(
samples_file = samples_file,
monocyte_file = "tvs_monocyte_gwe_loess.txt"
)
mrna_artery_probes <<- tvsproject::import_artery_mrna(
samples_file = samples_file,
artery_file = "tvs_plaque_gwe_loess_copy.csv"
)
mrna_all_probes <<- tvsproject::import_mrna(
blood_file = "tvs_blood_gwe_loess.txt",
artery_file = "tvs_plaque_gwe_loess_copy.csv",
monocyte_file = "tvs_monocyte_gwe_loess.txt",
samples_file = samples_file
)
lipopro_samples <<- tvsproject::import_lipoproteins(
lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
samples_file = samples_file,
format = "samples"
)
lipopro_particles <<- tvsproject::import_lipoproteins(
lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
samples_file = samples_file,
format = "particles"
)
lmwm <<- tvsproject::import_lmwm(
lmwm_file = "TVS0002.1 - LMWM predictions.xls",
samples_file = samples_file
)
histo <<- tvsproject::import_histology(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file,
samples_file = samples_file
)
deaths_patients <<- tvsproject::import_deaths(
database_file = "TVS 26.xlsx",
samples_file = samples_file,
format = "patients"
)
deaths_events <<- tvsproject::import_deaths(
database_file = "TVS 26.xlsx",
samples_file = samples_file,
format = "events"
)
diagnoses <<- tvsproject::import_diagnoses(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
treatments <<- tvsproject::import_treatments(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
symptoms <<- tvsproject::import_symptoms(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
clinchem <<- tvsproject::import_clinical_chemistry(database_file = "TVS 26.xlsx")
exposures <<- tvsproject::import_exposures(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
macro <<- tvsproject::import_macro(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
physiology <<- tvsproject::import_physiology("TVS 26.xlsx")
# ---------- metadata --------------
setwd(metadata_directory)
samples <<- tvsproject::import_samples("samples.csv")
unique_values <<- tvsproject::import_unique_values("unique-values.csv")
patients <<- tvsproject::import_patients("patients.csv")
variables <<- tvsproject::import_variables("variables.csv")
setwd(original_wd)
# ----------- classify variables ----------------
mrna_all_genes <<- tvsproject::classify_mrna_probes(
mrna_all_probes,
variables,
summary_function = "max"
)
mrna_artery_genes <<- mrna_all_genes %>%
filter(!(sample_type %in% c("Monocyte", "Blood"))) %>%
bind_rows(mrna_artery_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
mrna_blood_genes <<- mrna_all_genes %>%
filter(sample_type == "Blood") %>%
bind_rows(mrna_blood_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
mrna_monocyte_genes <<- mrna_all_genes %>%
filter(sample_type == "Monocyte") %>%
bind_rows(mrna_mono_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
lipids_classes <<- tvsproject::classify_lipids(
lipids,
variables,
summary_function = "sum"
)
}
#' Import TVS objects and save them in a cache file.
#'
#' @param data_directory A full path to a directory that contains all raw/original TVS datasets.
#' @param metadata_directory A full path to a directory that contains all TVS metadata-files.
#' @param cache_directory A full path to the output directory.
#'
#' @return Returns nothing, saves an RData-file to the given cache directory instead.
#'
#' @export
generate_cache <- function(data_directory,
metadata_directory,
cache_directory) {
# individual objects
tvsproject::import_objects(data_directory, metadata_directory)
cache_file_name <- stringr::str_c("tvs-objects-", lubridate::today(), ".RData")
save.image(file = stringr::str_c(cache_directory, cache_file_name))
# combine data objects into one and save another file
tvs <- tvsproject::combine_data_objects()
cache_file_name <- stringr::str_c("tvs-dataset-", lubridate::today(), ".RData")
save(tvs, file = stringr::str_c(cache_directory, cache_file_name))
# reset global environment
rm(list = ls())
gc()
}
#' Import TVS data as a single dataset object with a row per unique patient.
#'
#' @param data_directory A full path to a directory that contains all raw/original TVS datasets.
#' @param metadata_directory A full path to a directory that contains all TVS metadata-files.
#'
#' @importFrom magrittr "%>%"
#' @export
import_data <- function(data_directory, metadata_directory) {
# set working directory temporarily to the input
original_wd <- getwd()
setwd(data_directory)
samples_file <- stringr::str_c(metadata_directory, "samples.csv", sep = "/")
unique_values_file <- stringr::str_c(metadata_directory, "unique-values.csv", sep = "/")
# -------------- data ----------------
lipids <- tvsproject::import_lipids(
lipids_file = "TVS_data_171220_lipidomics.xlsx",
samples_file = samples_file
)
mrna_blood_probes <- tvsproject::import_blood_mrna(
samples_file = samples_file,
blood_file = "tvs_blood_gwe_loess.txt"
)
mrna_mono_probes <- tvsproject::import_monocyte_mrna(
samples_file = samples_file,
monocyte_file = "tvs_monocyte_gwe_loess.txt"
)
mrna_artery_probes <- tvsproject::import_artery_mrna(
samples_file = samples_file,
artery_file = "tvs_plaque_gwe_loess_copy.csv"
)
mrna_all_probes <- tvsproject::import_mrna(
blood_file = "tvs_blood_gwe_loess.txt",
artery_file = "tvs_plaque_gwe_loess_copy.csv",
monocyte_file = "tvs_monocyte_gwe_loess.txt",
samples_file = samples_file
)
lipopro_samples <- tvsproject::import_lipoproteins(
lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
samples_file = samples_file,
format = "samples"
)
lipopro_particles <- tvsproject::import_lipoproteins(
lipoprotein_file = "TVS0001.1 - LIPO predictions.xls",
samples_file = samples_file,
format = "particles"
)
lmwm <- tvsproject::import_lmwm(
lmwm_file = "TVS0002.1 - LMWM predictions.xls",
samples_file = samples_file
)
histo <- tvsproject::import_histology(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file,
samples_file = samples_file
)
deaths_patients <- tvsproject::import_deaths(
database_file = "TVS 26.xlsx",
samples_file = samples_file,
format = "patients"
)
deaths_events <- tvsproject::import_deaths(
database_file = "TVS 26.xlsx",
samples_file = samples_file,
format = "events"
)
diagnoses <- tvsproject::import_diagnoses(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
treatments <- tvsproject::import_treatments(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
symptoms <- tvsproject::import_symptoms(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
clinchem <- tvsproject::import_clinical_chemistry(database_file = "TVS 26.xlsx")
exposures <- tvsproject::import_exposures(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
macro <- tvsproject::import_macro(
database_file = "TVS 26.xlsx",
unique_values_file = unique_values_file
)
physiology <- tvsproject::import_physiology("TVS 26.xlsx")
setwd(original_wd)
# ----------- classify variables ----------------
mrna_all_genes <- tvsproject::classify_mrna_probes(
mrna_all_probes,
variables,
summary_function = "max"
)
mrna_artery_genes <- mrna_all_genes %>%
filter(!(sample_type %in% c("Monocyte", "Blood"))) %>%
bind_rows(mrna_artery_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
mrna_blood_genes <- mrna_all_genes %>%
filter(sample_type == "Blood") %>%
bind_rows(mrna_blood_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
mrna_monocyte_genes <- mrna_all_genes %>%
filter(sample_type == "Monocyte") %>%
bind_rows(mrna_mono_probes %>% magrittr::extract( , 1:2) %>% filter(is.na(sample_id)))
lipids_classes <- tvsproject::classify_lipids(
lipids,
variables,
summary_function = "sum"
)
# ------------- combine dataframes ----------------
tvs <- samples_file %>%
tvsproject::import_samples() %>%
select(patient_id) %>%
distinct() %>%
left_join(macro, by = "patient_id") %>%
left_join(physiology, by = "patient_id") %>%
left_join(exposures, by = "patient_id") %>%
left_join(clinchem, by = "patient_id") %>%
left_join(symptoms, by = "patient_id") %>%
left_join(treatments, by = "patient_id") %>%
left_join(diagnoses, by = "patient_id") %>%
left_join(deaths_patients, by = "patient_id") %>%
left_join(
rename(histo, sample_id_artery = sample_id),
by = "patient_id"
) %>%
left_join(lipids) %>%
left_join(lipopro_samples, by = "patient_id", suffix = c("_lipids", "")) %>%
left_join(
lmwm,
by = "patient_id",
suffix = c("_lipopro", "")
) %>%
left_join(
select(lipids_classes, -sample_id, -sample_type),
by = "patient_id"
) %>%
left_join(
mrna_artery_genes,
by = "patient_id",
suffix = c("_lmwm", "")
) %>%
left_join(
mrna_blood_genes,
by = "patient_id",
suffix = c("_artery", "")
) %>%
left_join(
mrna_monocyte_genes,
by = "patient_id",
suffix = c("_blood", "_monocyte")
) %>%
select(contains("_id"), contains("_type"), everything())
return(tvs)
}
#' Combine TVS data objects in the global environment into a single dataset object.
#'
#' The function assumes that the objects are named exatcly like the function "import_objects" assigns them.
#' When the function doesn't find the objects from its own environment, it searches them from the global environment.
#'
#' @return A dataset of class "tbl_df" (tibble) and "data.frame" (base).
#'
#' @importFrom magrittr "%>%"
#' @export
combine_data_objects <- function() {
tvs <- samples %>%
select(patient_id) %>%
distinct() %>%
left_join(macro, by = "patient_id") %>%
left_join(physiology, by = "patient_id") %>%
left_join(exposures, by = "patient_id") %>%
left_join(clinchem, by = "patient_id") %>%
left_join(symptoms, by = "patient_id") %>%
left_join(treatments, by = "patient_id") %>%
left_join(diagnoses, by = "patient_id") %>%
left_join(deaths_patients, by = "patient_id") %>%
left_join(
rename(histo, sample_id_artery = sample_id),
by = "patient_id"
) %>%
left_join(lipids) %>%
left_join(lipopro_samples, by = "patient_id", suffix = c("_lipids", "")) %>%
left_join(
lmwm,
by = "patient_id",
suffix = c("_lipopro", "")
) %>%
left_join(
select(lipids_classes, -sample_id, -sample_type),
by = "patient_id"
) %>%
left_join(
mrna_artery_genes,
by = "patient_id",
suffix = c("_lmwm", "")
) %>%
left_join(
mrna_blood_genes,
by = "patient_id",
suffix = c("_artery", "")
) %>%
left_join(
mrna_monocyte_genes,
by = "patient_id",
suffix = c("_blood", "_monocyte")
) %>%
select(contains("_id"), contains("_type"), everything())
return(tvs)
}
#' Read and clean TVS DNA data files for analysis.
#'
#' @importFrom magrittr "%>%"
#' @export
import_dna <- function(artery_file = "tvs_team16_artery.txt",
blood_file = "tvs_team16_blood.txt") {
dna_artery <- artery_file %>%
readr::read_tsv(col_types = stringr::str_dup("c", 68)) %>%
# SNP-metadata in meta_variables, remove here
dplyr::select(-Chr, -Position, -Alleles)
dna_blood <- blood_file %>%
readr::read_tsv(col_types = stringr::str_dup("c", 88)) %>%
# SNP-metadata in meta_variables, remove here
dplyr::select(-Chr, -Position, -Alleles)
dna <- dna_artery %>%
dplyr::full_join(dna_blood) %>%
# ugly gather-spread-workaround
dplyr::bind_rows(rlang::set_names(colnames(.), colnames(.)), .) %>%
as.matrix() %>%
t() %>%
dplyr::as_tibble(.name_repair = "unique") %>%
# ugly but working row_to_names -workaround
magrittr::set_colnames((( . %>% magrittr::extract(1, ) %>% as.character() )(.))) %>%
magrittr::extract(-1, ) %>%
dplyr::rename(sample_id = SNP_ID)
return(dna)
}
#' Read and clean TVS TaqMan DNA data file for analysis.
#'
#' This is an additional smaller SNP set (Furin, ApoE, IL, and ADAM)
#' measured with the AppliedBioSystems' Taqman assay.
#'
#' TODO: Recoding from numbers to bases.
#'
#' @importFrom magrittr "%>%" "%$%"
#' @export
import_dna_taqman <- function(taqman_file = "all TVS TaqMan genotypes August 2011.xls",
variable_metadata_file = "variables.txt") {
dna_taqman <- taqman_file %>%
readxl::read_excel(col_types = "text") %>%
dplyr::rename(sample_id = SampleID)
meta_variables <- variable_metadata_file %>%
readr::read_csv(col_types = stringr::str_dup("c", 22))
# create a map from gene_symbols to SNP identifiers
rename_map_dna_taqman <- colnames(dna_taqman) %>%
tibble::enframe() %>%
dplyr::rename(gene_symbol = value) %>%
dplyr::slice(9:20) %>%
# sorry, nesting to avoid creating intermediate object...
dplyr::left_join(
dplyr::mutate(
dplyr::select(
.data = meta_variables,
name_snakecase,
gene_symbol
),
gene_symbol = stringr::str_remove_all(gene_symbol, "-")
),
by = "gene_symbol"
) %$% # note the different pipe, see ?magrittr
rlang::set_names(gene_symbol, name_snakecase)
dna_taqman <- dna_taqman %>%
dplyr::rename(!!!rename_map_dna_taqman)
return(dna_taqman)
}
#' Read and clean TVS miRNA data file for analysis.
#'
#' @importFrom magrittr "%>%"
#' @export
import_mirna <- function(mirna_file = "TVS.miRNA.xls") {
mirna <- mirna_file %>%
readxl::read_excel() %>%
tidyr::gather("sample_id", "value", -1) %>%
tidyr::spread(Probe, value) %>%
dplyr::rename_all(
. %>%
stringr::str_to_lower() %>%
stringr::str_replace_all(c("-" = "_", "\\*" = "_star"))) %>%
dplyr::select(sample_id, darkcorner, mirnabrightcorner30:scorner3, dplyr::everything())
return(mirna)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.