#' compute_phewas
#'
#' computes the PheWAS analysis on concepts (UMLS concepts extracted from free text)
#'
#' @param concepts concepts data.frame from the `get_data_from_num` function
#' @param patients patients data.frame from the `get_data_from_num` function
#' @param concepts_min_occur integer. minimum number of occurrences of the concept in the dataset to be analyzed
#' @param zero_controls_add integer. number of simulated patients to add to the control group for a given if there is no occurrence of this concept in the control group. This trick allows to compute a pvalue in this situation.
#' @param min_concept integer. minimum number of unique concept for a patient to be included in the analysis
#' @param with_boot logical. perform a bootstrap on the results
#' @param perm integer. number of permutations for the bootstrap
#' @param boot_all logical. perform a bootstrap on all the results or only on the results with FDR corrected p.value < fdr_threshold
#' @param fdr_threshold maximum FDR corrected p.value to perform the bootstrap
#' @param progress progress bar object
#' @export
compute_phewas <- function(concepts,
patients,
concepts_min_occur = 5,
zero_controls_add = 2,
min_concept = NULL,
with_boot = FALSE,
perm = 1000,
boot_all = FALSE,
fdr_threshold = 0.05,
progress = NULL,
cores = 1) {
require(dplyr, quietly = TRUE)
#require(parallel, quietly = TRUE)
require(tidyr, quietly = TRUE)
require(broom, quietly = TRUE)
require(future)
if (!is.null(min_concept)) {
patients <- dplyr::filter(patients, UNIQ_CONCEPTS >= min_concept)
concepts <- concepts[concepts$PATIENT_NUM %in% patients$PATIENT_NUM,]
}
if(nrow(concepts) == 0) {return(NULL)}
if(!is.null(progress)) {
progress$set(1, detail = 'Computing phewas')
} else {
cat(stringr::str_interp('Computing phewas on concepts ...'))
}
n_patients <- concepts %>% dplyr::group_by(PATIENT_NUM,group) %>%
dplyr::summarise(bla = n()) %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
#n_patients <- patients %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
n_cases <- n_patients %>% dplyr::filter(group == 'cases') %>% dplyr::select(n)
n_control <- n_patients %>% dplyr::filter(group == 'control') %>% dplyr::select(n)
concepts_list <- concepts %>%
dplyr::group_by(CODE, CODE_LABEL, group, CAT) %>%
dplyr::summarize(count = sum(count))
concepts_list <-reshape2::dcast(concepts_list,
formula = CODE+ CODE_LABEL+ CAT ~ group,
value.var = 'count',
fill = 0)
#cl <- tidyr::spread(concepts_list,group, count)
concepts_list <- concepts_list %>%
dplyr::ungroup() %>%
dplyr::mutate(cases0 = as.numeric(n_cases) - cases,
control0 = as.numeric(n_control) - control,
total = cases + control) %>% ungroup()
concepts_keep <- concepts_list %>%
dplyr::filter(cases > 0, total >= concepts_min_occur)
concepts_with_0 <- concepts_list %>%
dplyr::filter(control == 0 , total >= concepts_min_occur) %>%
dplyr::select(CODE)
adjust_facts <- concepts %>% group_by(CODE) %>%
dplyr::filter(CODE %in% concepts_with_0$CODE) %>%
dplyr::slice(zero_controls_add) %>%
dplyr::mutate(group = "control", PATIENT_NUM = 0) %>% data.frame()
concepts <- rbind(concepts, adjust_facts)
concepts_wide <- concepts %>%
dplyr::filter(CODE %in% concepts_keep$CODE) %>%
dplyr::select(PATIENT_NUM, CODE,group, count, SEX)
concepts_wide <- reshape2::dcast(concepts_wide,
PATIENT_NUM + group + SEX ~ CODE,
value.var = 'count',
fill = 0)
concepts_wide <- concepts_wide %>%
dplyr::mutate(group = factor(group, levels =c('control','cases')))
result = NULL
#cores <- availableCores()
if (cores > 1 & Sys.info()[['sysname']] != 'Windows') {
plan(multiprocess)
} else {
plan(sequential)
}
cstart = 4
clist <- cstart:ncol(concepts_wide)
seq_along(clist)
chunks <- split(clist, ceiling(seq_along(clist)/ceiling(length(clist)/cores)))
f <- list()
for (i in 1:cores) {
f[[i]] <- future({
res_temp <- lapply(chunks[[i]], FUN= function(x, concepts_wide) {
concept = names(concepts_wide[x])
form <- as.formula(stringr::str_interp("${concept} ~ group",
list(concept = concept)))
fit <- glm(form, data=concepts_wide, family= binomial())
p <- fit %>%
tidy() %>%
dplyr::filter(term == 'groupcases') %>%
dplyr::select(p.value) %>%
as.numeric()
res <- exp(cbind(coef(fit), suppressMessages(confint.default(fit, trace = TRUE)))) %>%
tidy() %>% as.data.frame() %>%
rename(term = .rownames, OR = V1, CI_inf = X2.5.., CI_sup = X97.5..) %>%
dplyr::filter(term == 'groupcases') %>%
mutate(CODE = concept, p.value = p, fit = list(tidy((fit)))) %>% dplyr::select(-term)
res
}, concepts_wide = concepts_wide)
lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
})
}
res_temp <- lapply(f, value)
result <- lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
result <- lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
result <- result %>%
mutate( p_fdr = p.adjust(p.value, method = 'fdr'),
p_bonf =p.adjust(p.value, method = 'bonferroni') ) %>%
left_join(concepts_list, by = 'CODE') %>%
dplyr::select(CODE, CODE_LABEL, cases, control, cases0,control0, OR, CI_inf, CI_sup, p.value, p_fdr, p_bonf,CAT) %>%
arrange(p_fdr)
result <- result[!duplicated(result[,c('CODE', 'CODE_LABEL')]),]
if (with_boot) {
if(!is.null(progress)) {
progress$set(2, detail = 'Bootstrapping')
} else {
cat('Bootstrapping on concepts ...')
}
result <- bootstrap_results(result, fdr_threshold = fdr_threshold, concepts_wide, family = 'binomial',perm = perm, boot_all = boot_all, cores = cores)
}
result <- rename(result, CODE_LABEL = CODE_LABEL)
list(result = result, boot = with_boot)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.