#' compute_cim_phewas
#'
#' computes the PheWAS analysis on ICD codes
#'
#' @param cim cim data.frame from the `get_data_from_num` function
#' @param patients patients data.frame from the `get_data_from_num` function
#' @param cim_min_occur integer. minimum number of occurrences of the biological test result in the dataset to be analyzed
#' @param zero_controls_add integer. number of simulated patients to add to the control group for a given if there is no occurrence of this concept in the control group. This trick allows to compute a pvalue in this situation.
#' @param with_boot logical. perform a bootstrap on the results
#' @param perm integer. number of permutations for the bootstrap
#' @param boot_all logical. perform a bootstrap on all the results or only on the results with FDR corrected p.value < fdr_threshold
#' @param fdr_threshold maximum FDR corrected p.value to perform the bootstrap
#' @param progress progress bar object
#' @export
compute_cim_phewas <- function(cim,
patients,
cim_min_occur = 5,
zero_controls_add = 2 ,
with_boot = FALSE,
perm = 1000,
boot_all = FALSE,
fdr_threshold = 0.05,
progress = NULL,
cores = 1) {
require(dplyr, quietly = TRUE)
#require(parallel, quietly = TRUE)
require(tidyr, quietly = TRUE)
require(broom, quietly = TRUE)
require(future)
if(!is.null(progress)) {
progress$set(1, detail = 'Computing phewas')
} else {
cat('Computing phewas on ICD ...')
}
n_patients <- cim %>% dplyr::group_by(PATIENT_NUM,group) %>%
dplyr::summarise(bla = n()) %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
#n_patients <- patients %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
n_cases <- n_patients %>% dplyr::filter(group == 'cases') %>% dplyr::select(n)
n_control <- n_patients %>% dplyr::filter(group == 'control') %>% dplyr::select(n)
cim_list <- cim %>%
group_by(PARENT_CODE, PARENT_LABEL, group) %>%
summarize(count = n()) %>%
spread(group,count) %>%
ungroup() %>%
mutate(cases = ifelse(is.na(cases), 0, cases),
control = ifelse(is.na(control), 0, control),
cases0 = as.numeric(n_cases) - cases,
control0 = as.numeric(n_control) - control,
total = cases + control)
cim_keep <- cim_list %>%
dplyr::filter(cases > 0, total >= cim_min_occur, stringr::str_detect(PARENT_CODE, '^[A-Z]'))
cim_with_0 <- cim_list %>%
dplyr::filter(control == 0 , total >= cim_min_occur) %>%
dplyr::select(PARENT_CODE)
adjust_facts <- cim %>% group_by(PARENT_CODE) %>%
dplyr::filter(PARENT_CODE %in% cim_with_0$PARENT_CODE) %>%
slice(zero_controls_add) %>%
mutate(group = "control", PATIENT_NUM = 0) %>% data.frame()
cim <- rbind(cim %>% ungroup(), adjust_facts)
cim_wide <- cim %>%
dplyr::filter(PARENT_CODE %in% cim_keep$PARENT_CODE) %>%
dplyr::select(PATIENT_NUM, PARENT_CODE,group, count, SEX) %>%
mutate(count = 1) %>%
distinct(PATIENT_NUM, PARENT_CODE, .keep_all = TRUE) %>%
spread(key =PARENT_CODE,value = count, fill = 0) %>%
mutate(group = factor(group, levels =c('control','cases')))
result = NULL
#cores <- availableCores()
if (cores > 1 & Sys.info()[['sysname']] != 'Windows') {
plan(multiprocess)
} else {
plan(sequential)
}
cstart = 4
clist <- cstart:ncol(cim_wide)
seq_along(clist)
chunks <- split(clist, ceiling(seq_along(clist)/ceiling(length(clist)/cores)))
f <- list()
for (i in 1:cores) {
f[[i]] <- future({
res_temp <- lapply(chunks[[i]], FUN= function(x, cim_wide) {
concept = names(cim_wide)[x]
form <- as.formula(stringr::str_interp("`${concept}` ~ group",
list(concept = concept)))
fit <- glm(form, data=cim_wide, family= binomial())
p <- fit %>%
tidy() %>%
dplyr::filter(term == 'groupcases') %>%
dplyr::select(p.value) %>%
as.numeric()
res <- exp(cbind(coef(fit), suppressMessages(confint.default(fit, trace = TRUE)))) %>%
tidy() %>% as.data.frame() %>%
rename(term = .rownames, OR = V1, CI_inf = X2.5.., CI_sup = X97.5..) %>%
dplyr::filter(term == 'groupcases') %>%
mutate(PARENT_CODE = concept, p.value = p, fit = list(tidy((fit)))) %>% dplyr::select(-term)
res
}, cim_wide = cim_wide)
lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
})
}
res_temp <- lapply(f, value)
result <- lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
result <- result %>%
mutate( p_fdr = p.adjust(p.value, method = 'fdr'),
p_bonf =p.adjust(p.value, method = 'bonferroni') ) %>%
left_join(cim_list, by = 'PARENT_CODE') %>%
dplyr::select(PARENT_CODE, PARENT_LABEL, cases, control, cases0,control0, OR, CI_inf, CI_sup, p.value, p_fdr, p_bonf,fit) %>%
dplyr::rename(CODE = PARENT_CODE, CODE_LABEL = PARENT_LABEL) %>%
arrange(p_fdr)
result <- result[!duplicated(result[,c('CODE', 'CODE_LABEL')]),]
if (with_boot) {
if(!is.null(progress)) {
progress$set(2, detail = 'Bootstrapping')
} else {
cat('Bootstrapping on ICD ...')
}
result <- bootstrap_results(result, fdr_threshold = fdr_threshold, concepts_wide = cim_wide, family = 'binomial',perm = perm, boot_all = boot_all, cores = cores)
}
result <- rename(result, CODE_LABEL = CODE_LABEL)
list(result = result, boot = with_boot)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.