R/compute_phewas.R

#' compute_phewas
#' 
#' computes the PheWAS analysis on concepts (UMLS concepts extracted from free text)
#' 
#' @param concepts concepts data.frame from the `get_data_from_num` function
#' @param patients patients data.frame from the `get_data_from_num` function
#' @param concepts_min_occur integer. minimum number of occurrences of the concept in the dataset to be analyzed
#' @param zero_controls_add integer. number of simulated patients to add to the control group for a given if there is no occurrence of this concept in the control group. This trick allows to compute a pvalue in this situation. 
#' @param min_concept integer. minimum number of unique concept for a patient to be included in the analysis
#' @param with_boot logical. perform a bootstrap on the results
#' @param perm integer. number of permutations for the bootstrap
#' @param boot_all logical. perform a bootstrap on all the results or only on the results with FDR corrected p.value < fdr_threshold
#' @param fdr_threshold maximum FDR corrected p.value to perform the bootstrap 
#' @param progress  progress bar object 
#' @export
compute_phewas <- function(concepts, 
                           patients, 
                           concepts_min_occur = 5, 
                           zero_controls_add = 2, 
                           min_concept = NULL, 
                           with_boot = FALSE, 
                           perm = 1000, 
                           boot_all = FALSE, 
                           fdr_threshold = 0.05,
                           progress = NULL,
                           cores = 1) {
  require(dplyr, quietly = TRUE)
  #require(parallel, quietly = TRUE)
  require(tidyr, quietly = TRUE)
  require(broom, quietly = TRUE)
  require(future)
  
  if (!is.null(min_concept)) {
    patients <- dplyr::filter(patients, UNIQ_CONCEPTS >= min_concept)
    concepts <- concepts[concepts$PATIENT_NUM %in% patients$PATIENT_NUM,]
  }
  

  if(nrow(concepts) == 0) {return(NULL)}
  
  if(!is.null(progress)) {
    progress$set(1, detail = 'Computing phewas') 
  } else {
    cat(stringr::str_interp('Computing phewas on concepts ...'))
  }
  
  n_patients <- concepts %>% dplyr::group_by(PATIENT_NUM,group) %>% 
    dplyr::summarise(bla = n()) %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
  #n_patients <- patients %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
  n_cases <- n_patients %>% dplyr::filter(group == 'cases') %>% dplyr::select(n)
  n_control <- n_patients %>% dplyr::filter(group == 'control') %>% dplyr::select(n)
  
  
  
  concepts_list <- concepts %>% 
    dplyr::group_by(CODE, CODE_LABEL, group, CAT) %>% 
    dplyr::summarize(count = sum(count)) 
  
  
  concepts_list <-reshape2::dcast(concepts_list,  
                                  formula = CODE+ CODE_LABEL+ CAT ~ group, 
                                  value.var = 'count',
                                  fill = 0)
  
  #cl <- tidyr::spread(concepts_list,group, count) 
  
  
  concepts_list <- concepts_list %>% 
    dplyr::ungroup() %>%
    dplyr::mutate(cases0 = as.numeric(n_cases) - cases,
                  control0 = as.numeric(n_control) - control, 
                  total = cases + control) %>% ungroup()
  
  
  concepts_keep <- concepts_list %>% 
    dplyr::filter(cases > 0, total >= concepts_min_occur)
  
  concepts_with_0 <- concepts_list %>% 
    dplyr::filter(control == 0 , total >= concepts_min_occur) %>% 
    dplyr::select(CODE) 
  
  adjust_facts <- concepts %>% group_by(CODE) %>%  
    dplyr::filter(CODE %in% concepts_with_0$CODE) %>% 
    dplyr::slice(zero_controls_add) %>% 
    dplyr::mutate(group = "control", PATIENT_NUM = 0) %>% data.frame()
  
  concepts <- rbind(concepts, adjust_facts)
  
  
  
  concepts_wide <- concepts %>% 
    dplyr::filter(CODE %in% concepts_keep$CODE) %>%
    dplyr::select(PATIENT_NUM, CODE,group, count, SEX)
  
  concepts_wide <- reshape2::dcast(concepts_wide, 
                                   PATIENT_NUM + group + SEX ~ CODE, 
                                   value.var = 'count',
                                   fill = 0)
  
  concepts_wide <- concepts_wide %>%
    dplyr::mutate(group = factor(group, levels =c('control','cases')))
  
  
  result = NULL
  
  #cores <- availableCores() 
  if (cores > 1 & Sys.info()[['sysname']] != 'Windows') {
    plan(multiprocess)
  } else {
    plan(sequential)
  }
  
  cstart = 4
  clist <- cstart:ncol(concepts_wide)
  seq_along(clist)
  
  chunks <- split(clist, ceiling(seq_along(clist)/ceiling(length(clist)/cores)))
  
  f <- list()
  for (i in 1:cores) {
    f[[i]] <- future({
      res_temp <- lapply(chunks[[i]], FUN= function(x, concepts_wide) {
          concept = names(concepts_wide[x])


          form <- as.formula(stringr::str_interp("${concept} ~ group",
                                                 list(concept = concept)))

          fit <- glm(form, data=concepts_wide, family= binomial())

          p <- fit %>%
            tidy() %>%
            dplyr::filter(term == 'groupcases') %>%
            dplyr::select(p.value) %>%
            as.numeric()

          res <- exp(cbind(coef(fit), suppressMessages(confint.default(fit, trace = TRUE)))) %>%
            tidy() %>% as.data.frame() %>%
            rename(term = .rownames, OR = V1, CI_inf = X2.5.., CI_sup = X97.5..) %>%
            dplyr::filter(term == 'groupcases')  %>%
            mutate(CODE = concept, p.value = p, fit = list(tidy((fit))))  %>% dplyr::select(-term)
          res



        }, concepts_wide = concepts_wide)
      lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
    })
  }
  
  res_temp <- lapply(f, value)
  result <- lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
  
  result <- lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
  
  result <- result %>%
    mutate( p_fdr = p.adjust(p.value, method = 'fdr'),
            p_bonf =p.adjust(p.value, method = 'bonferroni') ) %>%
    left_join(concepts_list, by = 'CODE') %>%
    dplyr::select(CODE, CODE_LABEL, cases, control, cases0,control0, OR, CI_inf, CI_sup, p.value, p_fdr, p_bonf,CAT) %>%
    arrange(p_fdr)
  
  result <- result[!duplicated(result[,c('CODE', 'CODE_LABEL')]),]
  
  if (with_boot) {

    if(!is.null(progress)) {
      progress$set(2, detail = 'Bootstrapping')
    } else {
      cat('Bootstrapping on concepts ...')
    }

    result <- bootstrap_results(result, fdr_threshold = fdr_threshold, concepts_wide, family = 'binomial',perm = perm,  boot_all = boot_all, cores = cores)

  }
  
  result <- rename(result, CODE_LABEL = CODE_LABEL)

  
  list(result = result, boot = with_boot)
  
}
aneuraz/multiWAS documentation built on May 14, 2019, 2:37 p.m.