R/compute_bio_phewas.R

Defines functions compute_bio_phewas

Documented in compute_bio_phewas

#' compute_bio_phewas

#' computes the PheWAS analysis on numeric biological test results
#' 
#' @param bio bio data.frame from the `get_data_from_num` function
#' @param patients patients data.frame from the `get_data_from_num` function
#' @param direction one of c('SUP', 'INF') to perform the analysis on results above the normal range (SUP) or below the normal range(INF)
#' @param bio_min_occur integer. minimum number of occurrences of the biological test result in the dataset to be analyzed
#' @param zero_controls_add integer. number of simulated patients to add to the control group for a given if there is no occurrence of this concept in the control group. This trick allows to compute a pvalue in this situation. 
#' @param patients_min_concept integer. minimum number of unique concept for a patient to be included in the analysis
#' @param with_boot logical. perform a bootstrap on the results
#' @param perm integer. number of permutations for the bootstrap
#' @param boot_all logical. perform a bootstrap on all the results or only on the results with FDR corrected p.value < fdr_threshold
#' @param fdr_threshold maximum FDR corrected p.value to perform the bootstrap 
#' @param analysis_type one of c('poisson', 'binomial') for the type of analysis to perform
#' @param progress  progress bar object 
#' @export
compute_bio_phewas <- function(bio, 
                               patients, 
                               direction = "SUP",
                               bio_min_occur = 1, 
                               zero_controls_add = 2, 
                               patients_min_concept = NULL, 
                               with_boot = FALSE, 
                               perm = 1000, 
                               boot_all = FALSE, 
                               fdr_threshold = 0.05,
                               analysis_type = 'poisson',
                               progress= NULL,
                               cores = 1) {
  require(dplyr, quietly = TRUE)
  #require(parallel, quietly = TRUE)
  require(tidyr, quietly = TRUE)
  require(broom, quietly = TRUE)
  require(future)
  
  if(!is.null(progress)) {
    progress$set(1, detail = 'Computing phewas') 
  } else {
    cat('Computing phewas on bio ...')
  }
  
  dir_to_suppress <- ifelse(direction == "SUP", "INF", "SUP")
  
  n_patients <- bio %>% dplyr::group_by(PATIENT_NUM,group) %>% 
    dplyr::summarise(bla = n()) %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
  #n_patients <- patients %>% dplyr::group_by(group) %>% dplyr::summarise(n = n())
  n_cases <- n_patients %>% dplyr::filter(group == 'cases') %>% dplyr::select(n)
  n_control <- n_patients %>% dplyr::filter(group == 'control') %>% dplyr::select(n)
  
  bio <- dplyr::rename_(bio, VALUE = direction)
  #bio$VALUE = bio[,direction]
  bio$count = 1
  
  bio_list <- bio %>% 
    distinct(CODE,CODE_LABEL,PARENT_LABEL, PATIENT_NUM, group) %>%
    group_by(CODE, CODE_LABEL,PARENT_LABEL, group)
  
  if (analysis_type == 'poisson') {
    bio_list <- summarize(bio_list, count = n())
    
  } else if (analysis_type == 'binomial') {
    bio_list <- summarize(bio_list, count = 1)
  }
  
  bio_list <-  tidyr::spread(bio_list, group,count) %>% 
    ungroup() %>%
    mutate(cases = ifelse(is.na(cases), 0, cases),
           control = ifelse(is.na(control), 0, control),
           cases0 = as.numeric(n_cases) - cases,
           control0 = as.numeric(n_control) - control, 
           total = cases + control) 
  
  
  bio_keep <- bio_list %>% 
    dplyr::filter(cases > 0, total >= bio_min_occur)
  
  
  bio_with_0 <- bio_list %>% 
    dplyr::filter(control == 0 , total >= bio_min_occur) %>% 
    dplyr::select(CODE) 
  
  if (nrow(bio_with_0)>0) {
    adjust_facts <- bio %>% group_by(CODE) %>%  
      dplyr::filter(CODE %in% bio_with_0$CODE) %>% 
      slice(zero_controls_add) %>% 
      mutate(group = "control", PATIENT_NUM = 0) %>% data.frame()
    
    bio <- rbind(bio %>% ungroup(), adjust_facts)
  }
  
  
  bio_wide <- bio %>% 
    dplyr::filter(CODE %in% bio_keep$CODE) %>%
    dplyr::select(PATIENT_NUM, CODE,group, VALUE, SEX) %>% 
    group_by(PATIENT_NUM, CODE, group, SEX) %>%
    summarise(VALUE = sum(VALUE)) %>%
    ungroup()
  
  if (analysis_type == 'binomial') {
    bio_wide$VALUE = 1
  } 
  
  bio_wide <- spread(bio_wide, key =CODE,value = VALUE,  fill = 0) %>%
    mutate(group = factor(group, levels =c('control','cases')))
  
  
  result = NULL
  
  #cores <- availableCores() 
  if (cores > 1 & Sys.info()[['sysname']] != 'Windows') {
    plan(multiprocess)
  } else {
    plan(sequential)
  }
  
  cstart = 4
  clist <- cstart:ncol(bio_wide)
  seq_along(clist)
  
  chunks <- split(clist, ceiling(seq_along(clist)/ceiling(length(clist)/cores)))
  
  f <- list()
  for (i in 1:cores) {
    f[[i]] <- future({
      
      res_temp <- lapply(chunks[[i]], FUN= function(x, bio_wide, family) {
        concept = names(bio_wide[x])
        
        form <- as.formula(stringr::str_interp("`${concept}`~ group",
                                               list(concept = concept)))
        
        fit <- glm(form, data=bio_wide, family= family) 
        
        p <- fit %>% 
          tidy() %>% 
          dplyr::filter(term == 'groupcases') %>% 
          dplyr::select(p.value) %>% 
          as.numeric()
        
        res <- exp(cbind(coef(fit), suppressMessages(confint.default(fit, trace = TRUE)))) %>%
          tidy() %>% as.data.frame() %>%
          rename(term = .rownames, OR = V1, CI_inf = X2.5.., CI_sup = X97.5..) %>%
          dplyr::filter(term == 'groupcases')  %>%
          mutate(CODE = concept, p.value = p, fit = list(tidy((fit))))  %>% dplyr::select(-term)
        res
      },bio_wide = bio_wide, family = ifelse(analysis_type == 'poisson', poisson, binomial))
      
      lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()

    })
  }
  
  res_temp <- lapply(f, value)
  result <- lapply(res_temp, FUN=rbind) %>% data.table::rbindlist()
  
  result <- result %>%
    mutate( p_fdr = p.adjust(p.value, method = 'fdr'),
            p_bonf =p.adjust(p.value, method = 'bonferroni'),
            direction = direction) %>%
    left_join(bio_list, by = 'CODE') %>%
    dplyr::select(CODE, CODE_LABEL, cases, control, cases0,control0, OR, CI_inf, CI_sup, p.value, p_fdr, p_bonf, PARENT_LABEL,fit, direction) %>%
    arrange(p_fdr)
  
  result <- result[!duplicated(result[,c('CODE', 'CODE_LABEL')]),]
  
  if (with_boot) {
    
    if(!is.null(progress)) {
      progress$set(2, detail = 'Bootstrapping') 
    } else {
      cat('Bootstrapping on ICD ...')
    }
    
    result <- bootstrap_results(result, fdr_threshold = fdr_threshold, concepts_wide = bio_wide, family = analysis_type,perm = perm,  boot_all = boot_all, cores = cores)
  }
  
  result <- rename(result, CODE_LABEL = CODE_LABEL, CAT = PARENT_LABEL)

  
  list(result = result, boot = with_boot)
  
}
aneuraz/multiWAS documentation built on July 29, 2017, 3:39 a.m.