R_tmp/merge_coloc_results_each.R

#' Iterate merging of coloc results across files
#'
#' @param coloc_rda_files A list of full paths to coloc .RData/.rda results files generated by Jack's coloc scripts.
#' @param save_path File path where the merged results will be saved.
#' @param ppH4_thresh Threshold to filter summary-level results.
#' @param filter Flexible row filtering.
#' @param force_new If the \code{save_path} file already exists, overwrite it.
#' @inheritParams merge_coloc_results
#' @family coloc
#' @keywords internal
#' @examples
#' # List RDS files you want to extract info from
#' coloc_rds <- list.files("/sc/hydra/projects/ad-omics/microglia_omics/COLOC", pattern = "*_COLOC.RData", recursive = TRUE, full.names = TRUE)
#' coloc_rds <- coloc_rds[!grepl("*_sQTL_*|Microglia_all_regions_Young",coloc_rds)]
#' coloc_rds <- coloc_rds[grep("*Microglia_all_regions_*",coloc_rds)]
#'
#' # Summary-level
#' coloc_summary <- merge_coloc_results_each(coloc_rds_files=coloc_rds, save_path="/sc/arion/projects/pd-omics/brian/all_COLOC_results.Microglia_all_regions.summary-level.tsv.gz", results_level="summary")
#' # SNP-level
#' coloc_snp <- merge_coloc_results_each(coloc_rds_files=coloc_rds, save_path="/sc/arion/projects/pd-omics/brian/all_COLOC_results.Microglia_all_regions.snp-level.tsv.gz", results_level="snp", filter="leadSNP==T")
merge_coloc_results_each <- function(coloc_rda_files,
                                     save_path=FALSE,
                                     save_each=FALSE,
                                     results_level="summary",
                                     ppH4_thresh=0,
                                     return_filter=FALSE,
                                     force_new=TRUE,
                                     nThread=1,
                                     verbose=TRUE){
  if(file.exists(save_path) & force_new==FALSE){
    messager("Importing pre-existing file:",save_path, v=verbose)
    merged_COLOC <- data.table::fread(save_path, nThread = nThread)
  } else {
    merged_COLOC <- lapply(coloc_rda_files, function(f,
                                                     .results_level=results_level,
                                                     .ppH4_thresh=ppH4_thresh,
                                                     .return_filter=return_filter,
                                                     .save_path=save_path,
                                                     .save_each=save_each,
                                                     .nThread=nThread,
                                                     .verbose=verbose){
      i <- grep(f,coloc_rda_files)
      message(paste0("(",i,"/",length(coloc_rda_files),") "), f)
      load(f)
      # Save a study-specific file with the full results
      if(save_each){
        subset_path <- file.path(dirname(.save_path),
                                 gsub("_COLOC.RData$",paste0("_COLOC.",.results_level,"-level.tsv.gz"), basename(f)))
        messager("+ Saving study-specific results ==>",subset_path,v=.verbose)
      }  else {subset_path <- F}
      merged_coloc <- merge_coloc_results(all_obj = all_obj,
                                          results_level = .results_level,
                                          nThread = .nThread,
                                          save_path = subset_path,
                                          verbose  = FALSE)
      if(.results_level=="summary"){
        merged_coloc <- subset(merged_coloc,  PP.H4.abf > .ppH4_thresh)
        messager("+",nrow(merged_coloc),"Locus-Gene pairs identified at",paste0("PP.H4>",.ppH4_thresh), v=verbose)
      }
      merged_coloc$Study <- basename(dirname(f))
      merged_coloc$Dataset <- gsub("_COLOC.tsv|_COLOC.RData","",basename(f))
      merged_coloc$Path <- f
      merged_coloc <- echodata::standardize_gene(dat = merged_coloc,
                                                 Gene = "gene")
      merged_coloc <- subset(merged_coloc, !is.na(Gene))
      # Give each Comparison-Locus-Gene combination a unique ID
      merged_coloc <- mutate(merged_coloc, id=paste(Dataset,Locus,Gene,sep="."))
      # Get lead SNPs
      if(.results_level=="snp"){
        # Assign lead snps
        ## Have to do this via merging to avoid assigning lead SNP in one gene as leadSNP in all genes.
        messager("Assigning Locus-Gene specific lead SNPs",v=.verbose)
        lead.df <- merged_coloc %>%
          dplyr::group_by(Dataset, Locus, Gene) %>%
          dplyr::arrange(qtl.pvalues,dplyr::desc(abs(qtl.beta))) %>%
          dplyr::slice(1) %>%
          dplyr::mutate(leadSNP=TRUE) %>%
          dplyr::select(c("Study","Dataset","Locus","Gene","snp","leadSNP"))
        merged_coloc <- data.table::merge.data.table(merged_coloc,
                                                     lead.df,
                                                     by = c("Study","Dataset","Locus","Gene","snp"),
                                                     all.x = TRUE)
        merged_coloc[is.na(merged_coloc$leadSNP),"leadSNP"] <- F
      }
      # Filter
      if(.return_filter!=FALSE) merged_coloc <- subset(merged_coloc, eval(parse(text = .return_filter)))
      return(merged_coloc)
    }) %>% data.table::rbindlist(fill=TRUE)
    if(save_path!=FALSE){
      messager("+ Saving merged results ==>", save_path, v=verbose)
      dir.create(dirname(save_path), showWarnings = FALSE, recursive = TRUE)
      data.table::fwrite(merged_COLOC, save_path, nThread = nThread)
    }
  }
  return(merged_COLOC)
}
RajLabMSSM/echolocatoR documentation built on Jan. 29, 2023, 6:04 a.m.