R_tmp/mergeQTL.R

################## QTL Data ##################
# V- psychENCODE
# V- Fairfax
# V- MESA
# V- Cardiogenics
# V- Brain_xQTL_Serve
# - ROSMAP + CommmonMind (larget than Brain_xQTL_Serve)
# - STARNET
# V- GTEx V7 & V8 (49 tissues)


mergeQTL.add_SNP_id <- function(FM_all){
  FM_all <- FM_all %>%
    dplyr::mutate(SNP_id=paste0(gsub("chr","",CHR),":",POS)) %>%
    data.table::data.table()
  return(FM_all)
}


mergeQTL.psychENCODE_SNPinfo <- function(QTL.sub,
                                         SNPinfo_path="/sc/orga/projects/ad-omics/data/psychENCODE/SNP_Information_Table_with_Alleles.txt.gz"){
  messager("+ mergeQTL: Merging external SNP info...")
  SNPinfo <- data.table::fread(SNPinfo_path, nThread = 4)
  SNPinfo.sub <- subset(SNPinfo, PEC_id %in% QTL.sub$SNP_id, select=c("PEC_id","Rsid","REF","ALT"))
  snp_merge <- data.table:::merge.data.table(QTL.sub, SNPinfo.sub,
                                             by.x = "SNP_id", by.y = "PEC_id", all.x = T)
  return(snp_merge)
}

# psychENCODE
## Need: StdErr, MAF
mergeQTL.psychENCODE <- function(FM_all=merge_finemapping_results(minimum_support = 0),
                                 local_files=T,
                                 force_new_subset=FALSE,
                                 ASSAYS=c("eQTL","cQTL","isoQTL","tQTL") #
                                 ){
  FM_all <- mergeQTL.add_SNP_id(FM_all)
  for(assay in ASSAYS){
    messager("psychENCODE:: Checking for overlap with",assay)
    output_path <- file.path("./Data/QTL/psychENCODE", assay, paste0("psychENCODE_",assay,".finemap.txt"))
    output_path_gz <- paste0(output_path,".gz")
    if(file.exists(output_path_gz) & force_new_subset==FALSE){
      messager("psychENCODE:: Importing pre-existing file.")
      QTL.sub <- data.table::fread(output_path_gz, nThread = 4)
      # Drop duplicate columns (Peak_center)
      QTL.sub <- QTL.sub[,unique(names(QTL.sub)),with=F]
    } else {
      server_file <- Directory_info(dataset_name = paste0("psychENCODE_",assay), "fullSS.local")
      QTL <- data.table::fread(server_file, nThread = 4)
      fullSS_nrow <- get_nrows(server_file)
      # Subset QTL data
      if(assay %in% c("fQTL")){
        QTL <- QTL %>% dplyr::rename(gene_chr=Chromosome_of_variant,
                                      gene_start=Locus_of_variant,
                                      regression_slope=Regression_slope) %>%
        dplyr::mutate(FDR=p.adjust(Nominal_p_val_of_association,method = "fdr", n = n_snps),
                      SNP_id=paste0(gsub("chr","",gene_chr),":",gene_start))
        }
      QTL.sub <- QTL %>% subset(SNP_id %in% FM_all$SNP_id)
      QTL.sub <- mergeQTL.psychENCODE_SNPinfo(QTL.sub)

      # Write
      dir.create(dirname(output_path), showWarnings = F, recursive = T)
      data.table::fwrite(QTL.sub, output_path, sep="\t", nThread = 4)
      R.utils::gzip(output_path, overwrite=T, remove=TRUE)
    }

    if(!("gene_id" %in% colnames(QTL.sub))){QTL.sub$gene_id <- NA}
    # QTLs have can multiple probes per SNP location.
    ## Pick only the best one (lowest FDR and highest Effect) keep each row as a unique genomic position:
    QTL.sub <- QTL.sub %>%
      dplyr::group_by(SNP_id) %>%
     dplyr::arrange(FDR,dplyr::desc(regression_slope)) %>%
      dplyr::slice(1) %>%
      data.table::data.table()
    # Select and rename columns
    QTL.sub <-  (QTL.sub %>%
      dplyr::mutate(SNP=Rsid,
                    QTL.Effect=regression_slope,
                    QTL.StdErr=NA,
                    QTL.P=nominal_pval,
                    QTL.FDR=FDR,
                    QTL.MAF=NA,
                    QTL.A1=REF,
                    QTL.A2=ALT,
                    QTL.SampleSize=1866, #Assuming same size for each (not sure if true)
                    QTL.Gene=gene_id) %>%
      dplyr::select(SNP, QTL.Effect, QTL.StdErr, QTL.P, QTL.FDR, QTL.MAF, QTL.A1, QTL.A2, QTL.SampleSize, QTL.Gene) %>%
     dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect)) ) %>%
      data.table::data.table()
    FM_all <- data.table:::merge.data.table(FM_all,
                                            QTL.sub,
                                            by = "SNP",
                                            all.x = T, allow.cartesian = T)
  }
  return(FM_all)
}



mergeQTL.Fairfax_probemap <- function(gene_list, probe_path.="./Data/QTL/Fairfax_2014/gene.ILMN.map"){
  messager("++ Extracting probe info")
  cmd <- paste0("grep -E '", paste0(gene_list, collapse="|"),"' ",probe_path.)
  col_names <- data.table::fread(probe_path., sep="\t", nrows = 0) %>% colnames()
  probe_map <- data.table::fread(text = system(cmd, intern = T),
                                 sep = "\t", header = F, col.names = col_names)
  if(dim(probe_map)[1]==0){
    stop("Could not identify gene in the probe mapping file: ",paste(gene_list, collapse=", "))
  }
  # Subset just to make sure you didn't accidentally grep other genes
  probe_map <- subset(probe_map, GENE %in% gene_list)
  probe_map <- setNames(probe_map$GENE, probe_map$PROBE_ID)
  return(probe_map)
}

mergeQTL.Fairfax_extract_SNPinfo <- function(genotype_path="/sc/orga/projects/ad-omics/data/fairfax/volunteer_421.impute2.dosage",
                                             rsid.key_path="/sc/orga/projects/ad-omics/data/fairfax/rsid.key.tsv.gz"){
  ### Extract SNP, CHR, and POS from first several cols of Fairfax genotype file
  messager("mergeQTL::Fairfax: Extracting RSID, CHR, POS, REF and ALT from:",basename(genotype_path))
  if(file.exists(rsid.key_path)){
    messager("+ mergeQTL::Fairfax: Pre-existing file detected. Importing...")
  } else {
    system(paste0("awk -F ' ' 'BEGIN { print \"RSID\tCHR\tPOS\tREF\tALT\" }; { print $2\"\t\"$1\"\t\"$3\"\t\"$4\"\t\"$5 }' ", genotype_path," | gzip > ",rsid.key_path))
    messager("+ mergeQTL::Fairfax: File saved to ==>", rsid.key_path)
  }
  rsid.key <- data.table::fread(rsid.key_path, nThread = 4)
  return(rsid.key)
}

mergeQTL.Fairfax_extract_MAF <- function(dat,
                                         MAF_dir=file.path("/sc/orga/projects/ad-omics/data/fairfax/",
                                                           "imputation/imputation-hrc","MAF")){
  #NOTE!: There still a lot of mismatched alleles. Need to figure out how to fix.
  dat <- dat %>% tidyr::separate(SNP_id, c('CHR', 'POS'), sep=":", extra="drop", remove=FALSE)
  messager("mergeQTL::Fairfax: Merging with imputation files to get REF(0), ALT(1) and MAF...")
  merged.DAT <- lapply(unique(dat$CHR), function(chr){
    messager("mergeQTL::Fairfax: Chrom",chr)
    dat.chr = subset(dat, CHR==chr)
    MAF_path <- file.path(MAF_dir,paste0("chr",chr,".info"))
    # MAF_path <- file.path(MAF_dir,paste0("chr",chr,".filtered.tab"))
    maf.info <- data.table::fread(MAF_path, nThread = 4)
    maf.info <- maf.info %>% dplyr::rename(SNP_id=SNP)
    merge.dat <- data.table:::merge.data.table(dat.chr,
                                  maf.info[,c("SNP_id","REF(0)","ALT(1)","MAF")],
                                  by="SNP_id", all.x = T)
    # Check if these are the same ref/alt in the other Fairfax files
    ref_bool <- merge.dat$REF!=merge.dat$`REF(0)`
    alt_bool <- merge.dat$ALT!=merge.dat$`ALT(1)`
    messager(" +", sum(ref_bool, na.rm = T), "REF allleles did not match.")
    messager(" +", sum(alt_bool, na.rm = T), "ALT allleles did not match.")
    return(merge.dat)
  }) %>% data.table::rbindlist(fill=TRUE)
  return(merged.DAT)
}


# Fairfax: eQTL
## All fields complete for coloc!
mergeQTL.Fairfax <- function(FM_all, CONDITIONS=c("CD14","IFN","LPS2","LPS24"), force_new_subset=FALSE){
  FM_all <- mergeQTL.add_SNP_id(FM_all)
  for(condition in CONDITIONS){
    dataset <- paste0("Fairfax_2014_",condition)
    messager("mergeQTL::Fairfax:: Processing:",dataset)
    server_path <- Directory_info(paste0("Fairfax_2014_", condition),variable = "fullSS.local")
    output_path <- file.path("./Data/QTL/Fairfax_2014",condition,paste0(dataset,".finemap.txt"))
    output_path_gz <- paste0(output_path,".gz")
    dir.create(dirname(output_path), showWarnings = F, recursive = T)

    if(file.exists(output_path_gz)& force_new_subset==FALSE){
      messager("+ mergeQTL::Fairfax: Pre-existing file detected. Importing",output_path_gz,"...")
      dat <- data.table::fread(output_path_gz, nThread = 4)
    } else {
      probe_map <- mergeQTL.Fairfax_probemap(gene_list = unique(FM_all$Gene), probe_path.="./Data/QTL/Fairfax_2014/gene.ILMN.map")
      header <- echodata::get_header(server_path)
      messager("+ mergeQTL::Fairfax: Subsetting full summary stats file...")
      system(paste0("grep -E '", paste(c(header, unique(names(probe_map))), collapse="|"),"' " ,server_path," > ",output_path))

      # Import data
      dat <- data.table::fread(output_path, nThread = 4)

      ## Rename vars
      dat <- dplyr::rename(dat, SNP_id=SNP, probe=gene)
      dat$QTL.Gene <- probe_map[dat$probe]
      ## Add SNP col
      rsid.key <- mergeQTL.Fairfax_extract_SNPinfo()
      rsid.key <- rsid.key %>% dplyr::mutate(SNP_id=paste0(CHR,":",POS))
      ### Extract SNPs from FM_all
      dat <- data.table:::merge.data.table(dat, rsid.key[,c("RSID","REF","ALT","SNP_id")],
                                                  by="SNP_id", all.x = T)
      ### Extract MAF from imputation files
      merged.dat <- mergeQTL.Fairfax_extract_MAF(dat)
      dat <- merged.dat
      ## Overwrite merged subset
      data.table::fwrite(dat, output_path, sep = "\t", nThread = 4)
      R.utils::gzip(output_path, overwrite=TRUE)
    }
    dat$StdErr <- dat$beta / dat$`t-stat`
    dat.sub <- subset(dat, SNP_id %in% FM_all$SNP_id)
    # Take only the top QTL per SNP location
    ## Rename columns
    dat.sub <- (dplyr::mutate(dat.sub,
                             SNP_id,
                             QTL.Effect=beta,
                             QTL.StdErr=StdErr,
                             QTL.P=`p-value`,
                             QTL.FDR=FDR,
                             QTL.MAF=MAF,
                             QTL.A1=`REF(0)`,
                             QTL.A2=`ALT(1)`,
                             QTL.SampleSize=432,
                             QTL.Gene=QTL.Gene) %>%
      dplyr::select(SNP_id, QTL.Effect, QTL.StdErr, QTL.P, QTL.FDR, QTL.MAF, QTL.A1, QTL.A2, QTL.SampleSize, QTL.Gene) %>%
      dplyr::group_by(SNP_id) %>%
     dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect)) )%>%
      dplyr::slice(1) %>%
      data.table::data.table()
    FM_all <- data.table:::merge.data.table(FM_all, dat.sub,
                                              by="SNP_id",
                                              all.x = T, allow.cartesian = T)

  }
  return(FM_all)
}


# MESA
## Need: MAF
mergeQTL.MESA <- function(FM_all, force_new_subset=FALSE, POPULATIONS=c("AFA","CAU","HIS")){
  FM_all <- mergeQTL.add_SNP_id(FM_all)
  sample_size.dict <- list(AFA=233, CAU=578, HIS=352)
  for(pop in POPULATIONS){
    messager("MESA:: Extracting",pop,"data.")
    dataset <- paste0("MESA_",pop)
    output_path <- file.path("./Data/QTL/MESA",pop,paste0(dataset,".finemap.txt.gz"))

    if(file.exists(output_path) & force_new_subset==FALSE){
      messager("MESA:: Pre-existing file detect. Importing...")
      # for some reason, saved files lose their header. Have to explicitly provide them instead...
      dat <- data.table::fread(output_path, nThread = 4,
                               col.names = c("snps","gene","statistic","pvalue","FDR","beta",
                                             "chr","gene_name","start","end","gene_type","pos_snps","ref","alt"))
    } else {
      server_path <- Directory_info(paste0("MESA_",pop), "fullSS.local")
      output_path_txt <- gsub(".gz","",output_path)
      system(paste0("grep -E '", paste(unique(FM_all$Gene), collapse="|"),"' " ,server_path," > ",output_path_txt))
      # Import data
      dat <- data.table::fread(output_path_txt,
                               col.names = colnames(data.table::fread(server_path, nrow=0)),
                               nThread = 4)#file_path
      dat <- subset(dat, snps %in% FM_all$SNP)
      ## Overwrite subset w/ fixed header (and smaller size)
      dir.create(dirname(output_path), recursive = T, showWarnings = F)
      data.table::fwrite(dat, output_path, sep = "\t", col.names = T)
      R.utils::gzip(output_path_txt, overwrite=T, remove=TRUE)
    }

    # Calculare StdErr
    dat$QTL.StdErr <- dat$beta / dat$statistic
    # Take only the top QTL per SNP location
    ## Rename columns
    dat.sub <- (dplyr::mutate(dat,
                             SNP=snps,
                             QTL.Effect=beta,
                             QTL.StdErr=QTL.StdErr,
                             QTL.P=pvalue,
                             QTL.FDR=FDR,
                             QTL.MAF=NA,
                             QTL.A1=ref,
                             QTL.A2=alt,
                             QTL.SampleSize=sample_size.dict[[pop]],
                             QTL.Gene=gene_name) %>%
      dplyr::select(SNP, QTL.Effect, QTL.StdErr, QTL.P, QTL.FDR, QTL.MAF, QTL.A1, QTL.A2, QTL.SampleSize, QTL.Gene) %>%
      dplyr::group_by(SNP) %>%
     dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect)) ) %>%
      dplyr::slice(1) %>%
      data.table::data.table()
    FM_all <- data.table:::merge.data.table(FM_all, dat.sub,
                                            by="SNP",
                                            all.x = T)
  }
  return(FM_all)
}


# Cardiogenics
## Need: MAF, A1, A2
mergeQTL.Cardiogenics <- function(FM_all,
                                  force_new_subset=FALSE,
                                  cis_only=T,
                                  CELLTYPES=c("macrophages","monocytes")){
  FM_all <- mergeQTL.add_SNP_id(FM_all)
  for(celltype in CELLTYPES){
    messager("Cardiogenics:: Processing",celltype,"data...")
    dataset <- paste0("Cardiogenics_",celltype)
    output_path <- file.path("./Data/QTL/Cardiogenics",celltype,paste0(dataset,".finemap.txt"))
    output_path_gz <- paste0(output_path, ".gz")
    if(file.exists(output_path_gz) & force_new_subset==FALSE){
      print("Cardiogenics:: Pre-existing file detected. Importing...")
      dat.sub <- data.table::fread(output_path_gz, nThread = 4)
    } else{
      server_path <- Directory_info(paste0("Cardiogenics_",celltype), "fullSS.local")
      dir.create(dirname(output_path), recursive = T, showWarnings = F)
      DAT <- data.table::fread(server_path,  nThread = 4)
      dat.sub <- subset(DAT, SNPID %in% unique(FM_all$SNP))
      data.table::fwrite(dat.sub, output_path, sep="\t", nThread = 4)
      R.utils::gzip(output_path)
    }

    dat.sub$StdErr <- dat.sub$beta / dat.sub$t.test

    if(cis_only){dat.sub <- subset(dat.sub, relativePosition=="cis")}
    dat.sub <- (dplyr::mutate(dat.sub,
                             SNP=SNPID,
                             QTL.Effect=beta,
                             QTL.StdErr=StdErr,
                             QTL.P=NA,
                             QTL.FDR=FDR,
                             QTL.MAF=NA,
                             QTL.A1=NA,
                             QTL.A2=NA,
                             QTL.SampleSize=n.samples, # 758 individuals
                             QTL.Gene=reporterID) %>%
      dplyr::select(SNP, QTL.Effect, QTL.StdErr, QTL.P, QTL.FDR, QTL.MAF, QTL.A1, QTL.A2, QTL.SampleSize, QTL.Gene) %>%
      dplyr::group_by(SNP) %>%
     dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect)) ) %>%
      dplyr::slice(1) %>%
      data.table::data.table()
    FM_all <- data.table:::merge.data.table(FM_all, dat.sub,
                                            by="SNP",
                                            all.x = T)
  }
  return(FM_all)
}


mergeQTL.GTEx_list_files <- function(output_dir,
                                     server_path,
                                     GTEx_version="GTEx_V7",
                                     fuzzy_search=FALSE,
                                     local_files=TRUE){
  messager("GTEx:: Constructing reference file of available single-tissue eQTL files...")
  if(local_files){
    SS_files <- list.files(output_dir, pattern="*.finemap.txt.gz")

  } else {
    SS_files <- list.files(server_path,
                           pattern=ifelse(GTEx_version=="GTEx_V7",".allpairs.txt.gz","*.egenes.*"),
                           full.names = T)
  }

  all_tissues <- lapply(SS_files, function(e){strsplit(basename(e),"[.]")[[1]][1] }) %>% unlist() %>% unique()
  all_tissues <- gsub(paste0(GTEx_version,"_"),"", all_tissues)
  dir.create(output_dir, showWarnings = F, recursive = T)
  tissues_df <- data.frame(tissue=all_tissues, sum_stats=SS_files)
  if(fuzzy_search!=FALSE){
    tissues_df = tissues_df[base::grep(fuzzy_search, tissues_df$tissue),]
  }
  return(tissues_df)
}

mergeQTL.HGNC_to_ENS <- function(geneList){
  print("mergeQTL:: Gathering Ensembl_gene_id mappings for HGNC symbols...")
  info <- biomart_geneInfo(geneList = geneList)
  geneDict <- setNames(info$ensembl_gene_id, info$hgnc_symbol)
  converted_genes <- geneDict[geneList]
  na_genes <-  info$hgnc_symbol[as.logical(is.na(converted_genes))]
  messager("Could not find mappings for",length(na_genes),"genes:",paste(na_genes, collapse=", "))
  return(converted_genes)
}

mergeQTL.GTEx_snp_dict <- function(snp_list){
  ref <- data.table::fread(file.path(server_path,"GTEx_Analysis_2016-01-15_v7_WholeGenomeSeq_635Ind_PASS_AB02_GQ20_HETX_MISS15_PLINKQC.lookup_table.txt.gz"), nThread = 4)
  snp_dict <- setNames(ref$rs_id_dbSNP147_GRCh37p13, ref$variant_id)
  return(snp_dict)
}


# GTEx
## All fields complete for coloc!
mergeQTL.GTEx <- function(FM_all, fuzzy_search="Brain", GTEx_version="GTEx_V7", force_new_subset=FALSE, local_files=TRUE){
  FM_all <- mergeQTL.add_SNP_id(FM_all)
  server_path <- Directory_info(GTEx_version, "fullSS.local")
  output_dir <- file.path("./Data/QTL",GTEx_version)
  tissues_df <- mergeQTL.GTEx_list_files(output_dir, server_path, GTEx_version, fuzzy_search, local_files = local_files)

  # Gather QTL data
  for(tiss in tissues_df$tissue){
    messager(GTEx_version,":: Processsing eQTL for",tiss)
    dataset <- paste0(GTEx_version,"_",tiss)
    output_path <- file.path(output_dir, paste0(dataset,".finemap.txt"))
    output_path_gz <- paste0(output_path,".gz")
    if(file.exists(output_path_gz) & force_new_subset==FALSE){
      dat <- data.table::fread(output_path_gz, nThread = 4)
    } else {
      geneDict <- mergeQTL.HGNC_to_ENS(geneList = unique(FM_all$Gene))
      ensembl_genes <- as.character(geneDict)[!is.na(geneDict)]
      server_file <- subset(tissues_df, tissue==tiss)$sum_stats %>% as.character()
      header <- echodata::get_header(server_file)
      fullSS_nrows <- get_nrows(server_file)#Differs for each SS file, so need to calculate for each.
      system(paste0("zcat ",server_file," | grep -E '",paste(c(header, ensembl_genes), collapse="|"),"' > ",output_path))

      dat <- data.table::fread(output_path, nThread = 4)
      # Rewrite with FDR included
      dat <-  dat %>% tidyr::separate(variant_id, c('CHR', 'POS', 'A1','A2','build'), sep="_", extra="drop") %>%
                      dplyr::mutate(FDR=p.adjust(pval_nominal, method = "fdr", n=fullSS_nrows),
                                    CHR=as.numeric(CHR),
                                    POS=as.numeric(POS))
      data.table::fwrite(dat, output_path)
      R.utils::gzip(output_path, overwrite=T, remove=TRUE)
    }
    # Subset
    dat.sub <- (dat %>%
      dplyr::select(CHR, POS,
                    QTL.Effect=slope,
                    QTL.StdErr=slope_se,
                    QTL.P=pval_nominal,
                    QTL.FDR=FDR,
                    QTL.MAF=maf,
                    QTL.A1=A1,
                    QTL.A2=A2,
                    QTL.SampleSize=ma_samples, # 1000?
                    QTL.Gene=gene_id) %>%
       dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect))) %>%
      data.table::data.table()
    FM_all <- data.table:::merge.data.table(FM_all, dat.sub,
                                            by=c("CHR","POS"),
                                            all.x = T, allow.cartesian = T)
  }
  return(FM_all)
}


# Brain_xQTL_Serve
## Need: MAF, A1, A2, StdErr, Sample Size

mergeQTL.Brain_xQTL_Serve <- function(FM_all,
                                      force_new_subset=FALSE,
                                      ASSAYS=c("eQTL","haQTL","mQTL","cell-specificity-eQTL")){
  FM_all <- mergeQTL.add_SNP_id(FM_all)
  for(assay in ASSAYS){
    messager("Brain_xQTL_Serve:: Processing",assay,"...")
    dataset <- paste0("Brain_xQTL_Serve_",assay)
    output_path <- file.path("./Data/QTL/Brain_xQTL_Serve",assay,paste0("Brain_xQTL_Serve.",assay,".finemap.txt"))
    output_path_gz <- paste0(output_path,".gz")
    if(file.exists(output_path_gz) & force_new_subset==FALSE){
      messager("+ Brain_xQTL_Serve:: Pre-existing file detected. Importing...")
      QTL <- data.table::fread(output_path_gz, nThread = 4)
    } else {
      messager("+ Brain_xQTL_Serve:: Importing and subsettting full",assay,"dataset...")
      dir.create(dirname(output_path), showWarnings = F, recursive = T)
      server_file <- Directory_info(paste0("Brain.xQTL.Serve_",assay), "fullSS.local")
      header <- echodata::get_header(server_file)
      col1 <- strsplit(header, "\t")[[1]][1]

      if(assay %in% c("haQTL","mQTL")){
        messager("+ Brain_xQTL_Serve:: Importing large",assay,"file and subsetting via data.table...")
        QTL <- data.table::fread(server_file, nThread = 4)
        fullSS_nrows <- nrow(QTL)
        QTL <- subset(QTL, SNPid %in% unique(FM_all$SNP))
        QTL <- (QTL %>% dplyr::group_by(SNPid) %>%dplyr::arrange(pValue,dplyr::desc(SpearmanRho))) %>%
          dplyr::slice(1) %>% data.table()
        data.table::fwrite(QTL, output_path, nThread=1, sep="\t")
      } else {
        fullSS_nrows <- get_nrows(server_file)
        system(paste0("zcat ",server_file," | grep -E '",paste(c(col1, unique(FM_all$Gene)), collapse="|"),
                    "' > ",output_path))
      }

      # Calculate FDR
      QTL <- data.table::fread(output_path, nThread = 4)
      if(assay=="cell-specificity-eQTL"){
        QTL <- QTL %>% dplyr::rename(SNP=`SNP name`,
                                     FDR=`FDR (specific to each cell type)`,
                                     cell_type=`Cell type`)
      } else {
        QTL$FDR <- p.adjust(QTL$pValue, method="fdr", n=fullSS_nrows)
        QTL <- QTL %>% dplyr::rename(SNP=SNPid)
        }
      # Rewrite after adding FDR
      data.table::fwrite(QTL, output_path, nThread = 4, sep="\t")
      R.utils::gzip(output_path, overwrite=T, remove=TRUE)
    }

    # Subset
    dat.sub <- subset(QTL, SNP %in% unique(FM_all$SNP))
    if(assay=="cell-specificity-eQTL"){
      dat.sub$Effect <- NA
      dat.sub <- (dat.sub %>% dplyr::mutate(SNP,
                                            QTL.Effect=NA,
                                            QTL.StdErr=NA,
                                            QTL.P=`Interaction pvalue`,
                                            QTL.FDR=FDR,
                                            QTL.MAF=NA,
                                            QTL.A1=NA,
                                            QTL.A2=NA,
                                            QTL.SampleSize=NA,
                                            # QTL.CellType=cell_type,
                                            QTL.Gene=Gene) %>%
                    dplyr::select(SNP, QTL.Effect, QTL.StdErr, QTL.P, QTL.FDR, QTL.MAF, QTL.A1, QTL.A2, QTL.SampleSize, QTL.Gene) %>%
       dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect))) %>%
        data.table::data.table()
    } else {
      dat.sub <- (dat.sub %>%
        dplyr::mutate(SNP,
                      QTL.Effect=SpearmanRho,
                      QTL.StdErr=NA,
                      QTL.P=pValue,
                      QTL.FDR=FDR,
                      QTL.MAF=NA,
                      QTL.A1=NA,
                      QTL.A2=NA,
                      QTL.SampleSize=NA,
                      QTL.Gene=featureName) %>%
          dplyr::select(SNP, QTL.Effect, QTL.StdErr, QTL.P, QTL.FDR, QTL.MAF, QTL.A1, QTL.A2, QTL.SampleSize, QTL.Gene) %>%
       dplyr::arrange(QTL.FDR,dplyr::desc(QTL.Effect))) %>%
        data.table::data.table()
    }
    FM_all <- data.table:::merge.data.table(FM_all, dat.sub,
                                            by="SNP",
                                            all.x = T, allow.cartesian = T)
  }
  return(FM_all)
}




############### Post-processing ################

mergeQTL.melt_FDR <- function(FM_merge){
  dim(FM_merge)
  Effect.cols <- base::grep(".Effect", colnames(FM_merge), value = T)
  FDR.cols <- base::grep(".FDR", colnames(FM_merge), value = T)
  id.vars <- c("Gene","SNP","CHR","POS","P","Consensus_SNP","Support","leadSNP")
  FM_sub <- subset(FM_merge, select=c(id.vars, FDR.cols))
  colnames(FM_sub)[colnames(FM_sub) %in% FDR.cols] <- gsub(".FDR","",FDR.cols)
  FM_melt <- data.table::melt.data.table(FM_sub,
                                         id.vars = id.vars,
                                         variable.name = "QTL.Source",
                                         value.name = "FDR")
  FM_melt[is.infinite(FM_melt$FDR),"FDR"] <- NA
  FM_melt[FM_melt$FDR %in% c(Inf,-Inf),"FDR"] <- NA
  FM_melt[FM_melt$FDR==0,"FDR"] <- .Machine$double.xmin
  return(FM_melt)
}

####### PLOTS #######


mergeQTL.count_overlap <- function(){
  library(dplyr)
  # FM_merge <- FM_all
  FM_merge <- data.table::fread(file.path("./Data/GWAS/Nalls23andMe_2019/_genome_wide",
                                          "Nalls23andMe_2019.QTL_overlaps.txt.gz"),
                                nThread = 4)
  FM_melt <- mergeQTL.melt_FDR(FM_merge)

  mergedQTL.get_count <- function(SNP.subset){
    count.df <- subset(SNP.subset, (FDR<0.05), drop=FALSE) %>%
      dplyr::group_by(QTL.Source, .drop=FALSE) %>%
      tally(sort = F) %>%
     dplyr::arrange(QTL.Source)
    count.df$all.SNPs <- nrow(SNP.subset)
    count.df$Proportion <- count.df$n / count.df$all.SNPs
    return(count.df)
  }

  consensus <- mergedQTL.get_count(subset(FM_melt, (Consensus_SNP==TRUE)))
  cred.set <- mergedQTL.get_count(subset(FM_melt, (Support>0)))
  GWAS.lead.SNP <- mergedQTL.get_count(subset(FM_melt, (leadSNP==TRUE)))
  GWAS.sig.SNP <- mergedQTL.get_count(subset(FM_melt, (P<=0.05)))


  merged.data <- data.table::data.table(QTL.Source = consensus$QTL.Source,
                                        Consensus = consensus$Proportion,
                                        Credible.Set = cred.set$Proportion,
                                        GWAS.lead.SNP = GWAS.lead.SNP$Proportion,
                                        GWAS.sig.SNP = GWAS.sig.SNP$Proportion) %>%
    data.table::melt.data.table(id.vars = "QTL.Source",
                                variable.name = "SNP.Group",
                                value.name = "Proportion.Overlap")
  # Plot all groups
  library(ggplot2)
  ggplot(merged.data, aes(x=QTL.Source, y=Proportion.Overlap*100, fill=SNP.Group)) +
    geom_col(position = position_dodge(preserve = "single")) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 55, hjust = 1, size = 9),
          plot.title = element_text(hjust = 0.5),
          plot.subtitle = element_text(hjust = 0.5)) +
    labs(title="Proportion of Overlapping QTL", y="% Overlap", x="QTL Source")
}


mergeQTL.stacked_manhattan_plot <- function(FM_melt, SNP.Group=""){
  GENE_df <- FM_melt
  op <- ggplot(GENE_df, aes(x=POS, y=-log10(FDR), color=-log10(FDR))) +
    geom_point() +
    facet_grid(QTL.Source~., drop = F) +
    theme(strip.text.y = element_text(angle=0),
          strip.background = element_rect(fill="slategray1"),
          strip.text = element_text(colour = 'black'),
          plot.title = element_text(hjust = 0.5),
          plot.subtitle = element_text(hjust = 0.5)) +
    labs(subtitle=paste0(SNP.Group,": Significant QTL Overlap")) +
    geom_hline(yintercept = -log10(0.05), linetype="dashed", alpha=.9, color="purple", size=.2) +
    ylim(c(0, -log10(min(GENE_df$FDR))*1.2  )) +
    geom_point(data = subset(GENE_df, FDR>=0.05), aes(x=POS, y=-log10(FDR)), color="gray")
  return(op)
}

mergeQTL.QTL_distributions_plot <- function(){
  FM_merge <- data.table::fread(file.path("./Data/GWAS/Nalls23andMe_2019/_genome_wide",
                                          "Nalls23andMe_2019.QTL_overlaps.txt.gz"),
                                nThread = 4)
  FM_melt <- mergeQTL.melt_FDR(FM_merge)


  plot.QTL_distributions.subset <- function(SNP.Group="Consensus_SNP", cardiogenics=FALSE){
    FM_melt <-  FM_melt %>%dplyr::arrange(QTL.Source, FDR)
    if(cardiogenics==FALSE){
      FM_melt <- subset(FM_melt, !(QTL.Source %in% c("Cardiogenics_macrophages","Cardiogenics_monocytes")))
    }

    if(SNP.Group=="Consensus_SNP"){
      GENE_df <- subset(FM_melt, (Consensus_SNP==TRUE) & (FDR<=0.05), drop=FALSE)
    }
    if(SNP.Group=="CS"){
      GENE_df <- subset(FM_melt, (Support>0) & (FDR<=0.05), drop=FALSE)
    }
    if(SNP.Group=="GWAS_lead_SNP"){
      GENE_df <- subset(FM_melt, (leadSNP==TRUE) & (FDR<=0.05), drop=F)
    }
    if(SNP.Group=="GWAS_sig_SNP"){
      GENE_df <- subset(FM_melt, (P<=0.05) & (FDR<=0.05), drop=F)
    }
    GENE_df$SNP <- factor(GENE_df$SNP, levels = unique(GENE_df$SNP), ordered = T)

    # op <- plot.QTL_overlap(GENE_df, SNP.Group = SNP.Group)
    # print(op)

    gp <- ggplot(GENE_df , aes(x=QTL.Source, y=-log10(FDR), fill=QTL.Source, group=SNP)) +
      geom_col(position = position_dodge(preserve = "single"), show.legend = F, aes(group=SNP), alpha=.7) +
      geom_hline(yintercept = -log10(0.05), linetype="dashed", alpha=.9, color="purple", size=.2) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
            plot.title = element_text(hjust = 0.5),
            plot.subtitle = element_text(hjust = 0.5)) +
      labs(title=NULL, subtitle=paste0(SNP.Group, " : Significant QTL Overlap"), x=NULL) +
      scale_fill_discrete(drop=F) +
      scale_x_discrete(drop=F)
    # ylim(c(0,-log10( min(FM_melt$FDR, na.rm = T)) ))
    # ylim(c(0,150))
    # print(gp)
    return(gp)
  }
  consensus <- plot.QTL_distributions.subset(SNP.Group = "Consensus_SNP")
  cred.set <- plot.QTL_distributions.subset(SNP.Group = "CS")
  lead.snp <- plot.QTL_distributions.subset(SNP.Group = "GWAS_lead_SNP")
  sig.snp <- plot.QTL_distributions.subset(SNP.Group = "GWAS_sig_SNP")

  patchwork::wrap_plots(consensus, cred.set, lead.snp, ncol = 1)
  # cowplot::plot_grid(, sig.snp, ncol = 1, align = "h")
}




##################################################

####----------- Gather QTL Overlap -----------####

##################################################

mergeQTL.flip_alleles <- function(FM_merge){
  messager("mergeQTL:: Checking ref/alt allele matchup between GWAS and QTL data...")
  flip_check <- toupper(FM_merge$A1) == toupper(FM_merge$QTL.A1)
  messager("mergeQTL:: Flipping alleles at", sum(!flip_check, na.rm = T),"SNPs")
  FM_merge[!flip_check,"QTL.Effect"] <- -FM_merge[!flip_check,"QTL.Effect"]
  return(FM_merge)
}


mergeQTL.merge_handler <- function(FM_all, qtl_file, force_new_subset=F){
    qtl_name <- gsub(".finemap.txt.gz","",basename(qtl_file))
    message("mergeQTL::",qtl_name)

    if(grepl("GTEx_V7",qtl_name)){
      tissue <- gsub("GTEx_V7_","",qtl_name)
      FM_merge <- mergeQTL.GTEx(FM_all,
                                GTEx_version="GTEx_V7",
                                fuzzy_search=tissue,
                                force_new_subset=force_new_subset)

    } else if(grepl("GTEx_V8",qtl_name)){
      tissue <- gsub("GTEx_V8_","",qtl_name)
      FM_merge <- mergeQTL.GTEx(FM_all,
                                GTEx_version="GTEx_V8",
                                fuzzy_search=tissue,
                                force_new_subset=force_new_subset)

    } else if(grepl("psychENCODE",qtl_name)){
      assay = strsplit(qtl_name,"_")[[1]][2]
      FM_merge <- mergeQTL.psychENCODE(FM_all,
                                       ASSAYS=assay,
                                       force_new_subset=force_new_subset)

    } else if(grepl("Brain_xQTL_Serve",qtl_name)){
      assay = strsplit(qtl_name, "[.]")[[1]][2]
      FM_merge <- mergeQTL.Brain_xQTL_Serve(FM_all,
                                            ASSAYS=assay,
                                            force_new_subset=force_new_subset)

    } else if(grepl("MESA",qtl_name)){
      pop <- strsplit(qtl_name,"_")[[1]][2]
      FM_merge <- mergeQTL.MESA(FM_all,
                                POPULATIONS=pop,
                                force_new_subset=force_new_subset)

    } else if(grepl("Fairfax_2014",qtl_name)){
      condition <- strsplit(qtl_name,"_")[[1]][3]
      FM_merge <- mergeQTL.Fairfax(FM_all,
                                   CONDITIONS=condition,
                                   force_new_subset=force_new_subset)

    } else if(grepl("Cardiogenics",qtl_name)){
      celltype <- strsplit(qtl_name,"_")[[1]][2]
      FM_merge <- mergeQTL.Cardiogenics(FM_all,
                                        CELLTYPES=celltype,
                                        force_new_subset=force_new_subset)
    }
    return(FM_merge)
}

mergeQTL <- function(dataset = "./Data/GWAS/Nalls23andMe_2019"){
  # Gather all Fine-mapping results
  FM_all <- merge_finemapping_results(minimum_support = 0,
                                      include_leadSNPs = T,
                                      dataset = "./Data/GWAS/Nalls23andMe_2019",
                                      xlsx_path = F)
  FM_orig <- FM_all
  # List all datasets
  messager("+ mergeQTL:: Listing available QTL datasets:")
  list_Data_dirs() %>% dplyr::filter(grepl("QTL",type)) %>% dplyr::select(Dataset, type)

  # psychENCODE eQTL, cQTL, isoQTL, tQTL, fQTL, HiC: DLPFC
  FM_merge <- mergeQTL.merge_handler(FM_all = FM_all, )
  # FM_merge <- mergeQTL.psychENCODE(FM_all=FM_all, local_files = T,  force_new_subset = T)
  # Fairfax eQTL: monocytes
  FM_merge <- mergeQTL.Fairfax(FM_all, force_new_subset = F)
  # MESA eQTL: monocytes in AFA, CAU, HIS
  FM_merge <- mergeQTL.MESA(FM_merge, force_new_subset = F)
  # Cardiogenics eQTL: macrophages, monocytes
  FM_merge <- mergeQTL.Cardiogenics(FM_merge, force_new_subset = F, cis_only = T)
  # Brain_xQTL_Serve
  FM_merge <- mergeQTL.Brain_xQTL_Serve(FM_merge, force_new_subset = F)
  ## Write file and compress
  QTL_merged_path <- file.path(dataset,"_genome_wide",
                               "Nalls23andMe_2019.QTL_overlaps.txt")
  data.table::fwrite(FM_merge, QTL_merged_path, sep="\t", nThread = 4)
  R.utils::gzip(QTL_merged_path, overwrite=T, remove=TRUE)


  #-------------------------------#
  # GTEx eQTL: 49 different tissues
  FM_GTEx <- mergeQTL.GTEx_merge_subset(FM_all, GTEx_version="GTEx_V7", fuzzy_search="nigra")

  # FM_GTEx <- mergeQTL.GTEx(FM_all, fuzzy_search="Brain", GTEx_version = "GTEx_V7", force_new_subset = F)
  QTL_merged_path.GTEx <- file.path("./Data/GWAS/Nalls23andMe_2019/_genome_wide",
                               "Nalls23andMe_2019.GTEx_QTL_overlaps.txt")
  data.table::fwrite(FM_GTEx, QTL_merged_path.GTEx, sep="\t", nThread = 4)
  R.utils::gzip(QTL_merged_path.GTEx, overwrite=T, remove=TRUE)

  return(FM_merge)
}
RajLabMSSM/echolocatoR documentation built on Jan. 29, 2023, 6:04 a.m.