catalogueR: QTL data extraction and colocalization

Documented in COLOC_run

#' Iteratively run coloc on merged GWAS-QTL datatables
#'
#' Runs colocalization tests (\link[coloc]{coloc.abf})
#' on merged GWAS-QTL \emph{data.tables}
#' generated by \link[catalogueR]{eQTLcatalogue_query}.
#' Iteratively runs coloc across each:
#' \itemize{
#' \item{QTL dataset}
#' \item{GWAS locus}
#' \item{QTL gene}
#' }
#' \emph{NOTE:} Assumes that each file is within a subfolder named after
#'  the QTL dataset it came from.
#'  
#' @param save_path Where to save results to.
#' @param top_snp_only Only include the SNP (with the highest SNP-wise PP.H4, 
#' which is usually the one with the smallest p-value) instead of all SNPs. 
#' Can be useful for reducing data size.
#' @param split_by_group Split files by QTL group when saving. 
#' @inheritParams COLOC_merge_res
#' @inheritParams eQTLcatalogue_query
#' @inheritParams echodata::get_sample_size
#' 
#' @return
#' If \code{top_snp_only=TRUE}, returns SNP-level stats for only the SNP
#' with the highest colocalization probability (\emph{SNP.PP.H4})
#' If \code{top_snp_only=FALSE}, returns SNP-level stats for every SNP.
#' In either case, summary-level coloc stats are added in the columns
#' \emph{PP.H0}, \emph{PP.H1}, \emph{PP.H2}, \emph{PP.H3}, \emph{PP.H4}.
#' @family coloc
#' @export
#' @importFrom echodata get_sample_size
#' @importFrom data.table data.table rbindlist fwrite
#' @importFrom dplyr top_n
#' @importFrom parallel mclapply
#' @examples
#' gwas.qtl_paths <- catalogueR::eQTLcatalogue_example_queries()
#' coloc_QTLs <- catalogueR::COLOC_run(gwas.qtl_paths = gwas.qtl_paths)
COLOC_run <- function(gwas.qtl_paths,
                      save_path = tempfile(pattern = "coloc_results",
                                           fileext = ".tsv.gz"), 
                      top_snp_only = TRUE,
                      split_by_group = FALSE,
                      method = "abf",
                      coloc_thresh = .8,
                      compute_n = NULL,
                      nThread = 1,
                      verbose = TRUE) {
  
    qtl.groups <- unique(unlist(lapply(unname(gwas.qtl_paths),
                                       parse_gwas.qtl_path)))
    #### Iterate over QTL groups ####
    coloc_QTLs <- timeit(lapply(qtl.groups, function(group) {
        messager("+ QTL Group = ", group, v=verbose)
        gwas.qtl_paths_select <- gwas.qtl_paths[
            basename(dirname(gwas.qtl_paths)) == group
        ]

        # ---- Iterate over QTL datasets
        coloc_qtls <- parallel::mclapply(
            gwas.qtl_paths_select,
            function(qtl.path) {
                qtl_ID <- parse_gwas.qtl_path(
                    gwas.qtl_path = qtl.path,
                    get_locus = FALSE,
                    get_qtl_id = TRUE
                )
                gwas.locus <- parse_gwas.qtl_path(
                    gwas.qtl_path = qtl.path,
                    get_locus = TRUE,
                    get_qtl_id = FALSE
                )
                coloc_eGenes <- data.table::data.table()
                try({
                    qtl.dat <- data.table::fread(qtl.path)
                    messager(
                        "++ GWAS =", gwas.locus, "x",
                        formatC(length(unique(qtl.dat$gene.QTL)),
                                big.mark = ","),
                        "eGenes",
                        v = verbose
                    )
                    #### Check column names ####

                    #### Check sample size ####
                    # infer N from N_cases and N_controls
                    qtl.dat <- echodata::get_sample_size(
                        dat = qtl.dat,
                        compute_n = compute_n,
                        verbose = verbose
                    )
                    qtl.dat <- check_sample_size(
                        qtl.dat = qtl.dat,
                        sample_size = compute_n
                    )
                    if (!"qtl_id" %in% colnames(qtl.dat)) {
                        qtl.dat <- cbind(qtl.dat, qtl_id = qtl_ID)
                    }
                    qtl.dataset <- subset(qtl.dat, qtl_id == qtl_ID &
                        !is.na(gene.QTL) & gene.QTL != "")
                    remove(qtl.dat)
                    #### Iterate over QTL eGenes ####
                    coloc_eGenes <- lapply(unique(qtl.dataset$gene.QTL),
                                           function(eGene) {
                            messager("+++ QTL eGene =", eGene, v = verbose)
                            #### Extract QTL cols ####
                            qtl.egene <- subset(qtl.dataset, gene.QTL == eGene)
                            # Extract the GWAS cols
                            if ("Effect" %in% colnames(qtl.egene)) {
                                gwas_cols <- c(
                                    "Locus", "Locus.GWAS", "SNP", "CHR", "POS",
                                    "P", "Effect", "StdErr", "Freq",
                                    "MAF", "N", "N_cases", "N_controls",
                                    "proportion_cases", "A1", "A2"
                                )
                                gwas_cols <- gwas_cols[
                                    gwas_cols %in% colnames(qtl.egene)
                                ]
                                gwas.region <- subset(qtl.egene, 
                                                      select = gwas_cols)
                            }
                            #### Run coloc ####
                            coloc_res <- COLOC_get_res(
                                qtl.egene = qtl.egene,
                                gwas.region = gwas.region,
                                merge_by_rsid = TRUE,
                                coloc_thresh = coloc_thresh,
                                method = method,
                                verbose = FALSE
                            )
                            coloc_summary <- as.list(coloc_res$summary)
                            coloc_results <- coloc_res$results
                            if (top_snp_only) {
                                coloc_results <- (
                                    dplyr::top_n(coloc_results,
                                        n = 1, wt = SNP.PP.H4
                                ))[1, ]
                            }
                            #### Rename columns coloc_results ####
                            colnames(coloc_results) <- gsub(
                                "\\.df1", ".QTL",
                                colnames(coloc_results)
                            )
                            colnames(coloc_results) <- gsub(
                                "\\.df2", ".GWAS",
                                colnames(coloc_results)
                            )
                            #### Create results data.table ####
                            coloc_DT <- data.table::data.table(
                                Locus.GWAS = coloc_res$Locus,
                                qtl_id = qtl_ID,
                                gene.QTL = eGene,
                                P = gwas.region$P,
                                CHR = gwas.region$CHR,
                                POS = gwas.region$POS,
                                pvalue.QTL = qtl.egene$pvalue.QTL,
                                coloc_results,
                                PP.H0 = coloc_summary$PP.H0.abf,
                                PP.H1 = coloc_summary$PP.H1.abf,
                                PP.H2 = coloc_summary$PP.H2.abf,
                                PP.H3 = coloc_summary$PP.H3.abf,
                                PP.H4 = coloc_summary$PP.H4.abf
                            )
                            return(coloc_DT)
                        }
                    ) |> data.table::rbindlist(fill = TRUE)
                    # END ITERATE OVER eGenes
                }) # end try()
                return(coloc_eGenes)
            },
            mc.cores = nThread
        ) |> data.table::rbindlist(fill = TRUE)
        # END ITERATE OVER gwas.qtl_paths_select

        if (split_by_group) {
            split_path <- file.path(dirname(save_path), group)
            messager("Saving split file ==>", split_path)
            dir.create(dirname(split_path),
                showWarnings = FALSE, recursive = TRUE
            )
            data.table::fwrite(
                x = coloc_qtls,
                file = split_path,
                nThread = 1
            )
            return(split_path)
        } else {
            return(coloc_qtls)
        }
    })) # end timeit()

    if (split_by_group) {
        messager("+ Returning split file paths.")
    } else {
        coloc_QTLs <- data.table::rbindlist(coloc_QTLs, fill = TRUE)
        # Save all coloc results in one data.table
        if (save_path != FALSE) {
            dir.create(dirname(save_path),
                showWarnings = FALSE, recursive = TRUE
            )
            data.table::fwrite(coloc_QTLs,
                file = save_path,
                sep = "\t",
                nThread = 1
            )
        }
    }
    return(coloc_QTLs)
}

#### Deprecation function #####
run_coloc <- function(...){
  .Deprecated("COLOC_run")
  COLOC_run(...)
}