R/ctwas.R

Defines functions ctwas

Documented in ctwas

#' Causal inference for TWAS
#' 
#' @param pgenfs A character vector of .pgen or .bed files. One file for one
#' chromosome, in the order of 1 to 22. Therefore, the length of this vector
#' needs to be 22. If .pgen files are given, then .pvar and .psam are assumed
#' to present in the same directory. If .bed files are given, then .bim and
#' .fam files are assumed to present in the same directory.
#'
#' @param exprfs A character vector of .`expr` or `.expr.gz` files. One file for
#' one chromosome, in the order of 1 to 22. Therefore, the length of this vector
#' needs to be 22.  `.expr.gz` file is gzip compressed `.expr` files. `.expr` is
#' a matrix of imputed expression values, row is for each sample, column is for
#' each gene. Its sample order is same as in files provided by `.pgenfs`. We also
#' assume corresponding `.exprvar` files are present in the same directory.
#' `.exprvar` files are just tab delimited text files, with columns:
#' \describe{
#'   \item{chrom}{chromosome number, numeric}
#'   \item{p0}{gene boundary position, the smaller value}
#'   \item{p1}{gene boundary position, the larger value}
#'   \item{id}{gene id}
#' }
#' Its rows should be in the same order as the columns for corresponding `.expr`
#' files.
#'
#' @param Y a vector of length n, phenotype, the same order as provided
#' by `.pgenfs` (defined in .psam or .fam files).
#'
#' @param ld_regions A string representing the population to use for defining
#' LD regions. These LD regions were previously defined by ldetect. The user can also
#' provide custom LD regions matching genotype data, see
#' \code{ld_regions_custom}.
#'
#' @param ld_regions_version A string representing the genome reference build ("b37", "b38") to use for defining
#' LD regions. See \code{ld_regions}.
#'  
#' @param ld_regions_custom A bed format file defining LD regions. The default
#' is \code{NULL}; when specified, \code{ld_regions} and \code{ld_regions_version} will be ignored.
#'  
#' @param thin The proportion of SNPs to be used for the parameter estimation and initial fine 
#' mapping steps. Smaller \code{thin} parameters reduce runtime at the expense of accuracy. The fine mapping step is rerun using full SNPs 
#' for regions with strong gene signals; see \code{rerun_gene_PIP}.
#'
#' @param prob_single Blocks with probability greater than \code{prob_single} of having 1 or fewer effects will be
#' used for parameter estimation
#'
#' @param max_snp_region Inf or integer. Maximum number of SNPs in a region. Default is
#' Inf, no limit. This can be useful if there are many SNPs in a region and you don't
#' have enough memory to run the program. This applies to the last rerun step
#' (using full SNPs and rerun susie for regions with strong gene signals) only.
#'
#' @param rerun_gene_PIP if thin <1, will rerun blocks with the max gene PIP
#' > \code{rerun_gene_PIP} using full SNPs. if \code{rerun_gene_PIP} is 0, then
#' all blocks will rerun with full SNPs
#' 
#' @param niter1 the number of iterations of the E-M algorithm to perform during the initial parameter estimation step
#' 
#' @param niter2 the number of iterations of the E-M algorithm to perform during the complete parameter estimation step
#' 
#' @param L the number of effects for susie during the fine mapping steps
#' 
#' @param group_prior a vector of two prior inclusion probabilities for SNPs and genes. This is ignored 
#' if \code{estimate_group_prior = T}
#' 
#' @param group_prior_var a vector of two prior variances for SNPs and gene effects. This is ignored 
#' if \code{estimate_group_prior_var = T}
#' 
#' @param estimate_group_prior TRUE/FALSE. If TRUE, the prior inclusion probabilities for SNPs and genes are estimated
#' using the data. If FALSE, \code{group_prior} must be specified
#' 
#' @param estimate_group_prior_var TRUE/FALSE. If TRUE, the prior variances for SNPs and genes are estimated
#' using the data. If FALSE, \code{group_prior_var} must be specified
#' 
#' @param use_null_weight TRUE/FALSE. If TRUE, allow for a probability of no effect in susie
#' 
#' @param coverage A number between 0 and 1 specifying the \dQuote{coverage} of the estimated confidence sets
#' 
#' @param standardize TRUE/FALSE. If TRUE, all variables are standardized to unit variance
#' 
#' @param ncore The number of cores used to parallelize susie over regions
#' 
#' @param outputdir a string, the directory to store output
#' 
#' @param outname a string, the output name
#' 
#' @param logfile the log file, if NULL will print log info on screen
#'
#' @importFrom logging addHandler loginfo
#' @importFrom tools file_ext
#'
#' @export
ctwas <- function(pgenfs,
                  exprfs,
                  Y,
                  ld_regions = c("EUR", "ASN", "AFR"),
                  ld_regions_version = c("b37", "b38"),
                  ld_regions_custom = NULL,
                  thin = 1,
                  prob_single = 0.8,
                  max_snp_region = Inf,
                  rerun_gene_PIP = 0.8,
                  niter1 = 3,
                  niter2 = 30,
                  L= 5,
                  group_prior = NULL,
                  group_prior_var = NULL,
                  estimate_group_prior = T,
                  estimate_group_prior_var = T,
                  use_null_weight = T,
                  coverage = 0.95,
                  standardize = T,
                  ncore = 1,
                  outputdir = getwd(),
                  outname = NULL,
                  logfile = NULL){

  if (!is.null(logfile)){
      addHandler(writeToFile, file= logfile, level='DEBUG')
  }

  loginfo('ctwas started ... ')

  if (length(pgenfs) != 22){
    warning("Not all pgen files for 22 chromosomes are provided.")
  }

  if (length(exprfs) != 22){
    warning("Not all imputed expression files for 22 chromosomes are provided.")
  }

  if (length(exprfs) != length(pgenfs)){
    stop("Genotype pgen file and imputation files are not matching...")
  }

  if (is.null(ld_regions_custom)){
    ld_regions <- match.arg(ld_regions)
    ld_regions_version <- match.arg(ld_regions_version)
    regionfile <- system.file("extdata", "ldetect",
                              paste0(ld_regions, "." , ld_regions_version, ".bed"), package="ctwas")
  } else {
    regionfile <- ld_regions_custom
  }

  loginfo("LD region file: %s", regionfile)

  pvarfs <- sapply(pgenfs, prep_pvar, outputdir = outputdir)
  exprvarfs <- sapply(exprfs, prep_exprvar)

  if (thin <=0 | thin > 1){
    stop("thin value needs to be in (0,1]")
  }

  regionlist <- index_regions(regionfile = regionfile, exprvarfs = exprvarfs,
                              pvarfs = pvarfs,
                              thin = thin)

  temp_regs <- lapply(1:22, function(x) cbind(x,
                   unlist(lapply(regionlist[[x]], "[[", "start")),
                     unlist(lapply(regionlist[[x]], "[[", "stop"))))
                 
  regs <- do.call(rbind, lapply(temp_regs, function(x) if (ncol(x) == 3){x}))

  write.table(regs , file= paste0(outputdir,"/", outname, ".regions.txt")
              , row.names=F, col.names=T, sep="\t", quote = F)

  if (isTRUE(estimate_group_prior) | isTRUE(estimate_group_prior_var)){

    loginfo("Run susie iteratively, getting rough estimate ...")

    if (!is.null(group_prior)){
      group_prior[2] <- group_prior[2]/thin
    }

    pars <- susieI(pgenfs = pgenfs, exprfs = exprfs, Y = Y,
                   regionlist = regionlist,
                   niter = niter1,
                   L = 1,
                   group_prior = group_prior,
                   group_prior_var = group_prior_var,
                   estimate_group_prior = estimate_group_prior,
                   estimate_group_prior_var = estimate_group_prior_var,
                   use_null_weight = use_null_weight,
                   coverage = coverage,
                   standardize = standardize,
                   ncore = ncore,
                   outputdir = outputdir,
                   outname = paste0(outname, ".s1"))


    group_prior <- pars[["group_prior"]]
    group_prior_var <- pars[["group_prior_var"]]

    # filter blocks
    regionlist2 <- filter_regions(regionlist,
                                  group_prior,
                                  prob_single= prob_single)

    loginfo("Blocks are filtered: %s blocks left",
             sum(unlist(lapply(regionlist2, length))))

    loginfo("Run susie iteratively, getting accurate estimate ...")

    pars <- susieI(pgenfs = pgenfs, exprfs = exprfs, Y = Y,
                   regionlist = regionlist2,
                   niter = niter2,
                   L = 1,
                   group_prior = group_prior,
                   group_prior_var = group_prior_var,
                   estimate_group_prior = estimate_group_prior,
                   estimate_group_prior_var = estimate_group_prior_var,
                   use_null_weight = use_null_weight,
                   coverage = coverage,
                   standardize = standardize,
                   ncore = ncore,
                   outputdir = outputdir,
                   outname = paste0(outname, ".s2"))

    group_prior <- pars[["group_prior"]]
    group_prior_var <- pars[["group_prior_var"]]
  }

  loginfo("Run susie for all regions.")

  pars <- susieI(pgenfs = pgenfs, exprfs = exprfs, Y = Y,
                 regionlist = regionlist,
                 niter = 1,
                 L = L,
                 group_prior = group_prior,
                 group_prior_var = group_prior_var,
                 estimate_group_prior = estimate_group_prior,
                 estimate_group_prior_var = estimate_group_prior_var,
                 use_null_weight = use_null_weight,
                 coverage = coverage,
                 standardize = standardize,
                 ncore = ncore,
                 outputdir = outputdir,
                 outname = paste0(outname, ".temp"),
                 report_parameters = F)

  group_prior[2] <- group_prior[2] * thin # convert snp pi1

  if (thin == 1){
    file.rename(paste0(file.path(outputdir, outname), ".temp.susieI.txt"),
                paste0(file.path(outputdir, outname), ".susieI.txt"))
  } else {
    # get full SNPs
    # TODO: should use z score to trim down to maxSNP in the future.

    regionlist <- index_regions(regionfile = regionfile, exprvarfs = exprvarfs,
                                pvarfs = pvarfs,
                                thin = 1, maxSNP = max_snp_region)

    res <- data.table::fread(paste0(file.path(outputdir, outname), ".temp.susieI.txt"))

    # filter out regions based on max gene PIP of the region
    res.keep <- NULL
    for (b in 1: length(regionlist)){
      for (rn in names(regionlist[[b]])){
        gene_PIP <- max(res[res$type == "gene" & res$region_tag1 == b & res$region_tag2 == rn, ]$susie_pip, 0)
        if (gene_PIP < rerun_gene_PIP) {
          regionlist[[b]][[rn]] <- NULL
          res.keep <- rbind(res.keep, res[res$region_tag1 ==b & res$region_tag2 == rn, ])
        }
      }
    }

    nreg <- sum(unlist(lapply(regionlist, length)))

    loginfo("Number of regions that contains strong gene signals: %s", nreg)

    if (nreg == 0){

      file.rename(paste0(file.path(outputdir, outname), ".temp.susieI.txt"),
                  paste0(file.path(outputdir, outname), ".susieI.txt"))

    } else{

      loginfo("Rerun susie for regions with strong gene signals using full SNPs.")

      pars <- susieI(pgenfs = pgenfs, exprfs = exprfs, Y = Y,
                     regionlist = regionlist,
                     niter = 1,
                     L = L,
                     group_prior = group_prior,
                     group_prior_var = group_prior_var,
                     estimate_group_prior = estimate_group_prior,
                     estimate_group_prior_var = estimate_group_prior_var,
                     use_null_weight = use_null_weight,
                     coverage = coverage,
                     standardize = standardize,
                     ncore = ncore,
                     outputdir = outputdir,
                     outname = paste0(outname, ".s3"),
                     report_parameters = F)

      res.rerun <- data.table::fread(paste0(file.path(outputdir, outname), ".s3.susieI.txt"))

      res <- rbind(res.keep, res.rerun)

      data.table::fwrite(res, file = paste0(file.path(outputdir, outname), ".susieI.txt"),
                         sep = "\t", quote = F)
      file.remove((paste0(file.path(outputdir, outname), ".temp.susieI.txt")))
    }
  }

  list("group_prior" = group_prior,
       "group_prior_var" = group_prior_var)
}
simingz/ctwas documentation built on Sept. 17, 2024, 10:55 p.m.