R/eqtl_lib.R

Defines functions eqtl eqtl.run trans.qtl convert.vcf add.chr.len.anno manhattan.qtl eqtl.min.p myWaitForJobs find.eGenes find.eSNPs get.egenes.from.esnps qquniform get.peer.factors normalize.quantile

Documented in add.chr.len.anno convert.vcf eqtl eqtl.min.p eqtl.run manhattan.qtl myWaitForJobs trans.qtl

#' A package to facilitate the full pipeline of cis eQTL analysis including
#' permutations on a HPC cluster supported by BatchJobs
#'
#' \tabular{ll}{
#' Package: \tab eQTLpipeline\cr
#' Type: \tab Package\cr
#' Version: \tab 0.1\cr
#' Date: \tab 2015-11-03\cr
#' License: \tab LGPL \cr
#' LazyLoad: \tab yes\cr
#' }
#'
#' @name eQTLpipeline
#' @aliases eQTLpipeline
#' @docType package
#' @title full pipeline of eQTL analysis including permutations on HPC systems
#' @docType package
#' @references 
#' @keywords package
#' @import MatrixEQTL
#' @import data.table
#' @import BatchJobs
#' @examples
#' #insertexample()
NULL



#' Basic interface to MatrixEQTL passing all data in files
#'
#' @family eqtl functions
#' @title basic eQTL interface
#' @param expression_file_name filename of a tab separated file with expression
#'        values in a matrix ngene x nsample
#' @param genotype_file_name filename of a tab separated file with genotype
#'        dosage values (usually between 0 and 2) in a matrix nsnp x nsample
#' @param covariates_file_name filename of a tab separated file with covariate
#'        values in a matrix ncovar x nsample
#' @param gene.position data.frame with gene positions in the first 4 columns
#'        named "chrom", "start", "end", "gene_id"
#' @param snp.pos data.frame with SNP positions in the first three columns
#'        named "snp_id", "chrom", "snp_pos"
#' @param prefix name prefix for the outputfilenames
#' @param redo logical, if TRUE the results will always be recomputed, if FALSE
#'        the results will be loaded if they exist (default FALSE)
#' @param threshold significance threshold for eQTLs (default 1e-5)
#' @param compute.all MatrixEQTL reports only significant results, if results
#'        for all tests are required set this to TRUE (default FALSE)
#' @param what flag to determine what should be returned (default "table"). Can
#'        be set to "object" to get the MatrixEQTL result object.
#' @return if what is "table" the function returns the eQTL results as a table
#'         if what is "object" the function returns the MatrixEQTL object.
#' @references
#' @export
eqtl <- function(expression_file_name, genotype_file_name, covariates_file_name, gene.position, snp.pos, prefix, redo=FALSE, threshold=1e-5, compute.all=FALSE, what="table") {
  ## require(MatrixEQTL)
  ## require(data.table)
  
  # we use this check sum to store intermediate results
  if (compute.all) {
    eqtl.file = paste(prefix, "_eQTL_results_R.txt", sep="")
  } else {
    eqtl.file = paste(prefix, "_eQTL_results_R_significant.txt", sep="")
  }
  if (file.exists(eqtl.file) && !redo) {
    eqtl = fread(eqtl.file, sep="\t", stringsAsFactors=F)
    return(eqtl)
  }
  
  dir.create(dirname(prefix), recursive=T)
 
    
  # compute eqtl
  useModel = modelLINEAR ; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

  # Only associations significant at this level will be saved
  if (compute.all) {
    pvOutputThreshold_cis = 1
  } else {
    pvOutputThreshold_cis = threshold
  }
  
  pvOutputThreshold_tra = 0 # only look at cis eqtl
  
  errorCovariance = numeric()
  # errorCovariance = read.table("Sample_Data/errorCovariance.txt")
  cisDist = 1e6

  # load the expression data for matrix eqtl
  gene = SlicedData$new();
  gene$fileDelimiter = "\t"; # the TAB character
  gene$fileOmitCharacters = "NA"; # denote missing values;
  gene$fileSkipRows = 1; # one row of column labels
  gene$fileSkipColumns = 1; # one column of row labels
  gene$fileSliceSize = 2000; # read file in pieces of 2,000 rows
  gene$LoadFile(expression_file_name );

 

  # load the covariate data for matrix eqtl
  cvrt = SlicedData$new();
  cvrt$fileDelimiter = "\t"; # the TAB character
  cvrt$fileOmitCharacters = "NA"; # denote missing values;
  cvrt$fileSkipRows = 1; # one row of column labels
  cvrt$fileSkipColumns = 1; # one column of row labels
  cvrt$fileSliceSize = 2000; # read file in one piece
  if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name);
  }  
  

  ## Load genotype data
      
  snps = SlicedData$new();
  snps$fileDelimiter = "\t"; # the TAB character
  snps$fileOmitCharacters = "NA"; # denote missing values;
  snps$fileSkipRows = 1   ; # one row of column labels
  snps$fileSkipColumns = 1; # one col of snp ids
  snps$fileSliceSize = 2000; # read file in pieces of 2,000 rows
  snps$LoadFile(genotype_file_name);
      
  ## Run the analysis
  me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = "",
    pvOutputThreshold = pvOutputThreshold_tra,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    output_file_name.cis = eqtl.file,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snp.pos[,1:3],
    genepos = gene.position[,1:4],
    cisDist = cisDist,
    pvalue.hist = "qqplot",
    min.pv.by.genesnp=(what != "table"));

  pdf(file=paste(prefix, "_qqplot.pdf", sep=""))
  plot(me)
  dev.off()

  if (what == "table") {
    eqtl = fread(eqtl.file, sep="\t", stringsAsFactors=F)

    ## add information about the snp position to the eqtl data
    eqtl = cbind(eqtl, snp.pos[match(eqtl$SNP, snp.pos[,"snp_id"]),-1], gene.position[match(eqtl$gene, gene.position[,1]),-1])
    eqtl = eqtl[order(eqtl$chrom, eqtl$snp_pos, eqtl$gene),]
    write.table(eqtl, eqtl.file, sep="\t", row.names=F, quote=F)
    
    ## also create a filtered version
    if (compute.all) {
      significant = eqtl[eqtl[["p-value"]] < threshold,]
      sig.file = gsub("_eQTL_results_R.txt$", "_eQTL_results_R_significant.txt", eqtl.file)
      write.table(significant, sig.file, sep="\t", row.names=F, quote=F)
    }
    return(invisible(eqtl))
  } else {
    return(me)
  }
}

## they use min(p) per gene as the test statistic
## then they permute phenotypes to derive the null distribution
## they run 1000 permutations for all genes
## the exit criterion is > 15 permuted min(p) > actual min(p)


#' Basic interface to MatrixEQTL passing expression and covariate data in
#' matrices, and genotypes as a file uses the \code{\link{eqtl}} function
#'
#' @family eqtl functions
#' @title basic eQTL interface
#' @param expr expression values in a matrix ngene x nsample
#' @param covar covariate values in a matrix ncovar x nsample
#' @param genotype_file_name filename of a tab separated file with genotype
#'        dosage values (usually between 0 and 2) in a matrix nsnp x nsample
#' @param gene.position data.frame with gene positions in the first 4 columns
#'        named "chrom", "start", "end", "gene_id"
#' @param snp.pos data.frame with SNP positions in the first three columns
#'        named "snp_id", "chrom", "snp_pos"
#' @param id character indicating an id used for tempfile creation
#' @return the same output as the \code{\link{eqtl}} function
#' @references
#' @export
eqtl.run <- function(expr, covar, genotype_file_name, gene.position, snp.pos, id, ...) {

  ## prepare the input files
  outdir = tempfile(pattern=id)
  cat("creating temp dir for eqtl run", outdir, "\n")
  dir.create(outdir)

  prefix = file.path(outdir, id)
  expr.file = paste(prefix, "_expression.txt", sep="")
  write.table(expr, file=expr.file, sep="\t", quote=F)
  
  if (!is.null(covar)) {
    covar.file = paste(prefix, "_covariates.txt", sep="")
    write.table(covar, file=covar.file, sep="\t", quote=F)
  } else {
    covar.file = NULL
  }

  ## run the eqtl analysis
  eqtls = eqtl(expr.file, genotype_file_name, covar.file, gene.position, snp.pos, prefix, ...)

  return(eqtls)
}

#' Interface to the MatrixEQTL Matrix_eQTL_engine()
#' This is a simplified version for more general (trans)QTL analyses,
#' e.g. GWAS or metaboliteQTL and does not need gene annotations
#'
#' @family eqtl functions
#' @title transQTL interface
#' @param prefix path and prefix for the outputfiles (default NULL)
#' @param genotype_file_name filename of a tab separated file with genotype
#'        dosage values (usually between 0 and 2) in a matrix nsnp x nsample
#' @param vcf is the genotype file a vcf file that needs to be converted?
#'        (default FALSE)
#' @param snp.pos data.frame with SNP positions to be added to matrixEQTL object
#'        in the end. Columns named "snps", "chr", "pos", containing
#'        information in that order (default NULL)
#'        if vcf=TRUE and snp.pos=TRUE | snp.pos=NULL, information is extracted
#'        from vcf file unless save.memory=TRUE.
#'        if snp.pos=FALSE, no annotation is added
#' @param expression_file_name filename of a tab separated file with expression
#'        values in a matrix ngene x nsample.
#'        if expression_file_name is a numeric vector (lenght nsample),
#'        matrix (ngene x nsample) or data.frame (ngene x nsample), the
#'        expression file is created with the name prefix_gene.txt.
#' @param covariates_file_name filename of a tab separated file with covariate
#'        values in a matrix ncovar x nsample
#'        to not use any covariates, set covariates_file_name=character()
#'        if covariates_file_name is a numeric vector (lenght nsample),
#'        matrix (ngene x nsample) or data.frame (ngene x nsample), the
#'        covariate file is created with the name prefix_covariates.txt.
#' @param threshold significance threshold for eQTLs (default 1e-5), set to 1 to
#'        get all QTLs. threshold set to 1 for compute.all=TRUE
#' @param compute.all MatrixEQTL reports only significant results, if results
#'        for all tests are required set this to TRUE (default FALSE)
#' @param save.memory prevent FDR correction (default FALSE). only functional in
#'        the github-version of MatrixEQTL, see also
#'        https://github.com/andreyshabalin/MatrixEQTL/issues/8
#' @param what flag to determine if a table or the MatrixEQTL object should be
#'        returned (default NULL). If unspecified, a table is returned except
#'        either save.memory=TRUE or min.pv.by.genesnp=TRUE, then the MatrixEQTL
#'        result object is returned.
#' @param min.pv.by.genesnp option to report the minimal pvalue per SNP and per
#'        gene in the MatrixEQTL object (default FALSE). If you want to use this
#'        option, make sure to set what="object" to get the results.
#' @param verbose print comments of progress (default TRUE)
#' @param redo logical, if TRUE the results will always be recomputed, if FALSE
#'        the results will be loaded if they exist (default TRUE)
#' @param load.qtls (default FALSE) only relevant for save.memory=TRUE which is
#'        mostly used for big datasets. In that case, QTL results are not
#'        reported in the MatrixEQTL object but can be read in with
#'        load.qtls=TRUE.
#' @param keep.outfile unless compute.all=T or save.memory=T, results are written
#'        to a temp file that is deleted. (default FALSE)
#' 
#' Parameters /options I would like to add:
#' @param id.check check if ids and order of ids match in genotype, expression
#'        and covariate data (default FALSE)
#' @param threshold.cis
#' @param threshold.trans
#'
#' Parameters/options in the old wrapper that do not work currently:
#' @param gene.position data.frame with gene positions in the first 4 columns
#'        named "gene_id", "chrom", "start", "end"
#'        
#'
#' @return if what="table", the data.frame contained in the MatrixEQTL object
#'         me$all$eqtls is returned. If what="object", the whole MatrixEQTL
#'         object is returned.
#' @author Ines Assum (2019-03-21)
#' @references
#' @export
trans.qtl <- function(prefix,
                      genotype_file_name,
                      vcf=FALSE,
                      snp.pos=NULL,
                      expression_file_name,
                      covariates_file_name=character(),
                      threshold=1e-5,
                      compute.all=FALSE,
                      what=NULL,
                      min.pv.by.genesnp=FALSE,
                      save.memory=FALSE,
                      verbose=TRUE,
                      redo=TRUE,
                      load.qtls=FALSE,
                      keep.outfile=FALSE,
                      id.check=FALSE,
                      gene.position=NULL) {

  # TODO:
  # implement id.check:
  # check if ids and order of ids match in genotype, expression and covariate data

  # libraries:
  require(MatrixEQTL)
  require(data.table)
  if(vcf){
    require(dplyr)
    require(VariantAnnotation)
    require(snpStats)
  }
  
  if(is.null(prefix)){
    if(!dir.exists(paste0(getwd(), "/matrixEQTL"))){
      dir.create(paste0(getwd(), "/matrixEQTL"), showWarnings = F, recursive = T)
    }
    prefix <- paste0(getwd(), "/matrixEQTL/test")
  }
  
  if (is.null(what)){
    what <- "table"
    if (save.memory | min.pv.by.genesnp){
      what <- "object"
    }
  }
  
  # Output file name
  if (save.memory | compute.all | keep.outfile){
    output_file_name = paste0(prefix, "_QTL_results.txt");
  } else {
    output_file_name = tempfile();
  }
  
  if (file.exists(output_file_name) && !redo) {
    cat("QTL_results already exist in file\n", output_file_name, "\n",
        "and will not be recomputed because redo=F\n")
    eqtl = fread(output_file_name, sep="\t", stringsAsFactors=F)
    colnames(me$all$eqtls) <- c("snps", "gene", "beta", "statistic", "pvalue")
    return(eqtl)
  }
  
  
  ## Location of the package with the data files.
  base.dir = prefix;
  
  # data prep

  if(vcf){
    vcf.file <- readVcf(genotype_file_name)
    geno.file <- paste0(prefix, "_SNP.txt")
    gt.mat <- t(as(genotypeToSnpMatrix(vcf.file)$genotype, "numeric")) %>% as.data.frame
    write.table(cbind(snpid=rownames(gt.mat), gt.mat),
                row.names = F, col.names = T, quote = F, sep = "\t",
                file = geno.file)
    if(verbose){
      cat("Created genotype file from .vcf file at:\n", geno.file, "\n")
    }
    genotype_file_name <- geno.file
    if(is.null(snp.pos) & vcf==T & isFALSE(save.memory)){
      snp.pos <- T
    }
    if(isTRUE(snp.pos)){
      snp.pos <- rowRanges(vcf.file)
      snp.pos <- data.frame(snps=names(snp.pos),
                            chr=as.numeric(as.character(seqnames(snp.pos))),
                            pos=start(snp.pos),
                            stringsAsFactors = F)
      if(verbose){
        cat("SNP annotations extracted from the .vcf file.\n")
      }
    }
  }
  if(is.data.frame(snp.pos)){
    if(verbose){
      cat("SNP position annotation will be added to the results in the end.\n")
    }
  }
  
  if (is.character(expression_file_name) && length(expression_file_name)>0) {
    if(verbose){
      cat("Expression file will be read from:\n", expression_file_name, "\n")
    }
  } else if (is.data.frame(expression_file_name)){
    expr.file <- paste0(prefix, "_gene.txt")
    write.table(cbind(geneid=rownames(expression_file_name), expression_file_name),
                row.names = F, col.names = T, quote = F, sep = "\t",
                file = expr.file)
    if(verbose){
      dim <- dim(expression_file_name)
      cat("Created expression file from data.frame for ",
          dim[1], " genes by ",
          dim[2], " individuals at:\n", expr.file, "\n")
    }
    expression_file_name <- expr.file
  } else if (is.matrix(expression_file_name)){
    expr.file <- paste0(prefix, "_gene.txt")
    write.table(cbind(geneid=rownames(expression_file_name), expression_file_name),
                row.names = F, col.names = T, quote = F, sep = "\t",
                file = expr.file)
    if(verbose){
      dim <- dim(expression_file_name)
      cat("Created expression file from matrix for ",
          dim[1], " genes by ",
          dim[2], " individuals at:\n", expr.file, "\n")
    }
    expression_file_name <- expr.file
  } else if (is.vector(expression_file_name) && is.numeric(expression_file_name)) {
    expr.file <- paste0(prefix, "_gene.txt")
    write.table(rbind(c("geneid", paste0("sample", 1:length(expression_file_name))),
                      c("gene1", expression_file_name)),
                row.names = F, col.names = F, quote = F, sep = "\t",
                file = expr.file)
    if(verbose){
      dim <- length(expression_file_name)
      cat("Created expression file for a single gene for", dim,
          "individuals at:\n", expr.file, "\n")
    }
    expression_file_name <- expr.file
  } else {
    error("Something went wrong with importing your expression data.")
  }
  
  if (is.character(covariates_file_name) && length(covariates_file_name)>0) {
    if(verbose){
      cat("Covariate file will be read from:\n", covariates_file_name, "\n")
    }
  } else if (is.data.frame(covariates_file_name)){
    cov.file <- paste0(prefix, "_covariate.txt")
    write.table(cbind(covid=rownames(covariates_file_name), covariates_file_name),
                row.names = F, col.names = T, quote = F, sep = "\t",
                file = cov.file)
    if(verbose){
      dim <- dim(covariates_file_name)
      cat("Created covariate file from data.frame for ",
          dim[1], " genes by ",
          dim[2], " individuals at:\n", cov.file, "\n")
    }
    covariates_file_name <- cov.file
  } else if (is.matrix(covariates_file_name)){
    cov.file <- paste0(prefix, "_covariate.txt")
    write.table(cbind(covid=rownames(covariates_file_name), covariates_file_name),
                row.names = F, col.names = T, quote = F, sep = "\t",
                file = cov.file)
    if(verbose){
      dim <- dim(covariates_file_name)
      cat("Created covariate file from matrix for ",
          dim[1], " genes by ",
          dim[2], " individuals at:\n", cov.file, "\n")
    }
    covariates_file_name <- cov.file
  } else if (is.vector(covariates_file_name) && is.numeric(covariates_file_name)) {
    cov.file <- paste0(prefix, "_covariate.txt")
    write.table(rbind(c("covid", paste0("sample", 1:length(covariates_file_name))),
                      c("cov1", covariates_file_name)),
                row.names = F, col.names = F, quote = F, sep = "\t",
                file = cov.file)
    if(verbose){
      dim <- length(covariates_file_name)
      cat("Created covariate file for a single gene for", dim,
          "individuals at:\n", cov.file, "\n")
    }
    covariates_file_name <- cov.file
  } else if (length(covariates_file_name)==0) {
    if(verbose){
      cat("No covariates used in this analysis\n")
    }
  } else {
    error("Something went wrong with importing your covariate data.")
  }

  ## Settings
  
  # Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS
  useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
  
  
  # Only associations significant at this level will be saved
  if (compute.all) {
    pvOutputThreshold = 1
  } else {
    pvOutputThreshold = threshold
  }
  
  # Error covariance matrix
  # Set to numeric() for identity.
  errorCovariance = numeric();
  # errorCovariance = read.table("Sample_Data/errorCovariance.txt");
  
  
  ## Load genotype data
  
  snps = SlicedData$new();
  snps$fileDelimiter = "\t";      # the TAB character
  snps$fileOmitCharacters = "NA"; # denote missing values;
  snps$fileSkipRows = 1;          # one row of column labels
  snps$fileSkipColumns = 1;       # one column of row labels
  snps$fileSliceSize = 2000;      # read file in slices of 2,000 rows
  snps$LoadFile(genotype_file_name);
  
  ## Load gene expression data
  
  gene = SlicedData$new();
  gene$fileDelimiter = "\t";      # the TAB character
  gene$fileOmitCharacters = "NA"; # denote missing values;
  gene$fileSkipRows = 1;          # one row of column labels
  gene$fileSkipColumns = 1;       # one column of row labels
  gene$fileSliceSize = 2000;      # read file in slices of 2,000 rows
  gene$LoadFile(expression_file_name);
  
  ## Load covariates
  
  cvrt = SlicedData$new();
  cvrt$fileDelimiter = "\t";      # the TAB character
  cvrt$fileOmitCharacters = "NA"; # denote missing values;
  cvrt$fileSkipRows = 1;          # one row of column labels
  cvrt$fileSkipColumns = 1;       # one column of row labels
  if(length(covariates_file_name)>0) {
    cvrt$LoadFile(covariates_file_name);
  }
  
  ## Run the analysis
  
  me = Matrix_eQTL_engine(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name,
    pvOutputThreshold = pvOutputThreshold,
    useModel = useModel, 
    errorCovariance = errorCovariance, 
    verbose = verbose,
    pvalue.hist = TRUE,
    min.pv.by.genesnp = min.pv.by.genesnp,
    noFDRsaveMemory = save.memory);
  
  if(save.memory & load.qtls){
    # noFDRsaveMemory logical.
    # Set noFDRsaveMemory = TRUE to save significant gene-SNP pairs directly to
    # the output files, reduce memory footprint and skip FDR calculation.
    # The eQTLs are not recorded in the returned object if noFDRsaveMemory = TRUE.
    
    # let's add them manually
    cat("Read QTL_results from file\n", output_file_name, "\n")
    me$all$eqtls <- data.frame(fread(output_file_name));
    colnames(me$all$eqtls) <- c("snps", "gene", "beta", "statistic", "pvalue")
  }
  
  if(!(compute.all |save.memory)){
    unlink(output_file_name);
  }

  
  ## Results:
  
  cat('Analysis of ', me$all$ntests, 'QTLs done in: ', me$time.in.sec, ' seconds', '\n');
  cat('Detected eQTLs: ', me$all$neqtls, '\n');

  if(is.data.frame(snp.pos)){
    me$all$eqtls <- merge(snp.pos, me$all$eqtls,
                all.y=T)
    me$all$eqtls$chr <- as.character(me$all$eqtls$chr)
    me$all$eqtls$gene <- as.character(me$all$eqtls$gene)
    if(verbose){
      cat("SNP position annotation added to the QTL results.\n")
    }
  }
  
  if(what=="table" & min.pv.by.genesnp==FALSE){
    me <- data.frame(me$all$eqtls, stringsAsFactors = F)
  }
  
  return(me)
}



#' convenience function to convert genotypes from .vcf files to the MatrixEQTL format
#'
#' @family data conversion functions
#' @title convert .vcf files to MatrixEQTL genotype input
#' @param vcf.file vcf.file containing genotypes
#' @param which which field of the .vcf file to use: "GT" = genotype,
#'        "DS" = genotype dosage from MaCH/Thunder, "GL" = genotype likelihoods
#'        (default "GT")
#' @param map should the map object extracted from 
#' @param snp.pos names for colums with annotations for custom build,
#'        order: chromosome (chr), length (pos) (default NULL)
#' @param genotype_file_name
#' 
#' @return either a matrix nsnps x nsample containing genotypes coded as 0,1,2
#'         or a list with slots "gt.mat", "map", "snp.pos"
#' @author Ines Assum (2019-03-21)
#' @references
#' @export
convert.vcf <- function(vcf.file, which="GT", map=FALSE, snp.pos=FALSE, genotype_file_name=NULL){
  require(VariantAnnotation)
  require(dplyr)
  
  vcf <- readVcf(vcf.file)
  if(map | snp.pos){
    res <- list()
    snpMat <- genotypeToSnpMatrix(vcf)
    res[["gt.mat"]] <- t(as(snpMat$genotype, "numeric")) %>% as.data.frame
    if(snp.pos){
      snp.pos <- rowRanges(vcf)
      res[["snp.pos"]] <- data.frame(snps=names(snp.pos),
                                     chr=as.numeric(as.character(seqnames(snp.pos))),
                                     pos=start(snp.pos),
                                     stringsAsFactors = F)
    }
    if(map){
      res[["map"]] <- snpMat$map
    }
    if(!is.null(genotype_file_name)){
      write.table(cbind(snpid=rownames(res[["gt.mat"]]), res[["gt.mat"]]),
                  row.names = F, col.names = T, quote = F, sep = "\t",
                  file = genotype_file_name)
      cat("Created genotype file from .vcf file at:\n", genotype_file_name, "\n")
    }
  } else {
    res <- t(as(genotypeToSnpMatrix(vcf)$genotype, "numeric")) %>% as.data.frame
    if(!is.null(genotype_file_name)){
      write.table(cbind(snpid=rownames(res), res),
                  row.names = F, col.names = T, quote = F, sep = "\t",
                  file = genotype_file_name)
      cat("Created genotype file from .vcf file at:\n", genotype_file_name, "\n")
    }
  }
  return(res)
}


#' convenience function to get cumulative chromosome lengths for manhattan plots
#'
#' @family annotation functions
#' @title cumulative chromosome lengths
#' @param build genome build to use, options: 'b37' for GRCh37.p13,
#'        'b38' for GRCh38.p10, 'custom' (default 'b37')
#' @param df data.frame containing custom annotations (default NULL)
#' @param build.names names for colums with annotations for custom build,
#'        order: chromosome (chr), length (pos) (default NULL)
#' "
#' @return data frame with columns "chr", "chrLen", "cumSum2", "cumSum", "tick"
#'         and one row for each chr value, in ascending order
#'         chr = chromosome id, chrLen = length of the chromosome,
#'         cumSum2 = cumulative sum of chromosome lengths, cumSum = cumulative
#'         sum of chromosome lengths (starting at 0), tick = positions that
#'         indicate the middle of each chromosome (for axis label)
#' @author Ines Assum (2019-03-21)
#' @references
#' @export
add.chr.len.anno <- function(build="b37", df=NULL, build.names=NULL){
  chromosomeLength38 <- c(248956422, 242193529, 198295559, 190214555, 181538259,
                          170805979, 159345973, 145138636, 138394717, 133797422,
                          135086622, 133275309, 114364328, 107043718, 101991189,
                          90338345, 83257441, 80373285, 58617616, 64444167,
                          46709983, 50818468) # bp length from Ensembl GRCh38.p10
  chromosomeLength37 <- c(249250621, 243199373, 198022430, 191154276, 180915260,
                          171115067, 159138663, 146364022, 141213431, 135534747,
                          135006516, 133851895, 115169878, 107349540, 102531392,
                          90354753, 81195210, 78077248, 59128983, 63025520,
                          48129895, 51304566) # bp length from Ensembl GRCh37.p13
  if (build == 'b37') {
    pos <- data.frame(chr=1:22,
                      chrLen=chromosomeLength37)
  } else if (build == 'b38') {
    pos <- data.frame(chr=1:22,
                      chrLen=chromosomeLength38)
  } else if (build == 'custom') {
    library(data.table)
    df[, c("chr", "pos")] <- df[, build.names]
    pos <- as.data.table(df[, c("chr", "pos")])
    pos <- pos[, chrLen := max(pos), by = chr]
    pos <- pos[!duplicated(pos$chr), ]
    pos <- pos[order(pos$chr), ]
  } else {
    stop("Build not supported")
  }
  pos$cumSum2 <- cumsum(pos$chrLen)
  pos$cumSum <- pos$cumSum2-pos$chrLen
  pos$tick <- pos$cumSum + (pos$chrLen)/2
  return(pos)
}


#' convenience function to create ggplot2 manhattan plots of MatrixEQTL objects
#'
#' @family plot functions
#' @title make manhattan plots of MatrixEQTL results
#' @param me MatrixEQTL results as list or data.frame
#' @param build genome build to use, options: "b37", "b38", "custom"
#'        (default "b37")
#' @param build.names names for colums with annotations for custom build,
#'        order: chr, length (default NULL)
#' @param snp.pos data.frame with SNP annotations if not in me, SNP by cols:
#'        colnames(snp.pos) <- c("snps", "chr", "pos")
#' @param vcf vcf file to extract SNP annotations from (default NULL)
#' @param trait character vector of gene/trait names to plot
#' 
#' @return ggplot2 object (list) with the plot
#' @author Ines Assum (2019-03-21)
#' @references
#' @export
manhattan.qtl <- function(me, build="b37", snp.pos=NULL, trait=NULL, vcf=NULL, build.names=NULL) {
  require(ggplot2)
  require(RColorBrewer)
  require(data.table)


  if (!is.data.frame(me)){
    me <- data.frame(me$all$eqtls,
                     stringsAsFactors = F)
  }
  
  if(!is.null(trait)){
    me <- me[me$gene %in% trait, ]
  }
  if(!is.null(snp.pos)){
    me <- merge(snp.pos, me,
                all.y=T)
  }
  pos <- add.chr.len.anno(build, df=me, build.names)
  me <- merge(me, pos,
              all.x = T)
  me$cumPos <- me$pos + me$cumSum
  qtl.plot <- ggplot(me, aes(x=cumPos, y=-log10(pvalue), col = chr)) +
    geom_point() +
    theme_bw() +
    # scale_color_manual(values = rep(c("grey", "skyblue"), 22),
    #                    guide=FALSE) +
    scale_color_manual(values = rep(brewer.pal(n = 3, "Set2"), 22),
                       guide=FALSE) +
    scale_x_continuous(label = pos$chr, breaks = pos$tick)
  return(qtl.plot)
}


#' Basic interface to obtain minimal P-values per gene from MatrixEQTL passing
#' expression and covariate data in matrices, and genotypes as a file. The
#' function uses \code{\link{eqtl}}.
#'
#' @family eqtl functions
#' @title basic eQTL interface
#' @param expr expression values in a matrix ngene x nsample
#' @param covar covariate values in a matrix ncovar x nsample
#' @param genotype_file_name filename of a tab separated file with genotype
#'        dosage values (usually between 0 and 2) in a matrix nsnp x nsample
#' @param gene.position data.frame with gene positions in the first 4 columns
#'        named "chrom", "start", "end", "gene_id"
#' @param snp.pos data.frame with SNP positions in the first three columns
#'        named "snp_id", "chrom", "snp_pos"
#' @param id character indicating an id used for tempfile creation
#' @return the same output as the \code{\link{eqtl}} function
#' @references
#' @export
eqtl.min.p <- function(expr, ...) {

  ## report the min(p) for each gene
  eqtls = eqtl.run(expr, ..., what="object")
  min.p = eqtls$cis$min.pv.gene

  ## make sure the order is the same as the input expression matrix
  min.p = min.p[rownames(expr)]
  
  return(min.p)
}



#' Extension to the waitForJobs function of the BatchJobs package which shows
#' some strange behaviour when waiting for jobs (database locked)
#' so we need to make it extra failsafe.
#'
#' @family eqtl functions
#' @title basic eQTL interface
#' @param reg BatchJobs registry
#' @param waittime time to wait before updating job status
#' @param nretry number of time to retry getting the job status before throwing #'        an error
myWaitForJobs <- function(reg, waittime=3, nretry=100) {
  success = FALSE
  while (nretry > 0 && !success) {
    status = tryCatch({
      while (TRUE) {
        status = showStatus(reg)
        if (status$done + status$expired == status$n) {
          cat("done\n")
          return(list(success=TRUE, nretry=nretry))
        }
        Sys.sleep(waittime)
      }
      return(list(success=FALSE, nretry=nretry))
    }, error=function(e) {
      cat("Error while waiting for jobs:\n")
      print(e)
      cat("\nnumber of retries left: ", nretry - 1, "\n")
      Sys.sleep(waittime + runif(1, 0, 3))
      return(list(success=FALSE, nretry=nretry - 1))
    })
    success = status$success
    nretry = status$nretry
    cat("success after the tryCatch block:", success, "\n")
    cat("nretry after the tryCatch block:", nretry, "\n")
  }
  

  if (!success) {
    err.msg = paste("Error during batch processing in registry")
    save(envir=sys.frame(), list=ls(envir=sys.frame()), file=file.path(dir, "error_image.RData"))
    stop(err.msg)
  }
}

#' Run eQTL permutation analysis using BatchJobs to distribute jobs on a HPC.
#' This function uses \code{\link{eqtl.run}} function.
#'
#' @family eqtl HPC functions
#' @title identification of eGenes
#' @param expr expression values in a matrix ngene x nsample
#' @param covar covariate values in a matrix ncovar x nsample
#' @param gene.position data.frame with gene positions in the first 4 columns
#'        named "chrom", "start", "end", "gene_id"
#' @param snp.pos data.frame with SNP positions in the first three columns
#'        named "snp_id", "chrom", "snp_pos"
#' @param genotype_file_name filename of a tab separated file with genotype
#'        dosage values (usually between 0 and 2) in a matrix nsnp x nsample
#' @param dir work directory where the BatchJobs registry is stored to. This
#'        path must be accessible by / visible to all compute nodes
#' @param min.perm minimal number of permutations
#' @param max.perm maximal number of permutations
#' @param seed random seed
#' @param exit.criterion number of times a more extreme test statistic has to
#'        be observed before a gene is not permuted further
#' @param actual.min.p if the actual minimal P-value per gene has already been
#'        computed it can be passed
#' @return a table with minimal P-values per gene and the empirical P-values
#' @references GTEx Consortium, Ardlie, K. G., Wright, F. A., & Dermitzakis, E. T. (2015). The Genotype-Tissue Expression (GTEx) pilot analysis: multitissue gene regulation in humans. Science, 348(6235), 648–660. \url{http://doi.org/10.1126/science.1262110}
#' @export
find.eGenes <- function(expr, covar, gene.position, snp.pos, genotype_file_name, dir, min.perm=1000, max.perm=10000, seed=0, exit.criterion=15, actual.min.p=NULL) {
  require(BatchJobs)
  ## initilaize the random generator
  set.seed(seed)
  
  ## check if we need to determine the actual min.p
  if (is.null(actual.min.p)) {
    actual.min.p = eqtl.min.p(expr, covar, genotype_file_name, gene.position, snp.pos, "actual")
  }
  
  
  ## submit batches of min.perm permutation runs to the cluster
  ## evaluate which genes go to the next round
  
  rfun <- function(pcount, new.order, expr, covar, genotype_file_name, gene.position, snp.pos) {
    source("R/eqtl_lib.R")
    id = paste("permutation", pcount, "___", sep="")

    o = new.order[pcount,]
    pexpr = expr[,o]
    pcovar = covar[,o]
    colnames(pexpr) = colnames(expr)
    colnames(pcovar) = colnames(covar)

    min.p = eqtl.min.p(pexpr, pcovar, genotype_file_name, gene.position, snp.pos, id)
    return(min.p)
  }

  
  genes.to.permute = 1:nrow(expr)
  pcount = 0
  count = rep(0, nrow(expr))
  nperm.per.gene = rep(0, nrow(expr))
  
  while (pcount < max.perm && length(genes.to.permute) > 0) {

    cat("\n\n\n\npermutation run", pcount, "analysing", length(genes.to.permute), "genes\n")
    
    ## permute sample labels
    new.order = t(sapply(1:min.perm, function(i) sample(1:ncol(expr), ncol(expr))))

    ## submit jobs
    rdir = file.path(dir, "batchjobs")
    reg = makeRegistry(id="permutations", seed=seed, file.dir=rdir, packages=c("MatrixEQTL", "data.table"))

    resources = list(memory="8G", queue="standard", time="4:00:00", longrun="False")
    more.args = list(new.order=new.order, expr=expr[genes.to.permute,,drop=F], covar=covar, genotype_file_name=genotype_file_name, gene.position=gene.position, snp.pos=snp.pos)
    
    batchMap(reg, rfun, 1:min.perm, more.args=more.args)
    
    submitJobs(reg, resources=resources, job.delay=function(n, i) runif(1, 0, 0.5))

    ## there is a problem with concurrent acess to the sqlite database
    ## so we need to wrap this in a loop with a number of retries
    myWaitForJobs(reg, waittime=30, nretry=100)
    
    
    ## reduce matrix is concatenating results row wise
    min.p = reduceResultsMatrix(reg)
    
    ## remove the registry
    system(paste("rm -rf", rdir))

    ## count for each gene how many times the permuted minp was smaller
    smaller = min.p <= rep(actual.min.p[genes.to.permute], each=nrow(min.p))
    count[genes.to.permute] = count[genes.to.permute] + colSums(smaller)

    ## increment the counter
    pcount = pcount + min.perm

    ## remember for each gene how many permutations were done
    nperm.per.gene[genes.to.permute] = pcount

    ## check which genes need to go to the next round
    genes.to.permute = which(count < exit.criterion)
    save(envir=sys.frame(), list=ls(envir=sys.frame()), file=file.path(dir, "current_image.RData"))
  }

  ## return the results
  tab = data.frame(actual.min.p, count, nperm.per.gene, empirical.p=count/nperm.per.gene)
  return(tab)
}


#' Run eQTL permutation analysis using BatchJobs to distribute jobs on a HPC.
#' This function uses \code{\link{eqtl.run}} function.
#'
#' @family eqtl HPC functions
#' @title identification of eSNPs
#' @param expr expression values in a matrix ngene x nsample
#' @param covar covariate values in a matrix ncovar x nsample
#' @param gene.position data.frame with gene positions in the first 4 columns
#'        named "chrom", "start", "end", "gene_id"
#' @param snp.pos data.frame with SNP positions in the first three columns
#'        named "snp_id", "chrom", "snp_pos"
#' @param genotype_file_name filename of a tab separated file with genotype
#'        dosage values (usually between 0 and 2) in a matrix nsnp x nsample
#' @param dir work directory where the BatchJobs registry is stored to. This
#'        path must be accessible by / visible to all compute nodes
#' @param min.perm minimal number of permutations
#' @param max.perm maximal number of permutations
#' @param seed random seed
#' @param exit.criterion number of times a more extreme test statistic has to
#'        be observed before a gene is not permuted further
#' @param actual.eqtls if the actual eQTLs have already been computed they can
#'        be passed
#' @param blocksize the number of permutations collected at the same time. This
#'        parameter tunes the balance between speed and memory usage. The more
#'        permutation runs are collected in the same block, the faster, but
#'        also more memory consuming.
#' @return a table with eQTL P-values and the empirical P-values for eSNPs
#' @references GTEx Consortium, Ardlie, K. G., Wright, F. A., & Dermitzakis, E. T. (2015). The Genotype-Tissue Expression (GTEx) pilot analysis: multitissue gene regulation in humans. Science, 348(6235), 648–660. \url{http://doi.org/10.1126/science.1262110}
#' @export
find.eSNPs <- function(expr, covar, gene.position, snp.pos, genotype_file_name, dir, threshold, min.perm=1000, max.perm=10000, seed=0, exit.criterion=15, actual.eqtls=NULL, blocksize=10) {

  ## as input we need the permutation threshold which is the empirical min(p)
  ## that corresponds to the FDR threshold across genes

  ## then we compute gene wise nominal p-values that correspond to the
  ## empirical threshold
  
  require(BatchJobs)
  ## initilaize the random generator
  set.seed(seed)
  
  ## check if we need to determine the actual min.p
  if (is.null(actual.eqtls)) {
    actual.eqtls= eqtl.run(expr, covar, genotype_file_name, gene.position, snp.pos, "actual")
  }
  
  
  ## submit batches of min.perm permutation runs to the cluster
  ## evaluate which genes go to the next round
  
  rfun <- function(pcount, new.order, expr, covar, genotype_file_name, gene.position, snp.pos) {
    source("R/eqtl_lib.R")
    id = paste("permutation", pcount, "___", sep="")

    o = new.order[pcount,]
    pexpr = expr[,o]
    pcovar = covar[,o]
    colnames(pexpr) = colnames(expr)
    colnames(pcovar) = colnames(covar)

    min.p = eqtl.min.p(pexpr, pcovar, genotype_file_name, gene.position, snp.pos, id)
    return(min.p)
  }


  ## we do not load all min.perm vectors at once
  ## instead we will aggregate by counting if the permuted min(p) value
  ## for a gene was smaller than the actual p value of a pair
  
  afun <- function(aggr, job, res, actual.pvalue, gene) {
    
    ## from the BatchJobs man package (reduceResults)
    ## Here, 'job' is the
    ## current job descriptor (see 'Job'), 'result' is the current
    ## result object and 'aggr' are the so far aggregated results.
    ## When using 'reduceResults', your function should add the
    ## stuff you want to have from 'job' and 'result' to 'aggr' and
    ## return that

    ## res is the min(p) per gene
    minp = res[gene]
    
    ## aggr is just the sum of permutations that was smaller
    aggr = aggr + as.numeric(minp <= actual.pvalue)

    return(aggr)
  }
  
  pairs.to.permute = 1:nrow(actual.eqtls)
  genes.to.permute = 1:nrow(expr)
  pcount = 0
  count = rep(0, nrow(actual.eqtls))
  nperm.per.gene = rep(0, nrow(actual.eqtls))
  
  while (pcount < max.perm && length(genes.to.permute) > 0) {

    cat("\n\n\n\npermutation run", pcount, "analysing", length(genes.to.permute), "genes\n")
    
    ## permute sample labels
    new.order = t(sapply(1:min.perm, function(i) sample(1:ncol(expr), ncol(expr))))

    ## submit jobs
    rdir = file.path(dir, "batchjobs")
    reg = makeRegistry(id="permutations", seed=seed, file.dir=rdir, packages=c("MatrixEQTL", "data.table"))

    resources = list(memory="8G", queue="standard", time="4:00:00", longrun="False")
    more.args = list(new.order=new.order, expr=expr[genes.to.permute,,drop=F], covar=covar, genotype_file_name=genotype_file_name, gene.position=gene.position, snp.pos=snp.pos)
    
    batchMap(reg, rfun, 1:min.perm, more.args=more.args)
    
    submitJobs(reg, resources=resources, job.delay=function(n, i) runif(1, 0, 0.5))

    ## there is a problem with concurrent acess to the sqlite database
    ## so we need to wrap this in a loop with a number of retries
    myWaitForJobs(reg, waittime=30, nretry=100)
    

    ## somehow reducing results one by one is very slow, so we do it block
    ## wise not to waste too much memory

    gene = actual.eqtls[pairs.to.permute, "gene"]
    actual.pvalue = actual.eqtls[pairs.to.permute, "p-value"]

    ## reshape to the size of the block
    actual.pvalue = matrix(rep(actual.pvalue, each=blocksize), nrow=blocksize)

    ids = findDone(reg)
    nsuccess = length(ids)
    while (length(ids) > 0) {
      ## get the block of results
      min.p = reduceResultsMatrix(reg, ids=head(ids, blocksize), progressbar=FALSE)

      ## reshape to the size of the eqtl results
      min.p = min.p[,gene,drop=F]

      ## check if the block is of full size
      if (nrow(min.p) < blocksize) {
        ## if not reduce the size of the actual pvalues matrix
        actual.pvalue = actual.pvalue[1:nrow(min.p),]
      }
      
      ## count
      smaller = min.p <= actual.pvalue
      count[pairs.to.permute] = count[pairs.to.permute] + colSums(smaller)

      ## remove the job ids that were just processed
      ids = tail(ids, -blocksize)
    }
    
    ## remove the registry
    system(paste("rm -rf", rdir))
    
    ## increment the counter
    pcount = pcount + nsuccess

    ## remember for each gene how many permutations were done
    nperm.per.gene[genes.to.permute] = pcount

    ## check which genes need to go to the next round
    pairs.to.permute = which(count < exit.criterion)
    genes.to.permute = which(rownames(expr) %in% actual.eqtls[pairs.to.permute,"gene"])
    
    save(envir=sys.frame(1), list=ls(envir=sys.frame(1)), file=file.path(dir, "current_image.RData"))
  }

  ## return the results
  nperm.per.gene = nperm.per.gene[match(actual.eqtls[,"gene"], rownames(expr))]
  tab = data.frame(as.data.frame(actual.eqtls), count, nperm.per.gene, empirical.p=count/nperm.per.gene)
  return(tab)
}



#' Get egenes from a esnp table
#'
#' @family eqtl HPC functions
#' @param esnps an esnps table obtained from \code{\link{find.esnps}}
#' @param fdr false discovery rate threshold
#' @return a table of egenes with columns for minimal P-values and emprical
#' P-values
#' @export
get.egenes.from.esnps <- function(esnps, fdr=0.05) {
  require(qvalue)
  
  ## get the gene wise min(p)
  min.p = tapply(1:nrow(esnps), esnps[,"gene"], function(idx) idx[which.min(esnps[idx,"p.value"])])
  egenes = esnps[min.p,c("gene", "p.value", "chrom", "start", "end", "gene_name", "gene_type", "count", "nperm.per.gene", "empirical.p")]
  
  ## compute the fdr from the empirical p values
  qval = qvalue(egenes[,"empirical.p"])
  egenes = cbind(egenes, qvalue=qval$qvalues)

  max.p = max(egenes[egenes[,"qvalue"] < fdr, "empirical.p"])
  
  ## determine the gene specific p value threshold that corresponds to an fdr 5%
  threshold = tapply(1:nrow(esnps), esnps[,"gene"], function(idx) {
    sig = idx[esnps[idx,"empirical.p"] < max.p]
    return(max(c(0, egenes[sig, "p.value"])))
  })
  
  egenes = cbind(egenes, gene.specific.threshold=threshold[egenes[,"gene"]])
  return(egenes)
}

#' Quantile - quantile plot against the uniform distribution
#'
#' @param q P-values
#' @export
qquniform <- function(p) {
  p = sort(p)
  expected = (1:length(p)) / length(p)
  plot(-log10(expected), -log10(p))
}

#' Estimate latent factors using the PEER method
#' @param k the number of latent factors
#' @param expr expression values in a matrix ngene x nsample
#' @param covar measured covariate values in a matrix  nsample x ncovar.
#'        If covar is NULL, no measured covariates will be used.
#' @return the latent factors in a  nsample x k matrix
get.peer.factors <- function(k, expr, covar) {
  require(peer)
  model = PEER()
  PEER_setPhenoMean(model, t(expr))
  PEER_setNk(model, k)
  if (!is.null(covar)) {
    PEER_setCovariates(model, as.matrix(covar))
  }
  PEER_update(model)
  factors = PEER_getX(model)
  weights = PEER_getW(model)
  return(factors)
}


#' Quantile normalization
#'
#' @param x ngenes x nsamples matrix to be normalized
#' @return quantile normalized matrix
#' @export
normalize.quantile <- function(x) {
  x = as.matrix(x)
  o = apply(x, 2, order)
  xsort = x
  for (col in 1:ncol(x)) {
    xsort[,col] = x[o[,col], col]
  }
  means = apply(xsort, 1, mean)
  normalized = matrix(NA, nrow=nrow(x), ncol=ncol(x), dimnames=dimnames(x))
  for (col in 1:ncol(x)) {
    normalized[o[,col], col] = means
  }
  return(normalized)
}
matthiasheinig/eQTLpipeline documentation built on May 21, 2019, 1:16 p.m.