sQTLseekeR2: R package for splicing QTL mapping

Documented in compute.empirical.pv compute.nominal.pv sqtl.seeker.p

##' \code{sqtl.seeker.p} performs the same test as \code{sqtl.seeker} between SNPs and
##' relative transcript expression values, prior permutation of individual labels (see
##' Details). 
##' 
##' \code{sqtl.seeker.p} implements an adaptive permutation procedure to control for 
##' multiple testing (i.e. multiple genetic variants are tested per gene, see also 
##' \code{\link{sqtls.p}}). The outcome of the permutations is then modeled using beta
##' distributions, as in FastQTL (Ongen et al., 2015), allowing to compute an adjusted 
##' empirical P-value per gene.
##' 
##' @title Permuted sQTL seeker
##' @param tre.df a data.frame with transcript relative expression
##' produced by \code{prepare.trans.exp}. Same as in \code{sqtl.seeker}.
##' @param genotype.f the name of the genotype file. This file needs to
##' be ordered by position, compressed and indexed using \code{index.genotype} or externally using tabix (samtools).
##' Must have column 'snpId'. Same as in \code{sqtl.seeker}.
##' @param gene.loc a data.frame with the genes location. Columns 'chr', 'start',
##' 'end' and 'geneId' are required. Same as in \code{sqtl.seeker}.
##' @param covariates a data.frame with covariate information per sample (samples x covariates).
##' Rownames should be the sample ids. Covariates can be either \code{numeric} or \code{factor}. 
##' When provided, they are regressed out before testing the genotype effect. Default is \code{NULL}.
##' @param genic.window the window(bp) around the gene in which the SNPs are tested. Default is 5000 (i.e. 5kb).
##' @param min.nb.ext.scores the minimum number of permuted  nominal P-values lower than
##' the lowest observed nominal P-value to allow the computation to stop. Default is 100. 
##' @param nb.perm.min the minimum number of permutations. Default is 100.
##' @param nb.perm.max the maximum number of permutations. Default is 1000. 
##' @param min.nb.ind.geno SNPs with less samples than \code{min.nb.ind.geno} in any genotype group
##' are filtered out. Default is 10.
##' @param verbose Default is \code{FALSE}. 
##' @return A data.frame with columns:
##' \item{geneId}{the gene name.}
##' \item{variants.cis}{the number of variants tested in cis.}
##' \item{LD}{a linkage disequilibrium estimate for the genomic window (median r2).}
##' \item{best.snp}{ID of the SNP with the smallest observed nominal P-value.}
##' \item{best.nominal.pv}{P-value corresponding to the best SNP.}
##' \item{shape1}{Beta distribution parameter shape1.}
##' \item{shape2}{Beta distribution parameter shape2.}
##' \item{nb.perms}{the number of permutations used for the empirical P-value computation.}
##' \item{pv.emp}{empirical P-value based on permutations.}
##' \item{pv.emp.beta}{empirical P-value based on the beta approximation.}
##' \item{runtime}{approximated computation time per gene.}
##' @author Diego Garrido-Martín
##' @export
sqtl.seeker.p <- function(tre.df, genotype.f, gene.loc, covariates = NULL, 
                          genic.window = 5000, nb.perm.min = 100, 
                          nb.perm.max = 1000, min.nb.ext.scores = 100, 
                          min.nb.ind.geno = 10, verbose = FALSE)
{
    . <- nb.groups <- snpId <- NULL ## Uglily appease R checks (dplyr)
    analyze.gene.f <- function(tre.gene){
        if(verbose) message(tre.gene$geneId[1])
        if(sum(duplicated(gene.loc$geneId)) > 1){
            stop(tre.gene$geneId[1], " Repeated gene in gene location file.")
        }
        gr.gene <- with(gene.loc[which(gene.loc$geneId == tre.gene$geneId[1]), ], 
                        GenomicRanges::GRanges(chr, IRanges::IRanges(start, end)))
        if(genic.window > 0){
            gr.gene <- GenomicRanges::resize(gr.gene, GenomicRanges::width(gr.gene) + 2 * genic.window, fix = "center")
        }
        if(length(gr.gene) > 0){
            if(!is.null(covariates)){
                if(!all(rownames(covariates) %in% colnames(tre.gene))){
                    stop("All samples should have covariate information (either a value or NA).")                  
                }
                cov.na <- apply(covariates, 1, function(x){any(is.na(x))})
                covariates <- covariates[!cov.na, , drop = FALSE]                                     
                if(sum(cov.na) > 0){
                    warning(sprintf("%s samples with NA values for at least one covariate have been removed.", 
                                    sum(cov.na)))     
                }
                if(sum(cov.na) / nrow(covariates) > 0.05){
                    stop("More than 5% of the samples contain NA values for at least one covariate")  
                }
            }
            tre.gene <- tre.gene[, !is.na(tre.gene[1, ])]
            genotype.headers <- as.character(utils::read.table(genotype.f, 
                                                               as.is = TRUE, nrows = 1))
            if(!is.null(covariates)){
                com.samples <- Reduce(intersect, list(colnames(tre.gene), 
                                                      genotype.headers, rownames(covariates)))   
                if(length(com.samples) == 0){
                    stop("No common samples between genotype, covariate and transcript files.")
                }
            } else{
                com.samples <- intersect(colnames(tre.gene), genotype.headers)
                if(length(com.samples) == 0){
                    stop("No common samples between genotype and transcript files.")
                }
            }
            tre.gene <- tre.gene[, c("trId", "geneId", com.samples)]
            allzero <- apply(tre.gene[, com.samples], 1, function(x){ sum(x) == 0 })
            if (any(allzero)){
              tr.out <- tre.gene$trId[allzero]
              tre.gene <- tre.gene[!allzero,]
              warning(sprintf("Transcript(s) %s is(are) removed due to zero expression in all common samples.", 
                              paste(tr.out, collapse = ", ")) )
            }
            tre.tc <- t(sqrt(tre.gene[, com.samples]))
            colnames(tre.tc) <- tre.gene$tr
            if(!is.null(covariates)){
                covariates <- covariates[com.samples, , drop = FALSE]
                multiclass <- apply(covariates, 2, function(x){length(table(x)) > 1})
                covariates <- covariates[, multiclass, drop = FALSE]
                if (verbose){
                  message("\t", "Covariates removed due to only one value: ", 
                          paste(names(multiclass)[!multiclass], collapse = ", "))
                }
                fit <- stats::lm(tre.tc ~ ., data = covariates)
                if (ncol(covariates) > 1){
                    vifs <- car::vif(stats::lm(tre.tc[, 1] ~ ., data = covariates))
                    if (verbose){
                        message("\t", "Covariates VIF - ", 
                                paste(names(vifs), round(vifs, 2), sep = ": ", collapse = ", "))
                    }
                    if (any(vifs > 5)){
                        warning("Check multicollinearity. VIF > 5 for some covariates:", "\n",
                                paste(names(vifs), round(vifs, 2), sep = ": ", collapse = ", "))
                    }
                }
                tre.tc <- fit$residual
            }
            res.df <- data.frame()
            genotype.gene <- read.bedix(genotype.f, gr.gene) 
            if (verbose & is.null(genotype.gene)) message("\tNo SNPs in the genomic range.")
            if (!is.null(genotype.gene)) {
                snps.to.keep <- check.genotype(genotype.gene[, com.samples], 
                                               tre.gene[, com.samples], 
                                               min.nb.ind.geno = min.nb.ind.geno)
                if (verbose) {
                    snps.to.keep.t <- table(snps.to.keep)
                    message("\t", paste(names(snps.to.keep.t), snps.to.keep.t, 
                                        sep = ": ", collapse = ", "))
                }
                if (any(snps.to.keep == "PASS")) {
                    genotype.gene <- genotype.gene[snps.to.keep == "PASS", ]
                    observed <- dplyr::do(dplyr::group_by(genotype.gene, snpId), 
                                          compute.nominal.pv(., tre.mt = tre.tc))
                    min.pv.obs <- min(observed$pv.snp) 
                    best.snp <- observed$snpId[which.min(observed$pv.snp)]
                    res.df <- compute.empirical.pv(genotype.gene = genotype.gene, 
                                                   tre.mt = tre.tc, 
                                                   min.nb.ext.scores = min.nb.ext.scores, 
                                                   nb.perm.min = nb.perm.min,
                                                   nb.perm.max = nb.perm.max, 
                                                   min.pv.obs = min.pv.obs, 
                                                   best.snp = best.snp, verbose = verbose)
                    return(data.frame(done = TRUE, res.df))
                }
            }
        } else {
            if (verbose) {
                warning("Issue with the gene location.")
            }
        }
        return(data.frame(done = FALSE))
    }
    ret.df <- lapply(unique(tre.df$geneId), function(gene.i){
        df <- tre.df[which(tre.df$geneId == gene.i), ]
        data.frame(geneId = gene.i, analyze.gene.f(df))
    })
    done <- which(unlist(lapply(ret.df, ncol)) > 2)
    if(length(done) > 0){
        ret.df <- ret.df[done]
        ret.df <- do.call(rbind, ret.df)
        ret.df$done <- NULL
        return(ret.df)
    } else {
        return(NULL)
    }
}

##' Computes nominal P-value for the association between the genotype of a locus (e.g. SNP) and the splicing ratios of a gene.
##' @title Compute nominal P-value
##' @param geno.df a data.frame of one row with the genotype information for each sample.
##' @param tre.mt a matrix of splicing ratios (samples x transcripts).
##' @param permute should the rows of the splicing ratio matrix be permuted. Default is \code{FALSE}.
##' @param seed if \code{permute} is TRUE, value provided to \code{\link{set.seed}}. Default is 1.
##' @param item.acc accuracy for P-value computation. Passed to \code{pcqf} function. Default is 1e-14.
##' Minimum accuracy allowed is 1e-14.
##' @param eigen.tol eigenvalues below this threshold are considered 0. Default is 1e-12.
##' @return A data.frame containing a P-value for the association.
##' @author Diego Garrido-Martín 
##' @keywords internal
compute.nominal.pv <- function(geno.df, tre.mt, permute = FALSE, seed = 1, 
                               item.acc = 1e-14, eigen.tol = 1e-12)
{
    if (nrow(geno.df) > 1) {
        stop(geno.df$snpId[1], " SNP is duplicated in the genotype file.")
    }
    geno.snp <- as.numeric(geno.df[, rownames(tre.mt)])
    names(geno.snp) <- rownames(tre.mt)
    if (any(geno.snp == -1)) {
        non.na <- geno.snp > -1
        geno.snp <- geno.snp[non.na]
        tre.mt <- tre.mt[non.na, ]
    }
    groups.snp.f <- factor(as.numeric(geno.snp))
    n <- nrow(tre.mt)
    if (permute){
        set.seed(seed)
        perm <- sample(1:n)
        tre.mt <- tre.mt[perm, ]
    }
    fit <- stats::lm(tre.mt ~ groups.snp.f)
    tre.mt <- scale(tre.mt, center = TRUE, scale = FALSE)
    G <- tcrossprod(tre.mt)
    X <- stats::model.matrix(fit) # Note contrast type
    H <- tcrossprod(tcrossprod(X, solve(crossprod(X))), X)
    numer <- crossprod(c(H), c(G))
    trG <- sum(diag(G))
    denom <- trG - numer
    f.tilde <- as.numeric(numer/denom)  
    R <- fit$residuals
    df.e <- fit$df.residual
    df.i <- nlevels(groups.snp.f) - 1
    e <- eigen(stats::cov(R)*(n-1)/df.e, symmetric = TRUE, only.values = TRUE)$values
    lambda <- abs(e[abs(e) > eigen.tol])
    pv.snp <- pcqf(q = f.tilde, lambda = lambda, 
                   df.i = df.i, df.e = df.e, acc = item.acc)
    while (length(pv.snp) > 1) {
        item.acc <- item.acc * 10
        pv.snp <- pcqf(q = f.tilde, lambda = lambda, 
                       df.i = df.i, df.e = df.e, acc = item.acc)
    }
    if (pv.snp < item.acc) {
        pv.snp <- item.acc
    }
    return(data.frame(pv.snp))
} 

##' Computes empirical P-value for a gene.
##' @title Compute empirical P-value
##' @param genotype.gene a data.frame of genotypes produced by \code{\link{read.bedix}}.
##' @param tre.mt a matrix of splicing ratios (samples x transcripts).
##' @param best.snp SNP with the smallest observed nominal P-value, computed by \code{\link{compute.nominal.pv}}.
##' @param min.pv.obs smallest observed nominal P-value.
##' @param nb.perm.min the minimum number of permutations. Default is 100.
##' @param nb.perm.max the maximum number of permutations. Default is 1000. 
##' @param min.nb.ext.scores the minimum number of permuted  nominal P-values lower than
##' the smallest observed nominal P-value to allow the computation to stop. Default is 100. 
##' @param comp.ld should linkage disequilibrium estimates be computed (median r2). Default is \code{TRUE}.
##' @param verbose Default is \code{FALSE}.
##' @return a data.frame with columns:
##' \item{variants.cis}{the number of variants tested in cis.}
##' \item{LD}{a linkage disequilibrium estimate for the genomic window (median r2).}
##' \item{best.snp}{ID of the SNP with the smallest observed nominal P-value.}
##' \item{best.nominal.pv}{P-value corresponding to the best SNP.}
##' \item{shape1}{Beta distribution parameter shape1.}
##' \item{shape2}{Beta distribution parameter shape2.}
##' \item{nb.perms}{the number of permutations used for the empirical P-value computation.}
##' \item{pv.emp}{empirical P-value based on permutations.}
##' \item{pv.emp.beta}{empirical P-value based on the beta approximation.}
##' \item{runtime}{approximated computation time per gene (mins).}
##' @author Diego Garrido-Martín 
##' @keywords internal
compute.empirical.pv <- function(genotype.gene, tre.mt, best.snp, min.pv.obs, 
                                 nb.perm.min = 100, nb.perm.max = 1000, 
                                 min.nb.ext.scores = 100, comp.ld = TRUE, 
                                 verbose = FALSE)
{
    if(comp.ld) {
        ld <- compute.median.ld(genotype.gene[, rownames(tre.mt)])
    }
    variants.cis <- dim(genotype.gene)[1]
    genotype.gene <- LD.filter(genotype.gene = genotype.gene, tre.mt = tre.mt, 
                               th = 1, tol = 0)             
    genotype.gene$LD <- NULL
    if (verbose) {
        message(sprintf("\tPASS not in perfect LD: %s", nrow(genotype.gene)))
        message ("\tAdaptative permutation scheme")
    }
    t0 <- Sys.time()
    store.perm = c()
    i <- 1 
    ext <- 0
    pv <- 1
    while ( i <= nb.perm.min || (ext < min.nb.ext.scores && i <= nb.perm.max) ) {
        # Note that i starts in 1. Thus "<=" instead of "<" in the while loop last condition
        if (verbose && i%%100 == 0) message (sprintf("\t\tpermutation %s",i))
        min.pv.perm <- min(dplyr::do(dplyr::group_by(genotype.gene, snpId),
                                     compute.nominal.pv(., tre.mt, permute = TRUE, seed = i))$pv.snp)
        store.perm <- c(store.perm, min.pv.perm)
        ext <- sum(store.perm <= min.pv.obs)
        pv <- (ext + 1)/(i + 1)
        i <- i + 1
    }
    fit <- fitdistrplus::mledist(store.perm, "beta")  # Estimate beta parameters on permutations 
    shape1 <- fit$estimate[1]                                               
    shape2 <- fit$estimate[2]
    names(shape1) <- names(shape2) <- NULL
    pv.beta <- stats::pbeta(min.pv.obs, shape1 = shape1, shape2 = shape2)  
    t1 <- Sys.time()
    t.run <- as.numeric(difftime(t1, t0, units = "mins"))
    res.df <- data.frame(variants.cis = variants.cis, LD = ld, best.snp = best.snp, 
                         best.nominal.pv = min.pv.obs, shape1 = shape1, 
                         shape2 = shape2, nb.perm = length(store.perm), 
                         pv.emp.perm = pv, pv.emp.beta = pv.beta, runtime = t.run) 
    return(res.df)
} 

compute.median.ld <- function(df)
{
    M <- as.matrix(df)
    ns <- dim(M)[1]
    if(ns == 1) {
        return(NA)
    }
    R <- matrix(NA, ncol = ns, nrow = ns)               
    for (i in 1:(ns - 1)){
        for (j in (1 + i):ns){
            R[i, j] <- stats::cor(M[i, ], M[j, ])^2
        }
    }
    return(stats::median(R, na.rm = TRUE)) 
}