R/tcgsa_seq.R
In tcgsaseq: Time-Course Gene Set Analysis for RNA-Seq Data

Documented in tcgsa_seq

#'Time-course Gene Set Analysis
#'
#'Wrapper function for performing gene set analysis of (potentially longitudinal) RNA-seq data
#'
#'@param y a numeric matrix of size \code{G x n} containing the raw RNA-seq counts or
#'preprocessed expressions from \code{n} samples for \code{G} genes.
#'
#'@param x a numeric matrix of size \code{n x p} containing the model covariates from
#'\code{n} samples (design matrix). Usually, its first column is the intercept (full of
#'\code{1}s).
#'
#'@param phi a numeric design matrix of size \code{n x K} containing the \code{K} variables
#'to be tested
#'
#'@param weights_phi_condi a logical flag indicating whether heteroscedasticity
#'weights computation should be conditional on both the variable(s) to be tested
#'\code{phi} and on covariate(s) \code{x}, or on \code{x} alone. #'Default is \code{TRUE}
#'in which case conditional means are estimated conditionally on both \code{x} and \code{phi}.
#'
#'@param genesets either a vector of index or subscripts that defines which rows of \code{y}
#'constitute the investigated gene set (when only 1 gene set is being tested).
#'Can also be a \code{list} of index (or \code{rownames} of \code{y}) when several
#'gene sets are tested at once, such as the first element of a
#'\code{\link[GSA:GSA.read.gmt]{gmt}} object. If \code{NULL}, then gene-wise p-values are returned.
#'
#'@param indiv a vector of length \code{n} containing the information for
#'attributing each sample to one of the studied individuals. Coerced
#'to be a \code{factor}. Default is \code{NULL} in which case each sample is considered
#'as coming from independent subjects.
#'
#'@param Sigma_xi a matrix of size \code{K x K} containing the covariance matrix
#'of the \code{K} random effects. Only used if \code{homogen_traj} is \code{FALSE}.
#'Default assume diagonal correlation matrix, i.e. independence of random effects.
#'
#'@param which_weights a character string indicating which method to use to estimate
#'the mean-variance relationship weights. Possibilities are \code{"loclin"},
#'\code{"voom"} or \code{"none"} (in which case no weighting is performed).
#'Default is \code{"loclin"}.
#'See \code{\link{sp_weights}} and \code{\link{voom_weights}} for details.
#'
#'@param which_test a character string indicating which method to use to approximate
#'the variance component score test, either \code{"permutation"} or \code{"asymptotic"}.
#'Default is \code{"permutation"}.
#'
#'@param n_perm the number of perturbations. Default is \code{1000}.

#'@param progressbar logical indicating whether a progress bar should be displayed
#'when computing permutations (only in interactive mode).
#'
#'@param parallel_comp a logical flag indicating whether parallel computation
#'should be enabled. Only Linux and MacOS are supported, this is ignored on Windows.
#'Default is \code{TRUE}.
#'
#'@param nb_cores an integer indicating the number of cores to be used when
#'\code{parallel_comp} is \code{TRUE}.
#'Only Linux and MacOS are supported, this is ignored on Windows.
#'Default is \code{parallel::detectCores() - 1}.
#'
#'@param preprocessed a logical flag indicating whether the expression data have
#'already been preprocessed (e.g. log2 transformed). Default is \code{FALSE}, in
#'which case \code{y} is assumed to contain raw counts and is normalized into
#'log(counts) per million.
#'
#'@param doPlot a logical flag indicating whether the mean-variance plot should be drawn.
#' Default is \code{FALSE}.
#'
#'@param gene_based_weights a logical flag used for \code{"loclin"} weights, indicating whether to estimate
#'weights at the gene-level, or rather at the observation-level. Default is \code{TRUE},
#'and weights are then estimated at the gene-level.
#'
#'@param bw a character string indicating the smoothing bandwidth selection method to use. See
#'\code{\link[stats]{bandwidth}} for details. Possible values are \code{"ucv"}, \code{"SJ"},
#'\code{"bcv"}, \code{"nrd"} or \code{"nrd0"}
#'
#'@param kernel a character string indicating which kernel should be used.
#'Possibilities are \code{"gaussian"}, \code{"epanechnikov"}, \code{"rectangular"},
#'\code{"triangular"}, \code{"biweight"}, \code{"tricube"}, \code{"cosine"},
#'\code{"optcosine"}. Default is \code{"gaussian"} (NB: \code{"tricube"} kernel
#'corresponds to the loess method).
#'
#'@param exact a logical flag indicating whether the non-parametric weights accounting
#'for the mean-variance relationship should be computed exactly or extrapolated
#'from the interpolation of local regression of the mean against the
#'variance. Default is \code{FALSE}, which uses interpolation (faster computation).
#'
#'@param transform a logical flag used for \code{"loclin"} weights, indicating whether values should be
#'transformed to uniform for the purpose of local linear smoothing. This may be helpful if tail
#'observations are sparse and the specified bandwidth gives suboptimal performance there.
#'Default is \code{TRUE}.
#'
#'@param padjust_methods multiple testing correction method used if \code{genesets}
#'is a list. Default is "BH", i.e. Benjamini-Hochberg procedure for controlling the FDR.
#'Other possibilities are: \code{"holm"}, \code{"hochberg"}, \code{"hommel"},
#'\code{"bonferroni"} or \code{"BY"} (for Benjamini-Yekutieli procedure).
#'
#'@param lowess_span smoother span for the lowess function, between 0 and 1. This gives
#'the proportion of points in the plot which influence the smooth at each value.
#'Larger values give more smoothness. Only used if \code{which_weights} is \code{"voom"}.
#'Default is \code{0.5}.
#'
#'@param R library.size (optional, important to provide if \code{preprocessed = TRUE}).
#'Default is \code{NULL}
#'
#'@param homogen_traj a logical flag indicating whether trajectories should be considered homogeneous.
#'Default is \code{FALSE} in which case trajectories are not only tested for trend, but also for heterogeneity.
#'
#'@param na.rm_tcgsaseq logical: should missing values in \code{y} (including
#'\code{NA} and \code{NaN}) be omitted from the calculations?
#'Default is \code{TRUE}.
#'
#'@param verbose logical: should informative messages be printed during the
#'computation? Default is \code{TRUE}.
#'
#'@return A list with the following elements:\itemize{
#'   \item \code{which_test}: a character string carrying forward the value of the '\code{which_test}' argument
#'    indicating which test was perform (either "asymptotic" or "permutation").
#'   \item \code{preprocessed}: a logical flag carrying forward the value of the '\code{preprocessed}' argument
#'   indicating whether the expression data were already preprocessed, or were provided as raw counts and
#'   transformed into log-counts per million.
#'   \item \code{n_perm}: an integer carrying forward the value of the '\code{n_perm}' argument indicating
#'   the number of perturbations performed (\code{NA} if asymptotic test was performed).
#'   \item \code{genesets}: carrying forward the value of the '\code{genesets}' argument defining the gene sets
#'   of interest (\code{NULL} for gene-wise testing).
#'   \item \code{pval}: computed p-values. A \code{data.frame} with one raw for each each gene set, or
#'   for each gene if \code{genesets} argument is \code{NULL}, and with 2 columns: the first one '\code{rawPval}'
#'   contains the raw p-values, the second one contains the FDR adjusted p-values (according to
#'   the '\code{padjust_methods}' argument) and is named '\code{adjPval}'.
#' }
#'
#'@seealso \code{\link{sp_weights}} \code{\link{vc_test_perm}} \code{\link{vc_test_asym}} \code{\link{p.adjust}}
#'
#'@references Agniel D & Hejblum BP (2017). Variance component score test for
#'time-course gene set analysis of longitudinal RNA-seq data, \emph{Biostatistics},
#'18(4):589-604. \href{https://doi.org/10.1093/biostatistics/kxx005}{10.1093/biostatistics/kxx005}.
#'\href{https://arxiv.org/abs/1605.02351}{arXiv:1605.02351}.
#'
#'@references Law, C. W., Chen, Y., Shi, W., & Smyth, G. K. (2014). voom: Precision
#'weights unlock linear model analysis tools for RNA-seq read counts. \emph{Genome
#'Biology}, 15(2), R29.
#'
#'@importFrom stats p.adjust
#'
#'@examples
#'#rm(list=ls())
#'nsims <- 2 #100
#'res_quant <- list()
#'for(i in 1:2){
#'n <- 2000#0
#'nr <- 3
#'r <- nr*20#4*nr#100*nr
#'t <- matrix(rep(1:nr), r/nr, ncol=1, nrow=r)
#'sigma <- 0.4
#'b0 <- 1
#'
#'#under the null:
#'b1 <- 0
#'
#'y.tilde <- b0 + b1*t + rnorm(r, sd = sigma)
#'y <- t(matrix(rnorm(n*r, sd = sqrt(sigma*abs(y.tilde))), ncol=n, nrow=r) +
#'       matrix(rep(y.tilde, n), ncol=n, nrow=r))
#'x <- matrix(1, ncol=1, nrow=r)
#'
#'#run test
#'res <- tcgsa_seq(y, x, phi=t, genesets=lapply(0:9, function(x){x*10+(1:10)}),
#'                         Sigma_xi=matrix(1), indiv=rep(1:(r/nr), each=nr), which_test="asymptotic",
#'                         which_weights="none", preprocessed=TRUE)
#'res_genes <- tcgsa_seq(y, x, phi=cbind(t),#, rnorm(r)), #t^2
#'                       genesets=NULL,
#'                       Sigma_xi=diag(1), indiv=rep(1:(r/nr), each=nr), which_test="asymptotic",
#'                       which_weights="none", preprocessed=TRUE)
#'length(res_genes$pvals[, "rawPval"])
#'quantile(res_genes$pvals[, "rawPval"])
#'res_quant[[i]] <- res_genes$pvals[, "rawPval"]
#'}
#'#round(rowMeans(sapply(res_quant, quantile)), 3)
#'#plot(density(unlist(res_quant)))
#'#mean(unlist(res_quant)<0.05)
#'
#'\dontrun{
#'res_genes <- tcgsa_seq(y, x, phi=t, genesets=NULL,
#'                       Sigma_xi=matrix(1), indiv=rep(1:(r/nr), each=nr), which_test="permutation",
#'                       which_weights="none", preprocessed=TRUE, n_perm=1000, parallel_comp = FALSE)
#'
#'mean(res_genes$pvals$rawPval < 0.05)
#'summary(res_genes$pvals$FDR)
#'}
#'@export
tcgsa_seq <- function(y, x, phi, weights_phi_condi = TRUE,
                      genesets,
                      indiv = NULL,
                      Sigma_xi = diag(ncol(phi)),
                      which_test = c("permutation", "asymptotic"),
                      which_weights = c("loclin", "voom", "none"),
                      n_perm = 1000, progressbar = TRUE, parallel_comp = TRUE,
                      nb_cores = parallel::detectCores() - 1,
                      preprocessed = FALSE, doPlot = TRUE, gene_based_weights = TRUE,
                      bw = "nrd",
                      kernel = c("gaussian", "epanechnikov", "rectangular", "triangular", "biweight", "tricube", "cosine", "optcosine"),
                      exact = FALSE, transform = TRUE,
                      padjust_methods = c("BH", "BY", "holm", "hochberg", "hommel", "bonferroni"),
                      lowess_span = 0.5,
                      R = NULL,
                      homogen_traj = FALSE,
                      na.rm_tcgsaseq = TRUE,
                      verbose = TRUE){

  stopifnot(is.matrix(y))
  stopifnot(is.matrix(x) | is.data.frame(x))
  stopifnot(is.matrix(phi) | is.data.frame(phi))

  if(sum(is.na(y))>1 & na.rm_tcgsaseq){
    warning(paste("\n\n!!!!!\n'y' contains", sum(is.na(y)), "NA values.",
                  "\nCurrently they are ignored in the computations but you should think carefully about where do those NA/NaN come from...",
                  "\nIf you don't want to ignore those NA/NaN values, set the 'na.rm_tcgsaseq' argument to 'FALSE' (this may lead to errors).\n!!!!!\n"))
  }

  # checking for 0 variance genes
  v_g <- apply(X = y, MARGIN = 1, FUN = stats::var)
  if(sum(v_g==0) > 0){
    warning(paste0("Removing ", sum(v_g==0), " genes with 0 variance from the testing procedure.\n",
                   "  Those genes should probably have been removed beforehand..."))
    y <- y[v_g>0, ]
  }

  # normalization if needed
  if(!preprocessed){
    R <- colSums(y, na.rm = TRUE)
    y_lcpm <- apply(y, MARGIN=2, function(v){log2((v+0.5)/(sum(v)+1)*10^6)})
  }else{
    y_lcpm <- y
  }
  rm(y)

  if(is.data.frame(x)){
    warning("design matrix 'x' is a data.frame instead of a matrix:\n all variables (including factors) are converted to numeric...")
    x <- as.matrix(as.data.frame(lapply(x, as.numeric)))
  }

  if(is.data.frame(phi)){
    warning("design matrix 'phi' is a data.frame instead of a matrix:\n all variables (including factors) are converted to numeric... ")
    phi <- as.matrix(as.data.frame(lapply(phi, as.numeric)))
  }

  if(det(crossprod(cbind(x, phi)))==0){
    stop("crossprod(cbind(x, phi)) cannot be inversed. 'x' and 'phi' are likely colinear...")
  }

  if(length(padjust_methods)>1){
    padjust_methods <- padjust_methods[1]
  }
  stopifnot(padjust_methods %in% c("BH", "BY", "holm", "hochberg", "hommel", "bonferroni"))

  if(length(which_weights)>1){
    which_weights <- which_weights[1]
  }
  stopifnot(which_weights %in% c("loclin", "voom", "none"))

  if(length(which_test)>1){
    which_test <- which_test[1]
  }
  stopifnot(which_test %in% c("asymptotic", "permutation"))


  if(which_test == "asymptotic"){
    n_perm <- NA

    if(nrow(x) < 10)
      warning("Less than 10 samples: asymptotics likely not reached \nYou should probably run permutation test instead...")
  }



  if(which_test == "permutation"){
    if(is.null(indiv)){
      options(warn = -1)
      N_possible_perms <- factorial(ncol(y_lcpm))
      options(warn = 0)
    }else{
      options(warn = -1)
      N_possible_perms <- prod(sapply(table(indiv), factorial))
      options(warn = 0)
    }

    if(n_perm > N_possible_perms){
      warning(paste0("The number of permutations requested 'n_perm' is ", n_perm, "which is larger than the total number of existing permutations ", N_possible_perms,
                 ". Try a lower number for 'n_perm' (currently running with 'nperm=", N_possible_perms, "')."))
      n_perm <- N_possible_perms
    }
  }

  # Computing the weights
  if(which_weights != "none" & verbose){message("Computing the weights... ")}
  w <-  switch(which_weights,
               loclin = sp_weights(y = y_lcpm, x = x, phi = phi, use_phi = weights_phi_condi,
                                   preprocessed = TRUE, doPlot = doPlot,
                                   gene_based = gene_based_weights,
                                   bw = bw, kernel = kernel,
                                   exact = exact, transform = transform, verbose = verbose, na.rm = na.rm_tcgsaseq),
               voom = voom_weights(y = y_lcpm, x = if(weights_phi_condi){cbind(x, phi)}else{x},
                                   preprocessed = TRUE, doPlot = doPlot,
                                   lowess_span = lowess_span, R = R),
               none = matrix(1, ncol=ncol(y_lcpm), nrow=nrow(y_lcpm),
                             dimnames = list(rownames(y_lcpm),
                                             colnames(y_lcpm)
                             )
               )
  )
  if(which_weights != "none" & verbose){message("Done!\n")}



  if(is.null(genesets)){
    if(verbose){
      message("'genesets' argument not provided => only gene-wise p-values are computed\n")
    }
    if(which_test == "asymptotic"){
      if(is.null(indiv)){
        indiv <- 1:nrow(x)
      }

      rawPvals <- vc_test_asym(y = y_lcpm, x = x, indiv = indiv, phi = phi,
                               w = w, Sigma_xi = Sigma_xi,
                               genewise_pvals = TRUE, homogen_traj = homogen_traj,
                               na.rm = na.rm_tcgsaseq)$gene_pvals
    }else if(which_test == "permutation"){
      if(is.null(indiv)){
        indiv <- rep(1, nrow(x))
      }

      # constructing residuals
      if(na.rm_tcgsaseq){
        y_lcpm0 <- y_lcpm
        y_lcpm0[is.na(y_lcpm0)] <- 0
        y_lcpm_res <- y_lcpm - t(x%*%solve(crossprod(x))%*%t(x)%*%t(y_lcpm0))
      }else{
        y_lcpm_res <- y_lcpm - t(x%*%solve(crossprod(x))%*%t(x)%*%t(y_lcpm))
      }
      rm(y_lcpm0)
      x_res <- matrix(1, nrow=nrow(x), ncol=1)
      perm_result <- vc_test_perm(y = y_lcpm_res, x = x_res, indiv = indiv, phi = phi,
                                  w = w, Sigma_xi = Sigma_xi,
                                  n_perm=n_perm, progressbar = progressbar,
                                  parallel_comp = parallel_comp, nb_cores = nb_cores,
                                  genewise_pvals = TRUE, homogen_traj = homogen_traj,
                                  na.rm = na.rm_tcgsaseq)
      rawPvals <- perm_result$gene_pvals
      }

      pvals <- data.frame("rawPval" = rawPvals, "adjPval" = stats::p.adjust(rawPvals, padjust_methods))


    if(!is.null(rownames(y_lcpm))){
      rownames(pvals) <- rownames(y_lcpm)
    }
  }else if(is.list(genesets)){

    if(class(genesets[[1]])=="character"){
      if(is.null(rownames(y_lcpm))){
        stop("Gene sets specified as character but no rownames available for the expression matrix")
      }
      gene_names_measured <- rownames(y_lcpm)
      prop_meas <- sapply(genesets, function(x){length(intersect(x, gene_names_measured))/length(x)})
      if(sum(prop_meas)!=length(prop_meas)){
        warning("Some transcripts in the investigated gene sets were not measured:\nremoving those transcripts from the gene set definition...")
        genesets <- lapply(genesets, function(x){x[which(x %in% gene_names_measured)]})
      }
    }

    if(which_test == "asymptotic"){
      if(is.null(indiv)){
        indiv <- 1:nrow(x)
      }

      rawPvals <- sapply(seq_along(genesets), FUN = function(i_gs){
        gs <- genesets[[i_gs]]
        e <- try(y_lcpm[gs, 1, drop=FALSE], silent = TRUE)
        if(inherits(e, "try-error") | length(e)==0){
          warning(paste("Gene set", i_gs, "contains 0 measured transcript: associated p-value cannot be computed"))
          NA
        }else{
          vc_test_asym(y = y_lcpm[gs, , drop=FALSE], x = x, indiv = indiv, phi = phi,
                       w = w[gs, , drop=FALSE], Sigma_xi = Sigma_xi,
                       genewise_pvals = FALSE, homogen_traj = homogen_traj,
                       na.rm = na.rm_tcgsaseq)$set_pval
        }
      }
      )
    } else if(which_test == "permutation"){
      if(is.null(indiv)){
        indiv <- rep(1, nrow(x))
      }

      if(na.rm_tcgsaseq){
        y_lcpm0 <- y_lcpm
        y_lcpm0[is.na(y_lcpm0)] <- 0
        y_lcpm_res <- y_lcpm - t(x%*%solve(crossprod(x))%*%t(x)%*%t(y_lcpm0))
      }else{
        y_lcpm_res <- y_lcpm - t(x%*%solve(crossprod(x))%*%t(x)%*%t(y_lcpm))
      }
      rm(y_lcpm0)

      x_res <- matrix(1, nrow=nrow(x), ncol=1)
      rawPvals <- sapply(seq_along(genesets), FUN = function(i_gs){
        gs <- genesets[[i_gs]]
        e <- try(y_lcpm[gs, 1, drop = FALSE], silent = TRUE)
        if(inherits(e, "try-error") | length(e) < 1){
          warning(paste("Gene set", i_gs, "contains 0 measured transcript: associated p-value cannot be computed"))
          NA
        }else{
          vc_test_perm(y = y_lcpm[gs, ], x = x, indiv = indiv, phi = phi,
                       w = w[gs, , drop=FALSE], Sigma_xi = Sigma_xi,
                       n_perm = n_perm, progressbar = progressbar,
                       parallel_comp = parallel_comp, nb_cores = nb_cores,
                       genewise_pvals = FALSE, homogen_traj = homogen_traj,
                       na.rm = na.rm_tcgsaseq)$set_pval
        }
      }
      )
    }

    pvals <- data.frame("rawPval" = rawPvals, "adjPval" = stats::p.adjust(rawPvals, padjust_methods))
    if(!is.null(names(genesets))){
      rownames(pvals) <- names(genesets)
    }

  }else{

    if(!is.vector(genesets)){
      stop("'genesets' argument provided but is neither a list nor a vector")
    }

    if(class(genesets)=="character"){
      if(is.null(rownames(y_lcpm))){
        stop("Gene sets specified as character but no rownames available for the expression matrix")
      }
      gene_names_measured <- rownames(y_lcpm)
      if((length(intersect(genesets, gene_names_measured))/length(x)) != 1){
        warning("Some transcripts in the investigated gene sets were not measured:\n removing those transcripts from the gene set definition...")
        genesets <- genesets[which(genesets %in% gene_names_measured)]
      }
    }

    res_test <- switch(which_test,
                       asymptotic = vc_test_asym(y = y_lcpm[genesets, ], x = x,
                                                 indiv = indiv, phi = phi,
                                                 w = w[genesets, ],
                                                 Sigma_xi = Sigma_xi,
                                                 genewise_pvals = FALSE,
                                                 homogen_traj = homogen_traj,
                                                 na.rm = na.rm_tcgsaseq),
                       permutation = vc_test_perm(y = y_lcpm[genesets, ], x = x,
                                                  indiv = indiv, phi = phi,
                                                  w = w[genesets, ],
                                                  Sigma_xi = Sigma_xi,
                                                  n_perm = n_perm,
                                                  progressbar = progressbar,
                                                  parallel_comp = parallel_comp,
                                                  nb_cores = nb_cores,
                                                  genewise_pvals = FALSE,
                                                  homogen_traj = homogen_traj,
                                                  na.rm = na.rm_tcgsaseq)
    )
    pvals <- data.frame("rawPval" = res_test$set_pval, "adjPval" = NA)
    padjust_methods <- NA

  }

  return(list("which_test" = which_test, "preprocessed" = preprocessed, "n_perm" = n_perm,
              "genesets" = genesets, "pvals" = pvals, "w" = w))

}