R/class_dmDStest.R

#' @include class_dmDSfit.R
NULL

###############################################################################
### dmDStest class
###############################################################################

#' dmDStest object
#' 
#' dmDStest extends the \code{\linkS4class{dmDSfit}} class by adding the null 
#' model Dirichlet-multinomial (DM) and beta-binomial (BB) likelihoods and the 
#' gene-level and feature-level results of testing for differential 
#' exon/transcript usage. Result of calling the \code{\link{dmTest}} function.
#' 
#' @return
#' 
#' \itemize{ \item \code{results(x)}: get a data frame with gene-level or 
#' feature-level results.}
#' 
#' @param x,object dmDStest object.
#' @param ... Other parameters that can be defined by methods using this 
#'   generic.
#'   
#' @slot design_fit_null Numeric matrix of the design used to fit the null 
#'   model.
#' @slot lik_null Numeric vector of the per gene DM null model likelihoods.
#' @slot lik_null_bb Numeric vector of the per gene BB null model likelihoods.
#' @slot results_gene Data frame with the gene-level results including: 
#'   \code{gene_id} - gene IDs, \code{lr} - likelihood ratio statistics based on
#'   the DM model, \code{df} - degrees of freedom, \code{pvalue} - p-values and 
#'   \code{adj_pvalue} - Benjamini & Hochberg adjusted p-values.
#' @slot results_feature Data frame with the feature-level results including: 
#'   \code{gene_id} - gene IDs, \code{feature_id} - feature IDs, \code{lr} - 
#'   likelihood ratio statistics based on the BB model, \code{df} - degrees of 
#'   freedom, \code{pvalue} - p-values and \code{adj_pvalue} - Benjamini & 
#'   Hochberg adjusted p-values.
#'   
#' @examples 
#' # --------------------------------------------------------------------------
#' # Create dmDSdata object 
#' # --------------------------------------------------------------------------
#' ## Get kallisto transcript counts from the 'PasillaTranscriptExpr' package
#' 
#' library(PasillaTranscriptExpr)
#' 
#' data_dir  <- system.file("extdata", package = "PasillaTranscriptExpr")
#' 
#' ## Load metadata
#' pasilla_metadata <- read.table(file.path(data_dir, "metadata.txt"), 
#' header = TRUE, as.is = TRUE)
#' 
#' ## Load counts
#' pasilla_counts <- read.table(file.path(data_dir, "counts.txt"), 
#' header = TRUE, as.is = TRUE)
#' 
#' ## Create a pasilla_samples data frame
#' pasilla_samples <- data.frame(sample_id = pasilla_metadata$SampleName, 
#'   group = pasilla_metadata$condition)
#' levels(pasilla_samples$group)
#' 
#' ## Create a dmDSdata object
#' d <- dmDSdata(counts = pasilla_counts, samples = pasilla_samples)
#' 
#' ## Use a subset of genes, which is defined in the following file
#' gene_id_subset <- readLines(file.path(data_dir, "gene_id_subset.txt"))
#' 
#' d <- d[names(d) %in% gene_id_subset, ]
#' 
#' # --------------------------------------------------------------------------
#' # Differential transcript usage analysis - simple two group comparison 
#' # --------------------------------------------------------------------------
#' 
#' ## Filtering
#' ## Check what is the minimal number of replicates per condition 
#' table(samples(d)$group)
#' 
#' d <- dmFilter(d, min_samps_gene_expr = 7, min_samps_feature_expr = 3,
#'   min_gene_expr = 10, min_feature_expr = 10)
#' 
#' plotData(d)
#' 
#' ## Create the design matrix
#' design_full <- model.matrix(~ group, data = samples(d))
#' 
#' ## To make the analysis reproducible
#' set.seed(123)
#' ## Calculate precision
#' d <- dmPrecision(d, design = design_full)
#' 
#' plotPrecision(d)
#' 
#' head(mean_expression(d))
#' common_precision(d)
#' head(genewise_precision(d))
#' 
#' ## Fit full model proportions
#' d <- dmFit(d, design = design_full)
#' 
#' ## Get fitted proportions
#' head(proportions(d))
#' ## Get the DM regression coefficients (gene-level) 
#' head(coefficients(d))
#' ## Get the BB regression coefficients (feature-level) 
#' head(coefficients(d), level = "feature")
#' 
#' ## Fit null model proportions and perform the LR test to detect DTU
#' d <- dmTest(d, coef = "groupKD")
#' 
#' ## Plot the gene-level p-values
#' plotPValues(d)
#' 
#' ## Get the gene-level results
#' head(results(d))
#' 
#' ## Plot feature proportions for a top DTU gene
#' res <- results(d)
#' res <- res[order(res$pvalue, decreasing = FALSE), ]
#' 
#' top_gene_id <- res$gene_id[1]
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group")
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group", 
#'   plot_type = "lineplot")
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group", 
#'   plot_type = "ribbonplot")
#' 
#' @author Malgorzata Nowicka
#' @seealso \code{\linkS4class{dmDSdata}}, \code{\linkS4class{dmDSprecision}},
#'   \code{\linkS4class{dmDSfit}}
setClass("dmDStest", 
  contains = "dmDSfit",
  representation(design_fit_null = "matrix",
    lik_null = "numeric",
    lik_null_bb = "numeric",
    results_gene = "data.frame",
    results_feature = "data.frame"))


# ------------------------------------------------------------------------------

setValidity("dmDStest", function(object){
  # Has to return TRUE when valid object!
  
  if(!length(object@lik_null) == length(object@counts))
    return("Different number of genes in 'counts' and 'lik_null'")
  
  if(length(object@lik_null_bb) > 0){
    if(!length(object@lik_null_bb) == nrow(object@counts))
      return("Different number of features in 'counts' and 'lik_null_bb'")
  }
  
  # TODO: Add more checks for results
  
  return(TRUE)
  
})

###############################################################################
### accessing methods
###############################################################################


#' @rdname dmDStest-class
#' @inheritParams dmDSprecision-class
#' @export
setMethod("design", "dmDStest", function(object, type = "null_model"){
  
  stopifnot(type %in% c("precision", "full_model", "null_model"))
  
  if(type == "precision")
    object@design_precision
  else if(type == "full_model")
    object@design_fit_full
  else
    object@design_fit_null
  
})


#' @rdname dmDStest-class
#' @export
setGeneric("results", function(x, ...) standardGeneric("results"))

#' @rdname dmDStest-class
#' @inheritParams dmDSfit-class
#' @export
setMethod("results", "dmDStest", function(x, level = "gene"){
  stopifnot(length(level) == 1)
  stopifnot(level %in% c("gene", "feature"))
  slot(x, paste0("results_", level))
})


# -----------------------------------------------------------------------------

setMethod("show", "dmDStest", function(object){
  
  callNextMethod(object)
  
  cat("  results()\n")
  
})

###############################################################################
### dmTest
###############################################################################

#' Likelihood ratio test to detect differential transcript/exon usage
#' 
#' First, estimate the null Dirichlet-multinomial and beta-binomial model 
#' parameters and likelihoods using the null model design. Second, perform the 
#' gene-level (DM model) and feature-level (BB model) likelihood ratio tests. In
#' the differential exon/transcript usage analysis, the null model is defined by
#' the null design matrix. In the exon/transcript usage QTL analysis, null
#' models are defined by a design with intercept only. Currently, beta-binomial
#' model is implemented only in the differential usage analysis.
#' 
#' @param x \code{\linkS4class{dmDSfit}} or \code{\linkS4class{dmSQTLfit}} 
#'   object.
#' @param ... Other parameters that can be defined by methods using this 
#'   generic.
#' @export
setGeneric("dmTest", function(x, ...) standardGeneric("dmTest"))


# -----------------------------------------------------------------------------


#' @inheritParams dmFit
#' @param coef Integer or character vector indicating which coefficients of the 
#'   linear model are to be tested equal to zero. Values must indicate column 
#'   numbers or column names of the \code{design} used in \code{\link{dmFit}}.
#' @param design Numeric matrix defining the null model.
#' @param contrast Numeric vector or matrix specifying one or more contrasts of 
#'   the linear model coefficients to be tested equal to zero. For a matrix, 
#'   number of rows (for a vector, its length) must equal to the number of 
#'   columns of \code{design} used in \code{\link{dmFit}}.
#'   
#' @details One must specify one of the arguments: \code{coef}, \code{design} or
#'   \code{contrast}.
#'   
#'   When \code{contrast} is used to define the null model, the null design 
#'   matrix is recalculated using the same approach as in 
#'   \code{\link[edgeR]{glmLRT}} function from \code{\link{edgeR}}.
#'   
#' @return Returns a \code{\linkS4class{dmDStest}} or 
#'   \code{\linkS4class{dmSQTLtest}} object.
#' @examples 
#' # --------------------------------------------------------------------------
#' # Create dmDSdata object 
#' # --------------------------------------------------------------------------
#' ## Get kallisto transcript counts from the 'PasillaTranscriptExpr' package
#' 
#' library(PasillaTranscriptExpr)
#' \donttest{
#' data_dir  <- system.file("extdata", package = "PasillaTranscriptExpr")
#' 
#' ## Load metadata
#' pasilla_metadata <- read.table(file.path(data_dir, "metadata.txt"), 
#' header = TRUE, as.is = TRUE)
#' 
#' ## Load counts
#' pasilla_counts <- read.table(file.path(data_dir, "counts.txt"), 
#' header = TRUE, as.is = TRUE)
#' 
#' ## Create a pasilla_samples data frame
#' pasilla_samples <- data.frame(sample_id = pasilla_metadata$SampleName, 
#'   group = pasilla_metadata$condition)
#' levels(pasilla_samples$group)
#' 
#' ## Create a dmDSdata object
#' d <- dmDSdata(counts = pasilla_counts, samples = pasilla_samples)
#' 
#' ## Use a subset of genes, which is defined in the following file
#' gene_id_subset <- readLines(file.path(data_dir, "gene_id_subset.txt"))
#' 
#' d <- d[names(d) %in% gene_id_subset, ]
#' 
#' # --------------------------------------------------------------------------
#' # Differential transcript usage analysis - simple two group comparison 
#' # --------------------------------------------------------------------------
#' 
#' ## Filtering
#' ## Check what is the minimal number of replicates per condition 
#' table(samples(d)$group)
#' 
#' d <- dmFilter(d, min_samps_gene_expr = 7, min_samps_feature_expr = 3,
#'   min_gene_expr = 10, min_feature_expr = 10)
#' 
#' plotData(d)
#' 
#' ## Create the design matrix
#' design_full <- model.matrix(~ group, data = samples(d))
#' 
#' ## To make the analysis reproducible
#' set.seed(123)
#' ## Calculate precision
#' d <- dmPrecision(d, design = design_full)
#' 
#' plotPrecision(d)
#' 
#' head(mean_expression(d))
#' common_precision(d)
#' head(genewise_precision(d))
#' 
#' ## Fit full model proportions
#' d <- dmFit(d, design = design_full)
#' 
#' ## Get fitted proportions
#' head(proportions(d))
#' ## Get the DM regression coefficients (gene-level) 
#' head(coefficients(d))
#' ## Get the BB regression coefficients (feature-level) 
#' head(coefficients(d), level = "feature")
#' 
#' ## Fit null model proportions and perform the LR test to detect DTU
#' d <- dmTest(d, coef = "groupKD")
#' 
#' ## Plot the gene-level p-values
#' plotPValues(d)
#' 
#' ## Get the gene-level results
#' head(results(d))
#' 
#' ## Plot feature proportions for a top DTU gene
#' res <- results(d)
#' res <- res[order(res$pvalue, decreasing = FALSE), ]
#' 
#' top_gene_id <- res$gene_id[1]
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group")
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group", 
#'   plot_type = "lineplot")
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group", 
#'   plot_type = "ribbonplot")
#' }
#' @author Malgorzata Nowicka
#' @seealso \code{\link{plotPValues}} \code{\link[edgeR]{glmLRT}}
#' @references McCarthy, DJ, Chen, Y, Smyth, GK (2012). Differential expression 
#'   analysis of multifactor RNA-Seq experiments with respect to biological 
#'   variation. Nucleic Acids Research 40, 4288-4297.
#' @rdname dmTest
#' @importFrom limma nonEstimable
#' @export
setMethod("dmTest", "dmDSfit", function(x, 
  coef = NULL, design = NULL, contrast = NULL, 
  one_way = TRUE, bb_model = TRUE,
  prop_mode = "constrOptim", prop_tol = 1e-12, 
  coef_mode = "optim", coef_tol = 1e-12,
  verbose = 0, BPPARAM = BiocParallel::SerialParam()){
  
  # Check parameters
  stopifnot(is.logical(one_way))
  
  stopifnot(length(prop_mode) == 1)
  stopifnot(prop_mode %in% c("constrOptim"))
  stopifnot(length(prop_tol) == 1)
  stopifnot(is.numeric(prop_tol) && prop_tol > 0)
  
  stopifnot(length(coef_mode) == 1)
  stopifnot(coef_mode %in% c("optim", "nlminb", "nlm"))
  stopifnot(length(coef_tol) == 1)
  stopifnot(is.numeric(coef_tol) && coef_tol > 0)
  
  stopifnot(verbose %in% 0:2)
  
  if(!sum(!unlist(lapply(list(coef, design, contrast), is.null))) == 1)
    stop(paste0("Only one of the ways to define the null model 'coef', 
      'design' or 'contrast' can be used!"))
  
  # Check coef
  if(!is.null(coef)){
    
    # Check the full model design matrix
    nbeta <- ncol(x@design_fit_full)
    if(nbeta < 2) 
      stop("Need at least two columns for design, usually the first is 
        the intercept column!")
    
    if(length(coef) > 1) 
      coef <- unique(coef)
    
    if(is.numeric(coef)){
      stopifnot(max(coef) <= nbeta)
    }else if(is.character(coef)){
      if(!all(coef %in% colnames(x@design_fit_full))) 
        stop("'coef' does not match the columns of the design matrix!")
      coef <- match(coef, colnames(x@design_fit_full))
    }
    
    # Null design matrix
    design0 <- x@design_fit_full[, -coef, drop = FALSE]
    
  }
  
  # Check design
  if(!is.null(design)){
    
    # Check design as in edgeR
    design <- as.matrix(design)
    stopifnot(nrow(design) == ncol(x@counts))
    
    ne <- limma::nonEstimable(design)
    if(!is.null(ne)) 
      stop(paste("Design matrix not of full rank. 
        The following coefficients not estimable:\n", 
        paste(ne, collapse = " ")))
    
    # Null design matrix
    design0 <- design
    
  }
  
  # Check contrast exactly as in edgeR in glmLRT()
  if(!is.null(contrast)){
    
    design <- x@design_fit_full 
    contrast <- as.matrix(contrast)
    stopifnot(nrow(contrast) == ncol(design))
    
    qrc <- qr(contrast)
    ncontrasts <- qrc$rank
    
    if(ncontrasts == 0) 
      stop("Contrasts are all zero!")
    
    coef <- 1:ncontrasts
    
    nlibs <- nrow(design)
    Dvec <- rep.int(1, nlibs)
    Dvec[coef] <- diag(qrc$qr)[coef]
    Q <- qr.Q(qrc, complete = TRUE, Dvec = Dvec)
    design <- design %*% Q
    
    # Null design matrix
    design0 <- design[, -coef, drop = FALSE]
    
  }
  
  # Fit the DM null model: proportions and likelihoods
  fit0 <- dmDS_fit(counts = x@counts, design = design0, 
    precision = x@genewise_precision,
    one_way = one_way,
    prop_mode = prop_mode, prop_tol = prop_tol, 
    coef_mode = coef_mode, coef_tol = coef_tol, 
    verbose = verbose, BPPARAM = BPPARAM)
  
  # Calculate the DM degrees of freedom for the LR test: df_full - df_null
  df <- (ncol(x@design_fit_full) - ncol(design0)) * 
    (elementNROWS(x@coef_full) - 1)
  
  results_gene <- dm_LRT(lik_full = x@lik_full, 
    lik_null = fit0[["lik"]], df = df, verbose = verbose)
  
  results_gene <- data.frame(gene_id = rownames(results_gene), 
    results_gene, stringsAsFactors = FALSE, row.names = NULL)
  
  
  # Calculate the Beta-Binomial null likelihoods for each feature
  if(bb_model && length(x@lik_full_bb) > 0){
    
    fit0_bb <- bbDS_fit(counts = x@counts, fit = fit0[["fit"]], 
      design = design0, precision = x@genewise_precision,
      one_way = one_way, verbose = verbose, BPPARAM = BPPARAM)
    
    # Calculate the BB degrees of freedom for the LR test
    df <- rep.int(ncol(x@design_fit_full) - ncol(design0), 
      length(x@lik_full_bb))
    
    results_feature <- dm_LRT(lik_full = x@lik_full_bb, 
      lik_null = fit0_bb[["lik"]], df = df, verbose = verbose)
    
    results_feature <- data.frame(
      gene_id = rep.int(names(x@counts), elementNROWS(x@counts)), 
      feature_id = rownames(results_feature), 
      results_feature, stringsAsFactors = FALSE, row.names = NULL)
    
    return(new("dmDStest", 
      results_gene = results_gene, results_feature = results_feature,
      design_fit_null = design0, 
      lik_null = fit0[["lik"]],
      lik_null_bb = fit0_bb[["lik"]],
      design_fit_full = x@design_fit_full, 
      fit_full = x@fit_full, lik_full = x@lik_full, coef_full = x@coef_full,
      lik_full_bb = x@lik_full_bb,  coef_full_bb = x@coef_full_bb,
      mean_expression = x@mean_expression, 
      common_precision = x@common_precision, 
      genewise_precision = x@genewise_precision, 
      design_precision = x@design_precision,
      counts = x@counts, samples = x@samples))
    
  }else{
    
    if(bb_model && length(x@lik_full_bb) == 0)
      message("Beta-Binomial model is not fitted 
        because bb_model=FALSE in dmFit! Rerun dmFit with bb_model=TRUE.")
    
    return(new("dmDStest", 
      results_gene = results_gene,
      design_fit_null = design0, 
      lik_null = fit0[["lik"]],
      design_fit_full = x@design_fit_full, 
      fit_full = x@fit_full, lik_full = x@lik_full, coef_full = x@coef_full,
      lik_full_bb = x@lik_full_bb,  coef_full_bb = x@coef_full_bb,
      mean_expression = x@mean_expression, 
      common_precision = x@common_precision, 
      genewise_precision = x@genewise_precision, 
      design_precision = x@design_precision,
      counts = x@counts, samples = x@samples))
    
  }
  
  
})


###############################################################################
### plotPValues
###############################################################################

#' Plot p-value distribution
#' 
#' @return Plot a histogram of p-values.
#' 
#' @param x \code{\linkS4class{dmDStest}} or \code{\linkS4class{dmSQTLtest}}
#'   object.
#' @export
setGeneric("plotPValues", function(x, ...) standardGeneric("plotPValues"))



# ----------------------------------------------------------------------------




#' @inheritParams results
#' @examples
#' # --------------------------------------------------------------------------
#' # Create dmDSdata object 
#' # --------------------------------------------------------------------------
#' ## Get kallisto transcript counts from the 'PasillaTranscriptExpr' package
#' 
#' library(PasillaTranscriptExpr)
#' \donttest{
#' data_dir  <- system.file("extdata", package = "PasillaTranscriptExpr")
#' 
#' ## Load metadata
#' pasilla_metadata <- read.table(file.path(data_dir, "metadata.txt"), 
#' header = TRUE, as.is = TRUE)
#' 
#' ## Load counts
#' pasilla_counts <- read.table(file.path(data_dir, "counts.txt"), 
#' header = TRUE, as.is = TRUE)
#' 
#' ## Create a pasilla_samples data frame
#' pasilla_samples <- data.frame(sample_id = pasilla_metadata$SampleName, 
#'   group = pasilla_metadata$condition)
#' levels(pasilla_samples$group)
#' 
#' ## Create a dmDSdata object
#' d <- dmDSdata(counts = pasilla_counts, samples = pasilla_samples)
#' 
#' ## Use a subset of genes, which is defined in the following file
#' gene_id_subset <- readLines(file.path(data_dir, "gene_id_subset.txt"))
#' 
#' d <- d[names(d) %in% gene_id_subset, ]
#' 
#' # --------------------------------------------------------------------------
#' # Differential transcript usage analysis - simple two group comparison 
#' # --------------------------------------------------------------------------
#' 
#' ## Filtering
#' ## Check what is the minimal number of replicates per condition 
#' table(samples(d)$group)
#' 
#' d <- dmFilter(d, min_samps_gene_expr = 7, min_samps_feature_expr = 3,
#'   min_gene_expr = 10, min_feature_expr = 10)
#' 
#' plotData(d)
#' 
#' ## Create the design matrix
#' design_full <- model.matrix(~ group, data = samples(d))
#' 
#' ## To make the analysis reproducible
#' set.seed(123)
#' ## Calculate precision
#' d <- dmPrecision(d, design = design_full)
#' 
#' plotPrecision(d)
#' 
#' head(mean_expression(d))
#' common_precision(d)
#' head(genewise_precision(d))
#' 
#' ## Fit full model proportions
#' d <- dmFit(d, design = design_full)
#' 
#' ## Get fitted proportions
#' head(proportions(d))
#' ## Get the DM regression coefficients (gene-level) 
#' head(coefficients(d))
#' ## Get the BB regression coefficients (feature-level) 
#' head(coefficients(d), level = "feature")
#' 
#' ## Fit null model proportions and perform the LR test to detect DTU
#' d <- dmTest(d, coef = "groupKD")
#' 
#' ## Plot the gene-level p-values
#' plotPValues(d)
#' 
#' ## Get the gene-level results
#' head(results(d))
#' 
#' ## Plot feature proportions for a top DTU gene
#' res <- results(d)
#' res <- res[order(res$pvalue, decreasing = FALSE), ]
#' 
#' top_gene_id <- res$gene_id[1]
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group")
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group", 
#'   plot_type = "lineplot")
#' 
#' plotProportions(d, gene_id = top_gene_id, group_variable = "group", 
#'   plot_type = "ribbonplot")
#' }
#' @author Malgorzata Nowicka
#' @seealso \code{\link{plotData}}, \code{\link{plotPrecision}},
#'   \code{\link{plotProportions}}
#' @rdname plotPValues
#' @export
setMethod("plotPValues", "dmDStest", function(x, level = "gene"){
  
  stopifnot(length(level) == 1)
  stopifnot(level %in% c("gene", "feature"))
  
  res <- slot(x, paste0("results_", level))
  
  if(nrow(res) > 0)
    ggp <- dm_plotPValues(pvalues = res[, "pvalue"])
  else
    stop("Feature-level results are not available! Set bb_model=TRUE in 
      dmFit and dmTest")
  
  return(ggp)  
  
})
markrobinsonuzh/DRIMSeq documentation built on May 21, 2019, 12:23 p.m.