R/differentialNbinomWaldTestUMI4C.R

Defines functions deseq2UMI4C nbinomWaldTestUMI4C .smoothMonotone smoothMonotoneUMI4C vstUMI4C UMI4C2dds differentialNbinomWaldTestUMI4C

Documented in deseq2UMI4C differentialNbinomWaldTestUMI4C nbinomWaldTestUMI4C .smoothMonotone smoothMonotoneUMI4C UMI4C2dds vstUMI4C

#' Differential UMI4C contacts using DESeq2 Wald Test
#' 
#' Using a \code{UMI4C} object, infers the differences between conditions 
#' specified in \code{design} of the smooth monotone fitted values using a 
#' Wald Test from \code{DESeq2} package. 
#' @param umi4c UMI4C object as generated by \code{makeUMI4C} or the
#' \code{UMI4C} constructor.
#' @param design A \code{formula} or \code{matrix}. The formula expresses how
#' the counts for each fragment end depend on the variables in \code{colData}.
#' See  \code{\link[DESeq2]{DESeqDataSet}}.
#' @param query_regions \code{GRanges} object or \code{data.frame} containing
#' the coordinates of the genomic regions you want to use to perform the
#' analysis in specific genomic intervals. Default: NULL.
#' @param normalized Logical indicating if the function should return normalized
#'  or raw UMI counts.  Default: TRUE.
#' @param padj_method The method to use for adjusting p-values, see
#' \code{\link[stats]{p.adjust}}.  Default: fdr.
#' @param padj_threshold Numeric indicating the adjusted p-value threshold to
#' use to define significant differential contacts.  Default: 0.05.
#' @param alpha Approximate number of fragments desired for every basis function
#'  of the B-spline basis. \code{floor((max(number of fragments)) / alpha)} is 
#'  passed to \code{create.bspline.basis} as nbasis argument. 4 is the minimum
#'  allowed value. Default: 20.
#' @param penalty Amount of smoothing to be applied to the estimated functional 
#' parameter.  Default: 0.1.
#' @return \code{UMI4C} object with the DESeq2 Wald Test results.
#' @details This function fits the signal trend of a variance stabilized count 
#' values using a symmetric monotone fit for the distance dependency. Then 
#' scales the raw counts across the samples to obtain normalized factors. 
#' Finally, it detects differences between conditions applying the DESeq2 Wald 
#' Test.
#' @import GenomicRanges
#' @importFrom stats formula predict
#' @examples
#' 
#'  files <- list.files(system.file("extdata", "CIITA", "count", package="UMI4Cats"),
#'                      pattern = "*_counts.tsv.gz",
#'                      full.names = TRUE
#'  )
#' # Create colData including all relevant information
#' colData <- data.frame(
#'   sampleID = gsub("_counts.tsv.gz", "", basename(files)),
#'   file = files,
#'   stringsAsFactors = FALSE
#' )
#' 
#' library(tidyr)
#' colData <- colData %>%
#'   separate(sampleID,
#'            into = c("condition", "replicate", "viewpoint"),
#'            remove = FALSE
#'   )
#'   
#' # Make UMI-4C object including grouping by condition
#' umi <- makeUMI4C(
#'   colData = colData,
#'   viewpoint_name = "CIITA",
#'   grouping = NULL,
#'   bait_expansion = 2e6
#' )
#' 
#' umi_wald <- differentialNbinomWaldTestUMI4C(umi4c=umi,
#'                                             design=~condition,
#'                                             alpha = 100)
#' @export
differentialNbinomWaldTestUMI4C <- function(umi4c,
                                            design=~condition,
                                            normalized=TRUE,
                                            padj_method="fdr",
                                            query_regions=NULL,
                                            padj_threshold=0.05,
                                            penalty=0.1,
                                            alpha=20){
  # umi4c object to DDS object
  dds <- UMI4C2dds(umi4c=umi4c,
                   design=~condition,
                   query_regions=query_regions)
  
  # variance stabilizing transformation
  dds <- vstUMI4C(dds=dds)
  
  # monotone smoothing of the DDS object VST counts
  dds <- smoothMonotoneUMI4C(dds=dds,
                             alpha=alpha,
                             penalty=penalty)
  
  # differential contacts using DESeq2 Wald Test
  dds <- nbinomWaldTestUMI4C(dds=dds)
  
  # DDS object to umi4c object
  umi4c <- deseq2UMI4C(umi4c=umi4c,
                       dds=dds,
                       design=~condition,
                       normalized=normalized,
                       padj_method=padj_method,
                       padj_threshold=padj_threshold)
  
  return(umi4c)
}
#' UMI4Cats object to DDS object.
#' 
#' Transforms an UMI4C object to a DDS object
#' @param ... Other arguments to be passed to \code{\link[DESeq2]{DESeq}}
#' function.
#' @return DDS object.
#' @inheritParams differentialNbinomWaldTestUMI4C
#' @import GenomicRanges
UMI4C2dds <- function(umi4c,
                      design = ~condition,
                      query_regions=NULL,
                      ...){
  
  stopifnot(is(umi4c, "UMI4C"))
  
  # transform UMI4Cats object to DDS
  dds <- DESeq2::DESeqDataSetFromMatrix(
    countData = assays(umi4c)$umi,
    colData = colData(umi4c),
    rowRanges = rowRanges(umi4c),
    metadata = metadata(umi4c),
    design = ~ condition)
  
  # subset DDS taking in account query regions
    if (!is.null(query_regions)) {
      dds <- subsetByOverlaps(dds, query_regions)
      query_regions <- rowRanges(dds)
      colnames(mcols(query_regions))[1] <- "id"
    } else {
      query_regions <- rowRanges(umi4c)
      colnames(mcols(query_regions))[1] <- "id"
    }
  
  rowRanges(dds) <- query_regions
    
    return(dds)
}
#' Variance stabilizing transformation
#' 
#' Using a DDS object performs a variance stabilizing transformation from DESeq2 
#' package to the UMI4C counts
#' @param dds DDS object generated by \code{UMI4C2dds} 
#' @return DDS object with variance stabilizing transformation counts
#' @details This function estimate the size factors and dispersions of the counts
#' base on \code{\link[DESeq2]{DESeq}} for infering the VST distribution and
#' transform raw UMI4C counts.
vstUMI4C <- function(dds){
  if (!c("counts") %in% names(assays(dds))) 
    stop("No assay 'counts' found. Check your input.")
  
  message(paste(
    paste0("\n[", Sys.time(), "]"),
    "Starting vstUMI4C\n",
    "> Samples of DDS object:\n", paste(colnames(dds), sep="", collapse=", "), "\n"
  ))
  # learn the dispersion function of a dataset
  dds <- DESeq2::estimateSizeFactors(dds)
  #sizeFactors(dds)
  dds <- DESeq2::estimateDispersions(dds)
  
  # extract dispersion function
  if (attr(DESeq2::dispersionFunction(dds), "fitType") != "parametric") {
    stop("Failed to estimate the parameters of the Variance stabilizing transformation.
         Try increasing bait_expansion to include more fragments.")
  }
  coefs <- attr(DESeq2::dispersionFunction(dds), "coefficients")
  attr(dds, "vst") <- function(q) {
    log((1 + coefs["extraPois"] + 2 * coefs["asymptDisp"] * 
           q + 2 * sqrt(coefs["asymptDisp"] * q * (1 + coefs["extraPois"] + 
                                                     coefs["asymptDisp"] * q)))/(4 * coefs["asymptDisp"]))/log(2)
  }
  # transform counts using vst
  trafo <- attr(dds, "vst")(counts(dds))
  assays(dds)[['trafo']] <- trafo
  
  message(
    paste0("[", Sys.time(), "] "),
    "Finished vstUMI4C"
  )
  
  return(dds)
}
#' Monotone smoothing of the DDS object VST counts
#' 
#' Takes the variance stabilized count values and calculates a symmetric
#' monotone fit for the distance dependency. The signal trend is fitted using 
#' the \href{https://CRAN.R-project.org/package=fda}{fda} package. The position 
#' information about the viewpoint have to be stored in the metadata as 
#' \code{metadata(dds)[['bait']]}.
#' @param dds DDS object as generated by \code{vstUMI4C} with the 
#' variance stabilized count values
#' @return DDS object with monotone smoothed fit counts. 
#' @details This function computes the smoothing function for the VST values, based on
#' \href{https://CRAN.R-project.org/package=fda}{fda} package, and calculates a symmetric monotone fit counts for the distance dependency
#' @inheritParams differentialNbinomWaldTestUMI4C
smoothMonotoneUMI4C <- function(dds,
                                alpha=20,
                                penalty=0.1){
  if (!c("trafo") %in% names(assays(dds))) 
    stop("No assay 'trafo' found. Check your input.")
  
  if (length(metadata(dds)[['bait']]) != 1) 
    stop("None or more than one viewpoint are contained in the dds object. Check your input.")
  
  message(paste(
    paste0("\n[", Sys.time(), "]"),
    "Starting smoothMonotoneUMI4C using:\n",
    "> Samples of DDS object:\n", paste(colnames(dds), sep="", collapse=", "), "\n",
    "> Alpha:\n", alpha,"\n",
    "> Penalty:\n", penalty,"\n"
  ))
  
  # calculate mid of the viewpoint
  frag_viewpoint <- metadata(dds)[['bait']]
  frag_viewpoint$mid = start(frag_viewpoint) + (end(frag_viewpoint) - start(frag_viewpoint))%/%2
  
  # calculate mid of frag coord
  frag_data <- rowRanges(dds)
  mcols(frag_data, level="within")$mid <- start(frag_data) + (end(frag_data) - start(frag_data))%/%2
  mcols(frag_data, level="within")$dist <- mid(frag_data) - mid(frag_viewpoint)
  mcols(frag_data, level="within")$log_dist <- log(abs(mcols(frag_data)[['dist']]))
  frag_data = as.data.frame(frag_data)
  
  # calculate smooth monotone fit and fit counts
  fit <- apply(assays(dds)[['trafo']], 2, .smoothMonotone, alpha,
               penalty,frag_data)
  
  fit <- as.data.frame(fit)
  
  row.names(fit) <- unlist(dimnames(dds)[1])
  
  assays(dds)[['fit']] <- fit
  
  message(
    paste0("[", Sys.time(), "] "),
    "Finished smoothMonotoneUMI4C"
  )
  return(dds)
  
}
#' Monotone smoothing of the VST counts
#' 
#' Takes the variance stabilized count values and calculates a symmetric 
#' monotone fit for the distance dependency. The signal trend is fitted using 
#' the \href{https://CRAN.R-project.org/package=fda}{fda} package.
#' @param trafo_counts Variance stabilized count values assay from DDS object.
#' @param frag_data Data frame with all the information on restriction fragments
#'  and the interval around the viewpoint.
#' @return A dataframe with monotone smoothed fit counts. 
#' @details This function computes the smoothing function for the VST values, 
#' based on \href{https://CRAN.R-project.org/package=fda}{fda} package, and 
#' calculates a symmetric monotone fit counts for the distance dependency
#' @inheritParams differentialNbinomWaldTestUMI4C
.smoothMonotone <- function(trafo_counts,
                            alpha=20,
                            penalty=0.1,
                            frag_data){
 
  basisobj <- fda::create.bspline.basis(rangeval=c(min(frag_data$log_dist),max(frag_data$log_dist)),
                                        nbasis = floor(max(nrow(frag_data)/alpha)))
  
  fdParobj <- fda::fdPar(basisobj, 2, penalty)
  
  mono <- fda::smooth.monotone(argvals = frag_data$log_dist,
                         y = trafo_counts, 
                         WfdParobj = fdParobj)
  
  fit <- predict(mono, frag_data$log_dist)
}
#' Differential UMI4C contacts using DESeq2 Wald Test
#' 
#' Takes the smooth monotone fit count values and infers the differential UMI4C 
#' contacts using DESeq2 Wald Test from 
#' \code{DESeq2} package. 
#' @param dds DDS object as generated by \code{smoothMonotoneUMI4C} with the 
#' smooth monotone fit counts
#' @return DDS object with the DESeq2 Wald Test results, 
#' with results columns accessible with the \code{results} function.
#' @details This function back-transform fitted values to the
#' scale of raw counts and scale them across the samples to
#' obtin normalizaed factors unsing \code{normalizationFactors} from 
#' \code{DESeq2} package. To detect differences between conditions, the DESeq2 
#' Wald Test is applied to fitted counts with the normalization factors. 
nbinomWaldTestUMI4C <- function(dds){
  if (!c("fit") %in% names(assays(dds))) 
    stop("No assay 'fit' found. Check your input.")
  
  message(paste(
    paste0("\n[", Sys.time(), "]"),
    "Starting nbinomWaldTestUMI4C using:\n",
    "> Samples of DDS object:\n", paste(colnames(dds), sep="", collapse=", "), "\n"
  ))
  
  # define inverse vst function
  coefs <- attr(DESeq2::dispersionFunction(dds), "coefficients")
  attr(dds, "inverse-vst") <- function(q) {
    (4 * coefs["asymptDisp"] * 2^q - (1 + coefs["extraPois"]))^2/(4 * 
                                                                    coefs["asymptDisp"] * (1 + coefs["extraPois"] +                                                                                          (4 * coefs["asymptDisp"] * 2^q - (1 + coefs["extraPois"]))))
  }
  
  # backtransform fitted counts
  fit <-  attr(dds, "inverse-vst")(assays(dds)[["fit"]])
  # get normalization factors
  loggeomeans <- rowMeans(log(fit))
  normFactors <- exp(log(fit) - loggeomeans)
  DESeq2::normalizationFactors(dds) <- as.matrix(normFactors)
  # wald test
  design(dds) <- formula(~condition)
  dds <- try(estimateDispersions(dds))
  dds <- DESeq2::nbinomWaldTest(dds)
  message(
    paste0("[", Sys.time(), "] "),
    "Finished sample nbinomWaldTestUMI4C"
  )
  
  return(dds)
}
#' DDS object to UMI4Cats object.
#' 
#' Transforms an DDS object to a UMI4C object after applying 
#' \code{nbinomWaldTestUMI4C}.
#' @inheritParams differentialNbinomWaldTestUMI4C
#' @param dds DDS object as generated by \code{nbinomWaldTestUMI4C} 
#' with the DESeq2 Wald Test results
#' @return UMI4C object with the DESeq2 Wald Test results.
deseq2UMI4C <- function(umi4c,
                        dds,
                        design = ~condition,
                        normalized = TRUE,
                        padj_method = "fdr",
                        padj_threshold = 0.05) {
  
  res <- DESeq2::results(dds,
                         pAdjustMethod = padj_method
  )
  
  res <- data.frame(res[, c(5, 2, 6)])
  res$query_id <- rownames(res)
  res$sign <- FALSE
  res$sign[res$padj <= padj_threshold] <- TRUE
  
  counts <- as.data.frame(counts(dds, normalized = normalized))
  counts$query_id <- rownames(counts)
  counts <- counts[, c(ncol(counts), seq_len(ncol(counts) - 1))]
  
  umi4c@results <- S4Vectors::SimpleList(
    test = "DESeq2 Test based on the Negative Binomial distribution",
    ref = DESeq2::resultsNames(dds)[2],
    padj_threshold = padj_threshold,
    results = res[, c(4, 1, 2, 3, 5)],
    query = rowRanges(dds),
    counts = counts
  )
  
  return(umi4c)
}

Try the UMI4Cats package in your browser

Any scripts or data that you put into this service are public.

UMI4Cats documentation built on Dec. 31, 2020, 2:01 a.m.