R/CTSSs.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

#### Mappers ####

bg_mapper <- function(file, seqinfo, strand = "*"){
    # Import
    o <- import.bedGraph(file, genome=seqinfo)

    # Instead of conversion, stop if not proper CTSSs-like
    stopifnot(all(width(o) == 1))
    # o <- as(o, "StitchedGPos")
    # score(o) <- as.integer(score(o))

    # Set strand
    strand(o) <- strand

    # Return
    o
}

bw_mapper <- function(range, file, seqinfo, strand = "*"){
  # Prune levels
  pruned <- range
  seqlevels(pruned, pruning.mode = "coarse") <- seqlevels(file)

  # Check if there is nothing to import
  if(length(pruned) == 0){
    o <- GRanges(seqinfo=seqinfo)
    score(o) <- numeric()
  }else{
    # Import
    o <- import.bw(con=file,
                   format="bigwig",
                   which=pruned,
                   as="GRanges")

    # Instead of conversion, stop if not proper CTSSs-like
    stopifnot(all(width(o) == 1))

    # Force seqlevels and strand
    seqlevels(o) <- seqlevels(seqinfo)
    seqinfo(o) <- seqinfo
    strand(o) <- strand
  }

  # Return
  o
}

bw_mapper2 <- function(range, file, seqinfo, strand = "*"){
    # Import
    o <- suppressWarnings(import.bw(con=file,
                                    format="bigwig",
                                    which=range,
                                    as="GRanges"))

    # Instead of conversion, stop if not proper CTSSs-like
    stopifnot(all(width(o) == 1))
    # o <- as(o, "StitchedGPos")
    # score(o) <- as.integer(score(o)) # Not used by dgC anyway

    # Force seqlevels and strand
    seqlevels(o) <- seqlevels(seqinfo)
    seqinfo(o) <- seqinfo
    strand(o) <- strand

    # Return
    o
}

#### Reducers ####

gf_reducer2 <- function(mapped){
    # Make table of j = sample indicies
    o <- unlist(GRangesList(mapped), use.names=FALSE)
    o$j <- rep(seq_along(mapped), elementNROWS(mapped))

    # Find unique sites
    unique_pos <- granges(o)
    unique_pos <- unique(unique_pos)
    unique_pos <- sort(unique_pos)

    # Overlap to get i = position indices
    o$i <- match(o, unique_pos)

    # Determine the shape of the sparse matrix
    mat_dim <- c(length(unique_pos), length(mapped))
    mat_names <- list(NULL, names(mapped))

    # Build sparse matrix
    if (length(o) > 0) {
      o <- Matrix::sparseMatrix(i = o$i,
                                     j = o$j,
                                     x = score(o),
                                     dims = mat_dim,
                                     dimnames = mat_names)
    } else {
      # Simply create an empty matrix
      o <- Matrix::sparseMatrix(i = integer(0),
                                j = integer(0),
                                x = integer(0),
                                dims = c(0, mat_dim[2]),
                                dimnames = mat_names)
    }

    # # Simplify so sparse matrix encoding
    # indices <- as.matrix(mcols(indices)[,c("i", "j", "score")])
    # rle_dim <- c(length(unique_pos), length(mapped))
    # o <- indices[,1:2]
    # o <- SparseArraySeed(dim=rle_dim,
    #                      aind=o,
    #                      nzdata=indices[,3])
    #
    # # To DataFrame of Rles via RleMatrix
    # # o <- as(o, "RleMatrix")
    # o <- DelayedArray(o)
    # o <- as(o, "DataFrame")

    # Build RSE
    o <- SummarizedExperiment(assays=list(counts=o),
                              rowRanges=unique_pos)
    colnames(o) <- names(mapped)

    # Return
    o
}

gf_wrapper2 <- function(files, ranges, seqinfo, strand) {
    # Run GenomicFiles
    o <- GenomicFiles::reduceByRange(ranges = ranges,
                                     files = files,
                                     MAP = pryr::partial(bw_mapper,
                                                         seqinfo = seqinfo,
                                                         strand = strand),
                                     REDUCE = gf_reducer2,
                                     iterate = FALSE)

    # Merge output
    #message("Attempting to bind chunks...")
    o <- do.call(rbind, o)

    # Return
    o
}

#### Other helpers ####

setGeneric("cleanDesign", function(design, SampleIDs) {
    standardGeneric("cleanDesign")
})

setMethod("cleanDesign", signature(design = "NULL"), function(design, SampleIDs) {
    DataFrame(row.names = SampleIDs)
})

setMethod("cleanDesign", signature(design = "DataFrame"), function(design, SampleIDs) {
    assert_that(nrow(design) == length(SampleIDs),
                all(rownames(design) == SampleIDs))
    design
})

setMethod("cleanDesign", signature(design = "data.frame"), function(design, SampleIDs) {
    assert_that(nrow(design) == length(SampleIDs),
                all(rownames(design) == SampleIDs))
    methods::as(design, "DataFrame")
})

setMethod("cleanDesign", signature(design = "ANY"), function(design, SampleIDs) {
    stop("design must either NULL or a DataFrame-/data.frame-object!")
})

# pseudoSparsity <- function(x){
#     stopifnot(methods::is(x, "DataFrame"))
#
#     # Find non-zero
#     o <- vapply(X=x,
#                 FUN=function(x) sum(x == 0),
#                 FUN.VALUE=integer(1))
#
#     # Calc sparsity
#     o <- sum(o) / (ncol(x) * nrow(x))
#
#     # Return
#     o
# }

# clean_design <- function(ids, design=NULL){
#     # Decide what to
#     if (is.null(design)) {
#         design <- DataFrame(row.names = ids)
#     } else if (methods::is(design, "DataFrame")) {
#         assert_that(all(rownames(design) == ids))
#     } else if (is.data.frame(design)) {
#         assert_that(all(rownames(design) == ids))
#         design <- methods::as(design, "DataFrame")
#     } else {
#         stop("design must either NULL or a DataFrame-/data.frame-object!")
#     }
#
#     # Return
# }

#### Main function ####

#' Quantify CAGE Transcriptions Start Sites (CTSSs)
#'
#' This function reads in CTSS count data from a series of BigWig-files (or bedGraph-files) and
#' returns a CTSS-by-library count matrix. For efficient processing, the count
#' matrix is stored as a sparse matrix (dgCMatrix from the Matrix package), and CTSSs are compressed to a GPos object if possible.
#'
#' @param plusStrand BigWigFileList or character: BigWig/bedGraph files with plus-strand CTSS data.
#' @param minusStrand BigWigFileList or character: BigWig/bedGraph files with minus-strand CTSS data.
#' @param design DataFrame or data.frame: Additional information on samples which will be added to the ouput
#' @param genome Seqinfo: Genome information. If NULL the smallest common genome
#'   will be found using bwCommonGenome when BigWig-files are analyzed.
#' @param nTiles integer: Number of genomic tiles to parallelize over.
#' @param ... additional arguments passed to methods.
#'
#' @return RangedSummarizedExperiment, where assay is a sparse matrix
#'   (dgCMatrix) of CTSS counts and design stored in colData.
#' @family Quantification functions
#' @importClassesFrom Matrix dgCMatrix
#' @export
#' @examples
#' \dontrun{
#' # Load the example data
#' data('exampleDesign')
#' # Use the BigWig-files included with the package:
#' bw_plus <- system.file('extdata', exampleDesign$BigWigPlus,
#'                        package = 'CAGEfightR')
#' bw_minus <- system.file('extdata', exampleDesign$BigWigMinus,
#'                         package = 'CAGEfightR')
#'
#' # Create two named BigWigFileList-objects:
#' bw_plus <- BigWigFileList(bw_plus)
#' bw_minus <- BigWigFileList(bw_minus)
#' names(bw_plus) <- exampleDesign$Name
#' names(bw_minus) <- exampleDesign$Name
#'
#' # Quantify CTSSs, by default this will use the smallest common genome:
#' CTSSs <- quantifyCTSSs(plusStrand=bw_plus,
#'                        minusStrand=bw_minus,
#'                        design=exampleDesign)
#'
#' # Alternatively, a genome can be specified:
#' si <- seqinfo(bw_plus[[1]])
#' si <- si['chr18']
#' CTSSs_subset <- quantifyCTSSs(plusStrand=bw_plus,
#'                        minusStrand=bw_minus,
#'                        design=exampleDesign,
#'                        genome=si)
#'
#' # Quantification can be speed up by using multiple cores:
#' library(BiocParallel)
#' register(MulticoreParam(workers=3))
#' CTSSs_subset <- quantifyCTSSs(plusStrand=bw_plus,
#'                        minusStrand=bw_minus,
#'                        design=exampleDesign,
#'                        genome=si)
#'
#' # CAGEfightR also support bedGraph files, first BigWig is converted
#' bg_plus <- replicate(n=length(bw_plus), tempfile(fileext="_plus.bedGraph"))
#' bg_minus <- replicate(n=length(bw_minus), tempfile(fileext="_minus.bedGraph"))
#' names(bg_plus) <- names(bw_plus)
#' names(bg_minus) <- names(bw_minus)
#'
#' convertBigWig2BedGraph(input=sapply(bw_plus, resource), output=bg_plus)
#' convertBigWig2BedGraph(input=sapply(bw_minus, resource), output=bg_minus)
#'
#' # Then analyze: Note a genome MUST be supplied here!
#' si <- bwCommonGenome(bw_plus, bw_minus)
#' CTSSs_via_bg <- quantifyCTSSs(plusStrand=bg_plus,
#'                         minusStrand=bg_minus,
#'                         design=exampleDesign,
#'                         genome=si)
#'
#' # Confirm that the two approaches yield the same results
#' all(assay(CTSSs_via_bg) == assay(CTSSs))
#' }
setGeneric("quantifyCTSSs", function(plusStrand, minusStrand, design=NULL, genome=NULL, ...) {
    standardGeneric("quantifyCTSSs")
})

#' @rdname quantifyCTSSs
#' @export
setMethod("quantifyCTSSs",
          signature(plusStrand = "BigWigFileList",
                    minusStrand = "BigWigFileList"),
  function(plusStrand, minusStrand, design=NULL, genome=NULL, nTiles=1L) {
      # Pre-checks
      assert_that(length(plusStrand) == length(minusStrand),
                  bwValid(plusStrand),
                  bwValid(minusStrand),
                  !is.null(names(plusStrand)),
                  !is.null(names(minusStrand)),
                  all(names(plusStrand) == names(minusStrand)),
                  is.count(nTiles))

      # Set design
      message("Checking design...")
      design <- cleanDesign(design, SampleIDs=names(plusStrand))

      # Aquire seqinfo if missing.
      if (is.null(genome)) {
          message("Finding common genome...")
          genome <- bwCommonGenome(plusStrand, minusStrand, method = "intersect")
      } else if (methods::is(genome, "Seqinfo")) {
          message("Checking supplied genome compatibility...")
          bwGenomeCompatibility(plusStrand, minusStrand, genome)
      } else {
          stop("genome must either NULL or a Seqinfo-object!")
      }

      # Setup tiles and workers
      grl <- tileGenome(genome, ntile=nTiles)
      n_workers <- BiocParallel::bpworkers()
      n_workers <- ifelse(is.numeric(n_workers), n_workers, 1)

      message("Iterating over ", length(grl),
              " genomic tiles in ", length(plusStrand),
              " samples using ", n_workers, " worker(s)...")

      # Load data
      message("Importing CTSSs from plus strand...")
      plus_strand <- gf_wrapper2(files = plusStrand,
                                 ranges = grl,
                                 seqinfo = genome,
                                 strand = "+")

      message("Importing CTSSs from minus strand...")
      minus_strand <- gf_wrapper2(files = minusStrand,
                                  ranges = grl,
                                  seqinfo = genome,
                                  strand = "-")

      # Merge and format
      message("Merging strands...")
      o <- rbind(plus_strand, minus_strand)
      rm(plus_strand, minus_strand)

      # Attach design
      colData(o) <- design

      message("Formatting output...")
      # Calculate sparsity before seed
      zero_frac <- 1 - (Matrix::nnzero(assay(o)) / length(assay(o)))

      # Wrap in DelayedArray
      #assay(o) <- DelayedArray(assay(o))

      # Convert to GPos if possible:
      if(length(o) < .Machine$integer.max){
        rowRanges(o) <- methods::as(rowRanges(o), "StitchedGPos")
      }

      # Attach design

      # Sparsity
      #  Convert to appropriate seed
      # message("Formatting to specified seed...")
      # if(seed == "DataFrame"){
      #   assays(o) <- List(counts=DelayedArray(assay(o)))
      # }else if(seed == "dgCMatrix"){
      #   assays(o) <- List(counts=DelayedArray(assay(o)))
      #   assay(o) <- DelayedArray(as(assay(o), "dgCMatrix"))
      # }else if(seed == "RleMatrix"){
      #   assays(o) <- List(counts=RleArray(assay(o))) # Chunk by column
      # }else if(seed == "RleMatrix2"){
      #   assays(o) <- List(counts=as(assay(o), "RleMatrix")) # High compression
      # }else if(seed == "SparseArraySeed"){
      #   assays(o) <- List(counts=DelayedArray(assay(o)))
      #   assay(o) <- DelayedArray(as(assay(o), "SparseArraySeed"))
      # }else{
      #   warning("Did not wrap seed in DelayedArray!")
      # }

      # Calculate sparsity
      #pseudo_sparsity <- pseudoSparsity(assay(o))

      # Final conversion
      #assays(o) <- List(counts=assayCompression(assay(o, "counts")))

      # # Calculate sparsity
      # sparse_percent <- vapply(X=assay(o),
      #                          FUN=function(x) sum(x == 0),
      #                          FUN.VALUE=integer(1))
      # sparse_percent <- sum(sparse_percent) / (ncol(o) * nrow(o))

      # Final conversion
      # if(assayStorage == "RleMatrix"){
      #     assays(o) <- List(counts=RleArray(assay(o, "counts")))
      # }else if(assayStorage == "DelayedArray") {
      #     assays(o) <- List(counts=DelayedArray(assay(o, "counts")))
      # }else if(assayStorage == "DataFrame") {
      #     warning("Output is a simple DataFrame of Rle's: ",
      #     "This is mainly intended for debugging and will not work with remaining CAGEfightR functions!")
      # }else{
      #     stop("assayStorage must be in: RleMatrix, DelayedArray, DataFrame")
      # }

      # Post warnings
      if(!length(plusStrand) == ncol(o)){
        warning("Output has the wrong dimensions! ",
                "Check input files with checkCTSSs.")
      }

      if(!identical(seqinfo(o), genome)){
        warning("Seqinfo was not properly set on output! ",
                "Check input files with checkCTSSs")
      }

      # Basic Summary
      message("### CTSS summary ###")
      message("Number of samples: ", ncol(o))
      message("Number of CTSSs: ", format(nrow(o) / 1e6L, digits = 4), " millions")
      message("Sparsity: ", format(zero_frac * 100, digits=4), " %")
      #message("Type of DelayedArray seed: dgCMatrix")
      message("Type of rowRanges: ", class(rowRanges(o)))
      message("Final object size: ", utils::capture.output(pryr::object_size(o)))

      # Return
      o
})

#' @rdname quantifyCTSSs
#' @export
setMethod("quantifyCTSSs",
          signature(plusStrand = "character",
                    minusStrand = "character"),
          function(plusStrand, minusStrand, design=NULL, genome=NULL) {
              # Pre-checks
              assert_that(all(checkFileSeries(plusStrand)),
                          all(checkFileSeries(minusStrand)),
                          all(checkExtensions(plusStrand, "bedGraph")),
                          all(checkExtensions(minusStrand, "bedGraph")),
                          length(plusStrand) == length(minusStrand),
                          !is.null(names(plusStrand)),
                          !is.null(names(minusStrand)),
                          all(names(plusStrand) == names(minusStrand)),
                          methods::is(genome, "Seqinfo"))

              # Set design
              message("Checking design...")
              design <- cleanDesign(design, SampleIDs=names(plusStrand))

              # Setup tiles and workers
              n_workers <- BiocParallel::bpworkers()
              n_workers <- ifelse(is.numeric(n_workers), n_workers, 1)

              message("Iterating over ", length(plusStrand),
                      " samples using ", n_workers, " worker(s)...")

              # Load data
              message("Importing CTSSs from plus strand...")
              plus_strand <- BiocParallel::bplapply(plusStrand,
                                                    bg_mapper,
                                                    seqinfo=genome,
                                                    strand="+")
              plus_strand <- gf_reducer2(plus_strand)

              message("Importing CTSSs from minus strand...")
              minus_strand <- BiocParallel::bplapply(minusStrand,
                                                    bg_mapper,
                                                    seqinfo=genome,
                                                    strand="-")
              minus_strand <- gf_reducer2(minus_strand)

              # Merge and format
              message("Merging strands...")
              o <- rbind(plus_strand, minus_strand)
              rm(plus_strand, minus_strand)

              # Attach design
              colData(o) <- design

              message("Formatting output...")
              # Calculate sparsity before seed
              zero_frac <- 1 - (Matrix::nnzero(assay(o)) / length(assay(o)))

              # Wrap in DelayedArray
              #assay(o) <- DelayedArray(assay(o))

              # Convert to GPos if possible:
              if(length(o) < .Machine$integer.max){
                rowRanges(o) <- methods::as(rowRanges(o), "StitchedGPos")
              }

              # Basic Summary
              message("### CTSS summary ###")
              message("Number of samples: ", ncol(o))
              message("Number of CTSSs: ", format(nrow(o) / 1e6L, digits = 4), " millions")
              message("Sparsity: ", format(zero_frac * 100, digits=4), " %")
              #message("Type of DelayedArray seed: dgCMatrix")
              message("Type of rowRanges: ", class(rowRanges(o)))
              message("Final object size: ", utils::capture.output(pryr::object_size(o)))

              # Return
              o
})
Any scripts or data that you put into this service are public.
CAGEfightR documentation built on Nov. 8, 2020, 5:42 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
CAGEfightR
Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/CTSSs.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Defines functions gf_wrapper2 gf_reducer2 bw_mapper2 bw_mapper bg_mapper

Try the CAGEfightR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEfightR Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/CTSSs.R In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Defines functions gf_wrapper2 gf_reducer2 bw_mapper2 bw_mapper bg_mapper

Try the CAGEfightR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEfightR
Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/CTSSs.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor