BiocStyle::markdown()
# download current version of SE diagram
download.file("https://docs.google.com/feeds/download/drawings/Export?id=1kiC8Qlo1mhSnLDqkGiRNPSo6GWn3C2duBszCFbJCB-g&exportFormat=svg", "SE.svg")

Summarized Experiment

TODO: A modified version of the above figure with sparseAssays added?

Background

Motivating use case

Parts of a SparseSummarizedExperiment

sparseAssays

SimpleListSparseAssays

Putting it together

Often, SparseSummarizedExperiment objects are returned by functions written by other packages. However it is possible to create a SparseSummarizedExperiment 'by hand'.

# TODO: Do I want to demonstrate how to construct a SSE 'by hand' using 
#       simulated or 'real' data? Simulated is clunky, real increases package 
#       size.
# 
# NOTE: Need S4Vectors for DataFrame(), GenomicRanges for GRanges(),
#       SummarizedExperiment for SummarizedExperiment(),
#       SparseSummarizedExperiment for coercion methods.
library(S4Vectors)
library(GenomicRanges)
library(SummarizedExperiment)
library(SparseSummarizedExperiment)

#' Simulate data as a SimpleListSparseAssays object
#'
#' @param m Integer scalar giving the number of features.
#' @param n Integer scalar giving the number of samples.
#' @param d Numeric scalar in [0, 1] giving the proportion of non-missing data.
#' @param p A Integer scalar giving the ncol of the sparse assay.
#'
#' @return A SimpleListSparseAssays object
simSLSA <- function(m, n, d, p) {
  v <- replicate(n, {
    val <- matrix(NA_integer_, ncol = p, nrow = m)
    i <- sample(m, floor(d * m), replace = TRUE)
    val[i, ] <- rpois(floor(d * m) * p, lambda = 10)
    val
  }, simplify = FALSE)
  # Add sample names
  v <- mapply(function(vv, i) {
    tmp <- SparseAssays(vv)
    names(tmp[[1]]) <- paste0("s", i)
    tmp
  }, vv = v, i = seq_along(v))
  # Add sparse assay name
  v <- lapply(v, function(vv) {
    names(vv) <- "sa1"
    vv
  })
  # cbind (don't need to combine because know each element of v is a single
  # sample and has same dimensions by construction)
  do.call(cbind, v)
}

#' Simulate data as a GRanges object
#'
#' @param m Integer scalar giving the number of genomic ranges.
#'
#' @return A GRanges object.
simGR <- function(m) {
  GRanges(sample(paste0("chr", 1:22), m, replace = TRUE),
          IRanges(floor(runif(m, 1, 1e6)), width = 100),
          strand = sample(c("+", "-"), m, TRUE),
          feature_id = paste0("f", seq_len(m)))
}

#' Simulate data as a RangedSummarizedExperiment object
#'
#' @param m Integer scalar giving the number of genomic ranges.
#' @param n Integer scalar giving the number of samples (assumed/forced even).
#'
#' @return A RangedSummarizedExperiment object.
simRSE <- function(m, n) {
  # Require n to be even
  if (n %% 2 == 1) {
    n <- n + 1
  }
  counts <- matrix(floor(runif(m * n, 0, 1e4)), m)
  colData <- DataFrame(Genotype = rep(c("KO", "WT"), n / 2),
                       row.names = paste0("s", seq_len(n)))
  rowRanges <- simGR(m)
  SummarizedExperiment(assays = SimpleList(counts = counts),
                       rowRanges = rowRanges,
                       colData = colData)
}

#' Simulate data as a SummarizedExperiment object
#'
#' @param m Integer scalar giving the number of features.
#' @param n Integer scalar giving the number of samples (assumed/forced even).
#'
#' @return A SummarizedExperiment object.
simSE <- function(m, n) {
  as(simRSE(m, n), "SummarizedExperiment")
}

# NOTE: data are generated using pseudorandom numbers, so need to set the seed 
#       to ensure reproducibility.
set.seed(666)

m <- 10000
n <- 6
d <- 0.7
p <- 8

rse <- simRSE(m, n)
rse
rsse <- as(rse, "RangedSparseSummarizedExperiment")
rsse
sa <- simSLSA(m, n, d, p)
sa
sparseAssays(rsse) <- sa
rsse
sse <- as(rsse, "SparseSummarizedExperiment")
sse
names(sse) <- paste0("F", seq_len(nrow(sse)))
sse
se <- as(rse, "SummarizedExperiment")
se
names(se) <- paste0("F", seq_len(nrow(se)))
se

Common operations on SparseSummarizedExperiment

Subsetting

# subset the first five features and first three samples
sse[1:5, 1:3]
# Select knockout (KO) samples
sse[, sse$Genotype == "KO"]

Getters and setters

# TODO: Examples using rowRanges(), rowData(), colData(), and metadata()
# TODO: Illustrate the difference between sparseAssays() and sparseAssay, 
#       and the effect of withDimnames

Range-based operations

# Subset for only rows which are in the interval 100,000 to 110,000 of
# chromosome 1
roi <- GRanges(seqnames = "chr1", ranges = 100000:1100000)
subsetByOverlaps(rsse, roi)

Case study revisited -- SparseSummarizedExperiment in action

TODO: Worth including?

Advanced: Extending SparseSummarizedExperiment

TODO



PeteHaitch/SparseSummarizedExperiment documentation built on May 8, 2019, 1:31 a.m.