#' @include generics.R
#'
NULL
#' Compute TSS enrichment score per cell
#'
#' Compute the transcription start site (TSS) enrichment score for each cell,
#' as defined by ENCODE:
#' \url{https://www.encodeproject.org/data-standards/terms/}.
#'
#' The computed score will be added to the object metadata as "TSS.enrichment".
#'
#' @param object A Seurat object
#' @param assay Name of assay to use
#' @param tss.positions A GRanges object containing the TSS positions. If NULL,
#' use the genomic annotations stored in the assay.
#' @param n Number of TSS positions to use. This will select the first _n_
#' TSSs from the set. If NULL, use all TSSs (slower).
#' @param cells A vector of cells to include. If NULL (default), use all cells
#' in the object
#' @param fast Just compute the TSS enrichment score, without storing the
#' base-resolution matrix of integration counts at each site. This reduces the
#' memory required to store the object but does not allow plotting the
#' accessibility profile at the TSS.
#' @param process_n Number of regions to process at a time if using \code{fast}
#' option.
#' @param verbose Display messages
#'
#' @importFrom Matrix rowMeans
#' @importFrom methods slot
#' @importFrom stats ecdf
#' @importFrom GenomeInfoDb seqnames
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges start width strand
#' @importFrom SeuratObject DefaultAssay
#'
#' @return Returns a \code{\link[SeuratObject]{Seurat}} object
#' @export
#' @concept qc
#' @examples
#' print("see https://satijalab.org/signac/reference/tssenrichment")
TSSEnrichment <- function(
object,
tss.positions = NULL,
n = NULL,
fast = TRUE,
assay = NULL,
cells = NULL,
process_n = 2000,
verbose = TRUE
) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
stop("The requested assay is not a ChromatinAssay")
}
# first check that fragments are present
frags <- Fragments(object = object[[assay]])
if (length(x = frags) == 0) {
stop("No fragment files present in assay")
}
if (is.null(x = tss.positions)) {
if (verbose) {
message("Extracting TSS positions")
}
# work out TSS positions from gene annotations
annotations <- Annotation(object = object[[assay]])
if (is.null(x = annotations)) {
stop("No gene annotations present in assay")
}
tss.positions <- GetTSSPositions(ranges = annotations)
}
if (!is.null(x = n)) {
if (n > length(x = tss.positions)) {
n <- length(x = tss.positions)
}
tss.positions <- tss.positions[seq(n), ]
}
# exclude chrM
sn <- seqnames(x = tss.positions)
tss.positions <- tss.positions[!as.character(sn) %in% c("chrM", "Mt", "MT")]
if (fast) {
# just compute the TSS enrichment score without storing the full matrix
object <- TSSFast(
object = object,
assay = assay,
tss.positions = tss.positions,
process_n = process_n,
verbose = verbose
)
return(object)
}
tss.positions <- Extend(
x = tss.positions,
upstream = 1000,
downstream = 1000,
from.midpoint = TRUE
)
cutmatrix <- CreateRegionPileupMatrix(
object = object,
regions = tss.positions,
assay = assay,
cells = cells,
verbose = verbose
)
# compute mean read counts in 100 bp at each flank for each cell
# (200 bp total averaged)
if (verbose) {
message("Computing mean insertion frequency in flanking regions")
}
flanking.mean <- rowMeans(x = cutmatrix[, c(seq(from = 1, to = 100),
seq(from = 1902, to = 2001))])
# if the flanking mean is 0 for any cells, the enrichment score will be zero.
# instead replace with the mean from the whole population
flanking.mean[is.na(x = flanking.mean)] <- 0
flanking.mean[flanking.mean == 0] <- mean(flanking.mean, na.rm = TRUE)
# compute fold change at each position relative to flanking mean
# (flanks should start at 1)
if (verbose) {
message("Normalizing TSS score")
}
norm.matrix <- cutmatrix / flanking.mean
# Take signal value at center of distribution after normalization as
# TSS enrichment score, average the 1000 bases at the center
object$TSS.enrichment <- rowMeans(x = norm.matrix[, 500:1500], na.rm = TRUE)
e.dist <- ecdf(x = object$TSS.enrichment)
object$TSS.percentile <- round(
x = e.dist(object$TSS.enrichment),
digits = 2
)
# store expected as one additional row in the matrix
expected.insertions <- rep(1, ncol(x = cutmatrix))
expected.insertions <- t(x = as.matrix(x = expected.insertions))
rownames(x = expected.insertions) <- "expected"
# encode motif position as additional row in matrix
motif.vec <- t(x = matrix(
data = c(
rep(x = 0, 1000),
1,
rep(x = 0, 1000)
)
)
)
rownames(x = motif.vec) <- "motif"
# append
norm.matrix <- rbind(norm.matrix, expected.insertions)
norm.matrix <- rbind(norm.matrix, motif.vec)
# store the normalized TSS matrix
object <- suppressWarnings(SetAssayData(
object = object,
assay = assay,
slot = "positionEnrichment",
new.data = norm.matrix,
key = "TSS"
))
return(object)
}
#' @importFrom Rsamtools TabixFile
#' @importFrom GenomeInfoDb seqlevels keepSeqlevels
#' @importFrom stats ecdf
#' @importFrom Matrix rowSums
#' @importFrom SeuratObject DefaultAssay
#' @importFrom future nbrOfWorkers
#' @importFrom future.apply future_lapply
#' @importFrom pbapply pblapply
#' @importFrom utils txtProgressBar
TSSFast <- function(
object,
tss.positions,
assay = NULL,
process_n = 2000,
verbose = TRUE
) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
# extract fragments
frags <- Fragments(object = object[[assay]])
if (length(x = frags) == 0) {
stop("Fragments file not set for assay ", assay)
}
# get regions
upstream.flank <- Extend(
x = tss.positions,
upstream = 1000,
downstream = -901,
from.midpoint = TRUE
)
downstream.flank <- Extend(
x = tss.positions,
upstream = -901,
downstream = 1000,
from.midpoint = TRUE
)
centers <- Extend(
x = tss.positions,
upstream = 500,
downstream = 500,
from.midpoint = TRUE
)
# chunk ranges
process_n <- SetIfNull(x = process_n, y = length(x = centers))
nchunk <- ceiling(x = length(x = upstream.flank) / process_n)
upstream.flank <- ChunkGRanges(
granges = upstream.flank,
nchunk = nchunk
)
downstream.flank <- ChunkGRanges(
granges = downstream.flank,
nchunk = nchunk
)
centers <- ChunkGRanges(
granges = centers,
nchunk = nchunk
)
# iterate over fragment files and parts of region
if (verbose) {
message("Extracting fragments at TSSs")
pb <- txtProgressBar(
min = 1,
max = length(x = centers),
style = 3,
file = stderr()
)
}
if (nbrOfWorkers() > 1) {
mylapply <- future_lapply
} else {
mylapply <- ifelse(test = verbose, yes = pblapply, no = lapply)
}
center.counts <- vector(mode = "numeric", length = ncol(x = object))
flank.counts <- vector(mode = "numeric", length = ncol(x = object))
for (i in seq_along(along.with = frags)) {
# open fragment file
tbx.path <- GetFragmentData(object = frags[[i]], slot = "path")
cellmap <- GetFragmentData(object = frags[[i]], slot = "cells")
if (is.null(x = cellmap)) {
cellmap <- colnames(x = object)
names(x = cellmap) <- cellmap
} else {
cellmap <- cellmap[intersect(names(x = cellmap), colnames(x = object))]
}
tbx <- TabixFile(file = tbx.path)
# iterate over chunked ranges
res <- mylapply(
X = seq_along(along.with = centers),
FUN = function(x) {
extract_tss_counts(
cellnames = colnames(x = object),
region.centers = centers[[x]],
upstream = upstream.flank[[x]],
downstream = downstream.flank[[x]],
tabix.file = tbx,
cell.name.map = cellmap
)
}
)
# sum results from each chunk of granges
cc <- lapply(X = res, FUN = `[[`, 1)
fc <- lapply(X = res, FUN = `[[`, 2)
center.counts <- center.counts + Reduce(f = `+`, x = cc)
flank.counts <- flank.counts + Reduce(f = `+`, x = fc)
}
if (verbose) {
message("\nComputing TSS enrichment score")
}
# take mean accessibility per base
flank.mean <- flank.counts / 200
flank.mean[flank.counts == 0] <- mean(x = flank.mean, na.rm = TRUE)
center.norm <- center.counts / flank.mean
# compute TSS enrichment score and add to object
object$TSS.enrichment <- center.norm / 1001
e.dist <- ecdf(x = object$TSS.enrichment)
object$TSS.percentile <- round(
x = e.dist(object$TSS.enrichment),
digits = 2
)
return(object)
}
## Not exported ##
#' @importFrom GenomeInfoDb seqlevels keepSeqlevels
#' @importFrom Rsamtools seqnamesTabix
#' @importFrom Matrix rowSums
extract_tss_counts <- function(
cellnames,
region.centers,
tabix.file,
upstream,
downstream,
cell.name.map
) {
tabix.file <- open(con = tabix.file)
# initialize vectors
fc <- vector(mode = "numeric", length = length(x = cellnames))
names(x = fc) <- cellnames
cc <- vector(mode = "numeric", length = length(x = cellnames))
names(x = cc) <- cellnames
# remove seqlevels not present in fragment file
common.seqlevels <- intersect(
x = seqlevels(x = region.centers),
y = seqnamesTabix(file = tabix.file)
)
if (length(x = common.seqlevels) == 0) {
close(con = tabix.file)
return(list(cc, fc))
}
uflanks.use <- keepSeqlevels(
x = upstream,
value = common.seqlevels,
pruning.mode = "coarse"
)
dflanks.use <- keepSeqlevels(
x = downstream,
value = common.seqlevels,
pruning.mode = "coarse"
)
centers.use <- keepSeqlevels(
x = region.centers,
value = common.seqlevels,
pruning.mode = "coarse"
)
# count integration events
cuts.center <- SingleFileCutMatrix(
cellmap = cell.name.map,
tabix.file = tabix.file,
region = centers.use,
verbose = FALSE
)
counts.center <- rowSums(x = cuts.center)
cuts.flank <- SingleFileCutMatrix(
cellmap = cell.name.map,
tabix.file = tabix.file,
region = c(uflanks.use, dflanks.use),
verbose = FALSE
)
counts.flank <- rowSums(x = cuts.flank)
cc[names(x = counts.center)] <- cc + as.vector(x = counts.center)
fc[names(x = counts.flank)] <- fc + as.vector(x = counts.flank)
close(con = tabix.file)
return(list(cc, fc))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.