Nothing
#' @include generics.R
#' @importFrom utils globalVariables
NULL
#' Add chromatin module
#'
#' Compute chromVAR deviations for groups of peaks. The goal of this function is
#' similar to that of \code{\link[Seurat]{AddModuleScore}} except that it is
#' designed for single-cell chromatin data. The chromVAR deviations for each
#' group of peaks will be added to the object metadata.
#'
#' @param object A Seurat object
#' @param features A named list of features to include in each module. The name
#' of each element in the list will be used to name the modules computed, which
#' will be stored in the object metadata.
#' @param genome A BSgenome object
#' @param assay Name of assay to use. If NULL, use the default assay.
#' @param verbose Display messages
#' @param ... Additional arguments passed to \code{RunChromVAR}
#'
#' @return Returns a Seurat object
#'
#' @importFrom fastmatch fmatch
#' @importFrom Matrix sparseMatrix
#' @importFrom SeuratObject DefaultAssay GetAssayData AddMetaData
#'
#' @export
#' @concept utilities
AddChromatinModule <- function(
object,
features,
genome,
assay = NULL,
verbose = TRUE,
...
) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
stop("The requested assay is not a ChromatinAssay.")
}
# first find index of each feature
feat.idx <- sapply(X = features, FUN = fmatch, rownames(x = object[[assay]]))
j <- sapply(X = seq_along(along.with = features), FUN = function(x) {
rep(x = x, length(x = features[[x]]))
})
# construct sparse matrix with features
mat <- sparseMatrix(
i = unlist(x = feat.idx, use.names = FALSE),
j = unlist(x = j, use.names = FALSE),
x = 1,
dims = c(nrow(x = object[[assay]]), length(x = features))
)
rownames(x = mat) <- rownames(x = object[[assay]])
colnames(x = mat) <- names(x = features)
# run chromVAR
cv <- RunChromVAR(
object = object[[assay]],
motif.matrix = mat,
genome = genome,
verbose = verbose,
...
)
# add module scores to metadata
chromvar.data <- GetAssayData(object = cv, slot = "data")
object <- AddMetaData(
object = object,
metadata = as.data.frame(x = t(x = chromvar.data))
)
return(object)
}
globalVariables(names = c("group", "readcount"), package = "Signac")
#' Average Counts
#'
#' Compute the mean counts per group of cells for a given assay
#'
#' @param object A Seurat object
#' @param assay Name of assay to use. Default is the active assay
#' @param group.by Grouping variable to use. Default is the active identities
#' @param verbose Display messages
#'
#' @importFrom SeuratObject DefaultAssay Idents GetAssayData
#' @importFrom dplyr group_by summarize
#' @export
#' @concept utilities
#' @return Returns a dataframe
#' @examples
#' AverageCounts(atac_small)
AverageCounts <- function(
object,
assay = NULL,
group.by = NULL,
verbose = TRUE
) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
if (is.null(x = group.by)) {
group.by <- Idents(object = object)
} else {
group.by <- object[[group.by, drop = TRUE]]
}
# pull nCount_ column
col.use <- paste0("nCount_", assay)
total.df <- object[[col.use]]
colnames(x = total.df) <- "readcount"
total.df$cell <- rownames(x = total.df)
total.df$group <- group.by[total.df$cell]
total.df <- group_by(total.df, group)
if (verbose) {
message("Computing average counts per group")
}
group.means <- summarize(.data = total.df, mn = mean(x = readcount))
results <- group.means$mn
names(x = results) <- group.means$group
return(results)
}
#' Accessible peaks
#'
#' Find accessible peaks in a set of cells
#'
#' @param object A Seurat object
#' @param assay Name of assay to use
#' @param idents A set of identity classes to find accessible peaks for
#' @param cells A vector of cells to find accessible peaks for
#' @param min.cells Minimum number of cells with the peak accessible (>0 counts)
#' for the peak to be called accessible
#' @export
#' @concept utilities
#' @importFrom SeuratObject WhichCells DefaultAssay GetAssayData
#' @importFrom Matrix rowSums
#' @return Returns a vector of peak names
AccessiblePeaks <- function(
object,
assay = NULL,
idents = NULL,
cells = NULL,
min.cells = 10
) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
cells <- SetIfNull(x = cells, y = WhichCells(object, idents = idents))
open.peaks <- GetAssayData(
object = object,
assay = assay,
slot = "counts"
)[, cells]
peaks <- names(x = which(x = rowSums(x = open.peaks > 0) > min.cells))
return(peaks)
}
#' Cells per group
#'
#' Count the number of cells in each group
#'
#' @param object A Seurat object
#' @param group.by A grouping variable. Default is the active identities
#' @importFrom SeuratObject Idents
#' @export
#' @concept utilities
#' @return Returns a vector
#' @examples
#' CellsPerGroup(atac_small)
CellsPerGroup <- function(
object,
group.by = NULL
) {
if (is.null(x = group.by)) {
cellgroups <- Idents(object = object)
} else {
meta.data <- object[[]]
cellgroups <- meta.data[[group.by]]
}
cells.per.group <- table(cellgroups, useNA = "always")
lut <- as.vector(x = cells.per.group)
names(x = lut) <- names(x = cells.per.group)
return(lut)
}
#' Closest Feature
#'
#' Find the closest feature to a given set of genomic regions
#'
#' @param object A Seurat object
#' @param regions A set of genomic regions to query
#' @param annotation A GRanges object containing annotation information. If
#' NULL, use the annotations stored in the object.
#' @param ... Additional arguments passed to \code{\link{StringToGRanges}}
#'
#' @importMethodsFrom GenomicRanges distanceToNearest
#' @importFrom S4Vectors subjectHits mcols
#' @importFrom methods is
#' @importFrom SeuratObject DefaultAssay
#' @importFrom GenomeInfoDb dropSeqlevels
#' @return Returns a dataframe with the name of each region, the closest feature
#' in the annotation, and the distance to the feature.
#' @export
#' @concept utilities
#' @examples
#' \donttest{
#' ClosestFeature(
#' object = atac_small,
#' regions = head(granges(atac_small))
#' )
#' }
ClosestFeature <- function(
object,
regions,
annotation = NULL,
...
) {
if (!is(object = regions, class2 = 'GRanges')) {
regions <- StringToGRanges(regions = regions, ...)
}
if (inherits(x = object, what = "Seurat")) {
# running on Seurat object, extract the assay
assay <- DefaultAssay(object = object)
object <- object[[assay]]
}
if (!inherits(x = object, what = "ChromatinAssay")) {
stop("The requested assay is not a ChromatinAssay.")
}
if (length(x = regions) == 0) {
stop("No query regions supplied")
}
annotation <- SetIfNull(x = annotation, y = Annotation(object = object))
missing_seqlevels <- setdiff(
x = seqlevels(x = regions), y = seqlevels(x = annotation)
)
if (length(x = missing_seqlevels) > 0) {
warning(
"The following seqlevels present in query regions are not present\n ",
"in the supplied gene annotations and will be removed: ",
paste(missing_seqlevels, collapse = ", ")
)
regions <- dropSeqlevels(
x = regions,
value = missing_seqlevels,
pruning.mode = "coarse"
)
if (length(x = regions) == 0) {
stop("None of the supplied regions were found in the supplied annotation")
}
}
nearest_feature <- distanceToNearest(x = regions, subject = annotation)
feature_hits <- annotation[subjectHits(x = nearest_feature)]
df <- as.data.frame(x = mcols(x = feature_hits))
df$closest_region <- GRangesToString(grange = feature_hits, ...)
df$query_region <- GRangesToString(grange = regions, ...)
df$distance <- mcols(x = nearest_feature)$distance
return(df)
}
#' Create gene activity matrix
#'
#' Compute counts per cell in gene body and promoter region.
#'
#' @param object A Seurat object
#' @param assay Name of assay to use. If NULL, use the default assay
#' @param features Genes to include. If NULL, use all protein-coding genes in
#' the annotations stored in the object
#' @param extend.upstream Number of bases to extend upstream of the TSS
#' @param extend.downstream Number of bases to extend downstream of the TTS
#' @param biotypes Gene biotypes to include. If NULL, use all biotypes in the
#' gene annotation.
#' @param max.width Maximum allowed gene width for a gene to be quantified.
#' Setting this parameter can avoid quantifying extremely long transcripts that
#' can add a relatively long amount of time. If NULL, do not filter genes based
#' on width.
#' @param process_n Number of regions to load into memory at a time, per thread.
#' Processing more regions at once can be faster but uses more memory.
#' @param gene.id Record gene IDs in output matrix rather than gene name.
#' @param verbose Display messages
#'
#' @return Returns a sparse matrix
#'
#' @concept utilities
#' @export
#' @importFrom SeuratObject DefaultAssay
#' @examples
#' fpath <- system.file("extdata", "fragments.tsv.gz", package="Signac")
#' fragments <- CreateFragmentObject(
#' path = fpath,
#' cells = colnames(atac_small),
#' validate.fragments = FALSE
#' )
#' Fragments(atac_small) <- fragments
#' GeneActivity(atac_small)
GeneActivity <- function(
object,
assay = NULL,
features = NULL,
extend.upstream = 2000,
extend.downstream = 0,
biotypes = "protein_coding",
max.width = 500000,
process_n = 2000,
gene.id = FALSE,
verbose = TRUE
) {
if (!is.null(x = features)) {
if (length(x = features) == 0) {
stop("Empty list of features provided")
}
}
# collapse to longest protein coding transcript
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
stop("The requested assay is not a ChromatinAssay.")
}
annotation <- Annotation(object = object[[assay]])
# replace NA names with gene ID
annotation$gene_name <- ifelse(
test = is.na(x = annotation$gene_name) | (annotation$gene_name == ""),
yes = annotation$gene_id,
no = annotation$gene_name
)
if (length(x = annotation) == 0) {
stop("No gene annotations present in object")
}
if (verbose) {
message("Extracting gene coordinates")
}
transcripts <- CollapseToLongestTranscript(ranges = annotation)
if (gene.id) {
transcripts$gene_name <- transcripts$gene_id
}
if (!is.null(x = biotypes)) {
transcripts <- transcripts[transcripts$gene_biotype %in% biotypes]
if (length(x = transcripts) == 0) {
stop("No genes remaining after filtering for requested biotypes")
}
}
# filter genes if provided
if (!is.null(x = features)) {
transcripts <- transcripts[transcripts$gene_name %in% features]
if (length(x = transcripts) == 0) {
stop("None of the requested genes were found in the gene annotation")
}
}
if (!is.null(x = max.width)) {
transcript.keep <- which(x = width(x = transcripts) < max.width)
transcripts <- transcripts[transcript.keep]
if (length(x = transcripts) == 0) {
stop("No genes remaining after filtering for max.width")
}
}
# extend to include promoters
transcripts <- Extend(
x = transcripts,
upstream = extend.upstream,
downstream = extend.downstream
)
# quantify
frags <- Fragments(object = object[[assay]])
if (length(x = frags) == 0) {
stop("No fragment information found for requested assay")
}
cells <- colnames(x = object[[assay]])
counts <- FeatureMatrix(
fragments = frags,
features = transcripts,
process_n = process_n,
cells = cells,
verbose = verbose
)
# set row names
gene.key <- transcripts$gene_name
names(x = gene.key) <- GRangesToString(grange = transcripts)
rownames(x = counts) <- as.vector(x = gene.key[rownames(x = counts)])
counts <- counts[rownames(x = counts) != "", ]
return(counts)
}
#' Extract genomic ranges from EnsDb object
#'
#' Pulls the transcript information for all chromosomes from an EnsDb object.
#' This wraps \code{\link[biovizBase]{crunch}} and applies the extractor
#' function to all chromosomes present in the EnsDb object.
#'
#' @param ensdb An EnsDb object
#' @param standard.chromosomes Keep only standard chromosomes
#' @param biotypes Biotypes to keep
#' @param verbose Display messages
#'
#' @importFrom GenomeInfoDb keepStandardChromosomes seqinfo
#' @importFrom pbapply pblapply
#' @concept utilities
#' @export
GetGRangesFromEnsDb <- function(
ensdb,
standard.chromosomes = TRUE,
biotypes = c("protein_coding", "lincRNA", "rRNA", "processed_transcript"),
verbose = TRUE
) {
if (!requireNamespace("biovizBase", quietly = TRUE)) {
stop("Please install biovizBase\n",
"https://www.bioconductor.org/packages/biovizBase/")
}
# convert seqinfo to granges
whole.genome <- as(object = seqinfo(x = ensdb), Class = "GRanges")
if (standard.chromosomes) {
whole.genome <- keepStandardChromosomes(whole.genome, pruning.mode = "coarse")
}
# extract genes from each chromosome
my_lapply <- ifelse(test = verbose, yes = pblapply, no = lapply)
tx <- my_lapply(X = seq_along(whole.genome), FUN = function(x){
suppressMessages(expr = biovizBase::crunch(
obj = ensdb,
which = whole.genome[x],
columns = c("tx_id", "gene_name", "gene_id", "gene_biotype")))
})
# combine
tx <- do.call(what = c, args = tx)
tx <- tx[tx$gene_biotype %in% biotypes]
return(tx)
}
#' Find transcriptional start sites
#'
#' Get the TSS positions from a set of genomic ranges containing gene positions.
#' Ranges can contain exons, introns, UTRs, etc, rather than the whole
#' transcript. Only protein coding gene biotypes are included in output.
#'
#' @param ranges A GRanges object containing gene annotations.
#' @param biotypes Gene biotypes to include. If NULL, use all biotypes in the
#' supplied gene annotation.
#' @importFrom GenomicRanges resize
#' @importFrom S4Vectors mcols
#' @export
#' @concept utilities
GetTSSPositions <- function(ranges, biotypes = "protein_coding") {
if (!("gene_biotype" %in% colnames(x = mcols(x = ranges)))) {
stop("Gene annotation does not contain gene_biotype information")
}
if (!is.null(x = biotypes)){
ranges <- ranges[ranges$gene_biotype == "protein_coding"]
}
gene.ranges <- CollapseToLongestTranscript(ranges = ranges)
# shrink to TSS position
tss <- resize(gene.ranges, width = 1, fix = 'start')
return(tss)
}
#' Find intersecting regions between two objects
#'
#' Intersects the regions stored in the rownames of two objects and
#' returns a vector containing the names of rows that intersect
#' for each object. The order of the row names return corresponds
#' to the intersecting regions, i.e. the nth feature of the first vector
#' will intersect the nth feature in the second vector. A distance
#' parameter can be given, in which case features within the given
#' distance will be called as intersecting.
#'
#' @param object.1 The first Seurat object
#' @param object.2 The second Seurat object
#' @param assay.1 Name of the assay to use in the first object. If NULL, use
#' the default assay
#' @param assay.2 Name of the assay to use in the second object. If NULL, use
#' the default assay
#' @param distance Maximum distance between regions allowed for an intersection
#' to be recorded. Default is 0.
#' @param verbose Display messages
#'
#' @importMethodsFrom GenomicRanges distanceToNearest
#' @importFrom S4Vectors subjectHits queryHits mcols
#' @importFrom SeuratObject DefaultAssay
#' @export
#' @concept utilities
#' @return Returns a list of two character vectors containing the row names
#' in each object that overlap each other.
#' @examples
#' GetIntersectingFeatures(
#' object.1 = atac_small,
#' object.2 = atac_small,
#' assay.1 = 'peaks',
#' assay.2 = 'bins'
#' )
GetIntersectingFeatures <- function(
object.1,
object.2,
assay.1 = NULL,
assay.2 = NULL,
distance = 0,
verbose = TRUE
) {
assay.1 <- SetIfNull(x = assay.1, y = DefaultAssay(object = object.1))
assay.2 <- SetIfNull(x = assay.2, y = DefaultAssay(object = object.2))
if (!inherits(x = object.1[[assay.1]], what = "ChromatinAssay")) {
stop("Requested assay in object 1 is not a ChromatinAssay.")
}
if (!inherits(x = object.2[[assay.2]], what = "ChromatinAssay")) {
stop("Requested assay in object 2 is not a ChromatinAssay")
}
regions.1 <- GetAssayData(object = object.1, assay = assay.1, slot = "ranges")
regions.2 <- GetAssayData(object = object.2, assay = assay.2, slot = "ranges")
if (verbose) {
message("Intersecting regions across objects")
}
region.intersections <- distanceToNearest(x = regions.1, subject = regions.2)
keep.intersections <- mcols(x = region.intersections)$distance <= distance
region.intersections <- region.intersections[keep.intersections, ]
intersect.object1 <- queryHits(x = region.intersections)
intersect.object2 <- subjectHits(x = region.intersections)
return(list(intersect.object1, intersect.object2))
}
#' String to GRanges
#'
#' Convert a genomic coordinate string to a GRanges object
#'
#' @param regions Vector of genomic region strings
#' @param sep Vector of separators to use for genomic string. First element is
#' used to separate chromosome and coordinates, second separator is used to
#' separate start and end coordinates.
#' @param ... Additional arguments passed to
#' \code{\link[GenomicRanges]{makeGRangesFromDataFrame}}
#' @return Returns a GRanges object
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom tidyr separate
#' @examples
#' regions <- c('chr1-1-10', 'chr2-12-3121')
#' StringToGRanges(regions = regions)
#' @export
#' @concept utilities
StringToGRanges <- function(regions, sep = c("-", "-"), ...) {
ranges.df <- data.frame(ranges = regions)
ranges.df <- separate(
data = ranges.df,
col = "ranges",
sep = paste0(sep[[1]], "|", sep[[2]]),
into = c("chr", "start", "end")
)
granges <- makeGRangesFromDataFrame(df = ranges.df, ...)
return(granges)
}
#' GRanges to String
#'
#' Convert GRanges object to a vector of strings
#'
#' @param grange A GRanges object
#' @param sep Vector of separators to use for genomic string. First element is
#' used to separate chromosome and coordinates, second separator is used to
#' separate start and end coordinates.
#' @importMethodsFrom GenomicRanges start end seqnames
#' @examples
#' GRangesToString(grange = blacklist_hg19)
#' @return Returns a character vector
#' @export
#' @concept utilities
GRangesToString <- function(grange, sep = c("-", "-")) {
regions <- paste0(
as.character(x = seqnames(x = grange)),
sep[[1]],
start(x = grange),
sep[[2]],
end(x = grange)
)
return(regions)
}
#' Extend
#'
#' Resize GenomicRanges upstream and or downstream.
#' From \url{https://support.bioconductor.org/p/78652/}
#'
#' @param x A range
#' @param upstream Length to extend upstream
#' @param downstream Length to extend downstream
#' @param from.midpoint Count bases from region midpoint,
#' rather than the 5' or 3' end for upstream and downstream
#' respectively.
#'
#' @importFrom GenomicRanges trim
#' @importFrom BiocGenerics start strand end width
#' @importMethodsFrom GenomicRanges strand start end width
#' @importFrom IRanges ranges IRanges "ranges<-"
#' @export
#' @concept utilities
#' @return Returns a \code{\link[GenomicRanges]{GRanges}} object
#' @examples
#' Extend(x = blacklist_hg19, upstream = 100, downstream = 100)
Extend <- function(
x,
upstream = 0,
downstream = 0,
from.midpoint = FALSE
) {
if (any(strand(x = x) == "*")) {
warning("'*' ranges were treated as '+'")
}
on_plus <- strand(x = x) == "+" | strand(x = x) == "*"
if (from.midpoint) {
midpoints <- start(x = x) + (width(x = x) / 2)
new_start <- midpoints - ifelse(
test = on_plus, yes = upstream, no = downstream
)
new_end <- midpoints + ifelse(
test = on_plus, yes = downstream, no = upstream
)
} else {
new_start <- start(x = x) - ifelse(
test = on_plus, yes = upstream, no = downstream
)
new_end <- end(x = x) + ifelse(
test = on_plus, yes = downstream, no = upstream
)
}
ranges(x = x) <- IRanges(start = new_start, end = new_end)
x <- trim(x = x)
return(x)
}
#' Get cells in a region
#'
#' Extract cell names containing reads mapped within a given genomic region
#'
#' @param tabix Tabix object
#' @param region A string giving the region to extract from the fragments file
#' @param cells Vector of cells to include in output. If NULL, include all cells
#'
#' @importFrom Rsamtools scanTabix
#' @importFrom methods is
#' @importFrom fastmatch fmatch
#' @export
#' @concept utilities
#' @return Returns a list
#' @examples
#' fpath <- system.file("extdata", "fragments.tsv.gz", package="Signac")
#' GetCellsInRegion(tabix = fpath, region = "chr1-10245-762629")
GetCellsInRegion <- function(tabix, region, cells = NULL) {
if (!is(object = region, class2 = "GRanges")) {
region <- StringToGRanges(regions = region)
}
reads <- scanTabix(file = tabix, param = region)
reads <- lapply(X = reads, FUN = ExtractCell)
if (!is.null(x = cells)) {
reads <- sapply(X = reads, FUN = function(x) {
x <- x[fmatch(x = x, table = cells, nomatch = 0L) > 0L]
if (length(x = x) == 0) {
return(NULL)
} else {
return(x)
}
}, simplify = FALSE)
}
return(reads)
}
#' Counts in region
#'
#' Count reads per cell overlapping a given set of regions
#'
#' @param object A Seurat object
#' @param assay Name of a chromatin assay in the object to use
#' @param regions A GRanges object
#' @param ... Additional arguments passed to \code{\link[IRanges]{findOverlaps}}
#'
#' @importFrom IRanges findOverlaps
#' @importFrom S4Vectors queryHits
#' @importFrom Matrix colSums
#' @importFrom SeuratObject GetAssayData
#'
#' @export
#' @concept utilities
#' @return Returns a numeric vector
#' @examples
#' \donttest{
#' CountsInRegion(
#' object = atac_small,
#' assay = 'bins',
#' regions = blacklist_hg19
#' )
#' }
CountsInRegion <- function(
object,
assay,
regions,
...
) {
if (!is(object = object[[assay]], class2 = "ChromatinAssay")) {
stop("Must supply a ChromatinAssay")
}
obj.granges <- GetAssayData(object = object, assay = assay, slot = "ranges")
overlaps <- findOverlaps(query = obj.granges, subject = regions, ...)
hit.regions <- queryHits(x = overlaps)
data.matrix <- GetAssayData(
object = object, assay = assay, slot = "counts"
)[hit.regions, , drop = FALSE]
return(colSums(data.matrix))
}
#' Fraction of counts in a genomic region
#'
#' Find the fraction of counts per cell that overlap a given set of genomic
#' ranges
#'
#' @param object A Seurat object
#' @param assay Name of assay to use
#' @param regions A GRanges object containing a set of genomic regions
#' @param ... Additional arguments passed to \code{\link{CountsInRegion}}
#' @importFrom Matrix colSums
#' @importFrom SeuratObject GetAssayData DefaultAssay
#'
#' @export
#' @concept utilities
#' @return Returns a numeric vector
#' @examples
#' \dontrun{
#' FractionCountsInRegion(
#' object = atac_small,
#' assay = 'bins',
#' regions = blacklist_hg19
#' )
#' }
FractionCountsInRegion <- function(
object,
regions,
assay = NULL,
...
) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
reads.in.region <- CountsInRegion(
object = object,
regions = regions,
assay = assay,
...
)
total.reads <- colSums(x = GetAssayData(
object = object, assay = assay, slot = "counts"
))
return(reads.in.region / total.reads)
}
#' Intersect genomic coordinates with matrix rows
#'
#' Remove or retain matrix rows that intersect given genomic regions
#'
#' @param matrix A matrix with genomic regions in the rows
#' @param regions A set of genomic regions to intersect with regions in the
#' matrix. Either a vector of strings encoding the genomic coordinates, or a
#' GRanges object.
#' @param invert Discard rows intersecting the genomic regions supplied, rather
#' than retain.
#' @param sep A length-2 character vector containing the separators to be used
#' for extracting genomic coordinates from a string. The first element will be
#' used to separate the chromosome name from coordinates, and the second element
#' used to separate start and end coordinates.
#' @param verbose Display messages
#' @param ... Additional arguments passed to \code{\link[IRanges]{findOverlaps}}
#'
#' @importFrom IRanges findOverlaps
#' @importFrom S4Vectors queryHits
#'
#' @export
#' @concept utilities
#' @return Returns a sparse matrix
#' @examples
#' counts <- matrix(data = rep(0, 12), ncol = 2)
#' rownames(counts) <- c("chr1-565107-565550","chr1-569174-569639",
#' "chr1-713460-714823","chr1-752422-753038",
#' "chr1-762106-763359","chr1-779589-780271")
#' IntersectMatrix(matrix = counts, regions = blacklist_hg19)
IntersectMatrix <- function(
matrix,
regions,
invert = FALSE,
sep = c("-", "-"),
verbose = TRUE,
...
) {
if (is(object = regions, class2 = "character")) {
regions <- StringToGRanges(regions = regions, sep = sep)
}
rowranges <- StringToGRanges(regions = rownames(x = matrix), sep = sep)
if (verbose) {
message("Intersecting genomic regions")
}
region.overlaps <- findOverlaps(query = rowranges, subject = regions, ...)
keep.rows <- queryHits(x = region.overlaps)
if (invert) {
all.rows <- seq_len(length.out = nrow(matrix))
keep.rows <- setdiff(x = all.rows, y = keep.rows)
}
if (verbose) {
message("Subsetting matrix")
}
matrix <- matrix[keep.rows, ]
return(matrix)
}
#' Get gene coordinates
#'
#' Extract the coordinates of the longest transcript for a gene stored in the
#' annotations within an object.
#'
#' @param object A Seurat object
#' @param gene Name of a gene to extract
#' @param assay Name of assay to use
#'
#' @importFrom SeuratObject DefaultAssay
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges start end
#' @importFrom GenomeInfoDb seqnames
#' @export
#' @concept utilities
#' @examples
#' LookupGeneCoords(atac_small, gene = "MIR1302-10")
LookupGeneCoords <- function(object, gene, assay = NULL) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
stop("The requested assay is not a ChromatinAssay")
}
annotations <- Annotation(object = object[[assay]])
isgene <- annotations$gene_name == gene
isgene <- !is.na(x = isgene) & isgene
annot.sub <- annotations[isgene]
if (length(x = annot.sub) == 0) {
return(NULL)
} else {
gr <- GRanges(seqnames = as.character(x = seqnames(x = annot.sub))[[1]],
ranges = IRanges(start = min(start(x = annot.sub)),
end = max(end(x = annot.sub))))
return(gr)
}
}
#' Match DNA sequence characteristics
#'
#' Return a vector if genomic regions that match the distribution of a set of
#' query regions for any given set of characteristics, specified in the input
#' \code{meta.feature} dataframe.
#'
#' For each requested feature to match, a density
#' distribution is estimated using the \code{\link[stats]{density}} function,
#' and a set of weights for each feature in the dataset estimated based on the
#' density distribution. If multiple features are to be matched (for example,
#' GC content and overall accessibility), a joint density distribution is then
#' computed by multiplying the individual feature weights. A set of features
#' with characteristics matching the query regions is then selected using the
#' \code{\link[base]{sample}} function, with the probability of randomly
#' selecting each feature equal to the joint density distribution weight.
#'
#' @param meta.feature A dataframe containing DNA sequence information for
#' features to choose from
#' @param query.feature A dataframe containing DNA sequence information for
#' features to match.
#' @param n Number of regions to select, with characteristics matching the query
#' @param features.match Which features of the query to match when selecting a
#' set of regions. A vector of column names present in the feature metadata can
#' be supplied to match multiple characteristics at once. Default is GC content.
#' @param verbose Display messages
#' @param ... Arguments passed to other functions
#' @return Returns a character vector
#'
#' @importFrom stats density approx
#' @export
#' @concept utilities
#' @concept motifs
#' @examples
#' metafeatures <- SeuratObject::GetAssayData(
#' object = atac_small[['peaks']], slot = 'meta.features'
#' )
#' query.feature <- metafeatures[1:10, ]
#' features.choose <- metafeatures[11:nrow(metafeatures), ]
#' MatchRegionStats(
#' meta.feature = features.choose,
#' query.feature = query.feature,
#' features.match = "percentile",
#' n = 10
#' )
MatchRegionStats <- function(
meta.feature,
query.feature,
features.match = c("GC.percent"),
n = 10000,
verbose = TRUE,
...
) {
if (!inherits(x = meta.feature, what = 'data.frame')) {
stop("meta.feature should be a data.frame")
}
if (!inherits(x = query.feature, what = "data.frame")) {
stop("query.feature should be a data.frame")
}
if (length(x = features.match) == 0) {
stop("Must supply at least one sequence characteristic to match")
}
if (nrow(x = meta.feature) < n) {
n <- nrow(x = meta.feature)
warning("Requested more features than present in supplied data.
Returning ", n, " features")
}
for (i in seq_along(along.with = features.match)) {
featmatch <- features.match[[i]]
if (!(featmatch %in% colnames(x = query.feature))) {
if (featmatch == "GC.percent") {
stop("GC.percent not present in meta.features.",
" Run RegionStats to compute GC.percent for each feature.")
} else {
stop(featmatch, " not present in meta.features")
}
}
if (verbose) {
message("Matching ", featmatch, " distribution")
}
density.estimate <- density(
x = query.feature[[featmatch]], kernel = "gaussian", bw = 1
)
mf.use <- meta.feature[!is.na(x = meta.feature[[featmatch]]), ]
weights <- approx(
x = density.estimate$x,
y = density.estimate$y,
xout = mf.use[[featmatch]],
yright = 0.0001,
yleft = 0.0001
)$y
if (i > 1) {
feature.weights <- feature.weights * weights
} else {
feature.weights <- weights
}
}
feature.select <- sample.int(
n = nrow(x = mf.use),
size = n,
prob = feature.weights
)
feature.select <- rownames(x = mf.use)[feature.select]
return(feature.select)
}
#' Unify genomic ranges
#'
#' Create a unified set of non-overlapping genomic ranges
#' from multiple Seurat objects containing single-cell
#' chromatin data.
#'
#' @param object.list A list of Seurat objects or ChromatinAssay objects
#' @param mode Function to use when combining genomic ranges. Can be "reduce"
#' (default) or "disjoin".
#' See \code{\link[GenomicRanges]{reduce}}
#' and \code{\link[GenomicRanges]{disjoin}}
#' for more information on these functions.
#' @importFrom GenomicRanges reduce disjoin
#' @export
#' @concept utilities
#' @concept preprocessing
#' @return Returns a GRanges object
#' @examples
#' UnifyPeaks(object.list = list(atac_small, atac_small))
UnifyPeaks <- function(object.list, mode = "reduce") {
peak.ranges <- list()
for (i in seq_along(along.with = object.list)) {
peak.ranges[[i]] <- granges(x = object.list[[i]])
}
peak.ranges <- Reduce(f = c, x = peak.ranges)
if (mode == "reduce") {
return(reduce(x = peak.ranges))
} else if (mode == "disjoin") {
return(disjoin(x = peak.ranges))
} else {
stop("Unknown mode requested")
}
}
#' Subset matrix rows and columns
#'
#' Subset the rows and columns of a matrix by removing
#' rows and columns with less than the specified number of
#' non-zero elements.
#'
#' @param mat A matrix
#' @param min.rows Minimum number of non-zero elements for
#' the row to be retained
#' @param min.cols Minimum number of non-zero elements for
#' the column to be retained
#' @param max.row.val Maximum allowed value in a row for the
#' row to be retained. If NULL, don't set any limit.
#' @param max.col.val Maximum allowed value in a column for
#' the column to be retained. If NULL, don't set any limit.
#'
#' @return Returns a matrix
#' @export
#' @concept utilities
#' @importFrom Matrix colSums rowSums
#' @examples
#' SubsetMatrix(mat = volcano)
SubsetMatrix <- function(
mat,
min.rows = 1,
min.cols = 1,
max.row.val = 10,
max.col.val = NULL
) {
rowcount <- rowSums(mat > 0)
colcount <- colSums(mat > 0)
keeprows <- rowcount > min.rows
keepcols <- colcount > min.cols
if (!is.null(x = max.row.val)) {
rowmax <- apply(X = mat, MARGIN = 1, FUN = max)
keeprows <- keeprows & (rowmax < max.row.val)
}
if (!is.null(x = max.col.val)) {
colmax <- apply(X = mat, MARGIN = 2, FUN = max)
keepcols <- keepcols & (colmax < max.col.val)
}
return(mat[keeprows, keepcols])
}
#### Not exported ####
#' @importFrom IRanges isDisjoint
NonOverlapping <- function(x, all.features) {
# x is list of assays
diff.features <- names(x = all.features[all.features < length(x = x)])
if (length(x = diff.features) == 0) {
return(TRUE)
} else {
diff.ranges <- StringToGRanges(regions = diff.features)
return(isDisjoint(x = diff.ranges))
}
}
#' @importFrom Matrix sparseMatrix
AddMissingCells <- function(x, cells) {
# add columns with zeros for cells not in matrix
missing.cells <- setdiff(x = cells, y = colnames(x = x))
if (!(length(x = missing.cells) == 0)) {
null.mat <- sparseMatrix(
i = c(),
j = c(),
dims = c(nrow(x = x), length(x = missing.cells))
)
rownames(x = null.mat) <- rownames(x = x)
colnames(x = null.mat) <- missing.cells
x <- cbind(x, null.mat)
}
x <- x[, cells, drop = FALSE]
return(x)
}
#' @importFrom SeuratObject DefaultAssay GetAssayData
#' @importFrom Matrix Diagonal tcrossprod rowSums
AverageCountMatrix <- function(
object,
assay = NULL,
group.by = NULL,
idents = NULL
) {
assay = SetIfNull(x = assay, y = DefaultAssay(object = object))
countmatrix <- GetAssayData(object = object[[assay]], slot = "counts")
ident.matrix <- BinaryIdentMatrix(
object = object,
group.by = group.by,
idents = idents
)
collapsed.counts <- tcrossprod(x = countmatrix, y = ident.matrix)
avg.counts <- tcrossprod(
x = collapsed.counts,
y = Diagonal(x = 1 / rowSums(x = ident.matrix))
)
return(as.matrix(x = avg.counts))
}
# Create binary cell x class matrix of group membership
#' @importFrom Matrix sparseMatrix
BinaryIdentMatrix <- function(object, group.by = NULL, idents = NULL) {
group.idents <- GetGroups(object = object, group.by = group.by, idents = idents)
cell.idx <- seq_along(along.with = names(x = group.idents))
unique.groups <- as.character(x = unique(x = group.idents))
ident.idx <- seq_along(along.with = unique.groups)
names(x = ident.idx) <- unique.groups
ident.matrix <- sparseMatrix(
i = ident.idx[as.character(x = group.idents)],
j = cell.idx,
x = 1
)
colnames(x = ident.matrix) <- names(x = group.idents)
rownames(x = ident.matrix) <- unique.groups
ident.matrix <- as(object = ident.matrix, Class = "CsparseMatrix")
return(ident.matrix)
}
# Calculate nCount and nFeature
#
# From Seurat
#
# @param object An Assay object
#
# @return A named list with nCount and nFeature
#
#' @importFrom Matrix colSums
#
CalcN <- function(object) {
if (IsMatrixEmpty(x = GetAssayData(object = object, slot = "counts"))) {
return(NULL)
}
return(list(
nCount = colSums(x = object, slot = "counts"),
nFeature = colSums(x = GetAssayData(object = object, slot = "counts") > 0)
))
}
globalVariables(
names = c("start", "end", "seqnames", "strand", "gene_biotype"),
package = "Signac"
)
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @import data.table
CollapseToLongestTranscript <- function(ranges) {
range.df <- as.data.table(x = ranges)
range.df$strand <- as.character(x = range.df$strand)
range.df$strand <- ifelse(
test = range.df$strand == "*",
yes = "+",
no = range.df$strand
)
collapsed <- range.df[
, .(unique(seqnames),
min(start),
max(end),
strand[[1]],
gene_biotype[[1]],
gene_name[[1]]),
"gene_id"
]
colnames(x = collapsed) <- c(
"gene_id", "seqnames", "start", "end", "strand", "gene_biotype", "gene_name"
)
collapsed$gene_name <- make.unique(names = collapsed$gene_name)
gene.ranges <- makeGRangesFromDataFrame(
df = collapsed,
keep.extra.columns = TRUE
)
return(gene.ranges)
}
# Chunk GRanges
#
# Split a genomic ranges object into evenly sized chunks
# @param granges A GRanges object
# @param nchunk Number of chunks to split into
#
# @return Returns a list of GRanges objects
# @examples
# ChunkGRanges(blacklist_hg19, n = 10)
ChunkGRanges <- function(granges, nchunk) {
if (length(x = granges) < nchunk) {
nchunk <- length(x = granges)
}
chunksize <- as.integer(x = (length(granges) / nchunk))
range.list <- sapply(X = seq_len(length.out = nchunk), FUN = function(x) {
chunkupper <- (x * chunksize)
if (x == 1) {
chunklower <- 1
} else {
chunklower <- ((x - 1) * chunksize) + 1
}
if (x == nchunk) {
chunkupper <- length(x = granges)
}
return(granges[chunklower:chunkupper])
})
return(range.list)
}
# Extract cell
#
# Extract cell barcode from list of tab delimited character
# vectors (output of \code{\link{scanTabix}})
#
# @param x List of character vectors
# @return Returns a string
#' @importFrom stringi stri_split_fixed
ExtractCell <- function(x) {
if (length(x = x) == 0) {
return(NULL)
} else {
x <- stri_split_fixed(str = x, pattern = "\t")
n <- length(x = x)
x <- unlist(x = x)
return(unlist(x = x)[5 * (1:n) - 1])
}
}
# Run groupCommand for the first n lines, convert the cell barcodes in the file
# to the cell names that appear in the fragment object, and subset the output to
# cells present in the fragment object
#
# Every cell in the fragment file will be present in the output dataframe. If
# the cell information is not set, every cell barcode that appears in the first
# n lines will be present.
#
# @param fragments A Fragment object
# @param n Number of lines to read from the beginning of the fragment file
# @param verbose Display messages
#
# @return Returns a data.frame
ExtractFragments <- function(fragments, n = NULL, verbose = TRUE) {
fpath <- GetFragmentData(object = fragments, slot = "path")
if (isRemote(x = fpath)) {
stop("Remote fragment files not supported")
}
fpath <- normalizePath(path = fpath, mustWork = TRUE)
cells <- GetFragmentData(object = fragments, slot = "cells")
if (!is.null(x = cells)) {
cells.use <- as.character(x = cells)
} else {
cells.use <- NULL
}
verbose <- as.logical(x = verbose)
n <- SetIfNull(x = n, y = 0)
n <- as.numeric(x = n)
n <- round(x = n, digits = 0)
counts <- groupCommand(
fragments = fpath,
some_whitelist_cells = cells.use,
max_lines = n,
verbose = verbose
)
# convert cell names
if (!is.null(x = cells)) {
# every cell will be present in the output, even if 0 counts
converter <- names(x = cells)
names(x = converter) <- cells
counts$CB <- converter[counts$CB]
}
return(counts)
}
# convert region argument to genomic coordinates
# region can be a string, name of a gene, or GRanges object
FindRegion <- function(
object,
region,
sep = c("-", "-"),
assay = NULL,
extend.upstream = 0,
extend.downstream = 0
) {
if (!is(object = region, class2 = "GRanges")) {
# first try to convert to coordinates, if not lookup gene
region <- tryCatch(
expr = suppressWarnings(
expr = StringToGRanges(regions = region, sep = sep)
),
error = function(x) {
region <- LookupGeneCoords(
object = object,
assay = assay,
gene = region
)
return(region)
}
)
if (is.null(x = region)) {
stop("Gene not found")
}
}
region <- suppressWarnings(expr = Extend(
x = region,
upstream = extend.upstream,
downstream = extend.downstream
)
)
return(region)
}
# GetReadsInRegion
#
# Extract reads for each cell within a given genomic region or set of regions
#
# @param cellmap A mapping of cell names in the fragment file to cell names in
# the Seurat object. Should be a named vector where each element is a cell name
# that appears in the fragment file and the name of each element is the
# name of the cell in the Seurat object.
# @param region A genomic region, specified as a string in the format
# 'chr:start-end'. Can be a vector of regions.
# @param tabix.file A TabixFile object.
# @param cells Cells to include. Default is all cells present in the object.
# @param verbose Display messages
# @param ... Additional arguments passed to \code{\link{StringToGRanges}}
#
#' @importFrom Rsamtools TabixFile scanTabix
#' @importFrom SeuratObject Idents
#' @importFrom fastmatch fmatch
#
# @return Returns a data frame
GetReadsInRegion <- function(
cellmap,
region,
tabix.file,
cells = NULL,
verbose = TRUE,
...
) {
file.to.object <- names(x = cellmap)
names(x = file.to.object) <- cellmap
if (verbose) {
message("Extracting reads in requested region")
}
if (!is(object = region, class2 = "GRanges")) {
region <- StringToGRanges(regions = region, ...)
}
# remove regions that aren't in the fragment file
common.seqlevels <- intersect(
x = seqlevels(x = region),
y = seqnamesTabix(file = tabix.file)
)
if (length(common.seqlevels) != 0) {
region <- keepSeqlevels(
x = region,
value = common.seqlevels,
pruning.mode = "coarse"
)
reads <- scanTabix(file = tabix.file, param = region)
reads <- TabixOutputToDataFrame(reads = reads)
reads <- reads[
fmatch(x = reads$cell, table = cellmap, nomatch = 0L) > 0,
]
# convert cell names to match names in object
reads$cell <- file.to.object[reads$cell]
if (!is.null(x = cells)) {
reads <- reads[reads$cell %in% cells, ]
}
if (nrow(reads) == 0) {
reads$ident <- integer()
reads$length <- numeric()
return(reads)
}
reads$length <- reads$end - reads$start
} else {
reads <- data.frame(
"chr" = character(),
"start" = numeric(),
"end" = numeric(),
"cell" = character(),
"count" = numeric(),
"ident" = integer(),
"length" = numeric()
)
}
return(reads)
}
# Get vector of cell names and associated identity
# @param object A Seurat object
# @param group.by Identity class to group cells by
# @param idents which identities to include
# @return Returns a named vector
#' @importFrom SeuratObject Idents
GetGroups <- function(
object,
group.by,
idents
) {
if (is.null(x = group.by)) {
obj.groups <- Idents(object = object)
} else {
obj.md <- object[[group.by]]
obj.groups <- obj.md[, 1]
names(obj.groups) <- rownames(x = obj.md)
}
if (!is.null(idents)) {
obj.groups <- obj.groups[obj.groups %in% idents]
}
return(obj.groups)
}
# Check if path is remote
# @param x path/s to check
isRemote <- function(x) {
return(grepl(pattern = "^http|^ftp", x = x))
}
# row merge list of matrices
# @param mat.list list of sparse matrices
# @param new.rownames rownames to assign merged matrix
#' @importFrom SeuratObject RowMergeSparseMatrices
MergeMatrixParts <- function(mat.list, new.rownames) {
# RowMergeSparseMatrices only exported in Seurat release Dec-2019 (3.1.2)
merged.all <- mat.list[[1]]
for (i in 2:length(x = mat.list)) {
merged.all <- RowMergeSparseMatrices(
mat1 = merged.all,
mat2 = mat.list[[i]]
)
}
# reorder rows to match genomic ranges
merged.all <- merged.all[new.rownames, ]
return(merged.all)
}
# Run GetReadsInRegion for a list of Fragment objects
# concatenate the output dataframes and return
# @param object A Seurat or ChromatinAssay object
# @param region Genomic region to extract fragments for
# @param fragment.list A list of Fragment objects. If NULL, pull them from the
# object
# @param assay Name of assay to use if supplying a Seurat object
#' @importFrom SeuratObject DefaultAssay
#' @importFrom Rsamtools TabixFile
#' @importFrom GenomeInfoDb keepSeqlevels
MultiGetReadsInRegion <- function(
object,
region,
fragment.list = NULL,
assay = NULL,
...
) {
if (inherits(x = object, what = "Seurat")) {
# pull the assay
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
object <- object[[assay]]
}
fragment.list <- SetIfNull(
x = fragment.list,
y = Fragments(object = object)
)
if (length(x = fragment.list) == 0) {
# no fragments set
stop("No fragment files found")
}
res <- data.frame()
for (i in seq_along(along.with = fragment.list)) {
tbx.path <- GetFragmentData(object = fragment.list[[i]], slot = "path")
cellmap <- GetFragmentData(object = fragment.list[[i]], slot = "cells")
tabix.file <- TabixFile(file = tbx.path)
open(con = tabix.file)
reads <- GetReadsInRegion(
cellmap = cellmap,
region = region,
tabix.file = tabix.file,
...
)
res <- rbind(res, reads)
close(con = tabix.file)
}
return(res)
}
# Generate matrix of integration sites
#
# Generates a cell-by-position matrix of Tn5 integration sites
# centered on a given region (usually a DNA sequence motif). This
# matrix can be used for downstream footprinting analysis.
#
# @param cellmap A mapping of cell names in the fragment file to cell names in
# the Seurat object. Should be a named vector where each element is a cell name
# that appears in the fragment file and the name of each element is the
# name of the cell in the Seurat object.
# @param region A set of GRanges containing the regions of interest
# @param cells Which cells to include in the matrix. If NULL, use all cells in
# the cellmap
# @param tabix.file A \code{\link[Rsamtools]{TabixFile}} object.
# @param verbose Display messages
#' @importFrom Matrix sparseMatrix
#' @importFrom Rsamtools TabixFile
#' @importMethodsFrom GenomicRanges width start end
# @return Returns a sparse matrix
SingleFileCutMatrix <- function(
cellmap,
region,
cells = NULL,
tabix.file,
verbose = TRUE
) {
# if multiple regions supplied, must be the same width
cells <- SetIfNull(x = cells, y = names(x = cellmap))
if (length(x = region) == 0) {
return(NULL)
}
fragments <- GetReadsInRegion(
region = region,
cellmap = cellmap,
cells = cells,
tabix.file = tabix.file,
verbose = verbose
)
start.lookup <- start(x = region)
names(start.lookup) <- seq_along(region)
# if there are no reads in the region
# create an empty matrix of the correct dimension
if (nrow(x = fragments) == 0) {
cut.matrix <- sparseMatrix(
i = NULL,
j = NULL,
dims = c(length(x = cells), width(x = region)[[1]])
)
} else {
fragstarts <- start.lookup[fragments$ident] + 1
cut.df <- data.frame(
position = c(fragments$start, fragments$end) - fragstarts,
cell = c(fragments$cell, fragments$cell),
stringsAsFactors = FALSE
)
cut.df <- cut.df[
(cut.df$position > 0) & (cut.df$position <= width(x = region)[[1]]),
]
cell.vector <- seq_along(along.with = cells)
names(x = cell.vector) <- cells
cell.matrix.info <- cell.vector[cut.df$cell]
cut.matrix <- sparseMatrix(
i = cell.matrix.info,
j = cut.df$position,
x = 1,
dims = c(length(x = cells), width(x = region)[[1]])
)
}
rownames(x = cut.matrix) <- cells
colnames(x = cut.matrix) <- seq_len(width(x = region)[[1]])
return(cut.matrix)
}
# Generate matrix of integration sites
#
# Generates a cell-by-position matrix of Tn5 integration sites.
#
# @param object A Seurat object
# @param region A GRanges object containing the region of interest
# @param assay A name of assay to use. Must be a \code{\link{ChromatinAssay}}
# containing a list of \code{\link{Fragment}} objects.
# @param cells Which cells to include in the matrix. If NULL (default), use all
# cells in the object
# @param group.by Name of grouping variable to use
# @param verbose Display messages
# @return Returns a sparse matrix
#' @importFrom SeuratObject DefaultAssay
#' @importFrom Rsamtools TabixFile seqnamesTabix
#' @importFrom GenomeInfoDb keepSeqlevels
CutMatrix <- function(
object,
region,
group.by = NULL,
assay = NULL,
cells = NULL,
verbose = TRUE
) {
# run SingleFileCutMatrix for each fragment file and combine results
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
cells <- SetIfNull(x = cells, y = colnames(x = object))
fragments <- Fragments(object = object[[assay]])
if (length(x = fragments) == 0) {
stop("No fragment information found for requested assay")
}
res <- list()
for (i in seq_along(along.with = fragments)) {
fragment.path <- GetFragmentData(object = fragments[[i]], slot = "path")
cellmap <- GetFragmentData(object = fragments[[i]], slot = "cells")
tabix.file <- TabixFile(file = fragment.path)
open(con = tabix.file)
# remove regions that aren't in the fragment file
seqnames.in.both <- intersect(
x = seqnames(x = region),
y = seqnamesTabix(file = tabix.file)
)
region <- keepSeqlevels(
x = region,
value = seqnames.in.both,
pruning.mode = "coarse"
)
if (length(x = region) != 0) {
cm <- SingleFileCutMatrix(
region = region,
cellmap = cellmap,
tabix.file = tabix.file,
cells = cells,
verbose = FALSE
)
res[[i]] <- cm
}
close(con = tabix.file)
}
res <- Reduce(f = `+`, x = res)
return(res)
}
# Generate cut matrix for many regions
#
# Run CutMatrix on multiple regions and add them together.
# Assumes regions are pre-aligned.
#
# @param object A Seurat object
# @param regions A set of GRanges
# @param group.by Name of grouping variable to use
# @param fragments A list of Fragment objects
# @param assay Name of the assay to use
# @param cells Vector of cells to include
# @param verbose Display messages
#' @importFrom Rsamtools TabixFile seqnamesTabix
#' @importFrom SeuratObject DefaultAssay
#' @importFrom GenomeInfoDb keepSeqlevels
MultiRegionCutMatrix <- function(
object,
regions,
group.by = NULL,
fragments = NULL,
assay = NULL,
cells = NULL,
verbose = FALSE
) {
if (inherits(x = object, what = "Seurat")) {
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
object <- object[[assay]]
}
fragments <- SetIfNull(x = fragments, y = Fragments(object = object))
res <- list()
if (length(x = fragments) == 0) {
stop("No fragment files present in assay")
}
for (i in seq_along(along.with = fragments)) {
frag.path <- GetFragmentData(object = fragments[[i]], slot = "path")
cellmap <- GetFragmentData(object = fragments[[i]], slot = "cells")
if (is.null(x = cellmap)) {
cellmap <- colnames(x = object)
names(x = cellmap) <- cellmap
}
tabix.file <- TabixFile(file = frag.path)
open(con = tabix.file)
# remove regions that aren't in the fragment file
common.seqlevels <- intersect(
x = seqlevels(x = regions),
y = seqnamesTabix(file = tabix.file)
)
regions <- keepSeqlevels(
x = regions,
value = common.seqlevels,
pruning.mode = "coarse"
)
cm <- SingleFileCutMatrix(
cellmap = cellmap,
tabix.file = tabix.file,
region = regions,
verbose = verbose
)
close(con = tabix.file)
res[[i]] <- cm
}
# each matrix contains data for different cells at same positions
# bind all matrices together
res <- do.call(what = rbind, args = res)
return(res)
}
# Create cut site pileup matrix
#
# For a set of aligned genomic ranges, find the total number of
# integration sites per cell per base.
#
# @param object A Seurat object
# @param regions A GRanges object
# @param assay Name of the assay to use
# @param cells Which cells to include. If NULL, use all cells
# @param verbose Display messages
#' @importMethodsFrom GenomicRanges strand
CreateRegionPileupMatrix <- function(
object,
regions,
assay = NULL,
cells = NULL,
verbose = TRUE
) {
if (length(x = regions) == 0) {
stop("No regions supplied")
}
# split into strands
on_plus <- strand(x = regions) == "+" | strand(x = regions) == "*"
plus.strand <- regions[on_plus, ]
minus.strand <- regions[!on_plus, ]
# get cut matrices for each strand
if (verbose) {
message("Finding + strand cut sites")
}
cut.matrix.plus <- MultiRegionCutMatrix(
regions = plus.strand,
object = object,
assay = assay,
cells = cells,
verbose = FALSE
)
if (verbose) {
message("Finding - strand cut sites")
}
cut.matrix.minus <- MultiRegionCutMatrix(
regions = minus.strand,
object = object,
assay = assay,
cells = cells,
verbose = FALSE
)
# reverse minus strand and add together
if (is.null(x = cut.matrix.plus)) {
full.matrix <- cut.matrix.minus[, rev(x = colnames(x = cut.matrix.minus))]
} else if (is.null(x = cut.matrix.minus)) {
full.matrix <- cut.matrix.plus
} else {
full.matrix <- cut.matrix.plus + cut.matrix.minus[, rev(
x = colnames(x = cut.matrix.minus)
)]
}
# rename so 0 is center
region.width <- width(x = regions)[[1]]
midpoint <- round(x = (region.width / 2))
colnames(full.matrix) <- seq_len(length.out = region.width) - midpoint
return(full.matrix)
}
# Apply function to integration sites per base per group
#
# Perform colSums on a cut matrix with cells in the rows
# and position in the columns, for each group of cells
# separately.
#
# @param mat A cut matrix. See \code{\link{CutMatrix}}
# @param groups A vector of group identities, with the name
# of each element in the vector set to the cell name.
# @param fun Function to apply to each group of cells.
# For example, colSums or colMeans.
# @param group.scale.factors Scaling factor for each group. Should
# be computed using the number of cells in the group and the average number of
# counts in the group.
# @param normalize Perform sequencing depth and cell count normalization
# @param scale.factor Scaling factor to use. If NULL (default), will use the
# median normalization factor for all the groups.
ApplyMatrixByGroup <- function(
mat,
groups,
fun,
normalize = TRUE,
group.scale.factors = NULL,
scale.factor = NULL
) {
if (normalize) {
if (is.null(x = group.scale.factors) | is.null(x = scale.factor)) {
stop("If normalizing counts, supply group scale factors")
}
}
all.groups <- as.character(x = unique(x = groups))
if (any(is.na(x = groups))) {
all.groups <- c(all.groups, NA)
}
ngroup <- length(x = all.groups)
npos <- ncol(x = mat)
group <- unlist(
x = lapply(X = all.groups, FUN = function(x) rep(x, npos))
)
position <- rep(x = as.numeric(x = colnames(x = mat)), ngroup)
count <- vector(mode = "numeric", length = npos * ngroup)
for (i in seq_along(along.with = all.groups)) {
grp <- all.groups[[i]]
if (is.na(x = grp)) {
pos.cells <- names(x = groups)[is.na(x = groups)]
} else {
pos.cells <- names(x = groups)[groups == all.groups[[i]]]
}
if (length(x = pos.cells) > 1) {
totals <- fun(x = mat[pos.cells, ])
} else {
totals <- mat[pos.cells, ]
}
count[((i - 1) * npos + 1):((i * npos))] <- totals
}
# construct dataframe
coverages <- data.frame(
"group" = group, "position" = position, "count" = count,
stringsAsFactors = FALSE
)
if (normalize) {
scale.factor <- SetIfNull(
x = scale.factor, y = median(x = group.scale.factors)
)
coverages$norm.value <- coverages$count /
group.scale.factors[coverages$group] * scale.factor
} else {
coverages$norm.value <- coverages$count
}
return(coverages)
}
# TabixOutputToDataFrame
#
# Create a single dataframe from list of character vectors
#
# @param reads List of character vectors (the output of \code{\link{scanTabix}})
# @param record.ident Add a column recording which region the reads overlapped
# with
#' @importFrom stringi stri_split_fixed
#' @importFrom S4Vectors elementNROWS
TabixOutputToDataFrame <- function(reads, record.ident = TRUE) {
if (record.ident) {
nrep <- elementNROWS(x = reads)
}
original_names = names(reads)
reads <- unlist(x = reads, use.names = FALSE)
if (length(x = reads) == 0 | is.null(x = original_names)) {
df <- data.frame(
"chr" = "",
"start" = "",
"end" = "",
"cell" = "",
"count" = ""
)
df <- df[-1, ]
return(df)
}
reads <- stri_split_fixed(str = reads, pattern = "\t")
n <- length(x = reads[[1]])
unlisted <- unlist(x = reads)
e1 <- unlisted[n * (seq_along(along.with = reads)) - (n - 1)]
e2 <- as.numeric(x = unlisted[n * (seq_along(along.with = reads)) - (n - 2)])
e3 <- as.numeric(x = unlisted[n * (seq_along(along.with = reads)) - (n - 3)])
e4 <- unlisted[n * (seq_along(along.with = reads)) - (n - 4)]
e5 <- as.numeric(x = unlisted[n * (seq_along(along.with = reads)) - (n - 5)])
df <- data.frame(
"chr" = e1,
"start" = e2,
"end" = e3,
"cell" = e4,
"count" = e5,
stringsAsFactors = FALSE,
check.rows = FALSE,
check.names = FALSE
)
if (record.ident) {
df$ident <- rep(x = seq_along(along.with = nrep), nrep)
}
return(df)
}
# Extract delimiter information from a string.
#
# From Seurat
#
# Parses a string (usually a cell name) and extracts fields based on a delimiter
#
# @param string String to parse.
# @param field Integer(s) indicating which field(s) to extract. Can be a vector
# multiple numbers.
# @param delim Delimiter to use, set to underscore by default.
#
# @return A new string, that parses out the requested fields, and (if multiple),
# rejoins them with the same delimiter
#' @importFrom stringi stri_split_fixed
#
ExtractField <- function(string, field = 1, delim = "_") {
fields <- as.numeric(
x = unlist(x = stri_split_fixed(
str = as.character(x = field), pattern = ",")
)
)
if (length(x = fields) == 1) {
return(stri_split_fixed(str = string, pattern = delim)[[1]][field])
}
return(paste(
stri_split_fixed(str = string, pattern = delim)[[1]][fields],
collapse = delim))
}
# Set a default value if an object is null
#
# @param x An object to set if it's null
# @param y The value to provide if x is null
# @return Returns y if x is null, otherwise returns x.
SetIfNull <- function(x, y) {
if (is.null(x = x)) {
return(y)
} else {
return(x)
}
}
# Check if a matrix is empty
#
# From Seurat
#
# Takes a matrix and asks if it's empty (either 0x0 or 1x1 with a value of NA)
#
# @param x A matrix
#
# @return Whether or not \code{x} is empty
#
IsMatrixEmpty <- function(x) {
matrix.dims <- dim(x = x)
matrix.na <- all(matrix.dims == 1) && all(is.na(x = x))
return(all(matrix.dims == 0) || matrix.na)
}
# Find matrix indices corresponding to overlapping genomic ranges
# @param assay.list A list of ChromatinAssay objects
# @param all.ranges Combined genomic ranges for all objects. This should be the
# set of ranges that \code{reduce} was run on to get \code{reduced.ranges}
# @param reduced.ranges A set of reduced genomic ranges containing the rev.map
# information
GetRowsToMerge <- function(assay.list, all.ranges, reduced.ranges) {
revmap <- as.vector(x = reduced.ranges$revmap)
# get indices of ranges that changed
revmap.lengths <- sapply(X = revmap, FUN = length)
changed.ranges <- which(x = revmap.lengths > 1)
grange.string <- GRangesToString(grange = reduced.ranges[changed.ranges])
# preallocate
offsets <- list()
results <- list()
matrix.indices <- vector(
mode = "numeric",
length = length(x = changed.ranges) * 2
)
granges <- vector(
mode = "character",
length = length(x = changed.ranges) * 2
)
for (i in seq_along(along.with = assay.list)) {
indices <- which(x = all.ranges$dataset == i)
offsets[[i]] <- min(indices) - 1
offsets[[i]][[2]] <- max(indices) + 1
results[['matrix']][[i]] <- matrix.indices
results[['grange']][[i]] <- granges
}
# find sets of ranges for each dataset
counter <- vector(mode = "numeric", length = length(x = assay.list))
for (x in seq_along(along.with = changed.ranges)) {
idx <- changed.ranges[[x]]
all.assay <- revmap[[idx]]
for (i in seq_along(along.with = assay.list)) {
this.assay <- all.assay[
(all.assay > offsets[[i]][1]) & (all.assay < offsets[[i]][2])
]
mat.idx <- this.assay - offsets[[i]][1]
mat.idx <- mat.idx[mat.idx < offsets[[i]][2] & mat.idx > 0]
for (y in seq_along(along.with = mat.idx)) {
counter[i] <- counter[i] + 1
results[['matrix']][[i]][[counter[i]]] <- mat.idx[[y]]
results[['grange']][[i]][[counter[i]]] <- grange.string[[x]]
}
}
}
# remove trailing extra values in each vector
for (i in seq_along(along.with = assay.list)) {
results$matrix[[i]] <- results$matrix[[i]][1:counter[i]]
results$grange[[i]] <- results$grange[[i]][1:counter[i]]
}
return(results)
}
# Merge rows of count matrices with overlapping genomic ranges
# @param mergeinfo The output of GetRowsToMerge: a list of matrix indices
# and matrix rownames to be merged for each assay
# @param assay.list List of assays
# @param verbose Display messages
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom Matrix rowSums
#' @importMethodsFrom Matrix t
MergeOverlappingRows <- function(
mergeinfo,
assay.list,
slot = "counts",
verbose = TRUE
) {
merge.counts <- list()
for (i in seq_along(along.with = assay.list)) {
# get count matrix
counts <- GetAssayData(object = assay.list[[i]], slot = slot)
if (nrow(x = counts) == 0) {
# no counts, only data
# skip row merge and return empty counts matrices
merge.counts <- lapply(
X = seq_along(along.with = assay.list),
FUN = matrix,
nrow = 0,
ncol = 0
)
return(merge.counts)
}
# transpose for faster access since matrix is column major
counts <- t(x = counts)
# get rows to merge
mrows <- mergeinfo$matrix[[i]]
new.rownames <- mergeinfo$grange[[i]]
nrep <- rle(x = new.rownames)
# allocate
todelete <- c()
newmat <- vector(
mode = "list",
length = length(new.rownames)
)
newmat.names <- vector(
mode = "character",
length = length(x = new.rownames)
)
x <- 1 # row index for matrix
y <- 1 # counter for list index
if (verbose & length(x = nrep$lengths) > 1) {
pb <- txtProgressBar(
min = 1,
max = length(x = nrep$lengths),
style = 3,
file = stderr()
)
}
to.rename.idx <- vector(
mode = "numeric", length = length(x = nrep$lengths)
)
to.rename.names <- vector(
mode = "character", length = length(x = nrep$lengths)
)
idx.counter <- 0
for (j in seq_along(along.with = nrep$lengths)) {
rowrun <- nrep$lengths[[j]]
new.feature.name <- nrep$values[[j]]
index.range <- x:(x + rowrun - 1)
matrix.index <- mrows[index.range]
if (rowrun < 2) {
idx.counter <- idx.counter + 1
# no merge needed, just rename row in-place
# store row indices and names to do the change in one step at the end
to.rename.idx[idx.counter] <- matrix.index
to.rename.names[idx.counter] <- new.feature.name
} else {
# merge multiple rows and add to list
newmat[[y]] <- rowSums(x = counts[, matrix.index])
# mark merged row for deletion
todelete <- c(todelete, matrix.index)
# add row names
newmat.names[y] <- new.feature.name
y <- y + 1
}
if (verbose & length(x = nrep$lengths) > 1) {
setTxtProgressBar(pb = pb, value = j)
}
x <- x + rowrun
}
# remove extra elements in vectors
to.rename.idx <- to.rename.idx[1:idx.counter]
to.rename.names <- to.rename.names[1:idx.counter]
newmat <- newmat[1:(y - 1)]
newmat.names <- newmat.names[1:(y - 1)]
# transpose back
counts <- t(x = counts)
# rename matrix rows that weren't merged
rownames(counts)[to.rename.idx] <- to.rename.names
if (y == 1) {
# no rows were merged, can return counts
merge.counts[[i]] <- counts
} else if (y == 2) {
# only one element
tomerge <- matrix(data = newmat[[1]], nrow = 1)
colnames(x = tomerge) <- names(x = newmat[[1]])
rownames(x = tomerge) <- newmat.names
tomerge <- tomerge[, colnames(x = counts)]
counts <- rbind(counts, tomerge)
merge.counts[[i]] <- counts
} else {
# construct sparse matrix
if (verbose) {
message("\nBinding matrix rows")
}
merged.mat <- Reduce(f = rbind, x = newmat)
rownames(merged.mat) <- newmat.names
merged.mat <- as(object = merged.mat, Class = "CsparseMatrix")
# remove rows from count matrix that were merged
mat.rows <- seq_len(length.out = nrow(x = counts))
tokeep <- setdiff(mat.rows, todelete)
counts <- counts[tokeep, ]
# add new merged rows to counts
counts <- rbind(counts, merged.mat)
merge.counts[[i]] <- counts
}
}
return(merge.counts)
}
#' @importFrom Matrix sparseMatrix
#' @importFrom S4Vectors elementNROWS
PartialMatrix <- function(tabix, regions, sep = c("-", "-"), cells = NULL) {
# construct sparse matrix for one set of regions
# names of the cells vector can be ignored here, conversion is handled in
# the parent functions
open(con = tabix)
cells.in.regions <- GetCellsInRegion(
tabix = tabix,
region = regions,
cells = cells
)
close(con = tabix)
gc(verbose = FALSE)
nrep <- elementNROWS(x = cells.in.regions)
if (all(nrep == 0) & !is.null(x = cells)) {
# no fragments
# zero for all requested cells
featmat <- sparseMatrix(
dims = c(length(x = regions), length(x = cells)),
i = NULL,
j = NULL
)
rownames(x = featmat) <- GRangesToString(grange = regions)
colnames(x = featmat) <- cells
featmat <- as(object = featmat, Class = "CsparseMatrix")
return(featmat)
} else if (all(nrep == 0)) {
# no fragments, no cells requested
# create empty matrix
featmat <- sparseMatrix(
dims = c(length(x = regions), 0),
i = NULL,
j = NULL
)
rownames(x = featmat) <- GRangesToString(grange = regions)
featmat <- as(object = featmat, Class = "CsparseMatrix")
return(featmat)
} else {
# fragments detected
if (is.null(x = cells)) {
all.cells <- unique(x = unlist(x = cells.in.regions))
cell.lookup <- seq_along(along.with = all.cells)
names(x = cell.lookup) <- all.cells
} else {
cell.lookup <- seq_along(along.with = cells)
names(cell.lookup) <- cells
}
# convert cell name to integer
cells.in.regions <- unlist(x = cells.in.regions)
cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
all.features <- GRangesToString(grange = regions, sep = sep)
feature.vec <- rep(x = seq_along(along.with = all.features), nrep)
featmat <- sparseMatrix(
i = feature.vec,
j = cells.in.regions,
x = rep(x = 1, length(x = cells.in.regions))
)
featmat <- as(Class = "CsparseMatrix", object = featmat)
rownames(x = featmat) <- all.features[1:max(feature.vec)]
colnames(x = featmat) <- names(x = cell.lookup)[1:max(cells.in.regions)]
# add zero columns for missing cells
if (!is.null(x = cells)) {
featmat <- AddMissingCells(x = featmat, cells = cells)
}
# add zero rows for missing features
missing.features <- all.features[!(all.features %in% rownames(x = featmat))]
if (length(x = missing.features) > 0) {
null.mat <- sparseMatrix(
i = c(),
j = c(),
dims = c(length(x = missing.features), ncol(x = featmat))
)
rownames(x = null.mat) <- missing.features
null.mat <- as(object = null.mat, Class = "CsparseMatrix")
featmat <- rbind(featmat, null.mat)
}
return(featmat)
}
}
# Convert PFMMatrix to
# @param x A PFMatrix
PFMatrixToList <- function(x) {
if (!requireNamespace("TFBSTools", quietly = TRUE)) {
stop("Please install TFBSTools.
https://www.bioconductor.org/packages/TFBSTools/")
}
position.matrix <- TFBSTools::Matrix(x = x)
name.use <- TFBSTools::name(x = x)
return(list("matrix" = position.matrix, "name" = name.use))
}
#' @importFrom Matrix rowMeans rowSums
SparseRowVar <- function(x) {
return(rowSums(x = (x - rowMeans(x = x)) ^ 2) / (dim(x = x)[2] - 1))
}
#' @importMethodsFrom Matrix t
SparseColVar <- function(x) {
return(SparseRowVar(x = t(x = x)))
}
# Replace non-zero entries in a sparse entries with non-zero ranks
#
# This method creates a rank matrix for a sparse matrix X using the following approach:
# 1. Use non-zero entries in a column to calculate the ranks
# 2. Add (z-1)/2 to the ranks (only non-zero entries are changed). z is the number of zeros
# in the column
# Since all the entries are shifted by the same constant (the zeros
# are already shifted), the covariance matrix of this shifted matrix is
# the same as the rank matrix of the entire matrix (where the zeros would
# all also have a rank = (z+1)/2) where z is the number of zeros
#
# This rank matrix can then be used to calculate pearson correlation
SparsifiedRanks <- function(X){
if (!inherits(x = X, what = "CsparseMatrix")) {
X <- as(object = X, Class = "CsparseMatrix")
}
non_zeros_per_col <- diff(x = X@p)
n_zeros_per_col <- nrow(x = X) - non_zeros_per_col
offsets <- (n_zeros_per_col - 1) / 2
x <- X@x
## split entries to columns
col_lst <- split(
x = x,
f = rep.int(x = 1:ncol(x = X), times = non_zeros_per_col)
)
## calculate sparsified ranks and do shifting
sparsified_ranks <- unlist(
x = lapply(
X = seq_along(col_lst),
FUN = function(i) rank(x = col_lst[[i]]) + offsets[i]
)
)
## Create template rank matrix
X.ranks <- X
X.ranks@x <- sparsified_ranks
return(X.ranks)
}
SparseSpearmanCor <- function(X, Y = NULL, cov = FALSE) {
if (!requireNamespace(package = "qlcMatrix", quietly = TRUE)) {
stop("Please install qlcMatrix: install.packages('qlcMatrix')")
}
# Get sparsified ranks
rankX <- SparsifiedRanks(X = X)
if (is.null(Y)){
# Calculate pearson correlation on rank matrices
return (qlcMatrix::corSparse(X = rankX, cov = cov))
}
rankY <- SparsifiedRanks(X = Y)
return(qlcMatrix::corSparse(X = rankX, Y = rankY, cov = cov))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.