Nothing
#' Count Overlap of ATAC-seq Fragments
#'
#' @param file Filename of the file for ATAC-seq fragments.
#' The file must be block gzipped (using the \code{bgzip} command)
#' and accompanied with the index file (made using the \code{tabix} command).
#' The uncompressed file must be a tab delimited file,
#' where each row represents one fragment.
#' The first four columns are chromosome name, start position, end position,
#' and barcode (i.e., name) of the cell including the fragment.
#' The remaining columns are ignored.
#' See vignette for details.
#' @param targetregions GRanges object for the regions where overlaps are counted.
#' Usually all of the autosomes.
#' If there is memory problem, split a chromosome into smaller chunks,
#' for example by 10 Mb.
#' The function loads each element of \code{targetregions} sequentially,
#' and smaller elements require less memory.
#' @param excluderegions GRanges object for the regions to be excluded.
#' Simple repeats in the genome should be listed here,
#' because repeats can cause false overlaps.
#' A fragment is discarded if its 5' or 3' end is located in \code{excluderegions}.
#' If \code{NULL}, fragments are not excluded by this criterion.
#' @param targetbarcodes Character vector for the barcodes of cells to be analyzed,
#' such as those passing quality control.
#' If \code{NULL}, all barcodes in the input file are analyzed.
#' @param Tn5offset Numeric vector of length two.
#' The enzyme for ATAC-seq is a homodimer of Tn5.
#' The transposition sites of two Tn5 proteins are 9 bp apart,
#' and the (representative) site of accessibility is in between.
#' If the start and end position of your input file is taken from BAM file,
#' set the paramater to \code{c(4, -5)} to adjust the offset.
#' Alternatively, values such as \code{c(0, -9)} could generate similar results;
#' what matters the most is the difference between the two numbers.
#' The fragments.tsv.gz file generated by 10x Cell Ranger already adjusts the shift
#' but is recorded as a BED file. In this case, use \code{c(1, 0)} (default value).
#' If unsure, set to \code{"guess"},
#' in which case the program returns a guess.
#' @param barcodesuffix Add suffix to barcodes per targetregions.
#' @param dobptonext (experimental feature) Whether to compute smoothed distance
#' to the next fragment (irrelevant to BC) as bptonext,
#' which is the inverse of chromatin accessibility, and append as 9th to 14th columns.
#' @return A tibble with each row corresponding to a cell.
#' For each cell, its barcode, the total count of the fragments \code{nfrag},
#' and the count distinguished by overlap depth are given.
#'
#' @importFrom dplyr arrange desc distinct filter group_by lag left_join mutate n rename summarize ungroup
#' @importFrom GenomicRanges findOverlaps makeGRangesFromDataFrame
#' @importFrom rlang .data
#' @importFrom tidyr pivot_wider
#' @importFrom Rsamtools scanTabix TabixFile
#' @importFrom stats ksmooth
#' @importFrom utils read.csv setTxtProgressBar txtProgressBar
#' @export
fragmentoverlapcount = function (file,
targetregions,
excluderegions = NULL,
targetbarcodes = NULL,
Tn5offset = c(1, 0),
barcodesuffix = NULL,
dobptonext = FALSE) {
fragmentoverlaplist = list()
tbx = TabixFile(file = file)
pb = txtProgressBar(max = length(targetregions), style = 3)
for (i in 1:length(targetregions)) {
# Load fragments.
res = scanTabix(tbx, param = targetregions[i])
if (length(res[[1]]) == 0) { next() }
frags = read.csv(textConnection(res[[1]]),
sep = "\t",
header = FALSE)
frags = frags[, 1:4]
colnames(frags) = c("chr", "start", "end", "BC")
rm(res)
if (! is.null(targetbarcodes)) {
frags = frags[frags$BC %in% targetbarcodes, ]
}
if (nrow(frags) == 0) { next() }
# Discard "semi-duplicate" fragments;
# Hypothesized Tn5 transposition only at one strand.
frags = frags %>%
group_by(.data$BC) %>%
# Between chrX:100-200 and chrX:100-300, only retain first
arrange(.data$start, .data$end) %>%
distinct(.data$start, .keep_all = TRUE) %>%
# Between chrX:100-300 and chrX:200-300, only retain last
arrange(desc(.data$end), desc(.data$start)) %>%
distinct(.data$end, .keep_all = TRUE) %>%
ungroup() %>%
arrange(.data$start, .data$end, .data$BC)
# Discard fragment if 5' or 3' is located in excluderegions.
if (! is.null(excluderegions)) {
query = makeGRangesFromDataFrame(
data.frame(
seqnames = frags$chr,
start = frags$start,
end = frags$start))
x = is.na(findOverlaps(query, excluderegions, select = "first"))
query = makeGRangesFromDataFrame(
data.frame(
seqnames = frags$chr,
start = frags$end,
end = frags$end))
x = x & is.na(findOverlaps(query, excluderegions, select = "first"))
frags = frags[x, ]
if (nrow(frags) == 0) { next() }
}
# Adjust Tn5 site offset
if (identical(Tn5offset, "guess")) {
x = frags %>%
group_by(.data$BC) %>%
mutate(overlap = (lag(.data$end) - .data$start + 1)) %>%
ungroup()
x = x$overlap
x = x[abs(x) <= 18]
if (length(x) < 20) {
stop('Error: datasize is too small for guessing Tn5offset')
}
x = table(x)
x = as.numeric(names(x)[which.max(x)])
x = c(0, -x)
x = x - round(mean(x))
setTxtProgressBar(pb, length(targetregions))
close(pb)
return(x)
} else {
frags$start = frags$start + Tn5offset[1]
frags$end = frags$end + Tn5offset[2]
}
# Count overlap at 5' end of each fragment.
frags = frags %>%
group_by(.data$BC) %>%
mutate(overlapcount = .overlapwithprecedingcount(.data$start, .data$end, TRUE)) %>%
ungroup()
# Summarize per BC.
fragsbyBC = frags %>%
group_by(.data$BC) %>%
summarize(nfrags = n(),
depth1 = sum(.data$overlapcount == 0),
depth2 = sum(.data$overlapcount == 1),
depth3 = sum(.data$overlapcount == 2),
depth4 = sum(.data$overlapcount == 3),
depth5 = sum(.data$overlapcount == 4),
depth6 = sum(.data$overlapcount == 5))
if (dobptonext) {
# Compute smoothed distance to the next fragment (irrelevant to BC) as bptonext,
# which is the inverse of chromatin accessibility.
compute_smoothed_distance =
function(frags) {
if (nrow(frags) > 1) {
smoothed_starts = ksmooth(
1:nrow(frags),
frags$start,
bandwidth = 200,
n.points = nrow(frags))$y
differences = diff(smoothed_starts)
differences = c(differences, differences[length(differences)])
frags$bptonext = pmax(differences, 0)
} else {
frags$bptonext = Inf
}
return(frags)
}
# Convert bptonext into a list format
bptonext_as_list =
function(frags) {
list_data = frags %>%
mutate(depth = .data$overlapcount + 1) %>%
dplyr::filter(.data$depth <= 6) %>%
group_by(.data$BC, .data$depth) %>%
summarize(bptonext = list(.data$bptonext), .groups = "drop")
widened_data = pivot_wider(
list_data,
names_prefix = "bptonextdepth",
names_from = "depth",
values_from = "bptonext"
)
for (i in setdiff(paste0("bptonextdepth", 1:6), colnames(widened_data))) {
widened_data[[i]] = list(NULL)
}
return(widened_data)
}
frags = compute_smoothed_distance(frags)
bptonext_list_data = bptonext_as_list(frags)
fragsbyBC = left_join(fragsbyBC, bptonext_list_data, by = 'BC')
}
if (! is.null(barcodesuffix)) {
fragsbyBC$BC = paste0(fragsbyBC$BC, barcodesuffix[i])
}
fragmentoverlaplist = c(fragmentoverlaplist, list(fragsbyBC))
setTxtProgressBar(pb, i)
}
close(pb)
if (identical(fragmentoverlaplist, list())) {
stop('Error: no fragments remained after filtering')
}
if (dobptonext) {
fragmentoverlap =
do.call(rbind, fragmentoverlaplist) %>%
group_by(.data$BC) %>%
summarize(nfrags = sum(.data$nfrags),
depth1 = sum(.data$depth1),
depth2 = sum(.data$depth2),
depth3 = sum(.data$depth3),
depth4 = sum(.data$depth4),
depth5 = sum(.data$depth5),
depth6 = sum(.data$depth6),
bptonextdepth1 = list(do.call(c, .data$bptonextdepth1)),
bptonextdepth2 = list(do.call(c, .data$bptonextdepth2)),
bptonextdepth3 = list(do.call(c, .data$bptonextdepth3)),
bptonextdepth4 = list(do.call(c, .data$bptonextdepth4)),
bptonextdepth5 = list(do.call(c, .data$bptonextdepth5)),
bptonextdepth6 = list(do.call(c, .data$bptonextdepth6)))
} else {
fragmentoverlap =
do.call(rbind, fragmentoverlaplist) %>%
group_by(.data$BC) %>%
summarize(nfrags = sum(.data$nfrags),
depth1 = sum(.data$depth1),
depth2 = sum(.data$depth2),
depth3 = sum(.data$depth3),
depth4 = sum(.data$depth4),
depth5 = sum(.data$depth5),
depth6 = sum(.data$depth6))
}
fragmentoverlap = fragmentoverlap %>%
rename(barcode = "BC")
return(fragmentoverlap)
}
# A utility function.
# The fragments must be sorted by start, end.
.overlapwithprecedingcount =
function (start, end, include) {
ct = rep(NA, length(start))
if (length(include) == 1) {
include = rep(include, length(start))
}
unfinishedends = c()
for (i in 1:length(start)) {
if (!is.na(include[i]) & include[i]) {
unfinishedends = unfinishedends[unfinishedends >= start[i]]
ct[i] = length(unfinishedends)
unfinishedends = c(unfinishedends, end[i])
}
}
return(ct)
}
# A utility function.
# Computes a "capped" version of the log-transformed T2T1 values.
# The capping is done based on the interquartile range of the log-transformed values.
.cap = function (logT2T1) {
if (is.finite(quantile(logT2T1, 0.25))) {
x = 2 * quantile(logT2T1, 0.75) -
quantile(logT2T1, 0.5)
logT2T1capped = pmin(logT2T1, x)
x = 2 * quantile(logT2T1, 0.25) -
quantile(logT2T1, 0.5)
logT2T1capped = pmax(logT2T1capped, x)
# Although unlikely, adjust if -Inf remains.
x = min(logT2T1capped[is.finite(logT2T1capped)])
logT2T1capped = pmax(logT2T1capped, x)
} else {
warning("1st Qu. == -Inf")
x = min(logT2T1[is.finite(logT2T1)])
logT2T1capped = pmax(logT2T1, x)
}
return(logT2T1capped)
}
#' Infer Ploidy from ATAC-seq Fragment Overlap
#'
#' @param fragmentoverlap Frequency of fragment overlap in each cell
#' computed by the function \code{fragmentoverlapcount}.
#' @param levels Possible values of ploidy. For example,
#' \code{c(2, 4)} if the cells can be diploids or tetraploids.
#' The values must be larger than one.
#' @param s Seed for random numbers used in EM algorithm.
#' @param epsilon Convergence criterion for the EM algorithm.
#' @param subsamplesize EM algorithm becomes difficult to converge
#' when the number of cells is very large.
#' By setting the parameter (e.g. to 1e4),
#' we can run EM algorithm iteratively,
#' first for \code{subsamplesize} randomly sampled cells,
#' next for twice the number of cells in repetition.
#' The inferred lambda/theta parameters are used as the initial value
#' in the next repetition.
#' @param dobayes (experimental feature) Whether to perform Bayesian inference,
#' which takes long computation time.
#' @param prop Proportion of peaks that can be fitted with binomal
#' distribution in ploidy.bayes. The rest of peaks are allowed to
#' have depth larger than the ploidy.
#' @return A data.frame with each row corresponding to a cell.
#' For each cell, its barcode, ploidy inferred by 1) moment method,
#' 2) the same with additional K-means clustering,
#' 3) EM algorithm of mixture, and, optionally,
#' 4) Bayesian inference are given.
#' I recommend using \code{ploidy.moment} or \code{ploidy.em}.
#' When \code{fragmentoverlapcount} was computed with \code{dobptonext=TRUE},
#' we only use the chromosomal sites with chromatin accessibility in top 10%.
#' This requires longer computation time.
#'
#' @importFrom matrixStats colMins
#' @importFrom mixtools multmixEM
#' @import nimble
#' @importFrom stats dbinom dpois kmeans optimize quantile
#' @export
ploidy = function (fragmentoverlap,
levels,
s = 100,
epsilon = 1e-08,
subsamplesize = NULL,
dobayes = FALSE,
prop = 0.9) {
if (min(levels) <= 1) {
stop('Error: elements of levels must be larger than one')
}
### SUBTOTAL fragmentoverlap BY bptonext CLASSES
if (ncol(fragmentoverlap) > 8) {
# Unnest bptonext, classify by breaks, and count.
countbybptonext = function (data, breaks) {
colnames(data) = c("barcode", "bptonext")
data = tidyr::unnest(data, "bptonext")
if (nrow(data) > 0) {
data$bptonextclass = cut(data$bptonext, breaks, include.lowest = TRUE)
} else {
data$bptonextclass = character(0)
}
data = data %>%
group_by(.data$barcode, .data$bptonextclass) %>%
summarize(n = n(), .groups = "drop")
return(data)
}
# Compute deciles of bptonextall.
bptonextall =
do.call(c,
lapply(fragmentoverlap[, 9:14],
function (x) { do.call(c, x) }))
breaks = unique(quantile(bptonextall, seq(0, 1, 0.1)))
rm(bptonextall)
gc()
# grouping in rows by value: barcode, bptonextclass, depth
# value: n
fragmentoverlapbybptonext =
do.call(
rbind,
lapply(
1:6,
function (d) {
x = countbybptonext(
fragmentoverlap[, c("barcode", paste0("bptonextdepth", d))],
breaks)
x$depth = d
return(x)
}))
# grouping in rows by value: barcode, bptonextclass
# grouping in columns by name: depth
# value: n
fragmentoverlapbybptonext =
pivot_wider(
fragmentoverlapbybptonext,
names_from = "depth",
names_prefix = "depth",
values_from = "n")
fragmentoverlapbybptonext[is.na(fragmentoverlapbybptonext)] = 0
bptonextlevels =
levels(fragmentoverlapbybptonext$bptonextclass)
# grouping in list: bptonextclass
# grouping in rows by value: barcode
# grouping in columns by name: depth
# value: n
fragmentoverlapbybptonext = lapply(
bptonextlevels,
function (l) {
x =
fragmentoverlapbybptonext[
fragmentoverlapbybptonext$bptonextclass == l, ]
# Make barcode consistent with fragmentoverlap
x = x[match(fragmentoverlap$barcode, x$barcode), ]
x[is.na(x)] = 0
m = as.matrix(x[, -c(1:2)])
rownames(m) = x$barcode
return(m)
})
names(fragmentoverlapbybptonext) = bptonextlevels
}
if (ncol(fragmentoverlap) <= 8) {
x = as.matrix(fragmentoverlap[, 3:8])
T1 = as.numeric(x %*% seq(1, ncol(x)))
T2 = as.numeric(x %*% (seq(1, ncol(x))^2))
logT2T1 = log(T2 / T1 - 1)
x = inferpmoment(.cap(logT2T1), levels)
p.moment = x$p.moment
offset = x$offset
# exp(offset) is the estimate for 1/s
p.momentfractional = exp(logT2T1) * exp(offset) + 1
p.kmeans = inferpkmeans(fragmentoverlap, levels, p.moment)
p.em = inferpem(fragmentoverlap, levels, s, epsilon, subsamplesize)
if (dobayes) {
ploidy.bayes = inferpbayes(as.matrix(fragmentoverlap[, 3:8]), levels, prop, p.moment)
}
} else {
# grouping in list: bptonextclass
# name: barcode
# value: logT2T1
logT2T1bybptonext = lapply(
fragmentoverlapbybptonext,
function (x) {
T1 = as.numeric(x %*% seq(1, ncol(x)))
T2 = as.numeric(x %*% (seq(1, ncol(x))^2))
T2T1 = T2 / T1 - 1
logT2T1 = log(T2T1)
names(logT2T1) = rownames(x)
return(logT2T1)
}
)
x = inferpmoment(.cap(logT2T1bybptonext[[1]]), levels)
p.moment = x$p.moment
offset = x$offset
p.momentfractional = exp(logT2T1bybptonext[[1]]) * exp(offset) + 1
fragmentoverlap1 = cbind(
data.frame(
barcode = fragmentoverlap$barcode,
nfrags = rowSums(fragmentoverlapbybptonext[[1]])),
fragmentoverlapbybptonext[[1]])
p.kmeans = inferpkmeans(fragmentoverlap1, levels, p.moment)
p.em = inferpem(fragmentoverlap1, levels, s, epsilon, subsamplesize)
if (dobayes) {
x = fragmentoverlapbybptonext[[1]]
for (j in setdiff(1:6, 1:ncol(x))) { x = cbind(x, 0) } # pad if max depth < 6
ploidy.bayes = inferpbayes(x, levels, prop, p.moment)
}
}
output = data.frame(
barcode = fragmentoverlap$barcode,
ploidy.moment = p.moment,
ploidy.momentfractional = p.momentfractional,
ploidy.kmeans = p.kmeans,
ploidy.em = p.em)
if (dobayes) {
output$ploidy.bayes = ploidy.bayes$ploidy.bayes
}
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.