Nothing
#' Given a set of regions for a chromosome, compute the coverage matrix for a
#' given SRA study.
#'
#' Given a set of genomic regions as created by [expressed_regions], this
#' function computes the coverage matrix for a library size of 40 million 100 bp
#' reads for a given SRA study.
#'
#' @inheritParams expressed_regions
#' @param regions A [GRanges-class][GenomicRanges::GRanges-class] object with regions
#' for `chr` for which to calculate the coverage matrix.
#' @param chunksize A single integer vector defining the chunksize to use for
#' computing the coverage matrix. Regions will be split into different chunks
#' which can be useful when using a parallel instance as defined by
#' `bpparam`.
#' @param bpparam A [BiocParallelParam-class][BiocParallel::BiocParallelParam-class] instance which
#' will be used to calculate the coverage matrix in parallel. By default,
#' [SerialParam-class][BiocParallel::SerialParam-class] will be used.
#' @param verboseLoad If `TRUE` basic status updates for loading the data
#' will be printed.
#' @param scale If `TRUE`, the coverage counts will be scaled to read
#' counts based on a library size of 40 million reads. Set `scale` to
#' `FALSE` if you want the raw coverage counts. The scaling method is by
#' AUC, as in the default option of [scale_counts].
#' @param round If `TRUE`, the counts are rounded to integers. Set to
#' `TRUE` if you want to match the defaults of [scale_counts].
#'
#'
#' @return A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class]
#' object with the counts stored in the assays slot.
#'
#' @details When using `outdir = NULL` the information will be accessed
#' from the web on the fly. If you encounter internet access problems, it might
#' be best to first download the BigWig files using [download_study]. This
#' might be the best option if you are accessing all chromosomes for a given
#' project and/or are thinking of using different sets of `regions` (for
#' example, from different cutoffs applied to [expressed_regions]).
#' Alternatively check the `SciServer` section on the vignette to see
#' how to access all the recount data via a R Jupyter Notebook.
#'
#' If you have `bwtool` installed, you can use
#' <https://github.com/LieberInstitute/recount.bwtool> for faster results.
#' Note that you will need to run [scale_counts] after running
#' `coverage_matrix_bwtool()`.
#'
#' @author Leonardo Collado-Torres
#' @export
#'
#' @importFrom utils read.table
#' @import derfinder GenomicRanges RCurl BiocParallel
#' SummarizedExperiment S4Vectors
#'
#' @seealso [download_study], [findRegions][derfinder::findRegions],
#' [railMatrix][derfinder::railMatrix]
#'
#' @examples
#'
#' if (.Platform$OS.type != "windows") {
#' ## Reading BigWig files is not supported by rtracklayer on Windows
#' ## Define expressed regions for study DRP002835, chrY
#' regions <- expressed_regions("DRP002835", "chrY",
#' cutoff = 5L,
#' maxClusterGap = 3000L
#' )
#'
#' ## Now calculate the coverage matrix for this study
#' rse <- coverage_matrix("DRP002835", "chrY", regions)
#'
#' ## One row per region
#' identical(length(regions), nrow(rse))
#' }
coverage_matrix <- function(project, chr, regions, chunksize = 1000,
bpparam = NULL, outdir = NULL, chrlen = NULL, verbose = TRUE,
verboseLoad = verbose, scale = TRUE, round = FALSE, ...) {
## Check inputs
stopifnot(is.character(project) & length(project) == 1)
stopifnot(is.character(chr) & length(chr) == 1)
stopifnot((is.numeric(chunksize) | is.integer(chunksize)) & length(chunksize) == 1)
## Windows-specific info
if (.Platform$OS.type == "windows") {
warning("rtracklayer does not support importing BigWig files on Windows, so this function might not work")
}
## Use table from the package
url_table <- recount::recount_url
## Subset url data
url_table <- url_table[url_table$project == project, ]
if (nrow(url_table) == 0) {
stop("Invalid 'project' argument. There's no such 'project' in the recount_url data.frame.")
}
## Find chromosome length if absent
if (is.null(chrlen)) {
chrinfo <- read.table("https://raw.githubusercontent.com/nellore/runs/master/gtex/hg38.sizes",
col.names = c("chr", "size"), colClasses = c(
"character",
"integer"
)
)
chrlen <- chrinfo$size[chrinfo$chr == chr]
stopifnot(length(chrlen) == 1)
}
samples_i <- which(grepl("[.]bw$", url_table$file_name) & !grepl(
"mean",
url_table$file_name
))
## Check if data is present, otherwise download it
if (!is.null(outdir)) {
## Check sample files
sampleFiles <- sapply(samples_i, function(i) {
file.path(outdir, "bw", url_table$file_name[i])
})
if (any(!file.exists(sampleFiles))) {
download_study(
project = project, type = "samples", outdir = outdir,
download = TRUE, ...
)
}
## Check phenotype data
phenoFile <- file.path(outdir, paste0(project, ".tsv"))
if (!file.exists(phenoFile)) {
download_study(
project = project, type = "phenotype",
outdir = outdir, download = TRUE, ...
)
}
} else {
sampleFiles <- download_study(
project = project, type = "samples",
download = FALSE
)
phenoFile <- download_study(
project = project, type = "phenotype",
download = FALSE
)
}
## Read pheno data
pheno <- .read_pheno(phenoFile, project)
## Get sample names
m <- match(url_table$file_name[samples_i], pheno$bigwig_file)
## Match the pheno with the samples
pheno <- pheno[m, ]
if (project != "TCGA") {
names(sampleFiles) <- pheno$run
} else {
names(sampleFiles) <- pheno$gdc_file_id
}
## Define library size normalization factor
if (scale) {
targetSize <- 40e6
totalMapped <- pheno$auc[m]
mappedPerXM <- totalMapped / targetSize
} else {
mappedPerXM <- rep(1, nrow(pheno))
}
## Keep only regions from the chr in question
regions <- regions[seqnames(regions) == chr]
## Split regions into chunks
nChunks <- length(regions) %/% chunksize
if (length(regions) %% chunksize > 0) nChunks <- nChunks + 1
names(regions) <- seq_len(length(regions))
## Split regions into chunks
if (nChunks == 1) {
regs_split <- list(regions)
names(regs_split) <- "1"
} else {
regs_split <- split(regions, cut(seq_len(length(regions)),
breaks = nChunks, labels = FALSE
))
}
## Define bpparam
if (is.null(bpparam)) bpparam <- BiocParallel::SerialParam()
## Load coverage data
resChunks <- lapply(regs_split, derfinder:::.railMatChrRegion,
sampleFiles = sampleFiles, chr = chr, mappedPerXM = mappedPerXM,
L = 1, verbose = verbose, BPPARAM.railChr = bpparam,
verboseLoad = verboseLoad, chrlen = chrlen
)
## Group results from chunks
coverageMatrix <- do.call(rbind, lapply(resChunks, "[[", "coverageMatrix"))
if (round) coverageMatrix <- round(coverageMatrix, 0)
## Build a RSE object
rse <- SummarizedExperiment::SummarizedExperiment(
assays = list("counts" = coverageMatrix),
colData = DataFrame(pheno), rowRanges = regions
)
## Finish
return(rse)
}
## Helper function for reading the phenotype files
.read_pheno <- function(phenoFile, project) {
if (project %in% c("SRP012682", "TCGA")) {
subsets <- c("SRP012682" = "gtex", "TCGA" = "tcga")
res <- all_metadata(subsets[project], verbose = FALSE)
} else {
info <- readLines(phenoFile)
res <- read.table(
text = info[grepl(
paste0("^project|^", project),
info
)], header = TRUE, stringsAsFactors = FALSE, sep = "\t",
comment.char = "", quote = ""
)
}
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.