id <- function(cov) BiocGenerics::match(cov, unique(cov))
#' Add 'phase' column to a set of reads, given a vector of cds starts
#'
#' Given rpfs, and named vectors of starts and stops, this adds the 'phase'
#' to the reads
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param gr GRanges object with RPFs
#' @param cdsstarts named vector of cds starts
#'
#' @return the rpf granges but wth a 'phase' column
# TODO update this so it works with uORFs
addphase <- function(gr, cdsstarts) {
cdsstarts <- cdsstarts[as.vector(seqnames(gr))]
gr$phase <- unlist((GenomicRanges::start(gr) - cdsstarts) %% 3)
gr
}
#' Select those read lengths which make up 95% of the reads
#'
#' This function takes in a vector of the lengths of each read, and determines
#' which read lengths should be included in order to filter out the 2.5%
#' shortest/longest reads.
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param readlens Numeric; numeric vector
#' @return a numeric integer vector, such that selecting reads with these
#' lengths will preserve at least 95%
#' of reads
get_readlens <- function(readlens) {
readlens %>%
as.numeric() %>%
table() %>%
cumsum() %>%
{
. / max(.)
} %>%
Filter(f = function(x) x > 0.025) %>%
{
Filter(f = function(x) 1 - x > 0.025, .)
} %>%
names() %>%
as.numeric()
}
#' Read a bam file containing Ribosomal Footprints
#'
#' This function
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param ribobam String; full path to html report file.
#' @param strip_seqnames whether the function should remove all text after the
#' first '|', useful if aligning to gencode fastas#
#' Defaults to \code{TRUE}
#' @param which Select which reads to include (e.g. those that overlap
#' with start and stop positions)
#' @return This function returns a granges object with the read names in the
#' names slot and a metadata column
#' denoting readlength.
#'
#' @details This function reads in a bam file containing ribosomal footprints.
#' It uses only reads without splice sites and reads which align to the positive
#' strand, as it's designed to work on transcriptomic alignments.
#'
#' @seealso \code{\link{get_cds_reads}}, \code{\link{get_readlens}}
read_ribobam <- function(ribobam, which, strip_seqnames = TRUE) {
# What does which do here? - Gabriel
#
flags <- Rsamtools::scanBamFlag(
isUnmappedQuery = FALSE,
isMinusStrand = FALSE
)
bparam <- Rsamtools::ScanBamParam(simpleCigar = TRUE, flags)
ribogr <- GenomicAlignments::readGAlignments(
ribobam,
use.names = TRUE, param = bparam
)
#
readlens <- get_readlens(GenomicAlignments::qwidth(ribogr))
wfilt <- GenomicAlignments::qwidth(ribogr) <= max(readlens) &
(min(readlens) <= GenomicAlignments::qwidth(ribogr))
ribogr <- ribogr[wfilt]
# this is the number get_bamdf gets in py 4628644
mcols(ribogr)$readlen <- GenomicAlignments::qwidth(ribogr)
ribogr <- as(ribogr, "GenomicRanges")
# strip seqnames for when bam files contains a large fasta header
if (strip_seqnames) {
seqlevels(ribogr) <- str_replace(seqlevels(ribogr), "\\|.*", "")
}
# name the reads with integers as they appear in the sorted object
ribogr <- sort(ribogr)
names(ribogr) <- id(names(ribogr))
ribogr
}
#' Filter an RPF GR for overlap with coding sequences
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param cov GRanges; A granges object with
#' @param anno An annotation object
#'
#' @return This function returns a granges object with the read names in the
#' names slot and a metadata column
#' denoting readlength.
#'
#' @details This function reads in a bam file containing ribosomal footprints.
#' It uses only reads without splice sites
#' and reads which align to the positive strand, as it's designed to work on
#' transcriptomic alignments.
#'
#' @seealso \code{\link{get_cds_reads}}, \code{\link{get_readlens}}
get_cds_reads <- function(cov, anno) {
trspacecds <- anno$trspacecds
# load genomic or transcriptomic bam
genomicbam <- seqnames(cov) %>%
head(1) %>%
as.vector() %>%
str_detect("chr")
if (genomicbam) {
cov <- cov %>%
resize(1) %>%
GenomicFeatures::mapToTranscripts()
cov$readlen <- mcols(cov)$readlen[cov$xHits]
cov$name <- names(cov)[cov$xHits]
cov <- subset(cov, between(readlen, min(readlens), max(readlens)))
cov <- resize(cov, 1, "start")
cov <- sort(cov)
} else {
cov <- cov
cov <- sort(cov)
}
sharedseqnames <- unique(seqnames(trspacecds)) %>%
unlist() %>%
intersect(unique(seqnames(cov)))
cov <- cov %>% keepSeqlevels(sharedseqnames, pruning.mode = "coarse")
seqlevels(cov) <- seqlevels(trspacecds)
seqinfo(cov) <- seqinfo(trspacecds)
cov <- cov %>% IRanges::subsetByOverlaps(trspacecds)
}
#' Merge the seqlevels of two Granges objects
#' This function checks two GRanges objects have compatible ranges, and then
#' adds in to gr1 what's missing from gr2.
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param gr1 - granges object
#' @param gr2 - granges object
#' @return gr1 but with expanded seqinfo
mergeseqlevels <- function(gr1, gr2) {
unseqs <- union(seqlevels(gr1), seqlevels(gr2))
shared <- intersect(seqlevels(gr1), seqlevels(gr2))
s1unique <- setdiff(seqlevels(gr1), shared)
stopifnot(
all(seqinfo(gr1)[shared]@seqlengths == seqinfo(gr2)[shared]@seqlengths)
)
mergeseqinfo <- bind_rows(
as.data.frame(seqinfo(gr1)[s1unique]),
as.data.frame(seqinfo(gr2))
)
mergeseqinfo <- as(mergeseqinfo, "Seqinfo")
seqlevels(gr1) <- seqlevels(mergeseqinfo)
seqinfo(gr1) <- mergeseqinfo
gr1
}
#' convert a granges object of footprints to a psites object
#'
#' This function applies offsets to RPF data, attributing to each RPF
#' an offset specific a readlength and phase. Where phase is ambigous,
#' because more than one ORF overlaps the RPF, the RPF has both offsets
#' applied, and in rare cases where more than one possible psite location
#' applies, one is randomly selected.
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param rpfs - granges object with positions of footprints, in transcript
#' space
#' @param offsets_df - a data frame with numeric columns readlen,phase,offset
#' @param anno - an annotation object with at least one ORF per transcript,
#' the function uses the 'uORF' slot if it's available
#' @return a GRanges object of width-1 psites with columns readlen,
#' phase and offset
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' psites <- get_psite_gr(rpfs, offsets_df, anno)
#' @export
get_psite_gr <- function(rpfs, offsets_df, anno) {
orfs <- c(anno$trspacecds)
#
rpfs <- mergeseqlevels(rpfs, orfs)
orfs <- mergeseqlevels(orfs, rpfs)
#
orfov_ind <- findOverlaps(rpfs, orfs, select = "first", ignore.strand = TRUE)
rpfs$orf <- S4Vectors::Rle(names(orfs)[orfov_ind])
rpfs$phase <- GenomicRanges::start(rpfs) - GenomicRanges::start(orfs)[orfov_ind]
rpfs$phase <- rpfs$phase %% 3
#
# now get offsets for all
offsetcols <- c("readlen", "phase", "p_offset")
rpfs$p_offset <- rpfs %>%
{
as.data.frame(mcols(.)[, c("readlen", "phase")])
} %>%
left_join(
offsets_df %>% select(one_of(offsetcols)),
by = c("readlen", "phase")
) %>%
.$p_offset
rpfs <- rpfs %>% subset(!is.na(p_offset))
orfov <- GenomicRanges::countOverlaps(rpfs, orfs, ignore.strand = TRUE)
is_mov_rpf <- orfov > 1
# get phase for multi orf alignments
mov_rpfs <- rpfs[is_mov_rpf]
mov_rpfs_ov <- orfov[is_mov_rpf]
mov_rpfs_ind <- findOverlaps(mov_rpfs, orfs, ignore.strand = TRUE)
mov_rpfs <- rep(mov_rpfs, mov_rpfs_ov)
mov_rpfs$orf <- names(orfs)[subjectHits(mov_rpfs_ind)]
mov_rpfs$phase <- GenomicRanges::start(mov_rpfs) - GenomicRanges::start(orfs)[subjectHits(mov_rpfs_ind)]
mov_rpfs$phase <- mov_rpfs$phase %% 3
# get offsets for our multi orf alignments
mov_rpfs$p_offset <- mov_rpfs %>%
{
as.data.frame(mcols(.)[, c("readlen", "phase")])
} %>%
left_join(
offsets_df %>% select(one_of(offsetcols)),
by = c("readlen", "phase")
) %>%
.$p_offset
#
# for the remaining ambigous ones choose randomly
uniq_mov_rpfs <- mov_rpfs %>%
resize(1, "start") %>%
shift(.$p_offset) %>%
IRanges::subsetByOverlaps(orfs) %>%
sample() %>%
{
.[match(unique(names(.)), names(.))]
} %>%
.[order(as.numeric(names(.)))] %>%
shift(-.$p_offset) %>%
sort()
# add in phase to the ambiguous ones
uniq_mov_rpfs$p_offset <- NULL
rpfs[names(uniq_mov_rpfs)] <- uniq_mov_rpfs
psites <- rpfs %>%
subset(!is.na(p_offset)) %>%
resize(1, "start") %>%
shift(., .$p_offset)
psites <- psites[!is_out_of_bounds(psites)]
# phaseshift the psites
rl_phs_ <- mcols(psites)[, c("phase", "readlen")] %>%
as.data.frame() %>%
group_by(.data$phase, .data$readlen) %>%
tally()
shiftdf <- rl_phs_ %>%
group_by(.data$readlen) %>%
mutate(shft = rank(-.data$n) - 1) %>%
as.data.frame()
phaseshifts <- merge(
mcols(psites)[, c("phase", "readlen")],
shiftdf,
all.x = TRUE
)
#
psites <- psites %>%
GenomicRanges::shift(-.$phase) %>%
GenomicRanges::shift(phaseshifts$shft)
psites
}
#' Read and filter a bam file of RPFs
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param ribobam GRanges; A bam file with RPFs
#' @param anno An annotation object
#' @param startstop option to select only reads that overlapt he start/stop
#' for offset determination,Defaults to FALSE
#' @param strip_seqnames whether the function should remove all text after the
#' first '|', useful if aligning to gencode fastas Defaults to \code{TRUE}
#' @param offsets_df - a data frame with numeric columns readlen,phase,offset
#'
#' @return This function returns a granges object with the read names in the
#' names slot and a metadata column
#' denoting readlength.
#'
#' @details This function reads in a bam file containing ribosomal footprints.
#' It uses only reads without splice sites and reads which align to the
#' positive strand, as it's designed to work on transcriptomic alignments.
#'
#' @seealso \code{\link{get_cds_reads}}, \code{\link{get_readlens}}
#' @examples
#' data(chr22_anno)
#' testbam <- system.file("extdata", "chr22.bam", package = "Ribostan", mustWork = TRUE)
#' rpfs <- get_readgr(testbam, chr22_anno)
#' @export
get_readgr <- function(ribobam, anno, offsets_df = NULL, startstop = FALSE, strip_seqnames = TRUE) {
bamseqnames <- seqinfo(Rsamtools::BamFile(ribobam))@seqnames
if (strip_seqnames) bamseqnames <- str_replace(bamseqnames, "\\|.*", "")
orf_trs <- unlist(unique(seqnames(anno$trspacecds)))
stopifnot(
mean(orf_trs %in% bamseqnames) > .5
)
if (startstop) {
seltrs <- intersect(names(anno$trspacecds), bamseqnames)
which <- c(
anno$trspacecds[seltrs] %>% resize(1, "start"),
anno$trspacecds[seltrs] %>% resize(1, "end")
)
strand(which) <- "+"
} else {
which <- NULL
}
cov <- read_ribobam(ribobam, which)
if (!is.null(offsets_df)) {
cov <- get_psite_gr(cov, offsets_df, anno)
} else {
cov <- get_cds_reads(cov, anno)
}
cov
}
#' get_read_spmat
#'
#' get optimal ritpms using stan
#'
#' @param psites GRanges object of
#' @param anno an annotation object
#'
#' @return A matrix of the infile
get_read_spmat <- function(psites, anno) {
stopifnot(length(psites) > 0)
orfs <- c(anno$trspacecds)
#
psites <- mergeseqlevels(psites, orfs)
orfs <- mergeseqlevels(orfs, psites)
orfs <- IRanges::subsetByOverlaps(orfs, psites)
#
spmat <- Matrix::sparseMatrix(
i = names(psites) %>% id(),
j = psites$orf %>% id(),
x = 1
)
colnames(spmat) <- psites$orf %>% unique()
rownames(spmat) <- names(psites) %>% unique()
spmat <- spmat / Matrix::rowSums(spmat)
spmat
}
#' Use a sparse mapping matrix to optimize ritpms
#'
#' get optimal ritpms using stan
#'
#' @param spmat a sparse numeric matrix
#' @param anno an annotation object
#' @param iternum how many iterations to optimize TPMs for;
#' Defaults to 500
#' @param verbose whether to show optimization messages from stan;
#' Defaults to FALSE
#' @return A matrix of the infile
optimize_ritpms <- function(spmat, anno, iternum = 500, verbose = FALSE) {
trlens <- anno$trspacecds %>%
width() %>%
unlist() %>%
setNames(names(anno$trspacecds))
# now let's try the whole shebang in rstan
setdiff(colnames(spmat),names(trlens))
stopifnot(colnames(spmat)%in%names(trlens))
sptrlens <- trlens[colnames(spmat)]
fdata <- list(nonorm_trlen = sptrlens)
fdata <- c(fdata, spmat %>% rstan::extract_sparse_parts(.))
fdata$trlen <- fdata$nonorm_trlen %>%
{
. / sum(.)
}
fdata$TR <- spmat %>% ncol()
fdata$R <- spmat %>% nrow()
fdata$V <- fdata$w %>% length()
fdata$Ulen <- fdata$u %>% length()
fdata$classweights <- rep(1, fdata$R)
init <- list(ritpm = spmat %>%
{
Matrix::colSums(.)
} %>% `/`(fdata$trlen) %>%
{
. / sum(.)
})
#
modelcode <- "
data {
int TR;// number of TRs
int R;// number of reads
int V;
int Ulen;
vector [V] w ;
int v [V];
int u [Ulen];
vector [R] classweights;
}
parameters {
simplex [TR] n;
}
model {
target += log(
csr_matrix_times_vector(R, TR, w, v, u, n)
).* classweights;
}
"
eqritpm_mod <- rstan::stan_model(model_code = modelcode)
message('optimizing...')
opt <- rstan::optimizing(
eqritpm_mod,
data = fdata,
init = init,
verbose = verbose,
iter = iternum
)
opt$seqnames <- colnames(spmat)
opt$trlen <- fdata$nonorm_trlen[opt$seqnames]
opt
}
#' sample_cols_spmat
#'
#' This function takes in a sparse matrix, and then samples a single value
#' from each column, with sampling weights within each column equal to
#' the columns values
#'
#' @param spmat a sparse numeric matrix describing RPF's multimapping
#' @param return_mat whether to a matrix for selecting the elements,
#' as opposed to the summary data frame;defaults to TRUE
#' @return A matrix of the
sample_cols_spmat <- function(spmat, return_mat = TRUE) {
#
# so I think p gives us element which is the final one for each
matsum <- Matrix::summary(spmat)
cs <- cumsum(spmat@x)
colsums <- Matrix::colSums(spmat)
pts <- spmat@p
prevcs <- c(0, cs[pts %>% tail(-1)])[matsum$j]
cs <- cs - prevcs
xold <- spmat@x
spmat@x <- cs
spmatnm <- Matrix::t(Matrix::t(spmat) / colsums)
nvals <- spmat@x %>% length()
passrand <- runif(nvals) < spmatnm@x
spmat@x <- xold
spmat@x <- spmat@x * passrand
pass_summ <- Matrix::summary(spmat) %>% as.data.frame()
pass_summ <- pass_summ[pass_summ$x != 0, ]
pass_summ <- pass_summ[diff(c(0, pass_summ$j)) > 0, ]
# returnmatrix or the summary
if (return_mat) {
outmat <- Matrix::sparseMatrix(
i = pass_summ$i, j = pass_summ$j, x = pass_summ$x,
dims = dim(spmat)
)
stopifnot(dim(outmat) == dim(spmat))
outmat
} else {
pass_summ
}
}
################################################################################
#' get_ritpms
#'
#' This function takes in a sparse matrix, and then samples a single value
#' from each column, with sampling weights within each column equal to
#' the columns values
#' @param psites A GRanges object containing RPFs
#' @param anno An annotation object with a gene-transcript table
#' @return a vector of normalized footprint densities
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' data(ms_df)
#'
#' psites <- get_psite_gr(rpfs, offsets_df, chr22_anno)
#' # now, use Stan to estimate normalized p-site densities for our data
#' ritpms <- get_ritpms(psites, chr22_anno)
#' @export
get_ritpms <- function(psites, anno) {
psites <- psites%>%subset(orf %in% names(anno$trspacecds))
# quantify_orfs
spmat <- get_read_spmat(psites, anno)
#
ritpm_opt <- optimize_ritpms(spmat, anno, iternum = 100)
#
if (ritpm_opt$par %>% names() %>% str_detect("^ritpm\\[\\d+\\]$") %>% any()) {
ritpmpars <- ritpm_opt$par %>%
names() %>%
str_subset("^ritpm\\[\\d+\\]$")
ritpms <- ritpm_opt$par[ritpmpars]
names(ritpms) <- ritpm_opt$seqnames
ritpms <- ritpms * 1e6
ritpms
} else {
ritpmpars <- ritpm_opt$par %>%
names() %>%
str_subset("^n\\[\\d+\\]$")
ritpms <- ritpm_opt$par[ritpmpars]
names(ritpms) <- ritpm_opt$seqnames
ritpms <- ritpms / ritpm_opt$trlen
ritpms <- ritpms / sum(ritpms)
ritpms <- ritpms * 1e6
ritpms
}
}
#' sample_cov_gr
#'
#' This function takes in a coverage GR and takes one out of each multimap
#' weighting according to the ritpms
#'
#' @param psites A GRanges object containing psites
#' @param anno An annotation object with a gene-transcript table
#' @param ritpms a vector of ribosoome densities
#' @return a vector of normalized footprint densities
#' @export
sample_cov_gr <- function(psites, anno, ritpms) {
spmat <- get_read_spmat(psites, anno)
#
spmat <- Matrix::t(spmat) * ritpms[colnames(spmat)]
matsample <- sample_cols_spmat(spmat,return_mat = FALSE)
#
iddf <- tibble(
rind = seq_along(psites),
j = names(psites) %>% id(),
i = as.numeric(id(psites$orf))
)
#
rinds <- iddf %>%
inner_join(matsample, by = c("j", "i")) %>%
.$rind
#
sampcov <- psites[rinds]
#
sampcov
}
#' Aggregate transcript level RiboPMs to the gene-level
#' Ignores uORF expression
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param ripms - named vector of ribosome densities
#' @param anno An annotation object with a gene-transcript table
#' @return a data frame with columns gene_id, expr
#' @details This function reads in a vector of transcript-level
#' ribosome footprint densitiies and aggregates values from
#' transcripts of the same gene and outputs gene-level values
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' data(ms_df)
#'
#' psites <- get_psite_gr(rpfs, offsets_df, chr22_anno)
#' # now, use Stan to estimate normalized p-site densities for our data
#' ritpms <- get_ritpms(psites, chr22_anno)
#' # and get these at the gene level (ignoring uORFs)
#' gritpms <- gene_level_expr(ritpms, chr22_anno)
#' @export
gene_level_expr <- function(ripms, anno) {
trgiddf <- anno$trgiddf
if('uORF'%in%colnames(trgiddf)){
trgiddf <- trgiddf %>% subset(!uORF)
}
#
gn_expr <- left_join(
trgiddf,
tibble::enframe(ripms, "orf_id", "ritpm"),
by = "orf_id"
)
#
gn_expr <- gn_expr %>%
group_by(.data$gene_id) %>%
summarise(expr = sum(replace_na(.data$ritpm, 0)))
gn_expr %>% select('gene_id', 'expr')
}
#' Given a bam file, and transcript fasta, output a file of ribosome densities
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param ribobam A bam file with RPFs
#' @param ribofasta A gencode style fasta to which RPFs were aligned
#' @param outfile the file to which the RPF densitites should be saved
#' @return the file to which the table was saved
#'
#' @details The Ribosome densities are saved in salmon format
#' @export
#' @examples
#' get_exprfile(ribobam, ribofasta, outfile)
get_exprfile <- function(ribobam, ribofasta, outfile) {
#
anno <- get_ribofasta_anno(ribofasta)
#
rpfs <- get_readgr(ribobam, anno)
# determine offsets by maximum CDS occupancy
offsets_df <- get_offsets(rpfs, anno)
# use our offsets to determine p-site locations
psites <- get_psite_gr(rpfs, offsets_df, anno)
# now, use Stan to estimate normalized p-site densities for our data
ritpms <- get_ritpms(psites, anno)
#
n_reads <- n_distinct(names(rpfs))
lengths <- width(anno$trspacecds[names(ritpms)])
nucfracs <- (ritpms * lengths)
nucfracs <- nucfracs / sum(nucfracs)
counts <- n_reads * nucfracs
counts <- tibble::enframe(counts, "Name", "NumReads")
ritpmdf <- tibble::enframe(ritpms, "Name", "ritpm")
#
cdslens <- anno$trspacecds %>%
width() %>%
setNames(names(anno$trspacecds)) %>%
tibble::enframe("Name", "Length") %>%
mutate(EffectiveLength = .data$Length)
output <- cdslens %>%
left_join(ritpmdf) %>%
left_join(counts)
#
output <- output %>%
select('Name', 'Length', 'EffectiveLength', 'ritpm', 'NumReads')
output %>% write_tsv(outfile)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.