##' Read a DBN file.
##'
##' Read a DBN file. See 'Details'.
##'
##' The function reads in a DBN (dot-bracket) structure file, and
##' returns a \code{dataframe} with the following data columns:
##' \enumerate{
##' \item Column 1: Sequence ID
##' \item Column 2: Length of the sequence (in nt)
##' \item Column 3: Mean free energy (MFE)
##' }
##'
##' @param file A character string; specifies the input DBN file.
##'
##' @return A \code{dataframe} object. See 'Details'.
##'
##' @author Maurits Evers, \email{maurits.evers@@anu.edu.au}
#ReadDBN <- function(file) {
# if (!file.exists(file)) {
# ss <- sprintf("Could not open %s.", file)
# stop(ss)
# }
# d <- as.data.frame(matrix(readLines(file), ncol = 3, byrow = TRUE),
# stringsAsFactors = FALSE)
# d[, 1] <- gsub("^>", "", d[, 1])
# d[, 2] <- nchar(d[, 2])
# d[, 3] <- gsub("^[\\(\\.\\)]+\\s", "", d[, 3])
# d[ ,3] <- as.numeric(gsub("[\\(\\)]", "", d[, 3]))
# colnames(d) <- c("id", "siteSeqLength", "siteMFE")
# return(d)
#}
##' Fold sequences.
##'
##' Fold sequences. See 'Details'.
##'
##' The function takes a \code{dataframe}, extracts sequences from a
##' column specified by \code{colSeq}, and predicts secondary structures
##' using RNAfold \url{http://rna.tbi.univie.ac.at/}.
##' An optional column containing sequence IDs may be specified by
##' \code{colId}.
##' The function returns a \code{dataframe} with three columns:
##' \enumerate{
##' \item Column 1: Sequence ID
##' \item Column 2: Length of the sequence (in nt)
##' \item Column 3: Mean free energy (MFE)
##' }
##'
##' @param data A \code{dataframe} object. See 'Details'.
##' @param colSeq An integer scalar; specifies the column in
##' \code{data} containing the sequences.
##' @param colId An integer scalar; specifies the column in
##' \code{data} containing sequence IDs; if \code{NULL} IDs
##' are generated automatically; default is \code{NULL}.
##'
##' @return A \code{dataframe} object. See 'Details'.
##'
##' @author Maurits Evers, \email{maurits.evers@@anu.edu.au}
##'
##' @import Biostrings GenomicRanges IRanges
#GetMFE <- function(data, colSeq, colId = NULL) {
# sq <- DNAStringSet(data[, colSeq])
# if (is.null(colId) || length(data[, colId]) != length(sq)) {
# id <- sprintf("seq%i", seq(1, length(sq)))
# } else {
# id <- data[, colId]
# }
# names(sq) <- id
# writeXStringSet(sq, filepath = "tmp.fa", format = "fasta")
# cmd <- sprintf("RNAfold --noPS < tmp.fa > tmp.dbn")
# system(sprintf(cmd))
# str <- ReadDBN("tmp.dbn")
# file.remove(c("tmp.fa", "tmp.dbn"))
# return(str)
#}
#
## #' Plot relative distance distribution of loci from \code{txLoc}
## #' object to splice sites.
## #'
## #' Plot relative distance distribution of loci from \code{txLoc}
## #' object to splice sites.
## #'
## #' @param locus A \code{txLoc} object.
## #' @param ss A list of two \code{GRangesList} objects.
## #' @param flank An integer scalar; specifies the absolute maximum
## #' relative distance used as a cutoff; default is 1000.
## #' @param binWidth An integer scalar; specifies the spatial width
## #' by which distances will be binned; default is 20.
## #'
## #' @export
## PlotRelDistSSDistribution <- function(locus, ss, flank = 1000, binWidth = 20) {
## dist <- GetRelDistSS(locus, ss, flank)
## breaks <- seq(-flank, flank, by = binWidth)
## bwString <- sprintf("bw = %i nt", binWidth)
## h0 <- hist(dist, breaks = breaks, plot = FALSE)
## plot(h0$mids, h0$counts,
## type = "s",
## lwd = 2,
## xlab = "Relative distance from 1st CDS splice site [nt]",
## ylab = "Abundance",
## xlim = c(-1000, 1000),
## font.main = 1)
## CIFromBS <- EstimateCIFromBS(dist,
## breaks = breaks,
## nBS = 5000)
## x1 <- c(CIFromBS$x[1],
## rep(CIFromBS$x[-1], each = 2),
## CIFromBS$x[length(CIFromBS$x)])
## CI <- cbind(c(x1,
## rev(x1)),
## c(rep(CIFromBS$y.low, each = 2),
## rev(rep(CIFromBS$y.high, each = 2))))
## polygon(CI[, 1], CI[, 2], col = rgb(1, 0, 0, 0.2),
## lwd = 1, border = NA, lty = 1)
## # Loess smoothing of boostrap CI
## lines(lowess(CIFromBS$x, CIFromBS$y.low, f = 1/5),
## col = "red", lty = 2, lwd = 1)
## lines(lowess(CIFromBS$x, CIFromBS$y.high, f = 1/5),
## col = "red", lty = 2, lwd = 1)
## legend("topleft",
## c(sprintf("Abundance (%s)", bwString),
## "95%CI (empirical bootstrap)",
## "Lowess-smoothed 95%CI"),
## lwd = c(2, 5, 1),
## col = c("black", rgb(1, 0, 0, 0.2), "red"),
## lty = c(1, 1, 2),
## bty = "n")
## }
##
## #' Plot relative distance enrichment of sites from \code{locPos}
## #' and \code{locPos} to splice sites.
## #'
## #' Plot relative distance enrichment of sites from \code{locPos}
## #' and \code{locPos} to splice sites.
## #'
## #' @param locPos A \code{txLoc} object; specifies the positive
## #' sites.
## #' @param locNeg A \code{txLoc} object; specifies the negative
## #' sites.
## #' @param ss A list of two \code{GRangesList} objects.
## #' @param flank An integer scalar; specifies the absolute maximum
## #' relative distance used as a cutoff; default is 1000.
## #' @param binWidth An integer scalar; specifies the spatial width
## #' by which distances will be binned; default is 20.
## #'
## #' @export
## PlotRelDistSSEnrichment <- function(locPos, locNeg, ss, flank = 1000, binWidth = 20) {
## idPos <- GetId(locPos)
## idNeg <- GetId(locNeg)
## distPos <- GetRelDistSS(locPos, ss, flank)
## distNeg <- GetRelDistSS(locNeg, ss, flank)
## breaks <- seq(-flank, flank, by = binWidth)
## ctsPos <- table(cut(distPos, breaks = breaks))
## ctsNeg <- table(cut(distNeg, breaks = breaks))
## ctsMat <- as.matrix(rbind(ctsPos, ctsNeg))
## rownames(ctsMat) <- c("pos", "neg")
## title <- sprintf("N(%s) = %i, N(%s) = %i\n(bw = %i nt)",
## idPos, sum(ctsPos),
## idNeg, sum(ctsNeg),
## binWidth)
## tmp <- PlotEnrichment.Generic(ctsMat,
## title = title,
## x.las = 2, x.cex = 0.8, x.padj = 0.8)
## }
## #' Plot distribution of relative distances to exon-exon junctions.
## #'
## #' Plot distribution of relative distances to exon-exon junctions.
## #'
## #' @param dist A list of integer vectors.
## #' @param flank An integer scalar; specifies the absolute maximum
## #' relative distance used as a cutoff; default is 1000.
## #' @param binWidth An integer scalar; specifies the spatial width
## #' by which distances will be binned; default is 20.
## #' @param doBootstrap A logical scalar; if \code{YES} calculate
## #' 95% CI based on empirical bootstrap of sites within transcript
## #' region; default is \code{TRUE}.
## #'
## #' @export
## PlotRelDistEEJDistribution <- function(dist,
## flank = 1000,
## binWidth = 20,
## doBootstrap = TRUE) {
## if (length(dist) < 4) {
## par(mfrow = c(1, length(dist)))
## } else {
## par(mfrow = c(ceiling(length(dist) / 2), 2))
## }
## breaks <- seq(-flank, flank, by = binWidth)
## bwString <- sprintf("bw = %3.2f", binWidth)
## for (i in 1:length(dist)) {
## if (flank > 0) {
## dist[[i]] <- dist[[i]][abs(dist[[i]]) <= flank]
## }
## title <- sprintf("%s (N=%i)",
## names(dist)[i],
## length(dist[[i]]))
## xlab <- "Relative distance to exon-exon junction [nt]"
## PlotAbundance.generic(dist[[i]],
## xmin = -flank, xmax = flank,
## binWidth = binWidth,
## title = title,
## xlab = xlab,
## doBootstrap = doBootstrap)
## }
## }
## #' Perform and plot enrichment analysis of relative distances of
## #' "positive" and "negative" control sites to exon-exon junctions.
## #'
## #' Perform and plot enrichment analysis of relative distances of
## #' positive and negative control sites to exon-exon junctions.
## #'
## #' @param distPos A list of integer vectors; specifies distances
## #' of "positive" sites to exon-exon junctions.
## #' @param distNeg A list of integer vectors; specifies distances
## #' of "negative" sites to exon-exon junctions.
## #' @param flank An integer scalar; specifies the absolute maximum
## #' relative distance used as a cutoff; default is 1000.
## #' @param binWidth An integer scalar; specifies the spatial width
## #' by which distances will be binned; default is 20.
## #'
## #' @export
## PlotRelDistEEJEnrichment <- function(distPos,
## distNeg,
## flank = 1000,
## binWidth = 20) {
## if (length(distPos) < 4) {
## par(mfrow = c(1, length(distPos)))
## } else {
## par(mfrow = c(ceiling(length(distPos) / 2), 2))
## }
## breaks <- seq(-flank, flank, by = binWidth)
## bwString <- sprintf("bw = %3.2f", binWidth)
## for (i in 1:length(distPos)) {
## if (flank > 0) {
## distPos[[i]] <- distPos[[i]][abs(distPos[[i]]) <= flank]
## distNeg[[i]] <- distNeg[[i]][abs(distNeg[[i]]) <= flank]
## }
## ctsPos <- table(cut(distPos[[i]], breaks = breaks))
## ctsNeg <- table(cut(distNeg[[i]], breaks = breaks))
## ctsMat <- as.matrix(rbind(ctsPos, ctsNeg))
## rownames(ctsMat) <- c("pos", "neg")
## title <- sprintf("N(%s) = %i, N(%s) = %i\n(bw = %i nt)",
## "pos", sum(ctsPos),
## "neg", sum(ctsNeg),
## binWidth)
## tmp <- PlotEnrichment.Generic(ctsMat,
## title = title,
## x.las = 2, x.cex = 0.8, x.padj = 0.8)
## }
## }
## #' Calculate relative distances between loci from \code{txLoc}
## #' object to splice sites.
## #'
## #' Calculate relative distances between loci from \code{txLoc}
## #' and splice sites. See 'Details'.
## #' This is a low-level function that is being called from
## #' \code{PlotRelDistSSDistribution} and
## #' \code{PlotRelDistSSEnrichment}.
## #'
## #' The function calculates relative distances between 5' splice
## #' sites from \code{ss} and loci from \code{locus}. Distances
## #' are taken to be negative (positive) if a locus from
## #' \code{txLoc} is upstream (downstream) of a splice site.
## #' Distances flank <= d <= flank are considered. The fun
## #'
## #' @param locus A \code{txLoc} object.
## #' @param ss A list of two \code{GRangesList} objects.
## #' @param flank An integer scalar; specifies the absolute maximum
## #' relative distance used as a cutoff; default is 1000.
## #'
## #' @return An integer vector.
## #'
## #' @keywords internal
## #'
## #' @export
## GetRelDistSS <- function(locus,
## ss,
## flank = 1000) {
## CheckClass(locus, "txLoc")
## CheckClass(ss, "list", "GRangesList")
## grLoc <- unlist(
## TxLoc2GRangesList(locus,
## filter = c("5'UTR", "CDS", "3'UTR")))
## # Select 5' splice sites in CDS
## grSS <- unlist(ss[[grep("5p", names(ss))]])
## grSS <- grSS[grep("(CDS|coding)",
## grSS$section,
## ignore.case = TRUE)]
## # Filter genes with ss _and_ modification site
## genes <- intersect(grSS$gene, grLoc$gene)
## grSS <- grSS[which(grSS$gene %in% genes)]
## grLoc <- grLoc[which(grLoc$gene %in% genes)]
## # Select first 5' splice site upstream of start codon in CDS
## grSSPos <- grSS[which(strand(grSS) == "+")]
## grSSNeg <- grSS[which(strand(grSS) == "-")]
## dupIDPos <- duplicated(grSSPos$gene,
## fromLast = FALSE)
## dupIDNeg <- duplicated(grSSNeg$gene,
## fromLast = TRUE)
## grSSPos <- grSSPos[!dupIDPos]
## grSSNeg <- grSSNeg[!dupIDNeg]
## grSS <- sort(append(grSSPos, grSSNeg))
## # rtracklayer::export(grSS, "ss5p_firstCDS.bed")
## # Get distances
## d <- as.data.frame(distanceToNearest(grLoc, grSS))
## idx <- d$subjectHits
## dist <- ifelse(end(grLoc) > start(grSS[idx]), d$distance, -d$distance)
## dist <- ifelse(strand(grLoc) == "+", dist, -dist)
## dist <- dist[abs(dist) <= flank]
## return(dist)
## }
## #' Calculate mutual distances between entries for every transcript
## #' region from two txLoc objects.
## #'
## #' Calculate mutual distances between entries for every transcript
## #' region from two txLoc objects.
## #'
## #' The function calculates relative distances between loci from
## #' \code{loc1} and \code{loc2} within the same transcript section.
## #' Positive distances refer to loc1 > loc2, negative distances to
## #' loc1 < loc2. By default only the smallest distance for every
## #' locus from \code{loc1} is kept (\code{method = "nearest"}).
## #' By default relative distances correspond to distances between
## #' start coordinates (\code{refPoint = "ss"}).
## #'
## #' @param loc1 A \code{txLoc} object.
## #' @param loc2 A \code{txLoc} object.
## #' @param filter A character vector; only consider transcript sections
## #' specified in \code{filter}; if \code{NULL} consider all sections.
## #' @param method A character strings; specifies the distance
## #' calculation method; possible arguments are "all", "nearest";
## #' default is "nearest".
## #' @param refPoint A character string; specifies the reference point
## #' for calculating relative distances; possible arguments are "ss"
## #' (start-start), "mm" (midpoint-midpoint), "se" (start-end), "es"
## #' (end-start), "ee" (end-end); default is "ss".
## #'
## #' @return List of upper and lower 95% confidence interval
## #' bounds for every bin value.
## #'
## #' @export
## GetRelDist <- function(loc1,
## loc2,
## filter = NULL,
## method = c("nearest", "all"),
## refPoint = c("ss", "mm", "se", "es", "ee")) {
## CheckClass(loc1, "txLoc")
## CheckClass(loc2, "txLoc")
## CheckClassTxLocRef(loc1, loc2)
## refPoint <- match.arg(refPoint)
## method <- match.arg(method)
## id1 <- GetId(loc1)
## id2 <- GetId(loc2)
## refGenome <- GetRef(loc1)
## sec <- intersect(names(GetLoci(loc1)), names(GetLoci(loc2)))
## loc1 <- FilterTxLoc(loc1, sec)
## loc2 <- FilterTxLoc(loc2, sec)
## loc1 <- GetLoci(loc1)
## loc2 <- GetLoci(loc2)
## dist.list <- list()
## for (i in 1:length(loc1)) {
## txID <- intersect(loc1[[i]]$REFSEQ, loc2[[i]]$REFSEQ)
## if (length(txID) == 0) {
## dist.list[[length(dist.list)+1]] <- 0
## } else {
## dist <- vector()
## for (j in 1:length(txID)) {
## loc1.sel <- loc1[[i]][which(loc1[[i]]$REFSEQ == txID[j]), ]
## loc2.sel <- loc2[[i]][which(loc2[[i]]$REFSEQ == txID[j]), ]
## distMat <- switch(refPoint,
## "ss" = outer(loc1.sel$TXSTART,
## loc2.sel$TXSTART,
## "-"),
## "mm" = outer((loc1.sel$TXSTART + loc1.sel$TXEND) / 2,
## (loc2.sel$TXSTART + loc2.sel$TXEND) / 2,
## "-"),
## "se" = outer(loc1.sel$TXSTART,
## loc2.sel$TXEND,
## "-"),
## "es" = outer(loc1.sel$TXEND,
## loc2.sel$TXSTART,
## "-"),
## "ee" = outer(loc1.sel$TXEND,
## loc2.sel$TXEND,
## "-"))
## if (method == "nearest") {
## dist <- c(dist, apply(distMat, 1, function(x) x[which.min(abs(x))]))
## } else {
## dist <- c(dist, as.vector(distMat))
## }
## }
## dist.list[[length(dist.list) + 1]] <- dist
## }
## }
## names(dist.list) <- names(loc1)
## return(dist.list)
## }
## #' Get 5'/3' splice sites from transcriptome.
## #'
## #' Get 5'/3' splice sites from transcriptome. See 'Details'.
## #'
## #' This function extracts splice site positions from a reference
## #' transcriptome, and returns a list of two \code{GRangesList}
## #' objects of all 5' and 3' splice sites per gene.
## #'
## #' @param refGenome A character string; specifies a specific
## #' reference genome assembly version based on which the matching
## #' transcriptome is loaded; default is \code{"hg38"}.
## #' @param writeBED A logical scalar; if \code{TRUE} write 5'/3'
## #' splice sites to two BED files; default is \code{FALSE}.
## #'
## #' @import GenomicRanges IRanges
## #' @importFrom rtracklayer export
## #'
## #' @return A list of two \code{GRangesList} objects. See 'Details'.
## #'
## #' @export
## GetSpliceSites <- function(refGenome = "hg38", writeBED = FALSE) {
## refTx <- sprintf("tx_%s.RData", refGenome)
## if (!file.exists(refTx)) {
## ss <- sprintf("Reference transcriptome for %s not found.", refGenome)
## ss <- sprintf("%s\nRunning BuildTx(\"%s\") might fix that.",
## ss, refGenome)
## stop(ss)
## }
## load(refTx)
## requiredObj <- c("geneXID", "seqBySec", "txBySec")
## if (!all(requiredObj %in% ls())) {
## ss <- sprintf("Mandatory transcript objects not found.")
## ss <- sprintf("%s\nNeed all of the following: %s",
## ss, paste0(requiredObj, collapse = ", "))
## ss <- sprintf("%s\nRunning BuildTx(\"%s\") might fix that.",
## ss, refGenome)
## stop(ss)
## }
## geneXID <- get("geneXID")
## seqBySec <- get("seqBySec")
## txBySec <- get("txBySec")
## secWithSpliceSites <- c("5'UTR", "CDS", "3'UTR")
## sel <- which(names(txBySec) %in% secWithSpliceSites)
## ss5p.all <- GRanges()
## ss3p.all <- GRanges()
## for (i in 1:length(sel)) {
## junct <- psetdiff(range(txBySec[[sel[i]]]), txBySec[[sel[i]]])
## junct <- unlist(junct[elementLengths(junct) > 0])
## ss5p <- GRanges(
## seqnames(junct),
## IRanges(ifelse(strand(junct) == "+",
## start(junct),
## end(junct)),
## ifelse(strand(junct) == "+",
## start(junct),
## end(junct))),
## strand(junct),
## type = "ss5p",
## gene = names(junct),
## section = names(txBySec)[sel[i]])
## ss3p <- GRanges(
## seqnames(junct),
## IRanges(ifelse(strand(junct) == "+",
## end(junct),
## start(junct)),
## ifelse(strand(junct) == "+",
## end(junct),
## start(junct))),
## strand(junct),
## type = "ss3p",
## gene = names(junct),
## section = names(txBySec)[sel[i]])
## ss5p.all <- append(ss5p.all, ss5p)
## ss3p.all <- append(ss3p.all, ss3p)
## }
## ss5p.all <- sort(ss5p.all)
## ss3p.all <- sort(ss3p.all)
## ss5p.all$name <- paste(ss5p.all$type, ss5p.all$gene, ss5p.all$section, sep = "|")
## ss3p.all$name <- paste(ss3p.all$type, ss3p.all$gene, ss3p.all$section, sep = "|")
## if (writeBED) {
## rtracklayer::export(ss5p.all, "ss5p.bed")
## rtracklayer::export(ss3p.all, "ss3p.bed")
## }
## ret <- list(GenomicRanges::split(ss5p.all, ss5p.all$gene),
## GenomicRanges::split(ss3p.all, ss3p.all$gene))
## names(ret) <- c("ss5p", "ss3p")
## return(ret)
## }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.