#' @title Retrieve sequences flanking back-spliced junctions
#'
#' @description The function getSeqsFromGRs() includes 3 modules to retrieve
#' 3 types of sequences. Sequences of the introns flanking back-spliced
#' junctions, sequences from a defined genomic window surrounding the
#' back-spliced junctions and sequences of the back-spliced exons.
#'
#' @param annotatedBSJs A data frame with the annotated back-spliced junctions.
#' This data frame can be generated with \code{\link{annotateBSJs}}.
#'
#' @param genome A BSgenome object containing the genome sequences.
#' It can be generated with in \code{\link{getBSgenome}}.
#' See \code{\link[BSgenome]{available.genomes}} to see the BSgenome package
#' currently available
#'
#' @param lIntron An integer indicating how many nucleotides are taken from
#' the introns flanking the back-spliced junctions. This number must be
#' positive. Default value is 100.
#'
#' @param lExon An integer indicating how many nucleotides are taken from
#' the back-spliced exons starting from the back-spliced junctions. This number
#' must be positive. Default value is 10.
#'
#' @param type A character string specifying the sequences to retrieve.
#' If type = "ie" the sequences are retrieved from the the genomic ranges
#' defined by using the lIntron and lExon given in input. If type = "bse"
#' the sequences of the back-spliced exons are retrieved. If type = "fi"
#' the sequences of the introns flanking the back-spliced exons are retrieved.
#' Default value is "ie".
#'
#' @return A list.
#'
#' @examples
#' # Load data frame containing predicted back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)){
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve target sequences
#' targets <- getSeqsFromGRs(
#' annotatedBSJs,
#' genome,
#' lIntron = 200,
#' lExon = 10,
#' type = "ie"
#' )
#' }
#'
#' @importFrom Biostrings reverseComplement
#' @importFrom BSgenome getSeq
#' @import BSgenome.Hsapiens.UCSC.hg19
#' @export
getSeqsFromGRs <-
function(annotatedBSJs,
genome,
lIntron = 100,
lExon = 10,
type = "ie") {
# only 3 options are possible for the argument type
match.arg(type, c("ie", "bse", "fi"))
# Retrieve the coordinates from whcih to extract the sequences.
grCoords <- switch(
type,
bse = .getBSEcoords(annotatedBSJs),
fi = .getFIcoords(annotatedBSJs),
ie = .getIECoords(annotatedBSJs, lIntron, lExon)
)
# Create a list with 2 elements to store the sequences extracted from
# the upstream and downstream genomic ranges
targets <- vector("list", 2)
names(targets)[1] <- "upGR"
names(targets)[2] <- "downGR"
for (i in seq_along(targets)) {
# Create an empty list of data frames
targets[[i]] <- .getTargetsDF(annotatedBSJs)
if (i == 1) {
indexStart <- which(colnames(grCoords) == "startUpGR")
indexEnd <- which(colnames(grCoords) == "endUpGR")
} else{
indexStart <- which(colnames(grCoords) == "startDownGR")
indexEnd <- which(colnames(grCoords) == "endDownGR")
}
# Fill the data frame with the extracted genomic range coordinates
targets[[i]]$id <- grCoords$id
targets[[i]]$gene <- grCoords$gene
targets[[i]]$transcript <- grCoords$transcript
targets[[i]]$strand <- grCoords$strand
targets[[i]]$chrom <- grCoords$chrom
targets[[i]]$startGR <- grCoords[, indexStart]
targets[[i]]$endGR <- grCoords[, indexEnd]
targets[[i]]$type <- rep(type, nrow(targets[[i]]))
for (j in seq_along(targets[[1]]$id)) {
chrom <- targets[[i]]$chrom[j]
startGR <- targets[[i]]$startGR[j]
endGR <- targets[[i]]$endGR[j]
strand <- targets[[i]]$strand[j]
seq <-
.getSeqFromBSgenome(genome, chrom, startGR, endGR)
# Check coordinates and retrieve the seq
if (strand == "-" &
!is.na(startGR) & !is.na(endGR)) {
# For a circRNA that arises from a gene transcribed from
# negative strand we take the reverse complement
targets[[i]]$seq[j] <-
gsub("T",
"U",
as.character(Biostrings::reverseComplement(seq)))
targets[[i]]$length[j] <-
nchar(targets[[i]]$seq[j])
} else if (strand == "+" &
!is.na(startGR) & !is.na(endGR)) {
# For the positive strand no modification is needed because
# the sense strand corresponds to the positive strand that
# is the genome reference.
targets[[i]]$seq[j] <-
gsub("T", "U", as.character(seq))
targets[[i]]$length[j] <-
nchar(targets[[i]]$seq[j])
}
}
}
return(targets)
}
# The function getIECoords() retrieves the coordinates of the
# genomic ranges defined by the values of the arguments lIntron and lExon.
# The start point are the back-spliced junction coordinates reported in the
# annotateBSJs data frame.
.getIECoords <- function(annotatedBSJs,
lIntron = 100,
lExon = 10) {
# lIntron and lExon must be positive numbers
if (lIntron < 0) {
stop("lIntron must be a positive number")
}
if (lExon < 0) {
stop("lExon must be a positive number")
}
# Create an empty dataframe
grCoords <- .getGRcoordsDF(annotatedBSJs)
for (i in seq_along(annotatedBSJs$id)) {
grCoords$id[i] <- annotatedBSJs$id[i]
grCoords$gene[i] <- annotatedBSJs$gene[i]
grCoords$transcript[i] <- annotatedBSJs$transcript[i]
grCoords$strand[i] <- annotatedBSJs$strand[i]
grCoords$chrom[i] <- annotatedBSJs$chrom[i]
if (!is.na(annotatedBSJs$transcript[i]) &
!is.na(annotatedBSJs$strand[i])) {
annotatedBSJsToAnalyze <- annotatedBSJs[i,]
# Get upstream and dowstream genomic ranges coordinates
# for POSITIVE STRAND
if (annotatedBSJsToAnalyze$strand == "+") {
grCoordsToAnalyze <-
.grCoordsForPositive(annotatedBSJsToAnalyze, lIntron, lExon)
grCoords$startUpGR[i] <- grCoordsToAnalyze$startUpGR
grCoords$endUpGR[i] <- grCoordsToAnalyze$endUpGR
grCoords$startDownGR[i] <-
grCoordsToAnalyze$startDownGR
grCoords$endDownGR[i] <- grCoordsToAnalyze$endDownGR
}
# Get upstream and dowstream genomic ranges coordinates
# for NEGATIVE STRAND
if (annotatedBSJsToAnalyze$strand == "-") {
grCoordsToAnalyze <-
.grCoordsForNegative(annotatedBSJsToAnalyze, lIntron, lExon)
grCoords$startUpGR[i] <- grCoordsToAnalyze$startUpGR
grCoords$endUpGR[i] <- grCoordsToAnalyze$endUpGR
grCoords$startDownGR[i] <-
grCoordsToAnalyze$startDownGR
grCoords$endDownGR[i] <- grCoordsToAnalyze$endDownGR
}
}
}
return (grCoords)
}
# Get upstream and dowstream genomic ranges coordinates for positive strand
.grCoordsForPositive <-
function(annotatedBSJsToAnalyze,
lIntron = 100,
lExon = 10) {
# Create an empty dataframe
grCoords <- .getGRcoordsDF(annotatedBSJsToAnalyze)
# When the BSEs are NOT the FIRST and LAST of the transcript
if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
!is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
(annotatedBSJsToAnalyze$startUpBSE) - lIntron
grCoords$endUpGR <-
annotatedBSJsToAnalyze$startUpBSE + lExon
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE - lExon
grCoords$endDownGR <-
(annotatedBSJsToAnalyze$endDownBSE) + lIntron
# When the upstream back-spliced exon IS the FIRST of the
# transcript
} else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
!is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
(annotatedBSJsToAnalyze$startUpBSE)
grCoords$endUpGR <-
annotatedBSJsToAnalyze$startUpBSE + lExon
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE - lExon
grCoords$endDownGR <-
(annotatedBSJsToAnalyze$endDownBSE) + lIntron
# When the downstream back-spliced exon IS the LAST of the
# transcript
} else if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
(annotatedBSJsToAnalyze$startUpBSE) - lIntron
grCoords$endUpGR <-
annotatedBSJsToAnalyze$startUpBSE + lExon
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE - lExon
grCoords$endDownGR <-
(annotatedBSJsToAnalyze$endDownBSE)
# When the back-spliced exons ARE the FIRST and LAST of the
# transcript
} else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
(annotatedBSJsToAnalyze$startUpBSE)
grCoords$endUpGR <-
annotatedBSJsToAnalyze$startUpBSE + lExon
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE - lExon
grCoords$endDownGR <-
(annotatedBSJsToAnalyze$endDownBSE)
}
grCoords <- grCoords %>%
dplyr::select(startUpGR,
endUpGR,
startDownGR,
endDownGR)
return(grCoords)
}
# Get startUpGR and endUpGR for negative strand
.grCoordsForNegative <-
function(annotatedBSJsToAnalyze,
lIntron = 100,
lExon = 10) {
# Create an empty dataframe
grCoords <- .getGRcoordsDF(annotatedBSJsToAnalyze)
# When the back-spliced exons are NOT the FIRST and LAST of the
# transcript
if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
!is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
annotatedBSJsToAnalyze$startUpBSE - lExon
grCoords$endUpGR <-
annotatedBSJsToAnalyze$startUpBSE + lIntron
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE - lIntron
grCoords$endDownGR <-
annotatedBSJsToAnalyze$endDownBSE + lExon
# When the upstream back-spliced exon IS the FIRST of the
# transcript
} else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
!is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
annotatedBSJsToAnalyze$startUpBSE - lExon
grCoords$endUpGR <- annotatedBSJsToAnalyze$startUpBSE
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE - lIntron
grCoords$endDownGR <-
annotatedBSJsToAnalyze$endDownBSE + lExon
# When the downstream the back-spliced exon IS the LAST of
# the transcript
} else if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
annotatedBSJsToAnalyze$startUpBSE - lExon
grCoords$endUpGR <-
annotatedBSJsToAnalyze$startUpBSE + lIntron
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE
grCoords$endDownGR <-
annotatedBSJsToAnalyze$endDownBSE + lExon
# When the back-spliced exons ARE the FIRST and LAST of the
# transcript
} else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
is.na(annotatedBSJsToAnalyze$startDownIntron)) {
# For the upstream genomic range
grCoords$startUpGR <-
annotatedBSJsToAnalyze$startUpBSE - lExon
grCoords$endUpGR <- annotatedBSJsToAnalyze$startUpBSE
# For the downstream genomic range
grCoords$startDownGR <-
annotatedBSJsToAnalyze$endDownBSE
grCoords$endDownGR <-
annotatedBSJsToAnalyze$endDownBSE + lExon
}
grCoords <- grCoords %>%
dplyr::select(startUpGR,
endUpGR,
startDownGR,
endDownGR)
return(grCoords)
}
# The function getBSEcoords() retrieves the coordinates of the bse
.getBSEcoords <- function(annotatedBSJs) {
# Create an empty dataframe
grCoords <- .getGRcoordsDF(annotatedBSJs)
for (i in seq_along(annotatedBSJs$id)) {
grCoords$id[i] <- annotatedBSJs$id[i]
grCoords$gene[i] <- annotatedBSJs$gene[i]
grCoords$transcript[i] <- annotatedBSJs$transcript[i]
grCoords$strand[i] <- annotatedBSJs$strand[i]
grCoords$chrom[i] <- annotatedBSJs$chrom[i]
if (!is.na(grCoords$transcript[i])) {
# POSITIVE STRAND
if (annotatedBSJs$strand[i] == "+") {
# For the upstream genomic range
grCoords$startUpGR[i] <- annotatedBSJs$startUpBSE[i]
grCoords$endUpGR[i] <- annotatedBSJs$endUpBSE[i]
# For the downstream genomic range
grCoords$startDownGR[i] <-
annotatedBSJs$startDownBSE[i]
grCoords$endDownGR[i] <- annotatedBSJs$endDownBSE[i]
# NEGATIVE STRAND
} else if (annotatedBSJs$strand[i] == "-") {
# For the upstream genomic range
grCoords$startUpGR[i] <- annotatedBSJs$endUpBSE[i]
grCoords$endUpGR[i] <- annotatedBSJs$startUpBSE[i]
# For the downstream genomic range
grCoords$startDownGR[i] <-
annotatedBSJs$endDownBSE[i]
grCoords$endDownGR[i] <-
annotatedBSJs$startDownBSE[i]
}
}
}
return (grCoords)
}
#The function getFIcoords() retrieves the coordinates of
# the introns flanking the back-spliced exons from the annotateBSJs data frame.
.getFIcoords <- function(annotatedBSJs) {
# Create an empty dataframe
grCoords <- .getGRcoordsDF(annotatedBSJs)
for (i in seq_along(annotatedBSJs$id)) {
grCoords$id[i] <- annotatedBSJs$id[i]
grCoords$gene[i] <- annotatedBSJs$gene[i]
grCoords$transcript[i] <- annotatedBSJs$transcript[i]
grCoords$strand[i] <- annotatedBSJs$strand[i]
grCoords$chrom[i] <- annotatedBSJs$chrom[i]
if (!is.na(grCoords$transcript[i])) {
# POSITIVE STRAND
if (annotatedBSJs$strand[i] == "+") {
# For the upstream genomic range
grCoords$startUpGR[i] <-
annotatedBSJs$startUpIntron[i]
grCoords$endUpGR[i] <- annotatedBSJs$endUpIntron[i]
# For the downstream genomic range
grCoords$startDownGR[i] <-
annotatedBSJs$startDownIntron[i]
grCoords$endDownGR[i] <-
annotatedBSJs$endDownIntron[i]
# For NEGATIVE strand
} else if (annotatedBSJs$strand[i] == "-") {
# For the upstream genomic range
grCoords$startUpGR[i] <-
annotatedBSJs$endUpIntron[i]
grCoords$endUpGR[i] <-
annotatedBSJs$startUpIntron[i]
# For the downstream genomic range
grCoords$startDownGR[i] <-
annotatedBSJs$endDownIntron[i]
grCoords$endDownGR[i] <-
annotatedBSJs$startDownIntron[i]
}
}
}
return (grCoords)
}
# The function getGRcolNames() returns the column names
.getGRcolNames <- function() {
grColNames <- c(
"id",
"gene",
"transcript",
"strand",
"chrom",
"startUpGR",
"endUpGR",
"startDownGR",
"endDownGR"
)
return(grColNames)
}
# Create grCoords data frame
.getGRcoordsDF <- function(annotatedBSJs) {
grCoords <-
data.frame(matrix(
nrow = nrow(annotatedBSJs),
ncol = length(.getGRcolNames())
))
colnames(grCoords) <- .getGRcolNames()
return(grCoords)
}
# get sequences from BS genome
.getSeqFromBSgenome <- function(genome, chrom, startGR, endGR) {
seq <- BSgenome::getSeq(genome,
names = chrom,
startGR,
endGR)
return(seq)
}
# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.