R/getSeqsFromGRs.R

Defines functions .getSeqFromBSgenome .getGRcoordsDF .getGRcolNames .getFIcoords .getBSEcoords .grCoordsForNegative .grCoordsForPositive .getIECoords getSeqsFromGRs

Documented in getSeqsFromGRs

#' @title Retrieve sequences flanking back-spliced junctions
#'
#' @description The function getSeqsFromGRs() includes 3 modules to retrieve
#' 3 types of sequences. Sequences of the introns flanking back-spliced
#' junctions, sequences from a defined genomic window surrounding the
#' back-spliced junctions and sequences of the back-spliced exons.
#'
#' @param annotatedBSJs A data frame with the annotated back-spliced junctions.
#' This data frame can be generated with \code{\link{annotateBSJs}}.
#'
#' @param genome A BSgenome object containing the genome sequences.
#' It can be generated with in \code{\link{getBSgenome}}.
#' See \code{\link[BSgenome]{available.genomes}} to see the BSgenome package
#' currently available
#'
#' @param lIntron An integer indicating how many nucleotides are taken from
#' the introns flanking the back-spliced junctions. This number must be
#' positive. Default value is 100.
#'
#' @param lExon An integer indicating how many nucleotides are taken from
#' the back-spliced exons starting from the back-spliced junctions. This number
#' must be positive. Default value is 10.
#'
#' @param type A character string specifying the sequences to retrieve.
#' If type = "ie" the sequences are retrieved from the the genomic ranges
#' defined by using the lIntron and lExon given in input. If type = "bse"
#' the sequences of the back-spliced exons are retrieved. If type = "fi"
#' the sequences of the introns flanking the back-spliced exons are retrieved.
#' Default value is "ie".
#'
#' @return A list.
#'
#' @examples
#' # Load data frame containing predicted back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)){
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve target sequences
#' targets <- getSeqsFromGRs(
#'     annotatedBSJs,
#'     genome,
#'     lIntron = 200,
#'     lExon = 10,
#'     type = "ie"
#'     )
#' }
#' 
#' @importFrom Biostrings reverseComplement
#' @importFrom BSgenome getSeq
#' @import BSgenome.Hsapiens.UCSC.hg19
#' @export
getSeqsFromGRs <-
    function(annotatedBSJs,
        genome,
        lIntron = 100,
        lExon = 10,
        type = "ie") {
        # only 3 options are possible for the argument type
        match.arg(type, c("ie", "bse", "fi"))

        # Retrieve the coordinates from whcih to extract the sequences.
        grCoords <- switch(
            type,
            bse = .getBSEcoords(annotatedBSJs),
            fi = .getFIcoords(annotatedBSJs),
            ie = .getIECoords(annotatedBSJs, lIntron, lExon)
        )
        # Create a list with 2 elements to store the sequences extracted from
        # the upstream and downstream genomic ranges
        targets <- vector("list", 2)
        names(targets)[1] <- "upGR"
        names(targets)[2] <- "downGR"

        for (i in seq_along(targets)) {
            # Create an empty list of data frames
            targets[[i]] <- .getTargetsDF(annotatedBSJs)

            if (i == 1) {
                indexStart <- which(colnames(grCoords) == "startUpGR")
                indexEnd <- which(colnames(grCoords) == "endUpGR")
            } else{
                indexStart <- which(colnames(grCoords) == "startDownGR")
                indexEnd <- which(colnames(grCoords) == "endDownGR")
            }
            # Fill the data frame with the extracted genomic range coordinates
            targets[[i]]$id <- grCoords$id
            targets[[i]]$gene <- grCoords$gene
            targets[[i]]$transcript <- grCoords$transcript
            targets[[i]]$strand <- grCoords$strand
            targets[[i]]$chrom <- grCoords$chrom
            targets[[i]]$startGR <- grCoords[, indexStart]
            targets[[i]]$endGR <- grCoords[, indexEnd]
            targets[[i]]$type <- rep(type, nrow(targets[[i]]))

            for (j in seq_along(targets[[1]]$id)) {
                chrom <- targets[[i]]$chrom[j]
                startGR <- targets[[i]]$startGR[j]
                endGR <- targets[[i]]$endGR[j]
                strand <- targets[[i]]$strand[j]
                seq <-
                    .getSeqFromBSgenome(genome, chrom, startGR, endGR)

                # Check coordinates and retrieve the seq
                if (strand == "-" &
                        !is.na(startGR) & !is.na(endGR)) {
                    # For a circRNA that arises from a gene transcribed from
                    # negative strand we take the reverse complement
                    targets[[i]]$seq[j] <-
                        gsub("T",
                            "U",
                            as.character(Biostrings::reverseComplement(seq)))
                    targets[[i]]$length[j] <-
                        nchar(targets[[i]]$seq[j])

                } else if (strand == "+" &
                        !is.na(startGR) & !is.na(endGR)) {
                    # For the positive strand no modification is needed because
                    # the sense strand corresponds to the positive strand that
                    # is the genome reference.
                    targets[[i]]$seq[j] <-
                        gsub("T", "U", as.character(seq))
                    targets[[i]]$length[j] <-
                        nchar(targets[[i]]$seq[j])
                }
            }
        }
        return(targets)
    }


# The function getIECoords() retrieves the coordinates of the
# genomic ranges defined by the values of the arguments lIntron and lExon.
# The start point are the back-spliced junction coordinates reported in the
# annotateBSJs data frame.
.getIECoords <- function(annotatedBSJs,
    lIntron = 100,
    lExon = 10) {
    # lIntron and lExon must be positive numbers
    if (lIntron < 0) {
        stop("lIntron must be a positive number")
    }
    if (lExon < 0) {
        stop("lExon must be a positive number")
    }
    # Create an empty dataframe
    grCoords <- .getGRcoordsDF(annotatedBSJs)

    for (i in seq_along(annotatedBSJs$id)) {
        grCoords$id[i] <- annotatedBSJs$id[i]
        grCoords$gene[i] <- annotatedBSJs$gene[i]
        grCoords$transcript[i] <- annotatedBSJs$transcript[i]
        grCoords$strand[i] <- annotatedBSJs$strand[i]
        grCoords$chrom[i] <- annotatedBSJs$chrom[i]

        if (!is.na(annotatedBSJs$transcript[i]) &
                !is.na(annotatedBSJs$strand[i])) {
            annotatedBSJsToAnalyze <- annotatedBSJs[i,]
            # Get upstream and dowstream genomic ranges coordinates
            # for POSITIVE STRAND
            if (annotatedBSJsToAnalyze$strand == "+") {
                grCoordsToAnalyze <-
                    .grCoordsForPositive(annotatedBSJsToAnalyze, lIntron, lExon)
                grCoords$startUpGR[i] <- grCoordsToAnalyze$startUpGR
                grCoords$endUpGR[i] <- grCoordsToAnalyze$endUpGR
                grCoords$startDownGR[i] <-
                    grCoordsToAnalyze$startDownGR
                grCoords$endDownGR[i] <- grCoordsToAnalyze$endDownGR

            }
            # Get upstream and dowstream genomic ranges coordinates
            # for NEGATIVE STRAND
            if (annotatedBSJsToAnalyze$strand == "-") {
                grCoordsToAnalyze <-
                    .grCoordsForNegative(annotatedBSJsToAnalyze, lIntron, lExon)
                grCoords$startUpGR[i] <- grCoordsToAnalyze$startUpGR
                grCoords$endUpGR[i] <- grCoordsToAnalyze$endUpGR
                grCoords$startDownGR[i] <-
                    grCoordsToAnalyze$startDownGR
                grCoords$endDownGR[i] <- grCoordsToAnalyze$endDownGR
            }
        }
    }
    return (grCoords)
}

# Get upstream and dowstream genomic ranges coordinates for positive strand
.grCoordsForPositive <-
    function(annotatedBSJsToAnalyze,
        lIntron = 100,
        lExon = 10) {
        # Create an empty dataframe
        grCoords <- .getGRcoordsDF(annotatedBSJsToAnalyze)
        # When the BSEs are NOT the FIRST and LAST of the transcript
        if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE) - lIntron
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE) + lIntron

            # When the upstream back-spliced exon IS the FIRST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE)
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE) + lIntron

            # When the downstream back-spliced exon IS the LAST of the
            # transcript
        } else if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE) - lIntron
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE)

            # When the back-spliced exons ARE the FIRST and LAST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE)
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE)
        }

        grCoords <- grCoords %>%
            dplyr::select(.data$startUpGR,
                .data$endUpGR,
                .data$startDownGR,
                .data$endDownGR)
        return(grCoords)
    }


# Get startUpGR and endUpGR for negative strand
.grCoordsForNegative <-
    function(annotatedBSJsToAnalyze,
        lIntron = 100,
        lExon = 10) {
        # Create an empty dataframe
        grCoords <- .getGRcoordsDF(annotatedBSJsToAnalyze)
        # When the back-spliced exons are NOT the FIRST and LAST of the
        # transcript
        if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lIntron

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lIntron
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon

            # When the upstream back-spliced exon IS the FIRST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <- annotatedBSJsToAnalyze$startUpBSE

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lIntron
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon

            # When the downstream the back-spliced exon IS the LAST of
            # the transcript
        } else if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lIntron

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon

            # When the back-spliced exons ARE the FIRST and LAST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <- annotatedBSJsToAnalyze$startUpBSE

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon
        }

        grCoords <- grCoords %>%
            dplyr::select(.data$startUpGR,
                .data$endUpGR,
                .data$startDownGR,
                .data$endDownGR)

        return(grCoords)
    }



# The function getBSEcoords() retrieves the coordinates of the
.getBSEcoords <- function(annotatedBSJs) {
    # Create an empty dataframe
    grCoords <- .getGRcoordsDF(annotatedBSJs)

    for (i in seq_along(annotatedBSJs$id)) {
        grCoords$id[i] <- annotatedBSJs$id[i]
        grCoords$gene[i] <- annotatedBSJs$gene[i]
        grCoords$transcript[i] <- annotatedBSJs$transcript[i]
        grCoords$strand[i] <- annotatedBSJs$strand[i]
        grCoords$chrom[i] <- annotatedBSJs$chrom[i]

        if (!is.na(grCoords$transcript[i])) {
            # POSITIVE STRAND
            if (annotatedBSJs$strand[i] == "+") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <- annotatedBSJs$startUpBSE[i]
                grCoords$endUpGR[i] <- annotatedBSJs$endUpBSE[i]
                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$startDownBSE[i]
                grCoords$endDownGR[i] <- annotatedBSJs$endDownBSE[i]

                # NEGATIVE STRAND
            } else if (annotatedBSJs$strand[i] == "-") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <- annotatedBSJs$endUpBSE[i]
                grCoords$endUpGR[i] <- annotatedBSJs$startUpBSE[i]
                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$endDownBSE[i]
                grCoords$endDownGR[i] <-
                    annotatedBSJs$startDownBSE[i]

            }
        }
    }
    return (grCoords)

}


#The function getFIcoords() retrieves the coordinates of
# the introns flanking the back-spliced exons from the annotateBSJs data frame.
.getFIcoords <- function(annotatedBSJs) {
    # Create an empty dataframe
    grCoords <- .getGRcoordsDF(annotatedBSJs)

    for (i in seq_along(annotatedBSJs$id)) {
        grCoords$id[i] <- annotatedBSJs$id[i]
        grCoords$gene[i] <- annotatedBSJs$gene[i]
        grCoords$transcript[i] <- annotatedBSJs$transcript[i]
        grCoords$strand[i] <- annotatedBSJs$strand[i]
        grCoords$chrom[i] <- annotatedBSJs$chrom[i]

        if (!is.na(grCoords$transcript[i])) {
            # POSITIVE STRAND
            if (annotatedBSJs$strand[i] == "+") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <-
                    annotatedBSJs$startUpIntron[i]
                grCoords$endUpGR[i] <- annotatedBSJs$endUpIntron[i]

                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$startDownIntron[i]
                grCoords$endDownGR[i] <-
                    annotatedBSJs$endDownIntron[i]

                # For NEGATIVE strand
            } else if (annotatedBSJs$strand[i] == "-") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <-
                    annotatedBSJs$endUpIntron[i]
                grCoords$endUpGR[i] <-
                    annotatedBSJs$startUpIntron[i]

                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$endDownIntron[i]
                grCoords$endDownGR[i] <-
                    annotatedBSJs$startDownIntron[i]
            }
        }
    }
    return (grCoords)
}


# The function getGRcolNames() returns the column names
.getGRcolNames <- function() {
    grColNames <- c(
        "id",
        "gene",
        "transcript",
        "strand",
        "chrom",
        "startUpGR",
        "endUpGR",
        "startDownGR",
        "endDownGR"
    )
    return(grColNames)
}

# Create grCoords data frame
.getGRcoordsDF <- function(annotatedBSJs) {
    grCoords <-
        data.frame(matrix(
            nrow = nrow(annotatedBSJs),
            ncol = length(.getGRcolNames())
        ))
    colnames(grCoords) <- .getGRcolNames()
    return(grCoords)
}


# get sequences from BS genome
.getSeqFromBSgenome <- function(genome, chrom, startGR, endGR) {
    seq <- BSgenome::getSeq(genome,
        names = chrom,
        startGR,
        endGR)
    return(seq)
}


# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.

Try the circRNAprofiler package in your browser

Any scripts or data that you put into this service are public.

circRNAprofiler documentation built on March 6, 2021, 2 a.m.