R/getSeqsFromGRs.R

Defines functions .getSeqFromBSgenome .getGRcoordsDF .getGRcolNames .getFIcoords .getBSEcoords .grCoordsForNegative .grCoordsForPositive .getIECoords getSeqsFromGRs

Documented in getSeqsFromGRs

#' @title Retrieve sequences flanking back-spliced junctions
#'
#' @description The function getSeqsFromGRs() includes 3 modules to retrieve
#' 3 types of sequences. Sequences of the introns flanking back-spliced
#' junctions, sequences from a defined genomic window surrounding the
#' back-spliced junctions and sequences of the back-spliced exons.
#'
#' @param annotatedBSJs A data frame with the annotated back-spliced junctions.
#' This data frame can be generated with \code{\link{annotateBSJs}}.
#'
#' @param genome A BSgenome object containing the genome sequences.
#' It can be generated with in \code{\link{getBSgenome}}.
#' See \code{\link[BSgenome]{available.genomes}} to see the BSgenome package
#' currently available
#'
#' @param lIntron An integer indicating how many nucleotides are taken from
#' the introns flanking the back-spliced junctions. This number must be
#' positive. Default value is 100.
#'
#' @param lExon An integer indicating how many nucleotides are taken from
#' the back-spliced exons starting from the back-spliced junctions. This number
#' must be positive. Default value is 10.
#'
#' @param type A character string specifying the sequences to retrieve.
#' If type = "ie" the sequences are retrieved from the the genomic ranges
#' defined by using the lIntron and lExon given in input. If type = "bse"
#' the sequences of the back-spliced exons are retrieved. If type = "fi"
#' the sequences of the introns flanking the back-spliced exons are retrieved.
#' Default value is "ie".
#'
#' @return A list.
#'
#' @examples
#' # Load data frame containing predicted back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)){
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve target sequences
#' targets <- getSeqsFromGRs(
#'     annotatedBSJs,
#'     genome,
#'     lIntron = 200,
#'     lExon = 10,
#'     type = "ie"
#'     )
#' }
#' 
#' @importFrom Biostrings reverseComplement
#' @importFrom BSgenome getSeq
#' @import BSgenome.Hsapiens.UCSC.hg19
#' @export
getSeqsFromGRs <-
    function(annotatedBSJs,
        genome,
        lIntron = 100,
        lExon = 10,
        type = "ie") {
        # only 3 options are possible for the argument type
        match.arg(type, c("ie", "bse", "fi"))

        # Retrieve the coordinates from whcih to extract the sequences.
        grCoords <- switch(
            type,
            bse = .getBSEcoords(annotatedBSJs),
            fi = .getFIcoords(annotatedBSJs),
            ie = .getIECoords(annotatedBSJs, lIntron, lExon)
        )
        # Create a list with 2 elements to store the sequences extracted from
        # the upstream and downstream genomic ranges
        targets <- vector("list", 2)
        names(targets)[1] <- "upGR"
        names(targets)[2] <- "downGR"

        for (i in seq_along(targets)) {
            # Create an empty list of data frames
            targets[[i]] <- .getTargetsDF(annotatedBSJs)

            if (i == 1) {
                indexStart <- which(colnames(grCoords) == "startUpGR")
                indexEnd <- which(colnames(grCoords) == "endUpGR")
            } else{
                indexStart <- which(colnames(grCoords) == "startDownGR")
                indexEnd <- which(colnames(grCoords) == "endDownGR")
            }
            # Fill the data frame with the extracted genomic range coordinates
            targets[[i]]$id <- grCoords$id
            targets[[i]]$gene <- grCoords$gene
            targets[[i]]$transcript <- grCoords$transcript
            targets[[i]]$strand <- grCoords$strand
            targets[[i]]$chrom <- grCoords$chrom
            targets[[i]]$startGR <- grCoords[, indexStart]
            targets[[i]]$endGR <- grCoords[, indexEnd]
            targets[[i]]$type <- rep(type, nrow(targets[[i]]))

            for (j in seq_along(targets[[1]]$id)) {
                chrom <- targets[[i]]$chrom[j]
                startGR <- targets[[i]]$startGR[j]
                endGR <- targets[[i]]$endGR[j]
                strand <- targets[[i]]$strand[j]
                seq <-
                    .getSeqFromBSgenome(genome, chrom, startGR, endGR)

                # Check coordinates and retrieve the seq
                if (strand == "-" &
                        !is.na(startGR) & !is.na(endGR)) {
                    # For a circRNA that arises from a gene transcribed from
                    # negative strand we take the reverse complement
                    targets[[i]]$seq[j] <-
                        gsub("T",
                            "U",
                            as.character(Biostrings::reverseComplement(seq)))
                    targets[[i]]$length[j] <-
                        nchar(targets[[i]]$seq[j])

                } else if (strand == "+" &
                        !is.na(startGR) & !is.na(endGR)) {
                    # For the positive strand no modification is needed because
                    # the sense strand corresponds to the positive strand that
                    # is the genome reference.
                    targets[[i]]$seq[j] <-
                        gsub("T", "U", as.character(seq))
                    targets[[i]]$length[j] <-
                        nchar(targets[[i]]$seq[j])
                }
            }
        }
        return(targets)
    }


# The function getIECoords() retrieves the coordinates of the
# genomic ranges defined by the values of the arguments lIntron and lExon.
# The start point are the back-spliced junction coordinates reported in the
# annotateBSJs data frame.
.getIECoords <- function(annotatedBSJs,
    lIntron = 100,
    lExon = 10) {
    # lIntron and lExon must be positive numbers
    if (lIntron < 0) {
        stop("lIntron must be a positive number")
    }
    if (lExon < 0) {
        stop("lExon must be a positive number")
    }
    # Create an empty dataframe
    grCoords <- .getGRcoordsDF(annotatedBSJs)

    for (i in seq_along(annotatedBSJs$id)) {
        grCoords$id[i] <- annotatedBSJs$id[i]
        grCoords$gene[i] <- annotatedBSJs$gene[i]
        grCoords$transcript[i] <- annotatedBSJs$transcript[i]
        grCoords$strand[i] <- annotatedBSJs$strand[i]
        grCoords$chrom[i] <- annotatedBSJs$chrom[i]

        if (!is.na(annotatedBSJs$transcript[i]) &
                !is.na(annotatedBSJs$strand[i])) {
            annotatedBSJsToAnalyze <- annotatedBSJs[i,]
            # Get upstream and dowstream genomic ranges coordinates
            # for POSITIVE STRAND
            if (annotatedBSJsToAnalyze$strand == "+") {
                grCoordsToAnalyze <-
                    .grCoordsForPositive(annotatedBSJsToAnalyze, lIntron, lExon)
                grCoords$startUpGR[i] <- grCoordsToAnalyze$startUpGR
                grCoords$endUpGR[i] <- grCoordsToAnalyze$endUpGR
                grCoords$startDownGR[i] <-
                    grCoordsToAnalyze$startDownGR
                grCoords$endDownGR[i] <- grCoordsToAnalyze$endDownGR

            }
            # Get upstream and dowstream genomic ranges coordinates
            # for NEGATIVE STRAND
            if (annotatedBSJsToAnalyze$strand == "-") {
                grCoordsToAnalyze <-
                    .grCoordsForNegative(annotatedBSJsToAnalyze, lIntron, lExon)
                grCoords$startUpGR[i] <- grCoordsToAnalyze$startUpGR
                grCoords$endUpGR[i] <- grCoordsToAnalyze$endUpGR
                grCoords$startDownGR[i] <-
                    grCoordsToAnalyze$startDownGR
                grCoords$endDownGR[i] <- grCoordsToAnalyze$endDownGR
            }
        }
    }
    return (grCoords)
}

# Get upstream and dowstream genomic ranges coordinates for positive strand
.grCoordsForPositive <-
    function(annotatedBSJsToAnalyze,
        lIntron = 100,
        lExon = 10) {
        # Create an empty dataframe
        grCoords <- .getGRcoordsDF(annotatedBSJsToAnalyze)
        # When the BSEs are NOT the FIRST and LAST of the transcript
        if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE) - lIntron
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE) + lIntron

            # When the upstream back-spliced exon IS the FIRST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE)
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE) + lIntron

            # When the downstream back-spliced exon IS the LAST of the
            # transcript
        } else if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE) - lIntron
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE)

            # When the back-spliced exons ARE the FIRST and LAST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                (annotatedBSJsToAnalyze$startUpBSE)
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lExon

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lExon
            grCoords$endDownGR <-
                (annotatedBSJsToAnalyze$endDownBSE)
        }

        grCoords <- grCoords %>%
            dplyr::select(startUpGR,
                endUpGR,
                startDownGR,
                endDownGR)
        return(grCoords)
    }


# Get startUpGR and endUpGR for negative strand
.grCoordsForNegative <-
    function(annotatedBSJsToAnalyze,
        lIntron = 100,
        lExon = 10) {
        # Create an empty dataframe
        grCoords <- .getGRcoordsDF(annotatedBSJsToAnalyze)
        # When the back-spliced exons are NOT the FIRST and LAST of the
        # transcript
        if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lIntron

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lIntron
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon

            # When the upstream back-spliced exon IS the FIRST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                !is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <- annotatedBSJsToAnalyze$startUpBSE

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE - lIntron
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon

            # When the downstream the back-spliced exon IS the LAST of
            # the transcript
        } else if (!is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <-
                annotatedBSJsToAnalyze$startUpBSE + lIntron

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon

            # When the back-spliced exons ARE the FIRST and LAST of the
            # transcript
        } else if (is.na(annotatedBSJsToAnalyze$endUpIntron) &
                is.na(annotatedBSJsToAnalyze$startDownIntron)) {
            # For the upstream genomic range
            grCoords$startUpGR <-
                annotatedBSJsToAnalyze$startUpBSE - lExon
            grCoords$endUpGR <- annotatedBSJsToAnalyze$startUpBSE

            # For the downstream genomic range
            grCoords$startDownGR <-
                annotatedBSJsToAnalyze$endDownBSE
            grCoords$endDownGR <-
                annotatedBSJsToAnalyze$endDownBSE + lExon
        }

        grCoords <- grCoords %>%
            dplyr::select(startUpGR,
                endUpGR,
                startDownGR,
                endDownGR)

        return(grCoords)
    }



# The function getBSEcoords() retrieves the coordinates of the bse
.getBSEcoords <- function(annotatedBSJs) {
    # Create an empty dataframe
    grCoords <- .getGRcoordsDF(annotatedBSJs)

    for (i in seq_along(annotatedBSJs$id)) {
        grCoords$id[i] <- annotatedBSJs$id[i]
        grCoords$gene[i] <- annotatedBSJs$gene[i]
        grCoords$transcript[i] <- annotatedBSJs$transcript[i]
        grCoords$strand[i] <- annotatedBSJs$strand[i]
        grCoords$chrom[i] <- annotatedBSJs$chrom[i]

        if (!is.na(grCoords$transcript[i])) {
            # POSITIVE STRAND
            if (annotatedBSJs$strand[i] == "+") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <- annotatedBSJs$startUpBSE[i]
                grCoords$endUpGR[i] <- annotatedBSJs$endUpBSE[i]
                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$startDownBSE[i]
                grCoords$endDownGR[i] <- annotatedBSJs$endDownBSE[i]

                # NEGATIVE STRAND
            } else if (annotatedBSJs$strand[i] == "-") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <- annotatedBSJs$endUpBSE[i]
                grCoords$endUpGR[i] <- annotatedBSJs$startUpBSE[i]
                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$endDownBSE[i]
                grCoords$endDownGR[i] <-
                    annotatedBSJs$startDownBSE[i]

            }
        }
    }
    return (grCoords)

}


#The function getFIcoords() retrieves the coordinates of
# the introns flanking the back-spliced exons from the annotateBSJs data frame.
.getFIcoords <- function(annotatedBSJs) {
    # Create an empty dataframe
    grCoords <- .getGRcoordsDF(annotatedBSJs)

    for (i in seq_along(annotatedBSJs$id)) {
        grCoords$id[i] <- annotatedBSJs$id[i]
        grCoords$gene[i] <- annotatedBSJs$gene[i]
        grCoords$transcript[i] <- annotatedBSJs$transcript[i]
        grCoords$strand[i] <- annotatedBSJs$strand[i]
        grCoords$chrom[i] <- annotatedBSJs$chrom[i]

        if (!is.na(grCoords$transcript[i])) {
            # POSITIVE STRAND
            if (annotatedBSJs$strand[i] == "+") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <-
                    annotatedBSJs$startUpIntron[i]
                grCoords$endUpGR[i] <- annotatedBSJs$endUpIntron[i]

                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$startDownIntron[i]
                grCoords$endDownGR[i] <-
                    annotatedBSJs$endDownIntron[i]

                # For NEGATIVE strand
            } else if (annotatedBSJs$strand[i] == "-") {
                # For the upstream genomic range
                grCoords$startUpGR[i] <-
                    annotatedBSJs$endUpIntron[i]
                grCoords$endUpGR[i] <-
                    annotatedBSJs$startUpIntron[i]

                # For the downstream genomic range
                grCoords$startDownGR[i] <-
                    annotatedBSJs$endDownIntron[i]
                grCoords$endDownGR[i] <-
                    annotatedBSJs$startDownIntron[i]
            }
        }
    }
    return (grCoords)
}


# The function getGRcolNames() returns the column names
.getGRcolNames <- function() {
    grColNames <- c(
        "id",
        "gene",
        "transcript",
        "strand",
        "chrom",
        "startUpGR",
        "endUpGR",
        "startDownGR",
        "endDownGR"
    )
    return(grColNames)
}

# Create grCoords data frame
.getGRcoordsDF <- function(annotatedBSJs) {
    grCoords <-
        data.frame(matrix(
            nrow = nrow(annotatedBSJs),
            ncol = length(.getGRcolNames())
        ))
    colnames(grCoords) <- .getGRcolNames()
    return(grCoords)
}


# get sequences from BS genome
.getSeqFromBSgenome <- function(genome, chrom, startGR, endGR) {
    seq <- BSgenome::getSeq(genome,
        names = chrom,
        startGR,
        endGR)
    return(seq)
}


# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.
Aufiero/circRNAprofiler documentation built on Oct. 31, 2023, 1:18 a.m.