R/annotateRepeats.R

Defines functions .getComplRepeats .getRepeatsColNames annotateRepeats

Documented in annotateRepeats

#' @title Annotate repetitive elements
#'
#' @description The function annotateRepeats() annotates repetitive elements
#' located in the region flanking the back-spliced junctions of each circRNA.
#' Repetitive elements are provided by AnnotationHub storage which
#' collected repeats from RepeatMasker database. See \code{\link{AnnotationHub}}
#' and \url{http://www.repeatmasker.org} for more details.
#' An empty list is returned if none overlapping repeats  are found.
#'
#' @param targets A list containing the target regions to analyze.
#' It can be generated with \code{\link{getSeqsFromGRs}}.
#'
#' @param annotationHubID A string specifying the AnnotationHub id to use.
#' Type data(ahRepeatMasker) to see all possible options. E.g. if AH5122 is
#' specified, repetitive elements from Homo sapiens, genome hg19 will be
#' downloaded and annotated. Default value is "AH5122".
#'
#' @param complementary A logical specifying whether to filter and report
#' only back-spliced junctions of circRNAs which flanking introns contain
#' complementary repeats, that is, repeats belonging to a same family but
#' located on opposite strands.
#'
#' @return A list.
#'
#' @examples
#' # Load data frame containing detected back-spliced junctions
#' data("mergedBSJunctions")
#'
#' # Load short version of the gencode v19 annotation file
#' data("gtf")
#'
#' # Annotate the first back-spliced junctions
#' annotatedBSJs <- annotateBSJs(mergedBSJunctions[1, ], gtf)
#'
#' # Get genome
#' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19", quietly = TRUE)){
#'
#' genome <- BSgenome::getBSgenome("BSgenome.Hsapiens.UCSC.hg19")
#'
#' # Retrieve targets
#' targets <- getSeqsFromGRs(
#'     annotatedBSJs,
#'     genome,
#'     lIntron = 200,
#'     lExon = 10,
#'     type = "ie"
#'     )
#'
#' # Annotate repeats
#'
#'  repeats <- annotateRepeats(targets, annotationHubID  = "AH5122",
#'  complementary = TRUE)
#'
#' }
#'
#'
#' @import dplyr
#' @import AnnotationHub
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomicRanges findOverlaps
#' @importFrom magrittr %>%
#' @importFrom S4Vectors subjectHits
#' @importFrom S4Vectors queryHits
#' @importFrom rlang .data
#' @importFrom stats setNames
#' @export
annotateRepeats <-
    function(targets,
             annotationHubID = "AH5122",
             complementary = TRUE) {
        if (length(targets) == 2 &
            names(targets)[[1]] == "upGR") {
            # Create an empty list of 2 elements
            repeats <- vector("list", 2)
            names(repeats)[1] <- "upGR"
            names(repeats)[2] <- "downGR"
            
        }  else {
            stop("target sequences not valid, only upstream and downtream GRs
                are allowed.")
        }
        
        ah <- AnnotationHub::AnnotationHub()
        rm <- ah[[annotationHubID]]
        
        # Clean targets from NA value
        targets <- .cleanTargets(targets)
        
        for (i in seq_along(repeats)) {
            # Create an empty list of 2 elements to store the extracted
            # information
            repeats[[i]] <- vector("list", 2)
            names(repeats[[i]])[1] <- "targets"
            names(repeats[[i]])[2] <- "repeats"
            
            targetsToAnalyze <- targets[[i]]
            overlaps <-
                .findOverlappingRepeats(rm, targetsToAnalyze)
            repeats[[i]]$targets <- overlaps$targets
            repeats[[i]]$repeats <- overlaps$repeats
            
        }
        
        # Find repeats of the same family located in the upstream and
        # downstream genomic ranges and located on different strands
        
        if (complementary) {
            repeats <- .getComplRepeats(repeats)
        }
        
        return(repeats)
    }



# The function getRepeatsColNames() returns the column names.
.getRepeatsColNames <- function() {
    repeatsColumns <- c("id",
                        "name",
                        "chrom",
                        "start",
                        "end",
                        "width",
                        "strand",
                        "score")
    return(repeatsColumns)
    
}

# The function getRepeatsColNames() returns complementary repeats.
.getComplRepeats <- function(repeats) {
    upGRs <-
        base::cbind(repeats$upGR$repeats,
                    rep("up", nrow(repeats$upGR$repeats)))
    colnames(upGRs)[9] <- "gr"
    downGRs <-
        base::cbind(repeats$downGR$repeats,
                    rep("down", nrow(repeats$downGR$repeats)))
    colnames(downGRs)[9] <- "gr"
    
    
    # Report only instances where the same repeats (same family) is
    # present in the upstream and downstream genomic ranges and are
    # located on different strands.
    overlaps <-
        S4Vectors::findMatches(upGRs$id, downGRs$id, select = "all")
    
    if (length(overlaps) > 0) {
        df <-
            data.frame(upGRs[S4Vectors::queryHits(overlaps),],
                       downGRs[S4Vectors::subjectHits(overlaps),])
        
        
        matchingRepeats <- df %>%
            dplyr::mutate_if(is.factor, as.character) %>%
            dplyr::filter(
                .data$name == .data$name.1 &
                    .data$gr != .data$gr.1 &
                    .data$strand != .data$strand.1
            )
        
        repeats$upGR$repeats <-
            matchingRepeats[, c(1, 2, 4, 5, 6, 7, 8)] %>%
            dplyr::arrange(.data$id)
        
        repeats$upGR$targets <-
            repeats$upGR$targets %>%
            dplyr::filter(.data$id %in% unique(matchingRepeats$id)) %>%
            dplyr::arrange(.data$id)
        
        
        repeats$downGR$repeats <-
            matchingRepeats[, c(10, 11, 12, 13, 14, 15, 16, 17)] %>%
            stats::setNames(gsub(".1", "", names(.))) %>%
            dplyr::arrange(.data$id)
        
        
        repeats$downGR$targets <-
            repeats$downGR$targets %>%
            dplyr::filter(.data$id %in% unique(matchingRepeats$id)) %>%
            dplyr::arrange(.data$id)
        
    } else {
        repeats[[1]]$repeats <-
            data.frame(matrix(nrow = 0, ncol = 8))
        colnames(repeats[[1]]$repeats) <-
            .getRepeatsColNames()
        
        repeats[[2]]$repeats <-
            data.frame(matrix(nrow = 0, ncol = 8))
        colnames(repeats[[2]]$repeats) <-
            .getRepeatsColNames()
    }
    
    return(repeats)
}

# Select the needed column and rename repeats data frame
.renameRepeats <- function(repeats) {
    repeats <- repeats %>%
        dplyr::select(
            .data$id,
            .data$name,
            .data$seqnames.1,
            .data$start.1,
            .data$end.1,
            .data$width.1,
            .data$strand.1,
            .data$score
        ) %>%
        dplyr::rename(
            chrom = .data$seqnames.1,
            start = .data$start.1,
            end = .data$end.1,
            width = .data$width.1,
            strand = .data$strand.1,
            score = .data$score
        )
    
    repeats <- repeats[!duplicated(repeats),]
    return(repeats)
}


.findOverlappingRepeats <- function(rm, targetsToAnalyze) {
    # Make GR object for the upstream region of the circRNAs
    genRanges <- GenomicRanges::makeGRangesFromDataFrame(
        targetsToAnalyze,
        keep.extra.columns = TRUE,
        ignore.strand = FALSE,
        seqinfo = NULL,
        seqnames.field = c("chrom"),
        start.field = c("startGR"),
        end.field = c("endGR"),
        strand.field = "strand",
        starts.in.df.are.0based = FALSE
    )
    
    # Find Overlaps
    overlaps <-
        suppressWarnings(GenomicRanges::findOverlaps(rm, genRanges, ignore.strand =
                                                         TRUE))
    if (length(overlaps) == 0) {
        # No genomic ranges in common
        repeats <- data.frame(matrix(nrow = 0, ncol = 8))
        colnames(repeats) <- .getRepeatsColNames()
        
        targets <- targetsToAnalyze[NULL, ]
        
    } else{
        repeats <- data.frame(genRanges[S4Vectors::subjectHits(overlaps)],
                              rm[S4Vectors::queryHits(overlaps)])
        repeats <- .renameRepeats(repeats)
        
        # Keep only targets where a hit is found
        targets <-
            repeats[S4Vectors::subjectHits(overlaps), ] %>%
            dplyr::filter(!duplicated(.))
    }
    
    overlaps <- vector("list", 2)
    names(overlaps)[1] <- "repeats"
    names(overlaps)[2] <- "targets"
    
    overlaps$repeats <- repeats
    overlaps$targets <- targets
    return(overlaps)
}


# If the function you are looking for is not here check supportFunction.R
# Functions in supportFunction.R are used by multiple functions.

Try the circRNAprofiler package in your browser

Any scripts or data that you put into this service are public.

circRNAprofiler documentation built on March 6, 2021, 2 a.m.