R/annotation-data-helper.R

Defines functions parseTxdb annotateIntronRanges annotateReducedExonRanges getPromoterMetadata getIntronIdPerPromoter getIntronTable annotateUniqueIntronRanges annotateAllIntronRanges getIntronRanks getUniqueIntronRanges getIntronRangesByTx getReducedExonRanges getFirstExonRanges getExonRangesByTx getTssRanges getTranscriptRanges

# Get the transcript ranges with metadata and transcript lengths
#' @importFrom GenomeInfoDb keepStandardChromosomes 'seqlevelsStyle<-'
#' @importFrom GenomicFeatures transcripts
getTranscriptRanges <- function(txdb, species = 'Homo_sapiens') {
    # Retrieve transcript ranges with detailed metadata
    transcriptRanges <- transcripts(txdb, columns = 
                            c('TXID', 'TXSTART', 'TXEND', 'GENEID', 'TXNAME'))
    names(transcriptRanges) <- transcriptRanges$TXNAME
    transcriptRanges <- 
        tryCatch(keepStandardChromosomes(x = transcriptRanges, 
                                        species = species, 
                                        pruning.mode = "tidy"), 
                error = function(cond) keepStandardChromosomes(transcriptRanges,
                                        pruning.mode = "tidy"))
    GenomeInfoDb::seqlevelsStyle(transcriptRanges) <- 'UCSC'
    return(transcriptRanges)
}

# Get transcript start site coordinates
#' @importFrom GenomeInfoDb keepStandardChromosomes 'seqlevelsStyle<-'
#' @importFrom GenomicFeatures promoters
getTssRanges <- function(transcriptRanges) {
    # Annotate transcription start sites (TSSs) with ids
    tssCoordinates <- promoters(transcriptRanges, upstream = 0, downstream = 1)
    seqlevelsStyle(tssCoordinates) <- 'UCSC'
    tssCoordinates.unique <- unique(tssCoordinates)
    tssCoordinates.unique$tssId <- paste0('tss.',
                                        seq_len(length(tssCoordinates.unique)))
    tssCoordinates.overlap <- findOverlaps(tssCoordinates, 
                                            tssCoordinates.unique, 
                                            type = 'equal')
    tssCoordinates$tssId <- tssCoordinates.unique$tssId[
                                            subjectHits(tssCoordinates.overlap)]
    return(tssCoordinates)
}

# Get exon ranges for each transcript
#' @importFrom GenomeInfoDb keepStandardChromosomes
#' @importFrom GenomeInfoDb 'seqlevelsStyle<-'
#' @importFrom GenomicFeatures exonsBy
getExonRangesByTx <- function(txdb, species = 'Homo_sapiens') {
    exonRangesByTx <- exonsBy(txdb, by = 'tx', use.names = TRUE)
    exonRangesByTx <- 
        tryCatch(keepStandardChromosomes(x = exonRangesByTx, 
                                        species = species, 
                                        pruning.mode = "tidy"), 
                error = function(cond) keepStandardChromosomes(exonRangesByTx,
                                        pruning.mode = "tidy"))
    seqlevelsStyle(exonRangesByTx) <- 'UCSC'
    return(exonRangesByTx)
}

# Get the first exon of each transcript
getFirstExonRanges <- function(exonRangesByTx.unlist) {
    exonRanges.firstExon <- exonRangesByTx.unlist[
                                        exonRangesByTx.unlist$exon_rank == 1]
    exonRanges.firstExon$tx_name <- names(exonRanges.firstExon)
    exonRanges.firstExon$customId <- seq_len(length(exonRanges.firstExon))
    return(exonRanges.firstExon)
}

# Reduce all first exons for each gene to identify transcripts 
# belonging to each promoter
#' @importFrom rlang .data
#' @importFrom dplyr as_tibble '%>%' arrange group_by mutate lead summarise 
#'   ungroup select n 
#' @importFrom S4Vectors mcols 'mcols<-'
#' @importFrom GenomicRanges makeGRangesFromDataFrame
getReducedExonRanges <- function(exonRanges.firstExon, 
                                exonRanges.firstExon.geneId) {
    exonRanges.firstExon$geneId <- exonRanges.firstExon.geneId
    exonReducedRanges <- as_tibble(exonRanges.firstExon) %>% 
        dplyr::arrange(.data$geneId, .data$start) %>% 
        dplyr::group_by(.data$geneId) %>%
        dplyr::mutate(exonClass = 
                c(0, cumsum(lead(.data$start) > cummax(.data$end))[-n()])) %>%
        dplyr::group_by(.data$geneId, .data$exonClass, 
                        .data$seqnames, .data$strand) %>%
        dplyr::summarise(start = min(.data$start), 
                            end = max(.data$end), 
                            customId = list(.data$customId),
                            .groups = 'drop') %>%
        dplyr::ungroup() %>%
        dplyr::select(.data$seqnames, .data$start, 
                        .data$end, .data$strand, .data$customId)
    exonReducedRanges$promoterId <- seq_len(nrow(exonReducedRanges))
    exonReducedRanges <- makeGRangesFromDataFrame(exonReducedRanges, 
                                                    keep.extra.columns = TRUE)
    names(mcols(exonReducedRanges)) <- c('revmap', 'promoterId')
    return(exonReducedRanges)
}

# Get intron ranges for each promoter
#' @importFrom GenomeInfoDb keepStandardChromosomes 'seqlevelsStyle<-'
#' @importFrom GenomicFeatures intronsByTranscript
getIntronRangesByTx <- function(txdb, transcriptRanges, 
                                species = 'Homo_sapiens') {
    # Retrieve intron ranges for each promoter
    intronRangesByTx <- intronsByTranscript(txdb, use.names = TRUE)
    intronRangesByTx <- 
        tryCatch(keepStandardChromosomes(x = intronRangesByTx, 
                                        species = species, 
                                        pruning.mode = "tidy"), 
                error = function(cond) keepStandardChromosomes(intronRangesByTx,
                                        pruning.mode = "tidy"))
    seqlevelsStyle(intronRangesByTx) <- 'UCSC'
    intronRangesByTx <- intronRangesByTx[names(transcriptRanges)]
    return(intronRangesByTx)
}

# Get unique intron ranges with custom ids
getUniqueIntronRanges <- function(intronRangesByTx.unlist) {
    # Annotate unique intron ranges  with unique ids
    intronRanges.unique <- unique(intronRangesByTx.unlist)
    intronRanges.unique <- sort(intronRanges.unique)
    intronRanges.unique$INTRONID <- seq_len(length(intronRanges.unique))
    names(intronRanges.unique) <- NULL
    return(intronRanges.unique)
}

# Get the rank of each intron within the transcript (similar to exon ranges)
#' @importFrom rlang .data
#' @importFrom dplyr as_tibble mutate group_by '%>%' row_number
getIntronRanks <- function(intronRangesByTx) {
    intronRankByTx <- as_tibble(intronRangesByTx) %>% 
        dplyr::group_by(.data$group_name) %>% 
        dplyr::mutate(rank = ifelse(.data$strand == '+', 
                                    row_number(), rev(row_number())))
    intronRank <- intronRankByTx$rank
    return(intronRank)
}

# Annotate all the intron ranges with the metadata
annotateAllIntronRanges <- function(intronRankByTx, intronRangesByTx.unlist, 
                                    transcriptRanges, promoterIdMapping) {
    # Annotate all intron ranges with corresponding ids
    intronRangesByTx.unlist$INTRONRANK <- intronRankByTx
    intronRangesByTx.unlist$TXID <- transcriptRanges$TXID[match(
                    names(intronRangesByTx.unlist), transcriptRanges$TXNAME)]
    intronRangesByTx.unlist$TXSTART <- transcriptRanges$TXSTART[match(
                    names(intronRangesByTx.unlist), transcriptRanges$TXNAME)]
    intronRangesByTx.unlist$TXEND <- transcriptRanges$TXEND[match(
                    names(intronRangesByTx.unlist), transcriptRanges$TXNAME)]
    intronRangesByTx.unlist$GENEID <- as.character(transcriptRanges$GENEID)[
                    match(names(intronRangesByTx.unlist), 
                    transcriptRanges$TXNAME)]
    intronRangesByTx.unlist$TXNAME <- transcriptRanges$TXNAME[match(
                    names(intronRangesByTx.unlist), transcriptRanges$TXNAME)]
    intronRangesByTx.unlist$TxWidth <- transcriptRanges$TxWidth[match(
                    names(intronRangesByTx.unlist), transcriptRanges$TXNAME)]
    intronRangesByTx.unlist$promoterId <- promoterIdMapping$promoterId[match(
                    intronRangesByTx.unlist$TXID, 
                    promoterIdMapping$transcriptId)]
    names(intronRangesByTx.unlist) <- NULL
    return(intronRangesByTx.unlist)
}

# Annotate unique intron ranges with metadata
annotateUniqueIntronRanges <- function(intronRanges.unique, 
                                        intronRangesByTx.unlist) {
    # Annotate unique intron ranges with corresponding ids
    intronRanges.unique$INTRONRANK <- as(split(
                                            intronRangesByTx.unlist$INTRONRANK, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'IntegerList')
    intronRanges.unique$TXID <- as(split(intronRangesByTx.unlist$TXID, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'IntegerList')
    intronRanges.unique$TXSTART <- as(split(intronRangesByTx.unlist$TXSTART, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'IntegerList') 
    intronRanges.unique$TXEND <- as(split(intronRangesByTx.unlist$TXEND, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'IntegerList')
    intronRanges.unique$GENEID <- as(split(intronRangesByTx.unlist$GENEID, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'CharacterList')
    intronRanges.unique$TXNAME <- as(split(intronRangesByTx.unlist$TXNAME, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'CharacterList')
    intronRanges.unique$TxWidth <- as(split(intronRangesByTx.unlist$TxWidth, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'IntegerList')
    intronRanges.unique$promoterId <- as(split(
                                            intronRangesByTx.unlist$promoterId, 
                                            intronRangesByTx.unlist$INTRONID), 
                                            'CharacterList')
    return(intronRanges.unique)
}

# Prepare metadata for each unique intron range considering all of its uses 
# across transcripts
#' @importFrom rlang .data
#' @importFrom dplyr as_tibble mutate group_by '%>%' filter distinct inner_join
#' @importFrom S4Vectors mcols
getIntronTable <- function(intronRanges.unique, intronRangesByTx.unlist) {
    intronTable <- as_tibble(as.data.frame(intronRanges.unique)) 
    # Prepare metadata for each intron
    intronRangesByTx.metadata <- as_tibble(as.data.frame(
                                            mcols(intronRangesByTx.unlist)))
    intronRangesByTx.metadata <- mutate(
                group_by(intronRangesByTx.metadata, .data$TXNAME), 
                IntronEndRank = max(.data$INTRONRANK) - .data$INTRONRANK + 1)
    intronRangesByTx.metadata <- mutate(
                group_by(intronRangesByTx.metadata, .data$INTRONID),
                MinIntronRank = min(.data$INTRONRANK))
    intronRangesByTx.metadata <- mutate(
                group_by(intronRangesByTx.metadata, .data$INTRONID), 
                MaxIntronRank = max(.data$INTRONRANK))

    intronRangesByTx.metadata.unique <- group_by(intronRangesByTx.metadata, 
                                                    .data$INTRONID) %>%
        filter(.data$INTRONRANK == .data$MinIntronRank) %>%
        mutate(TxWidthMax = max(.data$TxWidth)) %>%
        dplyr::select(.data$INTRONID, .data$MinIntronRank, 
                        .data$MaxIntronRank, .data$TxWidthMax) %>%
        dplyr::rename(MinIntronRankTxWidthMax = .data$TxWidthMax) %>%
        distinct()

    intronTable <- inner_join(intronTable, 
                            intronRangesByTx.metadata.unique, by = 'INTRONID')
    return(intronTable)
}

# Get the ids of introns contributing to the transcription for each promoter
getIntronIdPerPromoter <- function(intronRangesByTx.unlist, promoterIdMapping) {
    intronRanges.firstIntron <- intronRangesByTx.unlist[
                                intronRangesByTx.unlist$INTRONRANK == 1]
    intronIdByPromoter.firstIntron <- split(intronRanges.firstIntron$INTRONID, 
                                factor(intronRanges.firstIntron$promoterId, 
                                        levels = seq_len(length(
                                        unique(promoterIdMapping$promoterId)))))
    intronIdByPromoter.firstIntron <- lapply(intronIdByPromoter.firstIntron, 
                                                unique)
    return(intronIdByPromoter.firstIntron)
}

# Get the metadata for each promoter regarding the use of introns
#' @importFrom rlang .data
#' @importFrom dplyr group_by mutate filter distinct n
getPromoterMetadata <- function(intronTable.firstIntron) {
    ## Extract per promoter information from intronTable
    promoterMetadata <- group_by(intronTable.firstIntron, .data$promoterId) %>%
        mutate(MinMergedIntronRank = min(.data$MinIntronRank),
            MaxMergedIntronRank = max(.data$MaxIntronRank),
            MaxMergedIntronTxWidth = max(.data$MinIntronRankTxWidthMax)) %>%
        filter(.data$MinIntronRank == min(.data$MinIntronRank)) %>%
        dplyr::select(.data$promoterId, .data$MinMergedIntronRank, 
                .data$MaxMergedIntronRank, .data$MaxMergedIntronTxWidth) %>%
        distinct()
    return(promoterMetadata)
}

# Annotate the reduced exon ranges for each promoter with the intron metadata
#' @importFrom IRanges IRanges 
#' @importFrom S4Vectors mcols<- mcols
annotateReducedExonRanges <- function(exonReducedRanges, promoterMetadata, 
                                        intronIdByPromoter.firstIntron) {
    intronReducedTable <- cbind(as.data.frame(exonReducedRanges), 
            as.data.frame(promoterMetadata[match(exonReducedRanges$promoterId, 
                                            promoterMetadata$promoterId), -1]))
    exonReducedRanges <- GRanges(intronReducedTable$seqnames,
                            ranges = IRanges(start = intronReducedTable$start, 
                                            intronReducedTable$end),
                                            strand = intronReducedTable$strand)
    exonReducedRanges$intronId <- as(intronIdByPromoter.firstIntron, 
                                            'IntegerList')
    mcols(exonReducedRanges) <- cbind(mcols(exonReducedRanges), 
            intronReducedTable[, c('promoterId', 'MinMergedIntronRank', 
                            'MaxMergedIntronRank', 'MaxMergedIntronTxWidth')])
    return(exonReducedRanges)
}

# Annotate all unique introns with the promoter metadata
#' @importFrom methods as
annotateIntronRanges <- function(intronRanges.unique, intronTable) {
    # Annotate unique intron ranges
    intronRanges.annotated <- intronRanges.unique
    intronRanges.annotated$MinIntronRank <- intronTable$MinIntronRank
    intronRanges.annotated$MaxIntronRank <- intronTable$MaxIntronRank
    intronRanges.annotated$MinIntronRankTxWidthMax <- 
        intronTable$MinIntronRankTxWidthMax
    return(intronRanges.annotated)
}

# Helper to parse input file
#' @importFrom AnnotationDbi loadDb
#' @importFrom GenomicFeatures makeTxDbFromGFF
parseTxdb <- function(txdb, file, species) {
    
    if (missing(txdb) & missing(file)) {
        stop('Either `txdb` or `file` argument must be provided')
    } else if (!missing(txdb) & !missing(file)) {
        stop('Both `txdb` and `file` arguments are provided: 
                Please specify only one')
    } else if (missing(txdb)) {
        ## Parse input file
        checkFile <- file.exists(file)
        if (!checkFile) {
            stop(paste0('Error: Please specify a valid file path'))
        }
        ext <- tools::file_ext(file)
        if (ext %in% c('gz', 'bz2', 'xz')){
            file.tmp <- gsub(paste0('\\.', ext), '', file)
            ext <- unique(tools::file_ext(file.tmp))
        }
        if (!(ext %in% c('gtf', 'sqlite', 'gff3'))) {
            stop('File path must link to a GTF/GFF or TxDb object')
        }
        message('Parsing input file...')
        if (ext == 'sqlite') {
            txdb <- AnnotationDbi::loadDb(file)
        } else if (ext %in% c('gtf', 'gff3')) {
            txdb <- GenomicFeatures::makeTxDbFromGFF(file = file,
                                            format = ext,
                                            organism = sub('_', ' ', species))
        }
    }
    return(txdb)
}
GoekeLab/proActiv documentation built on Jan. 30, 2022, 3:52 a.m.