R/auxillary-functions-annotation.R

Defines functions decorateWithExons decorateWithGenes getAnnotationDataFrame getCDSVector getCDSFromAnnotation getTranscriptsVector getTranscriptsFromAnnotation getExonsVector getExonsFromAnnotation getGenesVector getGenesFromAnnotation

Documented in decorateWithExons decorateWithGenes getAnnotationDataFrame getCDSFromAnnotation getCDSVector getExonsFromAnnotation getExonsVector getGenesFromAnnotation getGenesVector getTranscriptsFromAnnotation getTranscriptsVector

#'@include ASEset-class.R
NULL

#' AnnotationDb wrappers
#' 
#' These functions acts as wrappers to retrieve information from annotation
#' database objects (\code{annotationDb objects}) or (\code{transcriptDb
#' objects})
#' 
#' These functions retrieve regional annotation from OrgDb or TxDb objects,
#' when given GRanges objects.
#' 
#' @name annotation-wrappers
#' @rdname annotation-wrappers
#' @aliases getGenesFromAnnotation getGenesVector getExonsFromAnnotation
#' getTranscriptsFromAnnotation getCDSFromAnnotation getExonsVector
#' getTranscriptsVector getCDSVector getAnnotationDataFrame
#' @param OrgDb An \code{OrgDb} object
#' @param GR A \code{GenomicRanges} object with sample area
#' @param leftFlank An \code{integer} specifying number of additional
#' nucleotides around the SNPs for the leftFlank
#' @param rightFlank An \code{integer} specifying number of additional
#' nucleotides around the SNPs for the rightFlank
#' @param getUCSC A \code{logical} indicating if UCSC transcript IDs should
#' also be retrieved
#' @param TxDb A \code{transcriptDb} object
#' @param strand Two options,'+' or '-'
#' @param annotationType select one or more from 'gene', 'exon', 'transcript',
#' 'cds'.
#' @param verbose A \code{logical} making the functions more talkative
#' @return %OrgDb The \code{getGenesFromAnnotation} function will return a
#' \code{GRanges object} with ranges over the genes in the region.
#' 
#' The \code{getGenesVector} function will return a character vector where each
#' element are gene symbols separated by comma
#' 
#' %transcriptDb The \code{getExonsFromAnnotation} function will return a
#' \code{GRanges object} with ranges over the exons in the region.
#' 
#' The \code{getTranscriptsFromAnnotation} function will return a \code{GRanges
#' object} with ranges over the transcripts in the region.
#' 
#' The \code{getCDSFromAnnotation} function will return a \code{GRanges object}
#' with ranges over the CDSFs in the region.
#' 
#' The \code{getExonsVector} function will return a character vector where each
#' element are exons separated by comma
#' 
#' The \code{getTranscriptsVector} function will return a character vector
#' where each element are transcripts separated by comma
#' 
#' The \code{getCDSVector} function will return a character vector where each
#' element are CDSs separated by comma
#' 
#' The \code{getAnnotationDataFrame} function will return a data.frame with
#' annotations. This function is used internally by i.e. the barplot-function
#' @author Jesper R. Gadin, Lasse Folkersen
#' @keywords genes exons transcripts CDS annotation
#' @examples
#' 
#' 
#'   data(ASEset)
#'   require(org.Hs.eg.db)
#'   require(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'   OrgDb <- org.Hs.eg.db
#'   TxDb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' 
#'   #use for example BcfFiles as the source for SNPs of interest
#'   GR <- rowRanges(ASEset)
#'   #get annotation
#'   g <- getGenesFromAnnotation(OrgDb,GR)
#'   e <- getExonsFromAnnotation(TxDb,GR)
#'   t <- getTranscriptsFromAnnotation(TxDb,GR)
#'   c <- getCDSFromAnnotation(TxDb,GR)
#'   
#' @export getGenesFromAnnotation
#' @export getExonsFromAnnotation
#' @export getTranscriptsFromAnnotation
#' @export getCDSFromAnnotation
#' @export getGenesVector
#' @export getExonsVector
#' @export getTranscriptsVector
#' @export getCDSVector
#' @export getAnnotationDataFrame
NULL

#' @rdname annotation-wrappers
getGenesFromAnnotation <- function(OrgDb, GR, leftFlank = 0, rightFlank = 0, getUCSC = FALSE, 
    verbose = FALSE) {
    # checks
    if (class(OrgDb)[1] != "OrgDb") 
        stop(paste("OrgDb must of class OrgDb, not", class(OrgDb)[1]))
    
    if (class(GR)[1] != "GRanges") 
        stop(paste("GR must of class GRanges, not", class(GR)[1]))
    
    if (!class(leftFlank)[1] %in% c("numeric")) 
        stop(paste("leftFlank must be of class numeric, not:", class(leftFlank)[1]))
    if (length(leftFlank) != 1) 
        stop(paste("leftFlank must be of length 1, not:", length(leftFlank)))
    if (leftFlank < 0) 
        stop(paste("leftFlank must be equal to or larger than 0"))
    
    if (!class(rightFlank)[1] %in% c("numeric")) 
        stop(paste("rightFlank must be of class numeric, not:", class(rightFlank)[1]))
    if (length(rightFlank) != 1) 
        stop(paste("rightFlank must be of length 1, not:", length(rightFlank)))
    if (rightFlank < 0) 
        stop(paste("rightFlank must be equal to or larger than 0"))
    
    if (!class(getUCSC)[1] %in% c("logical")) 
        stop(paste("getUCSC must be of class logical, not:", class(getUCSC)[1]))
    if (length(getUCSC) != 1) 
        stop(paste("getUCSC must be of length 1, not:", length(getUCSC)))
    
    if (!"UCSCKG" %in% columns(OrgDb)) {
        if (verbose) 
            message("Unable to retrieve UCSCKG column from OrgDb. Omitting")
        getUCSC <- FALSE
    }
    
    if (!class(verbose)[1] %in% c("logical")) 
        stop(paste("verbose must be of class logical, not:", class(verbose)[1]))
    if (length(verbose) != 1) 
        stop(paste("verbose must be of length 1, not:", length(verbose)))
    
    # remove chr in seqnames
    seqLevels <- sub("^chr", "", seqlevels(GR))
    seqlevels(GR) <- seqLevels
    
    # pre-filtering to local region +/- 1MB (for speed purposes)
    startFilter <- max(c(1, start(range(GR)) - 10^6))
    endFilter <- end(range(GR)) + 10^6
    colsFilter <- c("CHR", "CHRLOC", "CHRLOCEND", "SYMBOL")
    sFilter <- suppressWarnings(select(OrgDb, keys = seqLevels, columns = colsFilter, 
        keytype = "CHR"))
    symbolsToGet <- sFilter[abs(sFilter[, "CHRLOC"]) > startFilter & abs(sFilter[, 
        "CHRLOCEND"]) < endFilter & !is.na(sFilter[, "CHRLOCEND"]) & !is.na(sFilter[, 
        "CHRLOC"]), "SYMBOL"]
    
    
    # then create an annGR for genes in range
    if (getUCSC) {
        cols <- c("SYMBOL", "CHR", "CHRLOC", "CHRLOCEND", "ENSEMBL", "UCSCKG")
    } else {
        cols <- c("SYMBOL", "CHR", "CHRLOC", "CHRLOCEND", "ENSEMBL")
    }
    s <- suppressWarnings(select(OrgDb, keys = symbolsToGet, columns = cols, keytype = "SYMBOL"))
    
    
    # remove Symbols with NAs
    TFminusStrand <- s[["CHRLOC"]] < 0
    TFplusStrand <- s[["CHRLOC"]] > 0
    sNoNas <- s[c(which(TFminusStrand), which(TFplusStrand)), ]
    
    # make Strand vector
    TFminusStrand2 <- sNoNas[["CHRLOC"]] < 0
    strand <- rep("+", length = (dim(sNoNas)[1]))
    
    strand[TFminusStrand2] <- "-"
    
    # make start and end vector
    sNonNegative <- sNoNas
    sNonNegative[TFminusStrand2, c("CHRLOC", "CHRLOCEND")] <- -sNonNegative[TFminusStrand2, 
        c("CHRLOC", "CHRLOCEND")]
    start <- sNonNegative[["CHRLOC"]]
    end <- sNonNegative[["CHRLOCEND"]]
    
    # make seqnames
    seqnames <- sNonNegative[["CHR"]]
    
    # make the annGR containing all genes
    if (getUCSC) {
        annGR <- GRanges(seqnames = Rle(seqnames), ranges = IRanges(start, end), 
            strand = Rle(strand), Symbol = sNonNegative[["SYMBOL"]], Ensembl = sNonNegative[["ENSEMBL"]], 
            UCSCKG = sNonNegative[["UCSCKG"]])
    } else {
        annGR <- GRanges(seqnames = Rle(seqnames), ranges = IRanges(start, end), 
            strand = Rle(strand), Symbol = sNonNegative[["SYMBOL"]], Ensembl = sNonNegative[["ENSEMBL"]])
        
    }
    
    # check that all levels in GR exist in annGR, if not exclude these levels
    if (sum(!levels(seqnames(GR)) %in% levels(seqnames(annGR))) > 0) {
        TFkeepLevels <- levels(seqnames(GR)) %in% levels(seqnames(annGR))
        seqlevels(GR) <- seqlevels(GR)[TFkeepLevels]
        
    }
    
    # check that all levels in annGR exist in GR, if not exclude these levels
    if (sum(!levels(seqnames(annGR)) %in% levels(seqnames(GR))) > 0) {
        TFkeepLevels <- levels(seqnames(annGR)) %in% levels(seqnames(GR))
        seqlevels(annGR, pruning.mode="coarse") <- seqlevels(annGR)[TFkeepLevels]
    }
    
    # the seqlevels comes in different orders. This will give the correct order.
    seqlevels(GR) <- seqlevels(annGR)
    
    # find overlaps between annGR and Snps incl. flank region
    GenesInRegion <- subsetByOverlaps(annGR, GR)  #put in flankSize here when you have time ;)
    seqlengths(GenesInRegion) <- seqlengths(GR)
    GenesInRegion
}

#' @rdname annotation-wrappers
getGenesVector <- function(OrgDb, GR, leftFlank = 0, rightFlank = 0, verbose = FALSE) {
    if (verbose) {
        cat("start gene extraction\n")
    }
    GenesInRegion <- getGenesFromAnnotation(OrgDb, GR, leftFlank = leftFlank, rightFlank = rightFlank, 
        verbose = FALSE)
    
    seqlevels(GR) <- seqlevels(GenesInRegion)
    
    # remove duplicate symbol names if same symbol merge regions
    symbolList <- unique(mcols(GenesInRegion)[["Symbol"]])
    newGenesInRegion <- GRanges()
    
    # check if GenesInRegion is zero
    if (!length(GenesInRegion) == 0) {
        for (i in 1:length(symbolList)) {
            symbol <- symbolList[i]
            TF <- mcols(GenesInRegion)[["Symbol"]] == symbol
            
            G <- GRanges(seqnames = unique(seqnames(GenesInRegion[TF])), ranges = IRanges(min(start(GenesInRegion[TF])), 
                max(end(GenesInRegion[TF]))), strand = unique(strand(GenesInRegion[TF])))
            mcols(G) <- unique(mcols(GenesInRegion[TF])[, "Symbol", drop = FALSE])
            
            newGenesInRegion <- c(newGenesInRegion, G)
        }
    }
    
    # half-vectorized solution
    h <- findOverlaps(newGenesInRegion, GR)
    symbolVec <- vector()
    for (i in 1:length(GR)) {
        symbolVec[i] <- paste(mcols(newGenesInRegion[queryHits(h)[subjectHits(h) == 
            i]])[["Symbol"]], collapse = ",")
    }
    # set NAs where appropriate
    symbolVec[symbolVec == ""] <- NA
    # return list with symbols
    symbolVec
    
    # return list with symbols
    symbolVec
}

#' @rdname annotation-wrappers
getExonsFromAnnotation <- function(TxDb, GR, leftFlank = 0, rightFlank = 0, verbose = FALSE) {
    
    # checks
    if (class(TxDb)[1] != "TxDb") 
        stop(paste("GR must of class TxDb, not", class(TxDb)[1]))
    
    if (class(GR)[1] != "GRanges") 
        stop(paste("GR must of class GRanges, not", class(GR)[1]))
    
    if (!class(leftFlank)[1] %in% c("numeric")) 
        stop(paste("leftFlank must be of class numeric, not:", class(leftFlank)[1]))
    if (length(leftFlank) != 1) 
        stop(paste("leftFlank must be of length 1, not:", length(leftFlank)))
    if (leftFlank < 0) 
        stop(paste("leftFlank must be equal to or larger than 0"))
    
    if (!class(rightFlank)[1] %in% c("numeric")) 
        stop(paste("rightFlank must be of class numeric, not:", class(rightFlank)[1]))
    if (length(rightFlank) != 1) 
        stop(paste("rightFlank must be of length 1, not:", length(rightFlank)))
    if (rightFlank < 0) 
        stop(paste("rightFlank must be equal to or larger than 0"))
    
    if (!class(verbose)[1] %in% c("logical")) 
        stop(paste("verbose must be of class logical, not:", class(verbose)[1]))
    if (length(verbose) != 1) 
        stop(paste("verbose must be of length 1, not:", length(verbose)))
    
    # remove chr in seqnames for GR
    seqLevels <- sub("^chr", "", seqlevels(GR))
    seqlevels(GR) <- seqLevels
    
    seqlevels(TxDb, pruning.mode="coarse") <- paste("chr", names(seqlengths(GR)), sep = "")
    
    # Get all exons from the active chromosomes By creating a GRanges from TxDb
    annGR <- exons(TxDb, columns = c("exon_id", "tx_name"))
    
    # remove chr in seqnames for annGR
    seqLevels <- sub("^chr", "", seqlevels(annGR))
    seqlevels(annGR) <- seqLevels
    
    # check that all levels in GR exist in annGR, if not exclude these levels
    if (sum(!levels(seqnames(GR)) %in% levels(seqnames(annGR))) > 0) {
        TFkeepLevels <- levels(seqnames(GR)) %in% levels(seqnames(annGR))
        seqlevels(GR) <- seqlevels(GR)[TFkeepLevels]
        
    }
    
    # check that all levels in annGR exist in GR, if not exclude these levels
    if (sum(!levels(seqnames(annGR)) %in% levels(seqnames(GR))) > 0) {
        TFkeepLevels <- levels(seqnames(annGR)) %in% levels(seqnames(GR))
        seqlevels(annGR, pruning.mode="coarse") <- seqlevels(annGR)[TFkeepLevels]
    }
    
    # the seqlevels comes in different orders. This will give the correct order.
    seqlevels(GR) <- seqlevels(annGR)
    
    # add flanking regions
    lf <- flank(GR, leftFlank, start = TRUE)
    rf <- flank(GR, rightFlank, start = FALSE)
    GR <- c(lf, GR, rf)
    start <- max(c(1, min(start(GR))))
    end <- max(end(GR))
    
    # speed-increasing coarse subset to +/- 1MB before extracting (because smaller
    # annGR is faster and we have to access several times)
    annGR <- annGR[start(ranges(annGR)) > max(c(1, start - 10^6)) & end(ranges(annGR)) < 
        (end + 10^6)]
    
    # extract names of all transcripts with exons in plotting window
    tx_names <- unique(unlist(as.list(mcols(annGR[start(ranges(annGR)) > start & 
        end(ranges(annGR)) < end])[["tx_name"]])))
    
    # retrieve all transcript annotation, including potential off-plot tails and
    # heads
    ExonsInRegion <- annGR[sapply(mcols(annGR)[["tx_name"]], function(x) {
        any(tx_names %in% x)
    })]
    
    seqlengths(ExonsInRegion) <- seqlengths(GR)
    ExonsInRegion
    
}

#' @rdname annotation-wrappers
getExonsVector <- function(TxDb, GR, leftFlank = 0, rightFlank = 0, verbose = FALSE) {
    if (verbose) {
        cat("start exon extraction\n")
    }
    
    ExonsInRegion <- getExonsFromAnnotation(TxDb, GR, leftFlank, rightFlank, verbose = verbose)
    
    seqlevels(GR) <- seqlevels(ExonsInRegion)
    
    # half-vectorized solution
    h <- findOverlaps(ExonsInRegion, GR)
    ExonVec <- vector()
    for (i in 1:length(GR)) {
        ExonVec[i] <- paste(mcols(ExonsInRegion[queryHits(h)[subjectHits(h) == i]])[["exon_id"]], 
            collapse = ",")
    }
    # set NAs where appropriate
    ExonVec[ExonVec == ""] <- NA
    # return list with symbols
    ExonVec
    
}

#' @rdname annotation-wrappers
getTranscriptsFromAnnotation <- function(TxDb, GR, leftFlank = 0, rightFlank = 0, 
    verbose = FALSE) {
    
    # checks
    if (class(TxDb)[1] != "TxDb") 
        stop(paste("GR must of class TxDb, not", class(TxDb)[1]))
    
    if (class(GR)[1] != "GRanges") 
        stop(paste("GR must of class GRanges, not", class(GR)[1]))
    
    if (!class(leftFlank)[1] %in% c("numeric")) 
        stop(paste("leftFlank must be of class numeric, not:", class(leftFlank)[1]))
    if (length(leftFlank) != 1) 
        stop(paste("leftFlank must be of length 1, not:", length(leftFlank)))
    if (leftFlank < 0) 
        stop(paste("leftFlank must be equal to or larger than 0"))
    
    if (!class(rightFlank)[1] %in% c("numeric")) 
        stop(paste("rightFlank must be of class numeric, not:", class(rightFlank)[1]))
    if (length(rightFlank) != 1) 
        stop(paste("rightFlank must be of length 1, not:", length(rightFlank)))
    if (rightFlank < 0) 
        stop(paste("rightFlank must be equal to or larger than 0"))
    
    if (!class(verbose)[1] %in% c("logical")) 
        stop(paste("verbose must be of class logical, not:", class(verbose)[1]))
    if (length(verbose) != 1) 
        stop(paste("verbose must be of length 1, not:", length(verbose)))
    
    
    
    # remove chr in seqnames for GR
    seqLevels <- sub("^chr", "", seqlevels(GR))
    seqlevels(GR) <- seqLevels
    
    seqlevels(TxDb, pruning.mode="coarse") <- paste("chr", names(seqlengths(GR)), sep = "")
    
    
    # Get all exons from the active chromosomes By creating a GRanges from TxDb
    annGR <- transcripts(TxDb)
    
    
    # remove chr in seqnames for annGR
    seqLevels <- sub("^chr", "", seqlevels(annGR))
    seqlevels(annGR) <- seqLevels
    
    # check that all levels in GR exist in annGR, if not exclude these levels
    if (sum(!levels(seqnames(GR)) %in% levels(seqnames(annGR))) > 0) {
        TFkeepLevels <- levels(seqnames(GR)) %in% levels(seqnames(annGR))
        seqlevels(GR) <- seqlevels(GR)[TFkeepLevels]
        
    }
    
    # check that all levels in annGR exist in GR, if not exclude these levels
    if (sum(!levels(seqnames(annGR)) %in% levels(seqnames(GR))) > 0) {
        TFkeepLevels <- levels(seqnames(annGR)) %in% levels(seqnames(GR))
        seqlevels(annGR, pruning.mode="coarse") <- seqlevels(annGR)[TFkeepLevels]
    }
    
    # the seqlevels comes in different orders. This will give the correct order.
    seqlevels(GR) <- seqlevels(annGR)
    
    # add flanking regions
    lf <- flank(GR, leftFlank, start = TRUE)
    rf <- flank(GR, rightFlank, start = FALSE)
    GR <- c(lf, GR, rf)
    # find overlaps between annGR and Snps incl. flank region
    TxInRegion <- subsetByOverlaps(annGR, GR)  #put in flankSize here when you have time ;)
    seqlengths(TxInRegion) <- seqlengths(GR)
    TxInRegion
}

#' @rdname annotation-wrappers
getTranscriptsVector <- function(TxDb, GR, leftFlank = 0, rightFlank = 0, verbose = FALSE) {
    if (verbose) {
        cat("start transcript extraction\n")
    }
    
    TxInRegion <- getTranscriptsFromAnnotation(TxDb, GR, leftFlank, rightFlank)
    
    seqlevels(GR) <- seqlevels(TxInRegion)
    
    # half-vectorized solution
    h <- findOverlaps(TxInRegion, GR)
    TxVec <- vector()
    for (i in 1:length(GR)) {
        TxVec[i] <- paste(mcols(TxInRegion[queryHits(h)[subjectHits(h) == i]])[["tx_id"]], 
            collapse = ",")
    }
    # set NAs where appropriate
    TxVec[TxVec == ""] <- NA
    # return list with symbols
    TxVec
    
}

#' @rdname annotation-wrappers
getCDSFromAnnotation <- function(TxDb, GR, leftFlank = 0, rightFlank = 0, verbose = FALSE) {
    # CDS are the coding regions that do not only code for proteins, but other also
    # other types like RNA.
    
    # checks
    if (class(TxDb)[1] != "TxDb") 
        stop(paste("GR must of class TxDb, not", class(TxDb)[1]))
    
    if (class(GR)[1] != "GRanges") 
        stop(paste("GR must of class GRanges, not", class(GR)[1]))
    
    if (!class(leftFlank)[1] %in% c("numeric")) 
        stop(paste("leftFlank must be of class numeric, not:", class(leftFlank)[1]))
    if (length(leftFlank) != 1) 
        stop(paste("leftFlank must be of length 1, not:", length(leftFlank)))
    if (leftFlank < 0) 
        stop(paste("leftFlank must be equal to or larger than 0"))
    
    if (!class(rightFlank)[1] %in% c("numeric")) 
        stop(paste("rightFlank must be of class numeric, not:", class(rightFlank)[1]))
    if (length(rightFlank) != 1) 
        stop(paste("rightFlank must be of length 1, not:", length(rightFlank)))
    if (rightFlank < 0) 
        stop(paste("rightFlank must be equal to or larger than 0"))
    
    if (!class(verbose)[1] %in% c("logical")) 
        stop(paste("verbose must be of class logical, not:", class(verbose)[1]))
    if (length(verbose) != 1) 
        stop(paste("verbose must be of length 1, not:", length(verbose)))
    
    # remove chr in seqnames for GR
    seqLevels <- sub("^chr", "", seqlevels(GR))
    seqlevels(GR) <- seqLevels
    
    seqlevels(TxDb, pruning.mode="coarse") <- paste("chr", names(seqlengths(GR)), sep = "")
    
    
    # Get all exons from the active chromosomes By creating a GRanges from TxDb
    annGR <- cds(TxDb)
    
    
    # remove chr in seqnames for annGR
    seqLevels <- sub("^chr", "", seqlevels(annGR))
    seqlevels(annGR) <- seqLevels
    
    # check that all levels in GR exist in annGR, if not exclude these levels
    if (sum(!levels(seqnames(GR)) %in% levels(seqnames(annGR))) > 0) {
        TFkeepLevels <- levels(seqnames(GR)) %in% levels(seqnames(annGR))
        seqlevels(GR) <- seqlevels(GR)[TFkeepLevels]
        
    }
    
    # check that all levels in annGR exist in GR, if not exclude these levels
    if (sum(!levels(seqnames(annGR)) %in% levels(seqnames(GR))) > 0) {
        TFkeepLevels <- levels(seqnames(annGR)) %in% levels(seqnames(GR))
        seqlevels(annGR, pruning.mode="coarse") <- seqlevels(annGR)[TFkeepLevels]
    }
    
    # the seqlevels comes in different orders. This will give the correct order.
    seqlevels(GR) <- seqlevels(annGR)
    
    # add flanking regions
    lf <- flank(GR, leftFlank, start = TRUE)
    rf <- flank(GR, rightFlank, start = FALSE)
    GR <- c(lf, GR, rf)
    # find overlaps between annGR and Snps incl. flank region
    CDSInRegion <- subsetByOverlaps(annGR, GR)  #put in flankSize here when you have time ;)
    seqlengths(CDSInRegion) <- seqlengths(GR)
    CDSInRegion
    
}



#' @rdname annotation-wrappers
getCDSVector <- function(TxDb, GR, leftFlank = 0, rightFlank = 0, verbose = FALSE) {
    if (verbose) {
        cat("start CDS extraction\n")
    }
    
    CDSInRegion <- getCDSFromAnnotation(TxDb, GR, leftFlank, rightFlank)
    
    seqlevels(GR) <- seqlevels(CDSInRegion)
    
    # half-vectorized solution
    h <- findOverlaps(CDSInRegion, GR)
    CDSVec <- vector()
    for (i in 1:length(GR)) {
        CDSVec[i] <- paste(mcols(CDSInRegion[queryHits(h)[subjectHits(h) == i]])[["cds_id"]], 
            collapse = ",")
    }
    # set NAs where appropriate
    CDSVec[CDSVec == ""] <- NA
    # return list with symbols
    CDSVec
}

#' @rdname annotation-wrappers
getAnnotationDataFrame <- function(GR, strand = "+", annotationType = NULL, OrgDb = NULL, 
    TxDb = NULL, verbose = FALSE) {
    # main checks
    if (sum(!(annotationType %in% c("gene", "exon", "cds", "transcript"))) > 0) {
        stop("annotationType must be one or more of these arguments 'gene','exon','cds','transcript'")
    }
    
    if (is.null(OrgDb) & is.null(TxDb)) {
        stop("at least one of parameters OrgDb or TxDb must be used")
    }
    
    # nr of columns for return df
    ncol <- 0
    if (!is.null(OrgDb)) {
        ncol <- ncol + 1
    }
    if (!is.null(TxDb)) {
        ncol <- ncol + (length(annotationType) - 1)
    }
    
    # return dataframe
    df <- data.frame(row.names = 1:length(GR))
    
    # set strand
    strand(GR) <- strand
    
    # extract annotation
    if (!is.null(OrgDb)) {
        if ("gene" %in% annotationType) {
            gene <- getGenesVector(OrgDb = OrgDb, GR = GR, verbose = verbose)
            df[["symbol"]] <- gene
        }
        if (is.null(annotationType)) {
            gene <- getGenesVector(OrgDb = OrgDb, GR = GR, verbose = verbose)
            df[["symbol"]] <- gene
        }
    }
    if (!is.null(TxDb)) {
        if ("exon" %in% annotationType) {
            df[["exon_id"]] <- getExonsVector(TxDb = TxDb, GR = GR, verbose = verbose)
        }
        if ("transcript" %in% annotationType) {
            df[["tx_id"]] <- getTranscriptsVector(TxDb = TxDb, GR = GR, verbose = verbose)
        }
        if ("cds" %in% annotationType) {
            df[["cds_id"]] <- getCDSVector(TxDb = TxDb, GR = GR, verbose = verbose)
        }
        
        if (is.null(annotationType)) {
            df[["exon_id"]] <- getExonsVector(TxDb = TxDb, GR = GR, verbose = verbose)
            df[["tx_id"]] <- getTranscriptsVector(TxDb = TxDb, GR = GR, verbose = verbose)
            df[["cds_id"]] <- getCDSVector(TxDb = TxDb, GR = GR, verbose = verbose)
        }
    }
    df
}

#' decorateWithGenes
#' 
#' Internal function that can draw gene regions on pre-specified surfaces.
#' Necessary for the genomic-location plots.
#' 
#' The main intention of this function is to be used when plotting several bar
#' plots in the same window. This function add gene regions under the bars.
#' 
#' @param x \code{ASEset} object
#' @param genesInRegion \code{GRanges} object with gene regions. Can be
#' obtained using \code{getGenesFromAnnotation}
#' @param xlim xlim values for the pre-specified surface
#' @param ylim ylim values for the pre-specified surface
#' @param chromosome character
#' @return \code{decorateWithGenes} returns nothing, but draws genes
#' @author Jesper R. Gadin, Lasse Folkersen
#' @seealso \itemize{ \item The \code{\link{locationplot}} which is uses this
#' function internally.  \item The \code{\link{decorateWithExons}} which is
#' another similar function that \code{\link{locationplot}} uses internally.  }
#' @keywords internal
#' @examples
#' 
#'   data(ASEset)
#' 
#' @export decorateWithGenes
decorateWithGenes <- function(x, genesInRegion, xlim, ylim, chromosome) {
    
    # check the input variables
    if (class(xlim)[1] != "integer") 
        xlim <- as.numeric(xlim)
    if (class(xlim)[1] != "numeric") 
        stop(paste("xlim must be of class numeric, not", class(xlim)[1]))
    if (length(xlim) != 2) 
        stop(paste("xlim must be of length 2, not", length(xlim)))
    if (class(ylim)[1] != "integer") 
        ylim <- as.numeric(ylim)
    if (class(ylim)[1] != "numeric") 
        stop(paste("ylim must be of class numeric, not", class(ylim)[1]))
    if (length(ylim) != 2) 
        stop(paste("ylim must be of length 2, not", length(ylim)))
    if (class(chromosome)[1] != "character") 
        stop(paste("chromosome must be of class character, not", class(chromosome)[1]))
    if (length(chromosome) != 1) 
        stop(paste("chromosome must be of length 1, not", length(chromosome)))
    if (!chromosome %in% unique(seqnames(genesInRegion))) {
        if (sub("^chr", "", chromosome) %in% unique(seqnames(genesInRegion))) {
            chromosome <- sub("^chr", "", chromosome)
        } else {
            stop(paste("chromosome", chromosome, "was not found amongst the seqnames of the genesInRegion object:", 
                paste(sort(unique(seqnames(genesInRegion))), collapse = ", ")))
        }
    }
    
    
    # only work with genes on current chromosome
    genesInRegion <- genesInRegion[seqnames(genesInRegion) == chromosome]
    
    # calculate how many 'rows' have to be made available
    maxCoverage <- max(coverage(genesInRegion)[[chromosome]])
    
    # loop over all unique genes, drawing them as specified
    uniqueGenes <- unique(mcols(genesInRegion)[["Symbol"]])
    
    for (i in 1:length(uniqueGenes)) {
        # getting the name of the gene and the height on the Y-axis
        genesymbol <- uniqueGenes[i]
        yPos <- ylim[1] + (i - 1)%%maxCoverage * ((ylim[2] - ylim[1])/maxCoverage)
        
        # this block checks for double instances and just arbitrarily take the first
        # (typically miRNAs with two locations)
        if (sum(mcols(genesInRegion)[["Symbol"]] %in% genesymbol) > 1) {
            geneData <- genesInRegion[which(mcols(genesInRegion)[["Symbol"]] %in% 
                genesymbol)[1], ]
        } else {
            geneData <- genesInRegion[which(mcols(genesInRegion)[["Symbol"]] %in% 
                genesymbol), ]
        }
        
        # draw and label
        start <- max(c(xlim[1] - (xlim[2] - xlim[1])/10, start(geneData)))
        end <- min(c(xlim[2] + (xlim[2] - xlim[1])/10, end(geneData)))
        lines(x = c(start, end), y = c(yPos, yPos), lwd = 2)
        text(x = start + (end - start)/2, y = yPos + (ylim[2] - ylim[1])/6, label = genesymbol, 
            cex = 0.8)
    }
}



#' decorateWithExons
#' 
#' Internal function that can draw gene regions on pre-specified surfaces.
#' Necessary for the genomic-location plots.
#' 
#' The main intention of this function is to be used when plotting several bar
#' plots in the same window. This function add gene regions under the bars.
#' 
#' @param x \code{ASEset} object
#' @param exonsInRegion \code{GRanges} object with generegions. Can be obtained
#' using \code{getExonsFromAnnotation}. Must contain a column 'tx_name'
#' @param xlim xlim values for the pre-specified surface
#' @param ylim ylim values for the pre-specified surface
#' @param chromosome character
#' @return \code{decorateWithExons} returns nothing, but draws genes
#' @author Jesper R. Gadin, Lasse Folkersen
#' @seealso \itemize{ \item The \code{\link{locationplot}} which is uses this
#' function internally.  \item The \code{\link{decorateWithGenes}} which is
#' another similar function that \code{\link{locationplot}} uses internally.  }
#' @keywords internal
#' @examples
#' 
#'   data(ASEset)
#' 
#' @export decorateWithExons
decorateWithExons <- function(x, exonsInRegion, xlim, ylim, chromosome) {
    
    # check the input variables
    if (class(exonsInRegion)[1] != "GRanges") 
        stop(paste("exonsInRegion must be of class GRanges, not", class(exonsInRegion)[1]))
    if (!"tx_name" %in% colnames(mcols(exonsInRegion))) 
        stop("exonsInRegion must contain an mcol variable named 'tx_name'")
    if (class(xlim)[1] != "integer") 
        xlim <- as.numeric(xlim)
    if (class(xlim)[1] != "numeric") 
        stop(paste("xlim must be of class numeric, not", class(xlim)[1]))
    if (length(xlim) != 2) 
        stop(paste("xlim must be of length 2, not", length(xlim)))
    if (class(ylim)[1] != "integer") 
        ylim <- as.numeric(ylim)
    if (class(ylim)[1] != "numeric") 
        stop(paste("ylim must be of class numeric, not", class(ylim)[1]))
    if (length(ylim) != 2) 
        stop(paste("ylim must be of length 2, not", length(ylim)))
    if (class(chromosome)[1] != "character") 
        stop(paste("chromosome must be of class character, not", class(chromosome)[1]))
    if (length(chromosome) != 1) 
        stop(paste("chromosome must be of length 1, not", length(chromosome)))
    if (!chromosome %in% unique(seqnames(exonsInRegion))) {
        if (sub("^chr", "", chromosome) %in% unique(seqnames(exonsInRegion))) {
            chromosome <- sub("^chr", "", chromosome)
        } else {
            stop(paste("chromosome", chromosome, "was not found amongst the seqnames of the exonInRegion object:", 
                paste(sort(unique(seqnames(exonsInRegion))), collapse = ", ")))
        }
    }
    # only work with exons on current chromosome
    exonsInRegion <- exonsInRegion[seqnames(exonsInRegion) == chromosome]
    
    
    # calculate how many 'rows' have to be made available (corresponding to the
    # number of unique transcripts in exonsInRegion
    uniqueGenes <- unique(unlist(as.list(mcols(exonsInRegion)[["tx_name"]])))
    maxCoverage <- length(uniqueGenes)
    
    
    for (i in 1:length(uniqueGenes)) {
        # getting the name of the gene and the height on the Y-axis
        tx_name <- uniqueGenes[i]
        yPos <- ylim[1] + (i - 1)%%maxCoverage * ((ylim[2] - ylim[1])/maxCoverage)
        
        # extracting and drawing all exons for each transcript
        exonsInTranscript <- which(sapply(as.list(mcols(exonsInRegion)[["tx_name"]]), 
            function(x) {
                tx_name %in% x
            }))
        for (exonInTranscript in exonsInTranscript) {
            x1 <- start(exonsInRegion[exonInTranscript])
            x2 <- end(exonsInRegion[exonInTranscript])
            lines(x = c(x1, x2), y = c(yPos, yPos), lwd = 2)
        }
        
        
        # draw a thin connecting line for each transcript
        end <- max(end(exonsInRegion[exonsInTranscript]))
        start <- min(start(exonsInRegion[exonsInTranscript]))
        lines(x = c(start, end), y = c(yPos, yPos), lwd = 0.5)
        
        # label with the tx_name
        start <- max(c(start, xlim[1]))
        end <- min(c(end, xlim[2]))
        text(x = start + (end - start)/2, y = yPos + (ylim[2] - ylim[1])/6, label = tx_name, 
            cex = 0.8)
        
    }
} 
pappewaio/AllelicImbalance documentation built on April 11, 2020, 2:58 a.m.