R/readAnnotate.R

Defines functions heatTargetAnnotation .nearest.2bed distance2NearestFeature annotatGrWithGeneParts checkBedValidity bed12ToIntrons bed12ToExons

Documented in annotatGrWithGeneParts bed12ToExons bed12ToIntrons checkBedValidity distance2NearestFeature heatTargetAnnotation

# PART THAT DEALS with annotation of genomic features

##############################################################################
# SECTION 1:
# reading annotation to GRanges
# makes GRanges object from a given bed6 or bed12 file to granges object
##############################################################################

#######################################
# SECTION 1: S3 functions
#######################################

# ---------------------------------------------------------------------------- #

#' bed12ToExons function
#' 
#' extracts exons from a bed12 file and puts them into GRanges object
#'
#' @param ref data.frame object
#' @keywords internal
bed12ToExons<-function(ref){

    ref=unique(ref)

    # apply strsplit on columns 11 and 12 to extract exon start positions and exon sizes
    b.start.size = cbind(as.integer(unlist(strsplit(as.character(ref$V12),"," ))),
                       as.integer(unlist(strsplit(as.character(ref$V11),"," ))))
    rep.ref = ref[rep(1:nrow(ref),ref[,10]),] # replicate rows occurs as many as its exon number


    exon.id = unlist( mapply( function(x,y)
                              if(x=="+"){return(1:y)}
                              else{return(y:1)}, 
                              ref[,6],ref[,10]))
    rep.ref$V5=exon.id

    rep.ref$V3 = rep.ref$V2+b.start.size[,1]+b.start.size[,2] # calculate exon start and ends
    rep.ref$V2 = rep.ref$V2+b.start.size[,1]

    strands = as.character(rep.ref$V6)
    strands[strands=="."]="*"
    grange.exons = GenomicRanges::GRanges(seqnames=as.character(rep.ref$V1),
    ranges = IRanges(start=rep.ref$V2+1, end=rep.ref$V3),
    strand = strands, 
    score=rep.ref$V5,
    name=rep.ref$V4)
    return(grange.exons)
} 

# ---------------------------------------------------------------------------- #
#' bed12ToIntrons function
#' 
#' extracts introns from a bed12 file and puts them into GRanges object
#'
#' @param ref data.frame object
#' @keywords internal
bed12ToIntrons<-function(ref){

    #remove the genes with one exon only (they won't have any introns)
    ref=ref[ref[,10]>1,]
    ids=paste(ref[,1],ref[,2],ref[,3],ref[,4],sep="")
    ref=cbind(ref,id=ids)
    ref=unique(ref)

    # apply strsplit on columns 11 and 12 to extract exon start positions and exon sizes
    b.start.size=cbind(as.integer(unlist(strsplit(as.character(ref$V12),"," ))),
                     as.integer(unlist(strsplit(as.character(ref$V11),"," ))))

    # replicate rows occurs as many as its exon number
    rep.ref = ref[rep(1:nrow(ref),ref[,10]),] 

    exon.id = unlist( mapply( function(x,y) 
                                if(x=="+"){return(1:y)}
                                else{return(y:1)} ,
                                ref[,6],ref[,10]  ) )
    rep.ref$V5 = exon.id

    # calculate exon start and ends
    rep.ref$V3 = rep.ref$V2+b.start.size[,1]+b.start.size[,2] 
    rep.ref$V2 = rep.ref$V2+b.start.size[,1]
    rep.ref = rep.ref[,c(1:6,13)]

    # now try to get the exons by cbinding consecutive exons
    temp.ref        = cbind(rep.ref[1:(nrow(rep.ref)-1),],rep.ref[2:nrow(rep.ref),])
    temp.ref        = temp.ref[temp.ref[,7]==temp.ref[,14],]
    temp.ref[,2]    = temp.ref[,3]
    temp.ref[,3]    = temp.ref[,9]
    rep.ref            = temp.ref[,1:6]

    # subtract 1 from - strand exon numbers
    rep.ref[rep.ref[,6]=="-",5]=rep.ref[rep.ref[,6]=="-",5]-1 

    strands=as.character(rep.ref$V6)
    strands[strands=="."]="*"
    grange.exons=GenomicRanges::GRanges(seqnames=as.character(rep.ref$V1),
    ranges=IRanges(start=rep.ref$V2+1, end=rep.ref$V3),
    strand=strands, score=rep.ref$V5,name=rep.ref$V4)
    return(grange.exons)
} 

# ---------------------------------------------------------------------------- #
#' checkBedValidity function
#' 
#' checks the validity of the bed data.frame if it is a legitimate bed columns
#'
#' @param bed.df data.frame object
#' @param type type
#' @keywords internal
checkBedValidity<-function(bed.df,type="none"){

    # does it have at least 3 columns
    num.col=(ncol(bed.df)>=3)

    # does it have 1st and 2nd column as numbers
    col1.2= (is.integer(bed.df[,2]) & is.integer(bed.df[,3]) )

    # does it have chr string in 1st column
    #chr= sum(grepl("chr",bed.df[,1]) )

    if(type=="exons" | type=="introns") {
        #does it have 12>= columns
        ex=(ncol(bed.df)>=12) 

        return(num.col & col1.2 & ex)
    }
    return(num.col & col1.2)
}




#######################################
# SECTION 1: S4 functions
#######################################


# ---------------------------------------------------------------------------- #
#' convert a data frame read-in from a bed file to a GRanges object
#'  
#' @param bed  a data.frame where column order and content 
#'             resembles a bed file with 12 columns
#' @usage convertBedDf(bed)
#' @return \code{\link{GRanges}} object
#'
#' @note one bed track per file is only accepted, the bed 
#'       files with multiple tracks will cause en error
#' @note bed files are expected not to have header lines
#'
#' @export
#' @docType methods
#' @rdname convertBedDf-methods
setGeneric("convertBedDf",function(bed) standardGeneric("convertBedDf"))

#' @aliases convertBedDf,data.frame-method
#' @rdname convertBedDf-methods
setMethod("convertBedDf" ,
          signature(bed = "data.frame" ),
          function(bed){

            if(! checkBedValidity(bed))
                stop("this is not a valid bed file")
                
            if(ncol(bed)>=6){
            
                # check if 6th column is really strand
                if( sum( unique(bed[,6]) %in% c("+","-",".") ) != length(unique(bed[,6])) )
                    stop("Strand column of the bed file or data.frame is wrong")
                        
                #convert to granges
                strands=as.character(bed$V6)
                strands[strands=="."]="*"
                grange=GRanges(seqnames=as.character(bed$V1),
                               ranges=IRanges(start=bed$V2+1, end=bed$V3),
                               strand=strands, 
                               score=bed$V5,
                               name=bed$V4)
            }

            if(ncol(bed)==5){
                grange = GRanges(seqnames=bed$V1,
                         ranges=IRanges(start=bed$V2+1, end=bed$V3),
                         strand=rep("*",nrow(bed)), score=bed$V5,name=bed$V4 )
            }
            if(ncol(bed)==4){
                grange = GRanges(seqnames=bed$V1,
                         ranges=IRanges(start=bed$V2+1, end=bed$V3),
                         strand=rep("*",nrow(bed)),name=bed$V4)
            }    
            if(ncol(bed)==3){
                grange = GRanges(seqnames=bed$V1,
                         ranges=IRanges(start=bed$V2+1, end=bed$V3),
                         strand=rep("*",nrow(bed)))
            }                           
            return(grange)
})

# ---------------------------------------------------------------------------- #
#' convert a data frame read-in from a bed file to a GRanges object for exons
#'  
#' @param bed.df a data.frame where column order and content resembles a bed file with 12 columns
#' @usage convertBed2Exons(bed.df)
#' @return \code{\link{GRanges}} object
#'
#' @note one bed track per file is only accepted, the bed files with multiple tracks will cause en error
#'
#'
#' @examples
#' file = system.file('extdata/chr21.refseq.hg19.bed', package='genomation')
#' bed12 = read.table(file)
#' exons = convertBed2Exons(bed12)
#' head(exons)
#' 
#' @export
#' @docType methods
#' @rdname convertBed2Exons-methods
setGeneric("convertBed2Exons",
                function(bed.df) standardGeneric("convertBed2Exons"))

#' @aliases convertBed2Exons,data.frame-method
#' @rdname convertBed2Exons-methods
setMethod("convertBed2Exons" ,
            signature(bed.df = "data.frame" ),
            function(bed.df){

            if(! checkBedValidity(bed.df,"exon"))
                stop("this is not a valid bed file")
                
            bed12ToExons(bed.df)
})


# ---------------------------------------------------------------------------- #
#' convert a data frame read-in from a bed file to a GRanges object for introns
#'  
#' @param bed.df  a data.frame where column order and content 
#'                resembles a bed file with 12 columns
#' @usage convertBed2Introns(bed.df)
#' @return \code{\link{GRanges}} object
#'
#' @note one bed track per file is only accepted, the bed files with 
#'       multiple tracks will cause en error
#'
#' @examples
#' file = system.file('extdata/chr21.refseq.hg19.bed', package='genomation')
#' bed12 = read.table(file)
#' introns = convertBed2Introns(bed12)
#' head(introns)
#'
#' @export
#' @docType methods
#' @rdname convertBed2Introns-methods
setGeneric("convertBed2Introns",
                  function(bed.df) standardGeneric("convertBed2Introns"))

#' @aliases convertBed2Introns,data.frame-method
#' @rdname convertBed2Introns-methods
setMethod("convertBed2Introns",
            signature(bed.df = "data.frame" ),
            function(bed.df){

            if(! checkBedValidity(bed.df,"exon"))
                stop("this is not a valid bed file")
                
            bed12ToIntrons(bed.df)
})



# ---------------------------------------------------------------------------- #
#' Function to get upstream and downstream adjecent regions to a genomic feature such as CpG islands
#' 
#' @param grange GRanges object for the feature
#' @param flank  number of basepairs for the flanking regions
#' @param clean  If set to TRUE, flanks overlapping with other main features 
#'               will be trimmed, and overlapping flanks will be removed. 
#'               This will remove multiple counts when other features overlap 
#'               with flanks
#'
#' @usage getFlanks(grange,flank=2000,clean=TRUE)
#' 
#' @examples
#' data(cpgi)
#' cpgi.flanks = getFlanks(cpgi)
#' head(cpgi.flanks)
#' 
#' @return GRanges object for flanking regions
#' @export
#' @docType methods
#' @rdname getFlanks-methods
setGeneric("getFlanks", 
                function(grange,flank=2000,clean=TRUE) 
                          standardGeneric("getFlanks"))

#' @aliases getFlanks,GRanges-method
#' @rdname getFlanks-methods
setMethod("getFlanks", signature(grange= "GRanges"),
            function(grange,flank=2000,clean=TRUE){

            shores = c( IRanges::flank(grange,flank),
                        IRanges::flank(grange,flank, FALSE) )
            
            # merge overlapping shores remove CpG coordinates from all shores, das ist so cool!!
            if(clean)
                shores=IRanges::reduce(IRanges::setdiff(shores, grange)) 
            
            shores
})




##############################################################################
# SECTION 2:
# annotate granges objects with annotations that read-in and converted to GRanges objects
##############################################################################

#' @rdname show-methods
#' @aliases show,AnnotationByGeneParts-method
setMethod("show", "AnnotationByGeneParts", 
            function(object) {

                message("Summary of target set annotation with genic parts");
                message("Rows in target set: ", nrow(object@members))
                message("-----------------------")
                
                message("percentage of target features overlapping with annotation:")
                print(round(object@annotation,2))
                message()
                
                message("percentage of target features overlapping with annotation:")
                message("(with promoter > exon > intron precedence):"); 
                print(round(object@precedence,2))
                message()
                
                message("percentage of annotation boundaries with feature overlap:")
                print(round(object@perc.of.OlapFeat,2))
                message()  
                
                message("summary of distances to the nearest TSS:")
                print(summary(abs(object@dist.to.TSS[,2])))
                message()
})


#' @rdname show-methods
#' @aliases show,AnnotationByFeature-method
setMethod("show", "AnnotationByFeature",
          function(object) {
            
            message("summary of target set annotation with feature annotation:")
            message("Rows in target set: ",nrow(object@members))
            message("----------------------------")
            
            message("percentage of target elements overlapping with features:")
            print(round(object@annotation,2))
            message()
            
            message("percentage of feature elements overlapping with target:")
            print(round(object@perc.of.OlapFeat,2))
            message()
})



#######################################
# SECTION 2: S3 FUNCTIONS 
# these shouldn't be exported
#######################################

# ---------------------------------------------------------------------------- #
#' annotatGrWithGeneParts function
#'
#' @param gr target object
#' @param prom promotors 
#' @param exon exons
#' @param intron introns
#' @param strand logical
#' @keywords internal
annotatGrWithGeneParts <- function(gr, prom, exon, intron, strand=FALSE){

    if( ! strand){strand(gr)="*"}
    memb = data.frame(matrix(rep(0,length(gr)*3),ncol=3) )
    colnames(memb)=c("prom","exon","intron")
    memb[countOverlaps(gr,prom) > 0,1] = 1
    memb[countOverlaps(gr,exon) > 0,2] = 1
    memb[countOverlaps(gr,intron) > 0,3] = 1

    # percentage of overlap for each region
    annotation = c( promoter     = 100*sum(memb$prom>0)/nrow(memb),
                    exon        = 100*sum(memb$exon>0)/nrow(memb),
                    intron        = 100*sum(memb$intron>0)/nrow(memb),
                    intergenic    = 100*sum(rowSums(memb)==0)/nrow(memb) )

    
    num.annotation = c( promoter    = sum(memb$prom>0),
                        exon        = sum(memb$exon>0),
                        intron        = sum(memb$intron>0),
                        intergenic    = sum(rowSums(memb)==0) )

    precedence = c( promoter  = 100*sum(memb$prom>0)/nrow(memb),
                    exon      = 100*sum(memb$exon>0 & memb$prom==0)/nrow(memb),
                    intron    = 100*sum(memb$intron>0 & memb$exon==0 & memb$prom==0)/nrow(memb),
                    intergenic= 100*sum(rowSums(memb)==0)/nrow(memb) )

    num.precedence = c( promoter  = sum(memb$prom>0),
                        exon      = sum(memb$exon>0 & memb$prom==0),
                        intron    = sum(memb$intron>0 & memb$exon==0 & memb$prom==0),
                        intergenic= sum(rowSums(memb)==0) )  

    numberOfOlapFeat = c( promoter=sum(countOverlaps(prom,gr)>0),
                          exon=sum(countOverlaps(exon,gr)>0),
                          intron=sum(countOverlaps(intron,gr)>0) )
    percOfOlapFeat =100*numberOfOlapFeat/c(length(prom),length(exon),length(intron))
    
    return(list(members = memb,
                annotation=annotation,
                precedence=precedence,
                num.annotation=num.annotation,
                num.precedence=num.precedence,
                numberOfOlapFeat=numberOfOlapFeat,percOfOlapFeat=percOfOlapFeat))
}

# ---------------------------------------------------------------------------- #
#' distance2NearestFeature function
#'
#' @param g.idh target object
#' @param tss TSSes
#' @keywords internal
distance2NearestFeature<-function(g.idh,tss){

    elementMetadata(g.idh) = data.frame(elementMetadata(g.idh),orig.row=1:length(g.idh))
    # get the row number column
    id.col = ncol( elementMetadata(g.idh))+5 

    # get the id column for tss in the merged data.frame below
    tss.id.col = ncol( elementMetadata(g.idh))+5+7 
    
    # get the id column for tss
    strnd.col=ncol( elementMetadata(g.idh))+5+7-2 

    # a merged data.frame is returned by this function
    met.tss = .nearest.2bed(g.idh, tss) 

    dist.col=ncol(met.tss)
    met.tss[met.tss$end < met.tss$start.y & met.tss$strand.y=="+", dist.col] = 
    -1* met.tss[met.tss$end<met.tss$start.y & met.tss$strand.y=="+",dist.col]

    met.tss[met.tss$end > met.tss$start.y & met.tss$strand.y=="-",dist.col] = 
    -1* met.tss[met.tss$end>met.tss$start.y & met.tss$strand.y=="-",dist.col]

    res = met.tss[order(met.tss[,id.col]),c(id.col,dist.col,tss.id.col,strnd.col)]
    names(res)=c("target.row", "dist.to.feature",  "feature.name", "feature.strand")

    return(res)
}


# ---------------------------------------------------------------------------- #
# get the nearest features between a subject and query bed file
# get grange objects in bed zormat
# g.bed is GenomicRanges object, biatch!!!
# subject is also GenomicRanges object
# I think this function can be easily simplified
.nearest.2bed<-function(g.bed,subject){
    
    chrs1 = levels(seqnames(g.bed))
    chrs2 = levels(seqnames(subject))
    chrs  = intersect(chrs1, chrs2)
    res.df=NULL
    
    for(i in 1:length(chrs)){
        
        # find the nearest for this range
        ind = nearest(ranges(g.bed[seqnames(g.bed)==chrs[i],]),
                  ranges(subject[seqnames(subject)==chrs[i],]))
        sbdf1 = IRanges::as.data.frame(g.bed[seqnames(g.bed)==chrs[i],] )
        sbdf2 = IRanges::as.data.frame(subject[seqnames(subject)==chrs[i],] )
        sbdf2 = sbdf2[ind,]
        names(sbdf2)=paste(names(sbdf2),".y",sep="")
        temp  = cbind(sbdf1,sbdf2)
        res.df = rbind(res.df,temp)
    }

    res.dist = rep(0,nrow(res.df))
    dist1 = abs(res.df$start-res.df$end.y)+1
    dist2 = abs(res.df$start.y-res.df$end)+1
    
    #total length
    totti = res.df$width + res.df$width.y 
    res.dist[pmax(dist1,dist2)>totti]=pmin(dist1,dist2)[pmax(dist1, dist2) > totti] 
    res.df = cbind(res.df,dist=res.dist)
    return(res.df)
}





#######################################
# SECTION 2: S4 FUNCTIONS 
#######################################


# ---------------------------------------------------------------------------- #
#' Annotate given  object with promoter, exon, intron and intergenic regions
#' 
#' The function annotates \code{GRangesList} or \code{GRanges} object as 
#' overlapping with promoter,exon,intron or intergenic regions.
#'
#' @param target  \code{\link{GRanges}} or \code{\link{GRangesList}}
#'                object storing chromosome locations
#'                to be annotated (e.g. chipseq peaks)
#' @param feature GRangesList object containing GRanges object for
#'                promoter, exons, introns and transcription start sites, 
#'                or simply output of readTranscriptFeatures function
#' @param strand If set to TRUE, annotation features and target features will be 
#'        overlapped based on strand  (def:FALSE)
#' @param intersect.chr boolean, whether to select only chromosomes that are 
#'        common to feature and target. FALSE by default
#' 
#' @return \code{AnnotationByGeneParts} object or a list of 
#'         \code{AnnotationByGeneParts}
#'         objects if target is a  \code{\link{GRangesList}} object.
#' 
#' @examples
#' data(cage)
#' bed.file = system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' cage.annot = annotateWithGeneParts(cage, gene.parts, intersect.chr=TRUE)
#' cage.annot
#' @export
#' @docType methods
#' @rdname annotateWithGeneParts-methods
setGeneric("annotateWithGeneParts", 
                function(target,feature,strand=FALSE, intersect.chr=FALSE)
                    standardGeneric("annotateWithGeneParts") )

#' @aliases annotateWithGeneParts,GRanges,GRangesList-method
#' @rdname annotateWithGeneParts-methods
setMethod("annotateWithGeneParts", 
          signature(target = "GRanges", feature = "GRangesList"),
          function(target, feature, strand, intersect.chr){

          if(intersect.chr){
            message('intersecting chromosomes...')
            chrs = intersect(unique(as.character(seqnames(target))), 
                         unique(unlist(
                           lapply(feature, 
                                  function(x)as.character(seqnames(x))))))
        
            if(length(chrs) == 0)
              stop('target and feature do not have intersecting chromosomes')
            target=target[seqnames(target) %in% chrs]
            feature = lapply(feature, function(x)x[seqnames(x) %in% chrs])
          }  
        
            a.list = annotatGrWithGeneParts(target, 
                                            feature$promoters,
                                            feature$exons,
                                            feature$introns,
                                            strand=strand)
            dist2TSS = distance2NearestFeature(target,feature$TSSes)

            new("AnnotationByGeneParts",
                members         = as.matrix(a.list$members),
                annotation      = a.list$annotation,
                precedence        = a.list$precedence,
                num.annotation  = a.list$num.annotation,
                num.precedence    = a.list$num.precedence,
                no.of.OlapFeat  = a.list$numberOfOlapFeat,
                perc.of.OlapFeat= a.list$percOfOlapFeat,
                dist.to.TSS     = dist2TSS )
})



#' @aliases annotateWithGeneParts,GRangesList,GRangesList-method
#' @rdname annotateWithGeneParts-methods
setMethod("annotateWithGeneParts",
          signature(target = "GRangesList", feature= "GRangesList"),
          function(target, feature, strand, intersect.chr){
          
        
      l = list()
      if(is.null(names(target)))
          names(target) = 1:length(target)
      for(i in 1:length(target)){
      
        name = names(target)[i]
        message(paste('Working on:',name))
        x = target[[name]]
        l[[name]] = annotateWithGeneParts(x, feature, strand, intersect.chr)
        
      }                          
            return(l)
})




# ---------------------------------------------------------------------------- #
#' Function to annotate a given GRanges object with promoter,exon,intron & intergenic values
#'  
#' @param target    a granges object storing chromosome locations to be annotated
#' @param feature   a granges object storing chromosome locations of a feature 
#'                  (can be CpG islands, ChIP-seq peaks, etc)
#' @param flank     a granges object storing chromosome locations of 
#'                  the flanks of the feature
#' @param feature.name     string for the name of the feature
#' @param flank.name     string for the name of the flanks
#' @param strand   If set to TRUE, annotation features and target features 
#'                 will be overlapped based on strand  (def:FAULT)
#' @param intersect.chr boolean, whether to select only chromosomes that are 
#'                      common to feature and target. FALSE by default
#'                      
#'                      
#' @return returns an \code{AnnotationByFeature} object
#' 
#' 
#' @examples
#' data(cpgi)
#' data(cage)
#' cpgi.flanks = getFlanks(cpgi)
#' flank.annot = annotateWithFeatureFlank(cage, cpgi, cpgi.flanks)
#' 
#' @export
#' @docType methods
#' @rdname annotateWithFeatureFlank-methods
setGeneric("annotateWithFeatureFlank", 
                          function(target,
                                   feature,
                                   flank,
                                   feature.name=NULL,
                                   flank.name="flank",
                                   strand=FALSE, 
                                   intersect.chr=FALSE) 
                            standardGeneric("annotateWithFeatureFlank") )

#' @aliases annotateWithFeatureFlank,GRanges,GRanges,GRanges-method
#' @rdname annotateWithFeatureFlank-methods
setMethod( "annotateWithFeatureFlank", 
            signature(target = "GRanges",feature="GRanges",flank="GRanges"),
            function(target, feature, flank, feature.name, flank.name, strand, 
               intersect.chr){

              if(is.null(feature.name))
                feature.name= deparse(substitute(feature))
        
              if(intersect.chr){
                message('intersecting chromosomes...')
                chrs = intersect(unique(as.character(seqnames(target))), 
                                 unique(as.character(seqnames(feature))))
                if(length(chrs) == 0)
                  stop('target and feature do not have intersecting chromosomes')
                target  = target[seqnames(target) %in% chrs]
                feature = feature[seqnames(feature) %in% chrs]
              }
        
                if( ! strand )
                    strand(target) = "*"
                    memb=data.frame(matrix(rep(0,length(target)*2),ncol=2) )
                    colnames(memb) = c(feature.name,flank.name)
                    memb[countOverlaps(target,feature)>0,1]=1
                    memb[countOverlaps(target,flank)>0,2]=1

                    annotation=c(100*sum(memb[,1]>0)/nrow(memb),
                                 100*sum(memb[,2]>0)/nrow(memb),
                                 100*sum(rowSums(memb)==0)/nrow(memb) )
                    names(annotation)=c(feature.name, flank.name, "other")

                    num.annotation=c( sum(memb[,1]>0), 
                                      sum(memb[,2]>0), 
                                      sum(rowSums(memb)==0) )
                    names(num.annotation)=c(feature.name,flank.name,"other")                      

                    precedence=c(100*sum(memb[,1]>0)/nrow(memb) ,
                                 100*sum(memb[,2]>0 & memb[,1]==0)/nrow(memb) ,
                                 100*sum(rowSums(memb)==0)/nrow(memb) )
                    names(precedence)=c(feature.name,flank.name,"other")

                    num.precedence=c( sum(memb[,1]>0), 
                                      sum(memb[,2]>0 & memb[,1]==0), 
                                      sum(rowSums(memb)==0)  )
                    names(num.precedence)=c(feature.name,flank.name,"other")                      

                    numberOfOlapFeat = c(sum(countOverlaps(feature,target)>0),
                    sum(countOverlaps(flank,target)>0) )
                    names(numberOfOlapFeat) = c(feature.name,flank.name)
                    percOfOlapFeat = 100*numberOfOlapFeat/c(length(feature),length(flank) )

                    new("AnnotationByFeature",
                        members         = as.matrix(memb),
                        annotation      = annotation,
                        precedence        = precedence,
                        num.annotation  = num.annotation,
                        num.precedence    = num.precedence,
                        no.of.OlapFeat  = numberOfOlapFeat,
                        perc.of.OlapFeat= percOfOlapFeat)

})


# ---------------------------------------------------------------------------- #
#' Function to annotate given GRanges object with a given genomic feature
#' 
#' @param target   a GRanges object storing chromosome locations to be annotated
#' @param feature  a GRanges object storing chromosome locations of a feature 
#'                 (can be CpG islands, ChIP-seq peaks, etc)
#' @param strand   If set to TRUE, annotation features and target features will 
#'                 be overlapped based on strand  (def:FAULT)
#' @param extend   specifiying a positive value will extend the feature on both 
#'                 sides as much as \code{extend}
#' @param feature.name name of the annotation feature. 
#'                     For example: H3K4me1,CpGisland etc.
#'                     by default the name is taken from the given variable
#' @param intersect.chr boolean, whether to select only chromosomes that are 
#'                      common to feature and target. FALSE by default
#'                      
#' @return returns an \code{AnnotationByFeature} object
#' 
#' 
#' @examples
#' data(cpgi)
#' data(promoters)
#' annot = annotateWithFeature(cpgi, promoters)
#' 
#' 
#' @export
#' @docType methods
#' @rdname annotateWithFeature-methods
setGeneric("annotateWithFeature", 
                    function(target,
                             feature,
                             strand=FALSE,
                             extend=0,
                             feature.name=NULL, 
                             intersect.chr=FALSE) 
                      standardGeneric("annotateWithFeature") )

#' @aliases annotateWithFeature,GRanges,GRanges-method
#' @rdname annotateWithFeature-methods
setMethod("annotateWithFeature", 
           signature(target = "GRanges",feature="GRanges"),
           function(target, feature, strand,extend,feature.name,intersect.chr){

                if(is.null(feature.name))
          feature.name= deparse(substitute(feature))

                if(extend>0){
                  message('extending features...')
          feature = resize(feature, width=(width(feature)+2*extend),
                           fix='center')
                }
        
        # selects common chromosomes for target and feature
        if(intersect.chr){
          message('intersecting chromosomes...')
          chrs = intersect(unique(as.character(seqnames(target))), 
                           unique(as.character(seqnames(feature))))
          if(length(chrs) == 0)
            stop('target and feature do not have intersecting chromosomes')
          target=target[seqnames(target) %in% chrs]
          feature = feature[seqnames(feature) %in% chrs]
        }
        
                if( ! strand)
          strand(target)="*"
                memb=rep(0,length(target))
        
                memb[countOverlaps(target,feature) > 0] = 1

                annotation = c( 100*sum(memb >  0)/length(memb) ,
                                        100*sum(memb == 0)/length(memb) )
                
                num.annotation = c( sum( memb >  0),
                                              sum( memb == 0) )
                names(annotation) = c(feature.name, "other")

                numberOfOlapFeat = c(sum(countOverlaps(feature,target)>0))
                percOfOlapFeat = 100*numberOfOlapFeat/c(length(feature))

                new("AnnotationByFeature",
                    members         = as.matrix(memb),
                    annotation      = annotation,
                    precedence          = annotation,
                    num.annotation  = num.annotation,
                    num.precedence    = num.annotation,
                    no.of.OlapFeat  = numberOfOlapFeat,
                    perc.of.OlapFeat= percOfOlapFeat)
})



#' Annotate given ranges with genomic features
#' 
#' The function annotates a target GRangesList or GRanges object as overlapping
#' or not with 
#' the elements of a named GRangesList. This is useful to annotate your regions
#' of interest with genomic features with arbitrary categories such as repeat
#' classes or families, or output from genome segmentation alogorithms such
#' as chromHMM.
#' 
#'
#' @param target GRanges or GRangesList object storing chromosome locations to be 
#'        annotated (e.g. chipseq peaks)
#' @param features a named GRangesList object containing GRanges objects different set
#'        of features. The function calculates percent overlaps with and without
#'        precendence at the same time. The order of objects in GRangesList
#'        defines their precedence. If a range in \code{target} overlaps with
#'        a more precedent range in an element of \code{features}, the other 
#'        overlaps from other less precedent elments will be discarded. This is
#'        useful for getting piecharts where percentages should add up to 100.
#'        
#' @param strand.aware	if set to TRUE, annotation features and target features will 
#' be overlapped based on strand (def:FALSE)
#' @param intersect.chr	logical value, whether to select only chromosomes 
#'        that are common to feature and target. FALSE by default
#' @return returns an \code{AnnotationByFeature} object or if target is 
#'         a GRangesList, a list of \code{AnnotationByFeature} objects.
#'      
#' @export
#' @examples
#' library(GenomicRanges)
#' data(cage)
#' data(cpgi)
#' cage$tpm=NULL
#' gl = GRangesList(cage=cage, cpgi=cpgi)
#' 
#' bed.file = system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' annot = annotateWithFeatures(gl, gene.parts, intersect.chr=TRUE)
#' 
#' @seealso 
#' see  \code{\link{getMembers}}, 
#'            \code{\link{heatTargetAnnotation}}, 
#'            \code{\link{plotTargetAnnotation}}
#' 
#' 
#' @docType methods
#' @rdname annotateWithFeatures-methods
setGeneric("annotateWithFeatures", 
           function(target,features,strand.aware=FALSE, intersect.chr=FALSE)
             standardGeneric("annotateWithFeatures") )

#' @aliases annotateWithFeatures,GRanges,GRangesList-method
#' @rdname annotateWithFeatures-methods
setMethod("annotateWithFeatures", 
          signature(target = "GRanges", features = "GRangesList"),
          function(target, features, strand.aware, intersect.chr){
            
            if(intersect.chr){
              message('intersecting chromosomes...')
              chrs = intersect(unique(as.character(seqnames(target))), 
                               unique(unlist(
                                 lapply(features, 
                                        function(x)as.character(seqnames(x))))))
              
              if(length(chrs) == 0)
                stop('target and feature do not have intersecting chromosomes')
              target=target[seqnames(target) %in% chrs]
              features = endoapply(features, function(x)x[seqnames(x) %in% chrs])
            }  
            
            
            x=unlist(features)
            
            # get overlap matrix
            overlaps <- as.matrix(GenomicRanges::findOverlaps(target  ,x,
                                                              ignore.strand= !strand.aware ))
            
            # get a datable from overlaps and the annotation category names
            dt=data.table(cbind(overlaps,annot=names(x)[overlaps[,2]]))
            
            # order dt by order of feature gategories in features GRangesList
            dt=dt[order(match(dt$annot,names(features))),]
            
            # get the number of target ranges overlapping with feature categories
            # and the number of features with target ranges
            annot=queryHits=subjectHits=NULL
            hits=dt[, list(targetHits=length(unique(queryHits )),
                           featureHits=length(unique(subjectHits ))) ,by=annot]
            
            # get the number of target ranges overlapping with feature categories
            # and the number of features with target ranges
            # with precendence
            
            p.hits=dt[!duplicated(dt$queryHits), 
                      list(targetHits=length(unique(queryHits )),
                           featureHits=length(unique(subjectHits ))) ,by=annot]
            
            
            # get a matrix of overlaps for targets
            # this matrix shows overlap status for each element in target
            dmemb=dcast(dt,queryHits~annot, 
                        fun.aggregate=function(x) ifelse(length(x)>0,1,0),
                        value.var="annot")
            
            # order dmemb by order of feature categories in features GRangesList
            col.order=c(colnames(dmemb)[1],
                        colnames(dmemb)[-1][order(match(colnames(dmemb)[-1],
                                                        names(features)))])
            dmemb=dmemb[,col.order,with=FALSE]
            
            # get an empty matrix of zeros, not all target ranges will be overlapping
            # with an annotation
            memb=matrix(0,nrow=length(target),ncol=ncol(dmemb)-1)
            
            # fill in the matrix of 0s with values from  the overlap matrix
            memb[as.numeric(dmemb$queryHits),]=as.matrix(dmemb[,2:ncol(dmemb),
                                                               with=FALSE])
            colnames(memb)=colnames(dmemb)[-1]
            
            # calculate number of overlaps etc before hand
            # populate stats for features that didn't overlap with the target
            no.of.OlapFeat=num.annotation=num.precedence=rep(0,length(features))
            
            names(no.of.OlapFeat)=names(num.precedence)=names(num.annotation)=names(features)
            
            no.of.OlapFeat[match( hits$annot,names(features))]=hits$featureHits
            num.annotation[match( hits$annot,names(features))]=hits$targetHits
            num.precedence[match( p.hits$annot,names(features))]=p.hits$targetHits
            
            # output the object
            new("AnnotationByFeature",
                members         = memb,
                annotation      = 100*num.annotation/length(target),
                precedence        = 100*num.precedence/length(target),
                num.annotation  =  num.annotation,
                num.precedence    =  num.precedence,
                no.of.OlapFeat  = no.of.OlapFeat ,
                perc.of.OlapFeat= 100*no.of.OlapFeat/sapply(features,length) 
            )
          })



#' @aliases annotateWithFeatures,GRangesList,GRangesList-method
#' @rdname annotateWithFeatures-methods
setMethod("annotateWithFeatures",
          signature(target = "GRangesList", features= "GRangesList"),
          function(target, features, strand.aware, intersect.chr){
            
            
            l = list()
            if(is.null(names(target)))
              names(target) = 1:length(target)
            for(i in 1:length(target)){
              
              name = names(target)[i]
              message(paste('Working on:',name))
              x = target[[name]]
              l[[name]] = annotateWithFeatures(x, features, strand.aware, intersect.chr)
              
            }                          
            return(l)
          })





# ACCESSOR FUNCTIONS
#AnnotationByFeature
#AnnotationByGeneParts


# ---------------------------------------------------------------------------- #
#' Get the membership slot of AnnotationByFeature
#'
#' Membership slot defines the overlap of target features with annotation features
#'  For example, if a target feature overlaps with an exon
#' 
#' @param x a \code{AnnotationByFeature}  object
#' 
#' @return matrix showing overlap of target features with annotation features. 
#'         1 for overlap, 0 for non-overlap
#' 
#' @usage getMembers(x)
#'
#' @export
#' @docType methods
#' @rdname getMembers-methods                      
setGeneric("getMembers", def=function(x) standardGeneric("getMembers"))

#' @aliases getMembers,AnnotationByFeature-method
#' @rdname getMembers-methods
setMethod("getMembers", 
            signature(x = "AnnotationByFeature"),
            function(x){
                return(x@members)
})

# ---------------------------------------------------------------------------- #
#' Get the percentage of target features overlapping with annotation from AnnotationByFeature
#'
#' This function retrieves percentage/number of target features 
#' overlapping with annotation
#'  
#' @param x a \code{AnnotationByFeature}  object
#' @param percentage TRUE|FALSE. If TRUE percentage of target 
#'                   features will be returned. If FALSE, 
#'                   number of target features will be returned
#' @param precedence TRUE|FALSE. If TRUE there will be a hierachy of annotation 
#'                   features when calculating numbers 
#'                   (with promoter>exon>intron precedence)
#'                   
#' That means if a feature overlaps with a promoter it will be counted as 
#' promoter overlapping only, or if it is overlapping with a an exon 
#' but not a promoter, #' it will be counted as exon overlapping only whether or
#'  not it overlaps with an intron.
#'
#' @usage getTargetAnnotationStats(x,percentage=TRUE,precedence=TRUE)
#' 
#' @examples
#' data(cage)
#' bed.file=system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' cage.annot=annotateWithGeneParts(cage, gene.parts, intersect.chr=TRUE)
#' getTargetAnnotationStats(cage.annot)
#'
#' @return a vector of percentages or counts showing quantity of target features
#'         overlapping with annotation
#' 
#' @export
#' @docType methods
#' @rdname getTargetAnnotationStats-methods
setGeneric("getTargetAnnotationStats", 
                  function(x,percentage=TRUE,precedence=TRUE) 
                    standardGeneric("getTargetAnnotationStats"))

#' @rdname getTargetAnnotationStats-methods
#' @aliases getTargetAnnotationStats,AnnotationByFeature-method
setMethod("getTargetAnnotationStats", 
            signature(x = "AnnotationByFeature"),
            function(x, percentage ,precedence ){                      
            
            if(percentage){
                if(precedence){
                    return(round(x@precedence,2))
                }else{
                    return(round(x@annotation,2))
                }
            }else{
                if(precedence){
                    return(round(x@num.precedence,2))
                }else{
                    return(round(x@num.annotation,2))
                }
            }
})


# ---------------------------------------------------------------------------- #
#' Get the percentage/count of annotation features overlapping with target 
#' features from AnnotationByFeature
#'
#' This function retrieves percentage/number of 
#' annotation features overlapping with targets. 
#' For example, if \code{AnnotationByFeature}  object is containing 
#' statistics of differentially methylated 
#' regions overlapping with gene annotation. This function will return 
#' number/percentage of introns,exons and promoters
#' overlapping with differentially methylated regions.
#'  
#' @param x a \code{AnnotationByFeature}  object
#' @param percentage TRUE|FALSE. If TRUE percentage of annotation features will 
#'        be returned. If FALSE, number of annotation features will be returned
#'
#' @return RETURNS  a vector of percentages or counts showing quantity 
#'         of annotation features overlapping with target features
#'      
#'         
#' @examples
#' data(cage)
#' bed.file=system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' cage.annot = annotateWithGeneParts(cage, gene.parts, intersect.chr=TRUE)
#' getFeatsWithTargetsStats(cage.annot)
#' 
#' @usage getFeatsWithTargetsStats(x,percentage=TRUE)
#' @export
#' @docType methods
#' @rdname getFeatsWithTargetsStats-methods
setGeneric("getFeatsWithTargetsStats", 
                  function(x,percentage=TRUE) 
                    standardGeneric("getFeatsWithTargetsStats"))


#' @rdname getFeatsWithTargetsStats-methods
#' @aliases getFeatsWithTargetsStats,AnnotationByFeature-method
setMethod("getFeatsWithTargetsStats", 
            signature(x = "AnnotationByFeature" ),
            function( x,percentage ){                      
            
                if(percentage){
                    return(round(x@perc.of.OlapFeat,2))
                }else{
                    return(round(x@no.of.OlapFeat,2))                        
                }
})


# ---------------------------------------------------------------------------- #
#' Get distance to nearest TSS and gene id from AnnotationByGeneParts
#'
#' This accessor function gets the nearest TSS, its distance to target feature,
#' strand and name of TSS/gene from AnnotationByGeneParts object
#' 
#' @param x a \code{AnnotationByGeneParts}  object
#' 
#' @return RETURNS a data.frame containing row number of the target features,
#'         distance of target to nearest TSS, TSS/Gene name, TSS strand
#' @usage getAssociationWithTSS(x)
#' @aliases getAssociationWithTSS,-methods getAssociationWithTSS,
#'          AnnotationByGeneParts-method
#' 
#' @examples
#' data(cage)
#' bed.file = system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' cage.annot = annotateWithGeneParts(cage, gene.parts, intersect.chr=TRUE)        
#' head(getAssociationWithTSS(cage.annot))
#'  
#' @export
#' @docType methods
#' @rdname AnnotationByGeneParts-methods
setGeneric("getAssociationWithTSS", 
                  function(x) 
                    standardGeneric("getAssociationWithTSS"))

#' @rdname AnnotationByGeneParts-methods
#' @docType methods
#' @aliases getAssociationWithTSS,AnnotationByGeneParts-method
setMethod("getAssociationWithTSS", 
            signature(x = "AnnotationByGeneParts"),
            function(x){
                return(x@dist.to.TSS)
})



# PLOTTING FUNCTIONS

# ---------------------------------------------------------------------------- #
#' Plot annotation categories from AnnotationByGeneParts or AnnotationByFeature
#'
#' This function plots a pie or bar chart for showing percentages of targets 
#' annotated by genic parts or other query features
#' 
#' @param x a \code{AnnotationByFeature} or  
#'        \code{AnnotationByGeneParts} object
#' @param precedence TRUE|FALSE. If TRUE there will be a hierachy of annotation 
#'        features when calculating numbers 
#'        (with promoter>exon>intron precedence). 
#'        This option is only valid when x is a 
#'        \code{AnnotationByGeneParts} object
#' @param col a vector of colors for piechart or the bar plot
#' @param cex.legend a numeric value of length 1 to specify the size of the legend. By default 1.
#' @param ... graphical parameters to be passed to \code{pie} 
#'            or \code{barplot} functions
#'
#'
#' @examples
#' data(cage)
#' 
#' bed.file = system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' annot = annotateWithGeneParts(cage, gene.parts, intersect.chr=TRUE)
#' \donttest{
#' plotTargetAnnotation(annot)
#' }
#'
#' @return plots a piechart or a barplot for percentage of 
#'         the target features overlapping with annotation
#' 
#' @export
#' @docType methods
#' @rdname plotTargetAnnotation-methods
setGeneric("plotTargetAnnotation", 
              function(x,
                       precedence=TRUE,
                       col=getColors(length(x@annotation)),
                       cex.legend=1,...) 
                       standardGeneric("plotTargetAnnotation"))

#' @rdname plotTargetAnnotation-methods
#' @docType methods
#' @aliases plotTargetAnnotation,AnnotationByFeature-method
setMethod("plotTargetAnnotation", 
            signature(x = "AnnotationByFeature"),
            function(x,precedence,col,cex.legend,...){
                
                props=getTargetAnnotationStats(x,precedence)

            if(precedence | ( is(x,"AnnotationByFeature") & 
                       !is(x,"AnnotationByGeneParts")) ){
                slice.names=names(props)
                names(props)=paste( paste(round(props),"%"),sep=" ")

                pie(props,cex=0.9,col=col,...)
                legend("topright",legend=slice.names,fill=col, cex=cex.legend)

            }else{

                mids = barplot(props,col=col,...)  
                text(mids, y = props,labels=paste(paste(round(props),"%",sep="")),pos=1) 
            }
})



# ---------------------------------------------------------------------------- #
#' Plots the percentage of overlapping intervals with genomic features in a heatmap
#'
#' This function plots a heatmap for percentage of overlapping ranges 
#' with provided genomic features. The input object is a list of 
#' \code{AnnotationByFeature} objects, which contains necessary information
#' about overlap statistics to make the plot.
#' 
#' @param l a \code{list} of  \code{AnnotationByFeature} objects. This 
#'        could be returned by \code{\link{annotateWithFeatures}} function.
#' @param cluster TRUE/FALSE. If TRUE the heatmap is going to be clustered 
#'        using a default hierarchical clustering scheme.
#' @param col a vector of two colors that will be used for interpolation. 
#'        The first color is the lowest one, the second is the highest one
#' @param precedence TRUE|FALSE. If TRUE the precedence of annotation 
#'        features will be used when plotting. The precedence will be taken
#'        from the GRangesList used as annotation. The first GRanges will be 
#'        treated as most important, and the second as the second most important
#'        and so on. Such that, if an interval overlaps with features on that 
#'        is part of the first GRanges object in the annotation GRangesList,
#'        the rest of its overlaps with other elements in the annotation 
#'        GRangesList will be ignored. This feature is important to have if 
#'        the user desires that percentage of overlaps adds up to 100. This is
#'        only possible when the annotation features are non-overlaping with
#'        eachother or there is a hierarchy/precedence among them such as 
#'        (with promoter>exon>intron precedence). In this case, anything that
#'        overlaps with a promoter annotation will only be counted as promoter
#'        even if it overlaps with exons or introns.
#'    
#'        
#' @param plot If FALSE, does not plot the heatmap just returns the matrix
#'        used to make the heatmap
#' 
#' 
#' @return returns the matrix used to make the heatmap when plot FALSE, 
#' otherwise returns ggplot2 object which can be modified further.
#'          
#' 
#' @examples
#' 
#' library(GenomicRanges)
#' data(cage)
#' data(cpgi)
#' cage$tpm=NULL
#' gl = GRangesList(cage=cage, cpgi=cpgi)
#' 
#' bed.file = system.file("extdata/chr21.refseq.hg19.bed", package = "genomation")
#' gene.parts = readTranscriptFeatures(bed.file)
#' annot = annotateWithFeatures(gl, gene.parts, intersect.chr=TRUE)
#' 
#'             
#' \donttest{  
#' heatTargetAnnotation(annot)
#' }
#' 
#' @seealso 
#' see  \code{\link{getMembers}}, 
#'            \code{\link{annotateWithFeatures}} 
#'          
#'            
#' @export
#' @docType methods
#' @rdname heatTargetAnnotation-methods
#' @aliases plotGeneAnnotation
#' @docType methods
heatTargetAnnotation<-function(l, cluster=FALSE, col=c("white","blue"),
                               precedence=FALSE,plot=TRUE){
  
  if(!all(unlist(lapply(l, class)) == "AnnotationByFeature"))
    stop("All elements of the input list need to be AnnotationByFeature-class")
  if(precedence){
    d = data.frame(do.call(rbind, lapply(l, function(x)x@precedence)))
  }else{
    d = data.frame(do.call(rbind, lapply(l, function(x)x@annotation)))
  }
  
  if(!plot){
    return(as.matrix(d))    
  }
  d = data.frame('SampleName'=rownames(d), d)
  
  ind = 1:nrow(d)
  if(cluster == TRUE){
    h = hclust(dist(d[,-1]))
    ind = h$order
  }
  
  m = reshape2::melt(d, id.vars='SampleName')
  colnames(m)[2] = 'Feature'

  Feature=SampleName=value=NULL
  p = ggplot(m, aes(x=Feature, y=SampleName, fill=value)) +
    theme(
      axis.title.x = element_text(colour='white'), 
      axis.text.x  = element_text(colour='black', face='bold'), 
      axis.text.y  = element_text(colour='black'), 
      axis.title.y = element_text(colour='white', face='bold')) 
  p = p + geom_tile(aes(fill = value)) +
    geom_text(aes(fill = m$value, label = round(m$value, 1))) +
    scale_fill_gradient(low =col[1], high = col[2]) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) 
  p
}

Try the genomation package in your browser

Any scripts or data that you put into this service are public.

genomation documentation built on Nov. 8, 2020, 5:21 p.m.