R/rank_rbm.R

Defines functions rank_rbm

Documented in rank_rbm

#' rank_rbm() 
#'   This function subsets the provided methylRaw objects by the specified
#'   GRange and returns a list of dataframes containing the GRange regions
#'   ranked by methylation site density.
#'
#' @param mrobjscd A methylKit methylRaw or methylRawList object for scd sites
#' @param mrobjhsm A methylKit methylRaw or methylRawList object for hsm sites
#' @param region.gr A GRanges object (typically based on the relevant genome
#'   annotation)
#' @param rlabel A string to identify the type of region analyzed
#' @param minrwidth Minimal region width for region to be included in plot
#' @param maxrwidth Maximal region width for region to be included in plot
#' @param minbrsites Minimal number of methylation sites in region for region
#'   to be included in plot
#' @param withglink Either NCBIgene or "" to indicate inclusion of a gene link
#'   column in the output if available
#' @param outflabel A string to identify the output dataframe 
#' 
#' @return A list of lists of dataframes containing (for each sample) the GRange
#'   regions ranked by overall methylation percentage (over all scd sites) or
#'   methylation hsm site density
#' 
#' @importFrom GenomicRanges findOverlaps
#' @importFrom S4Vectors subjectHits queryHits
#' @importFrom dplyr %>% group_by summarise
#' @importFrom ggplot2 ggplot aes geom_col
#'
#' @examples
#'   mydatf <- system.file("extdata","Am.dat",package="BWASPR")
#'   myparf <- system.file("extdata","Am.par",package="BWASPR")
#'   myfiles <- setup_BWASPR(datafile=mydatf,parfile=myparf)
#'   AmHEscd <- mcalls2mkobj(myfiles$datafiles,species="Am",study="HE",
#'                           type="CpGscd",mincov=1,assembly="Amel-4.5")
#'   AmHEhsm <- mcalls2mkobj(myfiles$datafiles,species="Am",study="HE",
#'                           type="CpGhsm",mincov=1,assembly="Amel-4.5")
#'   genome_ann <- get_genome_annotation(myfiles$parameters)
#'   rgL <- rank_rbm(AmHEscd,AmHEhsm,region.gr=genome_ann$gene,rlabel="genes",
#'                   withglink="NCBIgene",outflabel="Am_HE")
#'
#' @export

rank_rbm <- function(mrobjscd,mrobjhsm,region.gr,rlabel="",minrwidth=1,
                     maxrwidth=1000000,minnbrsites=1,withglink="",
                     outflabel="") {
    message('... rank_rbm ...')
    # read basic information:
    #
    sample_list <- getSampleID(mrobjscd)

    # get regional counts of methylation calls:
    #
    rcL <- regionCounts(mrobjscd,region.gr)

    # determinine methylation (percent of calls) in specified regions ...
    #
    message('   ... subset individual samples ...')
    #
    MpRegion <- lapply(1:length(getSampleID(mrobjscd)), function(i) {
        rcd           <- getData(rcL[[i]])
        rcd           <- cbind(rcd,round(100.*rcd[,6]/rcd[,5],2))
        names(rcd)[8] <- "prcntM"
        rcg   <- makeGRangesFromDataFrame(rcd,keep.extra.columns=TRUE)
        match <- findOverlaps(rcg,region.gr,type="equal")
        r     <- as.data.frame(rcg[queryHits(match)])
        a     <- as.data.frame(region.gr[subjectHits(match)])
        colnames(a) <- lapply(colnames(a), function(i) paste('region',i,sep='_'))
        ra    <- cbind(r,a)
        c     <- c("region_ID","seqnames","start","end","width","strand",
                   "coverage","numCs","numTs","prcntM")
        ra    <- ra[c]
        wtoutfile <- paste("ranked",rlabel,"byPrcntM",outflabel,sep="-")
        wtoutfile <- paste(wtoutfile,sample_list[i],sep="_")
        wtoutfile <- paste(wtoutfile,"txt",sep=".")
        write.table(ra[order(- ra$prcntM),], wtoutfile, sep='\t', row.names=FALSE, quote=FALSE)
        return(ra)
    })

    # determinine methylation site density in specified regions ...
    #
    SpRegion <- lapply(sample_list, function(sample) {
        message(paste('      ... rank ',sample,' regions of interest ...',sep=''))
        # subset mrobjhsm ...
        #
        sites               <- reorganize(mrobjhsm,
                                          sample.ids=list(sample),
                                          treatment=c(0)
                                         )[[1]]
        sites.gr            <- as(sites,'GRanges')
        sites.gr$perc_meth  <- (sites.gr$numCs/sites.gr$coverage) * 100

        # identify the msites within the specified regions ...
        #
        match               <- suppressWarnings(findOverlaps(sites.gr,region.gr,ignore.strand=TRUE))
        sites.gr            <- sites.gr[queryHits(match)]
        region.gr           <- region.gr[subjectHits(match)]
        sites.df            <- as.data.frame(sites.gr)
        region.df           <- as.data.frame(region.gr)
        colnames(region.df) <- lapply(colnames(region.df),
                                      function(i) paste('region',i,sep='_'))
        sites_region        <- cbind(sites.df,region.df)

        wtoutfile <- paste("sites-in",rlabel,outflabel,sep="-")
        wtoutfile <- paste(wtoutfile,sample,sep="_")
        wtoutfile <- paste(wtoutfile,"txt",sep=".")
        write.table(sites_region, wtoutfile, sep='\t', row.names=FALSE, quote=FALSE)
      
        # calculate site densities for each region ...
        #
        if (withglink == "NCBIgene"){
            rSstats <- sites_region %>% group_by(region_ID) %>%
                summarise(rwidth = round(mean(region_width),2),
                          nbrsites = dplyr::n(),
                          nbrper10kb = round((nbrsites/rwidth)*10000,2),
                          pmsum = round(sum(perc_meth),2),
                          pmpersite = round(pmsum/nbrsites,2),
                          pmpernucl = round(pmsum/rwidth,2),
                          pglink = paste("https://www.ncbi.nlm.nih.gov/gene/?term",
                                         unique(region_Name),sep="=")
                         )
        }
        else {
            rSstats <- sites_region %>% group_by(region_ID) %>%
                summarise(rwidth = round(mean(region_width),2),
                          nbrsites = dplyr::n(),
                          nbrper10kb = round((nbrsites/rwidth)*10000,2),
                          pmsum = round(sum(perc_meth),2),
                          pmpersite = round(pmsum/nbrsites,2),
                          pmpernucl = round(pmsum/rwidth,2)
                         )
        }
        # order the regions by nbrper10kb ...
        #
        rSstats <- rSstats[order(- rSstats$nbrper10kb),]
        rSstats <- subset(rSstats, select = -c(pmsum))

        # prepare the regional Site & Methylation statistics data frame:
        #
        rSMdf <- merge(rSstats,MpRegion[match(sample,sample_list)],by="region_ID")
        rSMdf <- rSMdf[order(- rSMdf$nbrper10kb),]
        rSMdf$width <- NULL	# ... remove the duplicated width information

        outfile   <- paste("ranked",rlabel,"bySiteDensity",outflabel,sep="-")
        outfile   <- paste(outfile,sample,sep="_")
        wtoutfile <- paste(outfile,"txt",sep=".")
        write.table(rSMdf, wtoutfile, sep='\t',
                    row.names=FALSE, quote=FALSE)

        outfile   <- paste("plot",rlabel,"prcntM-vs-SiteDensity",outflabel,sep="-")
        outfile   <- paste(outfile,sample,sep="_")
        ptoutfile <- paste(outfile,"pdf",sep=".")
        pdf(ptoutfile)
        print(ggplot(rSMdf[rSMdf$rwidth   >= minrwidth &
                           rSMdf$rwidth   <= maxrwidth &
                           rSMdf$nbrsites >= minnbrsites,],
              aes(x=nbrper10kb,y=prcntM)) +
              labs(title=outfile) +
              geom_point(size=2,shape=23,color="red") +
              geom_smooth(method="lm", se=TRUE, fullrange=FALSE, level=0.95)
             )

        dev.off()
        return(rSstats)
     })

    message('... rank_rbm finished ...')
    names(SpRegion) = sample_list
    names(MpRegion) = sample_list
    return(list(MpRegion,SpRegion))
}
BrendelGroup/BWASPR documentation built on Feb. 6, 2022, 9:09 a.m.