R/explore_dmsg.R

Defines functions explore_dmsg

Documented in explore_dmsg

#' explore_dmsg() 
#' This function sorts genes by meth_level per 10kb for individual samples
#' and sorts genes by meth_diff per 10kb for each comparison; sample
#' percent methylation per site values are compared with the Wilcoxon
#' signed rank test
#'
#' @param mrobj A methylKit methylRaw or methylRawList object
#' @param genome_ann Genome annotation returned by get_genome_annotation()
#' @param dmgprp A list of data frames containing dmsg data generated by
#'   show_dmsg()
#' @param maxgwidth An integer specifying the maximum gene width for a gene
#'   to be listed in the output tables [default: -1, interpreted to mean
#'   no restriction]
#' @param minnbrdmsites An integer specifying the minimum number of
#'   differentially methylated sites for a gene to be listed in the pairwise
#'   sample comparisons [default: 1]
#' @param withglink Either NCBIgene or "" to indicate inclusion of a
#'   gene link column in the output
#' @param outflabel A string to identify the study in the output file
#' 
#' @return A list of data frames summarizing single sample and pairwise
#'   comparison gene properties
#' 
#' @importFrom methylKit reorganize unite
#' @importFrom GenomicRanges findOverlaps
#' @importFrom S4Vectors subjectHits queryHits
#' @importFrom dplyr %>% group_by summarise

#'
#' @examples
#'   mydatf <- system.file("extdata","Am.dat",package="BWASPR")
#'   myparf <- system.file("extdata","Am.par",package="BWASPR")
#'   myfiles <- setup_BWASPR(datafile=mydatf,parfile=myparf)
#'   AmHE <- mcalls2mkobj(myfiles$datafiles,species="Am",study="HE",
#'                        type="CpGhsm",mincov=1,assembly="Amel-4.5")
#'   genome_ann <- get_genome_annotation(myfiles$parameters)
#'   dmsgList <- det_dmsg(AmHE,genome_ann,
#'                        threshold=25.0,qvalue=0.01,mc.cores=4,
#'                        destrand=TRUE,
#'                        outfile1="AmHE-dmsites.txt", 
#'                        outfile2="AmHE-dmgenes.txt")
#'   dmgprp <- show_dmsg(AmHE,dmsgList,min.nsites=10,destrand=TRUE,
#'                       outflabel="Am_HE")
#'   summaries <- explore_dmsg(AmHE,genome_ann,dmgprp,maxgwidth=20000,
#'                             minnbrdmsites=2, withglink="NCBIgene",
#'                             outflabel="Am_HE")
#'
#' @export


explore_dmsg <- function(mrobj,genome_ann,dmgprp,maxgwidth,minnbrdmsites,
                         withglink="NCBIgene",outflabel="") {
    message('... explore_dmsg ...')
    # read basic information
    #
    sample_list     <- getSampleID(mrobj)
    gene.gr         <- genome_ann$gene
    comparison_list <- names(dmgprp)
    # for each sample, return a list of genes sorted by meth level per 10kb
    #
    message('   ... explore individual samples ...')
    ss_summaries <- lapply(sample_list, function(sample) {
        message(paste('      ... explore ',sample,' ...',sep=''))
        # subset the mrobj
        #
        sites             <- reorganize(mrobj,
                                        sample.ids=list(sample),
                                        treatment=c(0))[[1]]
        sites.gr          <- as(sites,'GRanges')
        sites.gr$perc_meth <- (sites.gr$numCs/sites.gr$coverage) * 100
        # annotate the msites with genes ....
        #
        match             <- suppressWarnings(findOverlaps(sites.gr,gene.gr,ignore.strand=TRUE))
        sites.gr          <- sites.gr[queryHits(match)]
        gene.gr           <- gene.gr[subjectHits(match)]
        # combine
        sites.df          <- as.data.frame(sites.gr)
        gene.df           <- as.data.frame(gene.gr)
        colnames(gene.df) <- lapply(colnames(gene.df),
                                    function(i) paste('gene',i,sep='_'))
        sites_gene        <- cbind(sites.df,gene.df)
        # calculate site densities for each gene ...
        #
        if (withglink == "NCBIgene") {
            ss_summary <- sites_gene %>% group_by(gene_ID) %>%
                summarise(glink =
                            paste("https://www.ncbi.nlm.nih.gov/gene/?term",
                                   unique(gene_Name),sep="="),
                          gwidth = round(mean(gene_width),2),
                          nbrsites = dplyr::n(),
                          nbrper10kb = round((nbrsites/gwidth)*10000,2),
                          pmsum = round(sum(perc_meth),2),
                          pmpersite = round(pmsum/nbrsites,2),
                          pmpernucl = round(pmsum/gwidth,2)
                         )
        }
        else {
            ss_summary <- sites_gene %>% group_by(gene_ID) %>%
                summarise(gwidth = round(mean(gene_width),2),
                          nbrsites = dplyr::n(),
                          nbrper10kb = round((nbrsites/gwidth)*10000,2),
                          pmsum = round(sum(perc_meth),2),
                          pmpersite = round(pmsum/nbrsites,2),
                          pmpernucl = round(pmsum/gwidth,2)
                         )
        }
        # select by specified maxgwidth ...
        #
        if (maxgwidth > 0) {
          ss_summary <- ss_summary[ss_summary$gwidth <= maxgwidth,]
        }
        # order the genes by pmpernucl ...
        #
        ss_summary <- ss_summary[order(- ss_summary$pmpernucl),]
        ss_summary <- subset(ss_summary, select = -c(pmsum))
        if (nchar(withglink) > 0) {
            colnames(ss_summary) <- c("gene_ID","gene_link","gwidth",
                                      "#Sites","#per10Kb","%perSite","%pNucl")
        }
        else {
            colnames(ss_summary) <- c("gene_ID","gwidth",
                                      "#Sites","#per10Kb","%perSite","%pNucl")
        }
        outfile <- paste("ogl",outflabel,sep="-")
        outfile <- paste(outfile,sample,sep="_")
        wtoutfile <- paste(outfile,"txt",sep=".")
        write.table(ss_summary, wtoutfile, sep='\t',
                    row.names=FALSE, quote=FALSE)
        return(ss_summary)
    })

    # for each comparison, return a list of ordered (ranked) genes ...
    #
    message('   ... explore comparisons ...')

    wcoxfile <- paste("wrt",outflabel,sep="-")
    wcoxfile <- paste(wcoxfile,"txt",sep=".")
    sink(wcoxfile)

    pw_summaries <- lapply(comparison_list, function(comparison) {
        message(paste('      ... explore ',comparison,' ...',sep=''))
        sites            <- dmgprp[[comparison]]
        comparison       <- toString(comparison)
        sample1          <- unlist(strsplit(comparison,'\\.'))[1]
        sample2          <- unlist(strsplit(comparison,'\\.'))[3]
        sites['sample1'] <- sites[sample1]
        sites['sample2'] <- sites[sample2]
        # calc a set of parameters for each gene
        if (withglink == "NCBIgene") {
            pw_summary <- sites %>% group_by(gene_ID) %>%
                summarise(glink =
                            paste("https://www.ncbi.nlm.nih.gov/gene/?term",
                                   unique(gene_Name),sep="="),
                          gwidth = round(mean(gene_width),2),
                          nbrsites = dplyr::n(),
                          nbrper10kb = round((nbrsites/gwidth)*10000,2),
                          nbrdmsites = sum(is.dm),
                          nbrdmper10kb = round((nbrdmsites/gwidth)*10000,2),
                          prcntdm = round((nbrdmsites/nbrsites)*100,2),
                          pmpersite1 = round(sum(sample1)/nbrsites,2),
                          pmpersite2 = round(sum(sample2)/nbrsites,2),
                          dmpersite = round((sum(sample1)-sum(sample2))/nbrsites,2),
                          admpersite = round(abs(sum(sample1)-sum(sample2))/nbrsites,2),
                          dmpernucl = round((sum(sample1)-sum(sample2))/gwidth,2),
                          admpernucl = round(abs(sum(sample1)-sum(sample2))/gwidth,2)
                         )
        }
        else {
            pw_summary <- sites %>% group_by(gene_ID) %>%
                summarise(gwidth = round(mean(gene_width),2),
                          nbrsites = dplyr::n(),
                          nbrper10kb = round((nbrsites/gwidth)*10000,2),
                          nbrdmsites = sum(is.dm),
                          nbrdmper10kb = round((nbrdmsites/gwidth)*10000,2),
                          prcntdm = round((nbrdmsites/nbrsites)*100,2),
                          pmpersite1 = round(sum(sample1)/nbrsites,2),
                          pmpersite2 = round(sum(sample2)/nbrsites,2),
                          dmpersite = round((sum(sample1)-sum(sample2))/nbrsites,2),
                          admpersite = round(abs(sum(sample1)-sum(sample2))/nbrsites,2),
                          dmpernucl = round((sum(sample1)-sum(sample2))/gwidth,2),
                          admpernucl = round(abs(sum(sample1)-sum(sample2))/gwidth,2)
                         )
        }
        # select by specified maxgwidth and minnbrdmsites ...
        #
        if (maxgwidth > 0) {
          pw_summary <- pw_summary[pw_summary$gwidth <= maxgwidth,]
        }
        pw_summary <- pw_summary[pw_summary$nbrdmsites >= minnbrdmsites,]
        # order the genes by prcntdm ...
        #
        pw_summary <- pw_summary[order(- pw_summary$dmpersite),]
        if (nchar(withglink) > 0) {
            colnames(pw_summary) <-
              c("gene_ID","gene_link","gwidth","#Sites","#per10Kb",
                "#dmSites","#dmsp10kb","%dmSites","%pSite1","%pSite2","DMpSite","ADMpSite","DMpNucl","ADMpNucl")
        }
        else {
            colnames(pw_summary) <-
              c("gene_ID","gwidth","#Sites","#per10Kb",
                "#dmSites","#dmsp10kb","%dmSites","%pSite1","%pSite2","DMpSite","ADMpSite","DMpNucl","ADMpNucl")
        }
        outfile <- paste("ogl",outflabel,sep="-")
        outfile <- paste(outfile,comparison,sep="_")
        wtoutfile <- paste(outfile,"txt",sep=".")
        write.table(pw_summary, wtoutfile, sep='\t', row.names=FALSE, quote=FALSE)

        # ... comparing samples with the Wilcoxon signed rank test:
        #
	nbrgenes <- length(pw_summary$`%pSite1`)
        cat( paste('... comparing ',comparison,'...\n',sep=' ') )
        if (maxgwidth > 0) {
	  cat( paste('\nNumber of genes <= ',maxgwidth,' and with #dmsites >= ', minnbrdmsites,': ',nbrgenes ))
        }
        else {
	  cat( paste('\nNumber of genes with #dmsites >= ',minnbrdmsites,': ', nbrgenes,'\n') )
        }
	if (nbrgenes > 1) {
	  cat( sprintf("\nsample1\tsum of percent methylation: %8.2f\taverage: %8.2f\n",sum(pw_summary$`%pSite1`),
	               sum(pw_summary$`%pSite1`)/length(pw_summary$`%pSite1`) ) )
	  cat( sprintf(  "sample2\tsum of percent methylation: %8.2f\taverage: %8.2f\n",sum(pw_summary$`%pSite2`),
	               sum(pw_summary$`%pSite2`)/length(pw_summary$`%pSite2`) ) )
	  cat( sprintf("\nsum difference: %8.2f\taverage difference: %8.4f\n",sum(pw_summary$DMpSite),
	               sum(pw_summary$DMpSite)/length(pw_summary$DMpSite) ) )
          print(wilcox.test(pw_summary$`%pSite1`,pw_summary$`%pSite2`,paired=T,exact=F))
	}
	else {
	  cat( '\nOnly 1 gene, so there is nothing to test here ...\n' )
	}
        cat( "\n\n" )

        return(pw_summary)
    })

    sink()
    names(pw_summaries) <- comparison_list
    message('... explore_dmsg finished ...')
    return(list(ss_summaries,pw_summaries))
}
BrendelGroup/BWASPR documentation built on Feb. 6, 2022, 9:09 a.m.