#' explore_dmsg()
#' This function sorts genes by meth_level per 10kb for individual samples
#' and sorts genes by meth_diff per 10kb for each comparison; sample
#' percent methylation per site values are compared with the Wilcoxon
#' signed rank test
#'
#' @param mrobj A methylKit methylRaw or methylRawList object
#' @param genome_ann Genome annotation returned by get_genome_annotation()
#' @param dmgprp A list of data frames containing dmsg data generated by
#' show_dmsg()
#' @param maxgwidth An integer specifying the maximum gene width for a gene
#' to be listed in the output tables [default: -1, interpreted to mean
#' no restriction]
#' @param minnbrdmsites An integer specifying the minimum number of
#' differentially methylated sites for a gene to be listed in the pairwise
#' sample comparisons [default: 1]
#' @param withglink Either NCBIgene or "" to indicate inclusion of a
#' gene link column in the output
#' @param outflabel A string to identify the study in the output file
#'
#' @return A list of data frames summarizing single sample and pairwise
#' comparison gene properties
#'
#' @importFrom methylKit reorganize unite
#' @importFrom GenomicRanges findOverlaps
#' @importFrom S4Vectors subjectHits queryHits
#' @importFrom dplyr %>% group_by summarise
#'
#' @examples
#' mydatf <- system.file("extdata","Am.dat",package="BWASPR")
#' myparf <- system.file("extdata","Am.par",package="BWASPR")
#' myfiles <- setup_BWASPR(datafile=mydatf,parfile=myparf)
#' AmHE <- mcalls2mkobj(myfiles$datafiles,species="Am",study="HE",
#' type="CpGhsm",mincov=1,assembly="Amel-4.5")
#' genome_ann <- get_genome_annotation(myfiles$parameters)
#' dmsgList <- det_dmsg(AmHE,genome_ann,
#' threshold=25.0,qvalue=0.01,mc.cores=4,
#' destrand=TRUE,
#' outfile1="AmHE-dmsites.txt",
#' outfile2="AmHE-dmgenes.txt")
#' dmgprp <- show_dmsg(AmHE,dmsgList,min.nsites=10,destrand=TRUE,
#' outflabel="Am_HE")
#' summaries <- explore_dmsg(AmHE,genome_ann,dmgprp,maxgwidth=20000,
#' minnbrdmsites=2, withglink="NCBIgene",
#' outflabel="Am_HE")
#'
#' @export
explore_dmsg <- function(mrobj,genome_ann,dmgprp,maxgwidth,minnbrdmsites,
withglink="NCBIgene",outflabel="") {
message('... explore_dmsg ...')
# read basic information
#
sample_list <- getSampleID(mrobj)
gene.gr <- genome_ann$gene
comparison_list <- names(dmgprp)
# for each sample, return a list of genes sorted by meth level per 10kb
#
message(' ... explore individual samples ...')
ss_summaries <- lapply(sample_list, function(sample) {
message(paste(' ... explore ',sample,' ...',sep=''))
# subset the mrobj
#
sites <- reorganize(mrobj,
sample.ids=list(sample),
treatment=c(0))[[1]]
sites.gr <- as(sites,'GRanges')
sites.gr$perc_meth <- (sites.gr$numCs/sites.gr$coverage) * 100
# annotate the msites with genes ....
#
match <- suppressWarnings(findOverlaps(sites.gr,gene.gr,ignore.strand=TRUE))
sites.gr <- sites.gr[queryHits(match)]
gene.gr <- gene.gr[subjectHits(match)]
# combine
sites.df <- as.data.frame(sites.gr)
gene.df <- as.data.frame(gene.gr)
colnames(gene.df) <- lapply(colnames(gene.df),
function(i) paste('gene',i,sep='_'))
sites_gene <- cbind(sites.df,gene.df)
# calculate site densities for each gene ...
#
if (withglink == "NCBIgene") {
ss_summary <- sites_gene %>% group_by(gene_ID) %>%
summarise(glink =
paste("https://www.ncbi.nlm.nih.gov/gene/?term",
unique(gene_Name),sep="="),
gwidth = round(mean(gene_width),2),
nbrsites = dplyr::n(),
nbrper10kb = round((nbrsites/gwidth)*10000,2),
pmsum = round(sum(perc_meth),2),
pmpersite = round(pmsum/nbrsites,2),
pmpernucl = round(pmsum/gwidth,2)
)
}
else {
ss_summary <- sites_gene %>% group_by(gene_ID) %>%
summarise(gwidth = round(mean(gene_width),2),
nbrsites = dplyr::n(),
nbrper10kb = round((nbrsites/gwidth)*10000,2),
pmsum = round(sum(perc_meth),2),
pmpersite = round(pmsum/nbrsites,2),
pmpernucl = round(pmsum/gwidth,2)
)
}
# select by specified maxgwidth ...
#
if (maxgwidth > 0) {
ss_summary <- ss_summary[ss_summary$gwidth <= maxgwidth,]
}
# order the genes by pmpernucl ...
#
ss_summary <- ss_summary[order(- ss_summary$pmpernucl),]
ss_summary <- subset(ss_summary, select = -c(pmsum))
if (nchar(withglink) > 0) {
colnames(ss_summary) <- c("gene_ID","gene_link","gwidth",
"#Sites","#per10Kb","%perSite","%pNucl")
}
else {
colnames(ss_summary) <- c("gene_ID","gwidth",
"#Sites","#per10Kb","%perSite","%pNucl")
}
outfile <- paste("ogl",outflabel,sep="-")
outfile <- paste(outfile,sample,sep="_")
wtoutfile <- paste(outfile,"txt",sep=".")
write.table(ss_summary, wtoutfile, sep='\t',
row.names=FALSE, quote=FALSE)
return(ss_summary)
})
# for each comparison, return a list of ordered (ranked) genes ...
#
message(' ... explore comparisons ...')
wcoxfile <- paste("wrt",outflabel,sep="-")
wcoxfile <- paste(wcoxfile,"txt",sep=".")
sink(wcoxfile)
pw_summaries <- lapply(comparison_list, function(comparison) {
message(paste(' ... explore ',comparison,' ...',sep=''))
sites <- dmgprp[[comparison]]
comparison <- toString(comparison)
sample1 <- unlist(strsplit(comparison,'\\.'))[1]
sample2 <- unlist(strsplit(comparison,'\\.'))[3]
sites['sample1'] <- sites[sample1]
sites['sample2'] <- sites[sample2]
# calc a set of parameters for each gene
if (withglink == "NCBIgene") {
pw_summary <- sites %>% group_by(gene_ID) %>%
summarise(glink =
paste("https://www.ncbi.nlm.nih.gov/gene/?term",
unique(gene_Name),sep="="),
gwidth = round(mean(gene_width),2),
nbrsites = dplyr::n(),
nbrper10kb = round((nbrsites/gwidth)*10000,2),
nbrdmsites = sum(is.dm),
nbrdmper10kb = round((nbrdmsites/gwidth)*10000,2),
prcntdm = round((nbrdmsites/nbrsites)*100,2),
pmpersite1 = round(sum(sample1)/nbrsites,2),
pmpersite2 = round(sum(sample2)/nbrsites,2),
dmpersite = round((sum(sample1)-sum(sample2))/nbrsites,2),
admpersite = round(abs(sum(sample1)-sum(sample2))/nbrsites,2),
dmpernucl = round((sum(sample1)-sum(sample2))/gwidth,2),
admpernucl = round(abs(sum(sample1)-sum(sample2))/gwidth,2)
)
}
else {
pw_summary <- sites %>% group_by(gene_ID) %>%
summarise(gwidth = round(mean(gene_width),2),
nbrsites = dplyr::n(),
nbrper10kb = round((nbrsites/gwidth)*10000,2),
nbrdmsites = sum(is.dm),
nbrdmper10kb = round((nbrdmsites/gwidth)*10000,2),
prcntdm = round((nbrdmsites/nbrsites)*100,2),
pmpersite1 = round(sum(sample1)/nbrsites,2),
pmpersite2 = round(sum(sample2)/nbrsites,2),
dmpersite = round((sum(sample1)-sum(sample2))/nbrsites,2),
admpersite = round(abs(sum(sample1)-sum(sample2))/nbrsites,2),
dmpernucl = round((sum(sample1)-sum(sample2))/gwidth,2),
admpernucl = round(abs(sum(sample1)-sum(sample2))/gwidth,2)
)
}
# select by specified maxgwidth and minnbrdmsites ...
#
if (maxgwidth > 0) {
pw_summary <- pw_summary[pw_summary$gwidth <= maxgwidth,]
}
pw_summary <- pw_summary[pw_summary$nbrdmsites >= minnbrdmsites,]
# order the genes by prcntdm ...
#
pw_summary <- pw_summary[order(- pw_summary$dmpersite),]
if (nchar(withglink) > 0) {
colnames(pw_summary) <-
c("gene_ID","gene_link","gwidth","#Sites","#per10Kb",
"#dmSites","#dmsp10kb","%dmSites","%pSite1","%pSite2","DMpSite","ADMpSite","DMpNucl","ADMpNucl")
}
else {
colnames(pw_summary) <-
c("gene_ID","gwidth","#Sites","#per10Kb",
"#dmSites","#dmsp10kb","%dmSites","%pSite1","%pSite2","DMpSite","ADMpSite","DMpNucl","ADMpNucl")
}
outfile <- paste("ogl",outflabel,sep="-")
outfile <- paste(outfile,comparison,sep="_")
wtoutfile <- paste(outfile,"txt",sep=".")
write.table(pw_summary, wtoutfile, sep='\t', row.names=FALSE, quote=FALSE)
# ... comparing samples with the Wilcoxon signed rank test:
#
nbrgenes <- length(pw_summary$`%pSite1`)
cat( paste('... comparing ',comparison,'...\n',sep=' ') )
if (maxgwidth > 0) {
cat( paste('\nNumber of genes <= ',maxgwidth,' and with #dmsites >= ', minnbrdmsites,': ',nbrgenes ))
}
else {
cat( paste('\nNumber of genes with #dmsites >= ',minnbrdmsites,': ', nbrgenes,'\n') )
}
if (nbrgenes > 1) {
cat( sprintf("\nsample1\tsum of percent methylation: %8.2f\taverage: %8.2f\n",sum(pw_summary$`%pSite1`),
sum(pw_summary$`%pSite1`)/length(pw_summary$`%pSite1`) ) )
cat( sprintf( "sample2\tsum of percent methylation: %8.2f\taverage: %8.2f\n",sum(pw_summary$`%pSite2`),
sum(pw_summary$`%pSite2`)/length(pw_summary$`%pSite2`) ) )
cat( sprintf("\nsum difference: %8.2f\taverage difference: %8.4f\n",sum(pw_summary$DMpSite),
sum(pw_summary$DMpSite)/length(pw_summary$DMpSite) ) )
print(wilcox.test(pw_summary$`%pSite1`,pw_summary$`%pSite2`,paired=T,exact=F))
}
else {
cat( '\nOnly 1 gene, so there is nothing to test here ...\n' )
}
cat( "\n\n" )
return(pw_summary)
})
sink()
names(pw_summaries) <- comparison_list
message('... explore_dmsg finished ...')
return(list(ss_summaries,pw_summaries))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.