R/methyAnalysis.R
In methyAnalysis: DNA methylation data analysis and visualization

Documented in annotateDMRInfo annotateGRanges detectDMR.slideWin estimateCMR.methylation export.DMRInfo export.methyGenoSet getContinuousRegion identifySigDMR MethyLumiM2GenoSet smoothMethyData

## get the methylation probe location information, and return as a GRanges object
getMethyProbeLocation <- function(probes, lib="FDb.InfiniumMethylation.hg19") {
	
	if (!require(lib, character.only=TRUE)) stop(paste(lib, 'is not installed!'))
	if (is(probes, 'eSet')) probes <- rownames(probes)
	
	if (exists(lib)) {
		lib <- get(lib)
		if (is(lib, 'FeatureDb')) {
			allAnnotation <- features(lib)
		}
		if (any(probes %in% names(allAnnotation))) {
			probes <- probes[probes %in% names(allAnnotation)]
			warnings('Some probes does not exist in the annotation library!')
		}
		methyGrange <- allAnnotation[probes]
		values(methyGrange) <- DataFrame(ProbeID=probes)
		return(methyGrange)
	} 
	
	## For old annotation libraries: IlluminaHumanMethylation450k.db
	## Check whether multiple versions of chromosome information is available,
	## If so, only use the latest version.
	chrPattern <- paste(sub("\\.db$", "", lib), "CHR", sep="")
	chrObjs <- ls(paste("package:", lib, sep=""), pattern=paste(chrPattern, "[0-9]+$", sep=""))
	if (length(chrObjs) > 1) {
		chrVersion <- sub(".*[^0-9]([0-9]+)$", "\\1", chrObjs)
		obj <- get(chrObjs[which.max(as.numeric(chrVersion))])
	} else {
		obj <- get(chrPattern)
	}
	pp <- probes[probes %in% keys(obj)]
  chr[pp] <- sapply(AnnotationDbi::mget(pp, obj), function(x) x[1])

  obj <- get(paste(sub("\\.db$", "", lib), "CPGCOORDINATE", sep=""))
	pp <- probes[probes %in% keys(obj)]
  loc[pp] <- sapply(AnnotationDbi::mget(pp, obj), function(x) x[1])
	methyGrange <- GRanges(seqnames=chr, ranges=IRanges(start=loc, width=1), strand='*', ProbeID=probes)
	names(methyGrange) <- probes
	return(methyGrange)
}



## convert MethyLumiM class object to GenoSet class object
MethyLumiM2GenoSet <- function(methyLumiM, lib="FDb.InfiniumMethylation.hg19", bigMatrix=FALSE, dir.bigMatrix='.', savePrefix.bigMatrix) {
	oldFeatureData <- fData(methyLumiM)
	if (!identical(rownames(oldFeatureData), rownames(methyLumiM))) {
	  warnings('The rownames of featureData is inconsistent with the methyLumiM!')
  	rownames(oldFeatureData) <- rownames(methyLumiM)
	}
	methyLumiM <- addAnnotationInfo(methyLumiM, lib=lib)
	ff <- fData(methyLumiM)
	if (is.null(ff$CHROMOSOME))
		stop('Chromosome information is not available!\n')
		
	## retrieve the hgVersion information from the annotation library
  hgVersion <- ''
	if (require(lib, character.only=TRUE)) {
		dbinfo <- paste(sub('.db$', '', lib), '_dbInfo', sep='')
		if (exists(dbinfo)) {
			dbinfo <- do.call(dbinfo, list())
			rownames(dbinfo) <- dbinfo$name
			hgVersion <- strsplit(dbinfo['GPSOURCEURL', 'value'], '/')[[1]]
			hgVersion <- hgVersion[length(hgVersion)]
		} else if (class(get(lib)) == "FeatureDb") {
		  metaInfo <- metadata(get(lib))
			hgVersion <- metaInfo$value[metaInfo$name == 'Genome']
		}
	}

	## remove those without chromosome location
	rmInd <- (is.na(ff$CHROMOSOME) | ff$CHROMOSOME == '')
	if (any(rmInd)) {
		ff <- ff[!rmInd,]		
		cat(paste(length(which(rmInd)), 'probes were removed because of lack of chromosome location information!\n'))
	}
	ff$CHROMOSOME <- checkChrName(as.character(ff$CHROMOSOME))
	locdata <- GRanges(seqnames=ff$CHROMOSOME, ranges=IRanges(start=ff$POSITION, width=1, names=rownames(methyLumiM)))
	genome(locdata) <- hgVersion
	names(locdata) <- rownames(methyLumiM)[!rmInd]
	# sort the locdata
	locdata <- genoset::toGenomeOrder(locdata, strict=TRUE)

	if ((any(rmInd) && is(exprs(methyLumiM), 'BigMatrix')) || bigMatrix) {
		if (missing(savePrefix.bigMatrix)) {
			warnings('savePrefix.bigMatrix is missing! GenoSet_bigMatrixFiles is used!')
			savePrefix.bigMatrix <- 'GenoSet_bigMatrixFiles'
		}
	  dimNames <- list(names(locdata), colnames(methyLumiM))
		## predefine a big matrix
		methyLumiM.new <- lumi::asBigMatrix(methyLumiM[names(locdata),1], nCol=ncol(methyLumiM), dimNames=dimNames, saveDir=dir.bigMatrix, savePrefix=savePrefix.bigMatrix)
		## fill in the subsetted data
		for (ad.name in assayNames(methyLumiM.new)) {
      matrix.new <- assayElement(methyLumiM.new, ad.name)
      matrix.i <- assayElement(methyLumiM, ad.name)[names(locdata), ]
			matrix.new[, colnames(matrix.i)] <- matrix.i
		}
		methyLumiM <- methyLumiM.new		
		rm(methyLumiM.new)
	} else {
		# methyLumiM <- methyLumiM[!rmInd,]
		methyLumiM <- methyLumiM[names(locdata), ]
	}
	
	
	mcols(locdata) <- oldFeatureData[rownames(methyLumiM), ]
	methyGenoSet <- MethyGenoSet(rowRanges=locdata, pData=pData(methyLumiM), annotation=as.character(lib), assays=as.list(assayData(methyLumiM)))
	#fData(methyGenoSet) <- oldFeatureData[rownames(methyGenoSet), ]
	methyGenoSet@history <- methyLumiM@history
	
	## set smoothing attributes if exists
	if (!is.null(attr(methyLumiM, 'windowIndex')))
		attr(methyGenoSet, 'windowIndex') <- attr(methyLumiM, 'windowIndex')
	if (!is.null(attr(methyLumiM, 'windowRange')))
		attr(methyGenoSet, 'windowRange') <- attr(methyLumiM, 'windowRange')
	if (!is.null(attr(methyLumiM, 'windowSize')))
		attr(methyGenoSet, 'windowSize') <- attr(methyLumiM, 'windowSize')

	return(methyGenoSet)
}


## smooth the methylation data (MethyLumiM or GenoSet objects) using slide window with fixed windowsize (in bp)
smoothMethyData <- function(methyData, winSize=250, lib='FDb.InfiniumMethylation.hg19', p.value.detection.th=0.05, bigMatrix=FALSE, dir.bigMatrix='.', savePrefix.bigMatrix=NULL, ...) {

	if (!is(methyData, 'GenoSet') & !is(methyData, 'MethyLumiM')) {
		stop("methyData should be a GenoSet or MethyLumiM object!")
	}
	if (is(methyData, 'GenoSet')) {
		assaydata <- as.list(assays(methyData))
	} else {
		assaydata <- assayData(methyData)
	}
	if (is(methyData, 'MethyLumiM')) {
		chrInfo <- getChrInfo(methyData, lib=lib)
	} else {
		chrInfo <- data.frame(PROBEID=rownames(methyData), CHROMOSOME=genoset::chr(methyData), POSITION=start(methyData), END=end(methyData))
	}
	if (is.character(chrInfo$POSITION)) chrInfo$POSITION = as.numeric(chrInfo$POSITION)
	
	ratioData.old <- ratioData <- assaydata$exprs
	if (is(ratioData, 'BigMatrix')) { 
		bigMatrix <- TRUE
	}

	index <- 1:nrow(ratioData)
	# remove those probes lack of position information
	rmInd <- which(is.na(chrInfo$POSITION) | chrInfo$CHROMOSOME == '' )
	if (length(rmInd) > 0)	index <- index[-rmInd]
	
	# Sort the ratioData by chromosome and location
	isOrdered <- with(chrInfo, sapply(split(POSITION, CHROMOSOME), function(x) all(x == sort(x, decreasing=FALSE))))
	if (!all(isOrdered)) {
		ord <- order(chrInfo$CHROMOSOME[index], chrInfo$POSITION[index], decreasing=FALSE)
		index <- index[ord]
	}
	
	if (bigMatrix) {
		if (!is(ratioData, 'BigMatrix') | !is.null(savePrefix.bigMatrix) | !all(index == 1:length(index)))
			methyData <- asBigMatrix(methyData, rowInd=index, savePrefix=savePrefix.bigMatrix, saveDir=dir.bigMatrix)
	} else {
		methyData <- methyData[index,]
	}
	ratioData <- assaydata$exprs
	## set NA for those probes having high p-values
	if (!is.null(assaydata$detection)) {
	  naInd <- which(assaydata$detection > p.value.detection.th)
	  if (length(naInd) > 0) ratioData[naInd] <- NA
	}
	chrInfo <- chrInfo[index,]
	index <- 1:nrow(ratioData)
	
	# split data by Chromosome 
	index.chrList <- split(index, as.character(chrInfo$CHROMOSOME))
	chrInfoList <- split(chrInfo, as.character(chrInfo$CHROMOSOME))
	windowIndex <- vector(mode='list', length=length(chrInfoList))
	windowRange <- vector(mode='list', length=length(chrInfoList))
	
	for (i in 1:length(index.chrList)) {
		index.i <- index.chrList[[i]]
		chrInfo.i <- chrInfoList[[i]]
		chr.i <- chrInfo.i$CHROMOSOME[1]
		cat(paste("Smoothing Chromosome", chr.i, "...\n"))
    
		# return the index of sorted probes in each slide window
		windowIndex.i <- eval(call(".setupSlidingTests", pos_data=chrInfo.i$POSITION, winSize=winSize))
		names(windowIndex.i) <- chrInfoList[[i]]$PROBEID
		smooth.ratio.i <- sapply(windowIndex.i, function(ind) {
			# average the values in slide window and perform test
			smooth.ij <- colMeans(ratioData[index.i[ind],,drop=FALSE], na.rm=TRUE)
			return(smooth.ij)
		})
		smooth.ratio.i <- t(smooth.ratio.i) 
		## replace the data with smoothed ones	
		ratioData[index.i,] <- smooth.ratio.i
		
		windowIndex[[i]] <- windowIndex.i
		windowRange.i <- sapply(windowIndex.i, function(ind) {
			# average the values in slide window and perform test
			range.ij <- range(chrInfo.i$POSITION[ind])
			return(range.ij)
		})
		windowRange[[i]] <- t(windowRange.i)
	}
	windowRange <- do.call('rbind', windowRange)
	colnames(windowRange) <- c('startLocation', 'endLocation')

	exprs(methyData) <- ratioData
	if (is(ratioData, 'BigMatrix')) {
		assayData(methyData) <- bigmemoryExtras::attachAssayDataElements(assaydata)
	}
		
		
	# methyData <- suppressMessages(methyData[rownames(smooth.ratioData),colnames(smooth.ratioData)])
	# exprs(methyData) <- smooth.ratioData[rownames(methyData),]

	attr(methyData, 'windowIndex') <- windowIndex
	attr(methyData, 'windowRange') <- windowRange
	attr(methyData, 'windowSize') <- winSize
	return(methyData)
}


## export a MethyGenoSet object as a 'gct' (for IGV) or 'bw' (big-wig) file
export.methyGenoSet <- function(methyGenoSet, file.format=c('gct', 'bw'), exportValue=c('beta', 'M', 'intensity'), hgVersion.default='hg19', savePrefix=NULL, outputFile=NULL) {
	
	# exportValue <- match.arg(exportValue)
	exportValue <- exportValue[1]
	file.format <- match.arg(file.format)
	## get the annotation version
	hgVersion <- unname(genome(methyGenoSet)[1])
	if (is.null(hgVersion) || unique(hgVersion) == '') {
		warnings(paste('hgVersion information is not available in methyGenoSet!', hgVersion.default, ' will be used.'))
		genome(methyGenoSet) <- hgVersion <- hgVersion.default
	}
	
	# chr <- space(rowRanges(methyGenoSet))
	chr <- genoset::chr(methyGenoSet)
	start <- start(methyGenoSet)
	## Sort the rows of ratios.obj
	methyGenoSet <- methyGenoSet[order(chr, start),]
	
	## check overlap probes and average them
	locationID <- paste(chr, start, sep='_')
	dupInd <- which(duplicated(locationID))
	if (length(dupInd) > 0) {
		dupID <- unique(locationID[dupInd])
		for (dupID.i in dupID) {
			selInd.i <- which(locationID == dupID.i)
			exprs(methyGenoSet)[selInd.i[1],] <- colMeans(exprs(methyGenoSet)[selInd.i,], na.rm=TRUE)
		}
		methyGenoSet <- methyGenoSet[-dupInd,]
	}
	## attach chr prefix in the chromosome names if it does not include it
	if (length(grep('^chr', chr)) == 0) {
		names(rowRanges(methyGenoSet)) <- paste('chr', names(rowRanges(methyGenoSet)), sep='')
	}
	
	if (exportValue %in% c('beta', 'M')) {
  	methyData <- assayElement(methyGenoSet, 'exprs')
  	if (exportValue == 'beta') {
  		methyData <- m2beta(methyData) # - 0.5
  	} 
	} else if (exportValue %in% assayNames(methyGenoSet)) {
	  methyData <- assayElement(methyGenoSet, exportValue) 
	} else if (exportValue == 'intensity') {
	  methyData <- log2(assayElement(methyGenoSet, 'methylated') + assayElement(methyGenoSet, 'unmethylated') + 1)
	} else {
	  stop("exportValue doesn't exist in the data!")
	}
	
	## only keep 3 digit to save space
	methyData <- signif(methyData, 3)
	
	if (file.format == 'gct') {
		chrInfo.all <- data.frame(PROBEID=rownames(methyGenoSet), CHROMOSOME=genoset::chr(methyGenoSet), START=start(methyGenoSet), END=end(methyGenoSet),	stringsAsFactors=FALSE)

		# remove those probes lack of position information
		rmInd <- which(is.na(chrInfo.all$START))
		if (length(rmInd) > 0)	{
			chrInfo.all <- chrInfo.all[-rmInd,]
			methyData <- methyData[-rmInd,]
		}
		
		description <- with(chrInfo.all, paste("|@", CHROMOSOME, ":", START, "-", END, "|", sep=""))
		gct <- cbind(Name=chrInfo.all[,'PROBEID'], Description=description, methyData)
		
		## check version hgVersion is included in the filename
		if (is.null(outputFile))
		  outputFile <- paste(savePrefix, "_", exportValue, "_", hgVersion, ".gct", sep='')
		
		cat("#1.2\n", file=outputFile)
		cat(paste(dim(gct), collapse='\t', sep=''), "\n", file=outputFile, append=TRUE)
		suppressWarnings(write.table(gct, sep="\t", file=outputFile, row.names=FALSE, append=TRUE, quote=FALSE))
		return(invisible(outputFile))

	} else if (file.format == 'bw') {
 		chrInfo.all <- getChromInfoFromUCSC(hgVersion)
		rownames(chrInfo.all) <- chrInfo.all[,'chrom']
 
		samplenames <- colnames(methyGenoSet)
		pdata <- pData(methyGenoSet)
		chr.info <- chrInfo(methyGenoSet)
		seq.lengths <- chr.info[,"stop"] - chr.info[,"offset"]
		for (i in 1:ncol(methyData)) {		
			score.i <- methyData[,i]
			cn.data.i <- GRanges(seqnames=genoset::chr(methyGenoSet), ranges=IRanges(start=start(methyGenoSet),end=end(methyGenoSet)), strand='*', score=score.i)
			genome(cn.data.i) <- hgVersion
			cn.data.i <- cn.data.i[!is.na(score.i), ]
			if (savePrefix == '' || is.null(savePrefix)) {
				savePrefix.i <- samplenames[i]
			} else {
				savePrefix.i <- paste(savePrefix, samplenames[i], sep='_')
			}
			
			seqlengths(cn.data.i) <- chrInfo.all[as.character(seqlevels(cn.data.i)), 'size']
			if (length(outputFile) == ncol(methyData)) {
        filename.i <- outputFile[i]
			} else {
  			filename.i <- paste(savePrefix.i, "_", exportValue, "_", hgVersion[1], ".bw", sep='')
			}
			export.bw(cn.data.i, filename.i, dataFormat="auto")
		}
		return(invisible(filename.i))
	}
	
}


# This is the sliding test
## ratios.obj inlcudes c("PROBEID","CHROMOSOME","POSITION"), values 
## winSize is the half slide window size, by default it is 250
## 
detectDMR.slideWin <- function(methyGenoSet, sampleType, winSize=250, testMethod=c('ttest', 'wilcox'), p.adjust.method='fdr', p.value.detection.th=0.05, ...) {
	
	testMethod <- match.arg(testMethod)
	if (is(methyGenoSet, 'MethyLumiM')) {
		methyGenoSet <- MethyLumiM2GenoSet(methyGenoSet, ...)
	}
	
	smooth <- TRUE
	if (!is.null(attr(methyGenoSet, 'windowIndex'))) {
		smooth <- FALSE
		if (!is.null(attr(methyGenoSet, 'windowSize'))) {
			if (attr(methyGenoSet, 'windowSize') != winSize) smooth <- TRUE
		}
	} 
	if (smooth) {
		methyGenoSet <- smoothMethyData(methyGenoSet, winSize=winSize, p.value.detection.th=p.value.detection.th, ...)
	}
	windowIndex <- attr(methyGenoSet, 'windowIndex')	
	windowRange <- attr(methyGenoSet, 'windowRange')
	
	chrInfo <- suppressWarnings(as(rowRanges(methyGenoSet), 'GRanges'))
	smoothData <- assays(methyGenoSet)$exprs
	## set NA for those probes having high p-values
	if (!is.null(assays(methyGenoSet)$detection)) {
	  naInd <- which(assays(methyGenoSet)$detection > p.value.detection.th)
	  if (length(naInd) > 0) smoothData[naInd] <- NA
	}
	
	## calculate class means
	uniClass <- unique(sampleType)
	selInd1 <- which(sampleType == uniClass[1])
	selInd2 <- which(sampleType == uniClass[2])
	classMean <- data.frame(rowMeans(smoothData[, selInd1, drop=FALSE], na.rm=TRUE), rowMeans(smoothData[, selInd2, drop=FALSE], na.rm=TRUE))
	names(classMean) <- paste('mean_', uniClass, sep='')

	if (!is.factor(sampleType)) sampleType <- as.factor(sampleType)

	## perform t-test
	if (is.function(testMethod)) {
		testInfo <- testMethod(smoothData, sampleType, ... )
	} else {
		if (testMethod == 'ttest') {
		  naInd <- which(apply(smoothData, 1, function(x) any(is.na(x))))
		  if (length(naInd) > 0) {
        ## do t.test for those with nas
  			p.value <- difference <- tscore <- rep(NA, nrow(smoothData))
  			## check the non-NA sample number for each class
  			nonNA.num <- apply(smoothData[naInd,,drop=FALSE], 1, function(x) table(sampleType[!is.na(x)]))
  			## only include those with more than 2 samples in each class
  			tooFewSample.ind <- (apply(nonNA.num, 2, min) < 2)
  			if (length(naInd[!tooFewSample.ind]) > 0) {
          t.test.res <- apply(smoothData[naInd[!tooFewSample.ind],,drop=FALSE], 1, function(x) t.test(x ~ sampleType))
    			p.value[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$p.value)
    			difference[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$estimate[1] -  x$estimate[2])
    			tscore[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$statistic)
  			}
  			tstats <- rowttests(smoothData[-naInd,,drop=FALSE], sampleType)
  			p.value[-naInd] <- tstats$p.value
  			difference[-naInd] <- tstats$dm
  			tscore[-naInd] <- tstats$statistic
		  } else {
  			tstats <- rowttests(smoothData, sampleType)
  			p.value <- tstats$p.value
  			difference <- tstats$dm
  			tscore <- tstats$statistic
		  }
		} else if (testMethod == 'wilcox') {
			tmp <- split(as.data.frame(t(smoothData)), sampleType)
			xx <- t(tmp[[1]])
			yy <- t(tmp[[2]])
			p.value <- apply(cbind(xx, yy), 1, function(x) wilcox.test(x[1:ncol(xx)], x[(ncol(xx)+1):length(x)])$p.value, ...)
			difference <- classMean[,1]	- classMean[,2]	
		} else {
			stop('Unsupported "testMethod"!')
		}
		p.adjust <- p.adjust(p.value, method=p.adjust.method)
		testInfo <- data.frame(PROBEID=rownames(smoothData), difference=difference, p.value=p.value, p.adjust=p.adjust)
		if (testMethod == 'ttest') testInfo <- data.frame(testInfo, tscore=tscore)
	}

	startInd <- unlist(lapply(windowIndex, function(x) sapply(x, min)))[rownames(smoothData)]
	endInd <- unlist(lapply(windowIndex, function(x) sapply(x, max)))[rownames(smoothData)]
	testInfo <- data.frame(testInfo, startWinIndex=startInd, endWinIndex=endInd, windowRange, classMean)
	testResult <- chrInfo
	
	## export as a GRanges object
	values(testResult) <- testInfo
	return(testResult)
}


identifySigDMR <- function(detectResult, p.adjust.method="fdr", pValueTh=0.01, fdrTh=pValueTh, diffTh=1, 
			minProbeNum=1, maxGap=2000, minGap=100, oppositeMethylation=FALSE, topNum=NULL) {
	
	if (is(detectResult, 'GRanges')) {
		detectResult.g <- detectResult
		detectResult <- data.frame(CHROMOSOME=as.character(seqnames(detectResult)), POSITION=start(detectResult), as(values(detectResult), 'data.frame'))
		rownames(detectResult) <- names(detectResult.g)
	} else if (is(detectResult, 'data.frame')) {
		if (!all(c("PROBEID","CHROMOSOME","POSITION") %in% names(detectResult)))
			stop(" PROBEID, CHROMOSOME and POSITION are required columns in the detectResult object!")
		# remove those probes lack of position information
		rmInd <- which(is.na(detectResult$POSITION))
		if (length(rmInd) > 0) {
			detectResult <- detectResult[-rmInd,]
		}
		## convert it as a GRanges object
		tmp <- with(detectResult, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=POSITION, end=POSITION), strand='*'))
		rmInd <- which(colnames(detectResult) %in% c('CHROMOSOME', 'POSITION', 'END', 'width'))
		values(tmp) <- detectResult[,-rmInd, drop=FALSE]
		detectResult.g <- tmp
		names(detectResult.g) <- rownames(detectResult)
	} else {
		stop('detectResult should be either a GRanges object or a data.frame with required columns!')
	}

	## No pValue constraint if we only interested in the top CpG-sites
	if (!is.null(topNum) && !is.na(topNum)) pValueTh <- 1
	
	## if there is no isSignificant column, then DMR should be identified first
	if (is.null(detectResult$isSignificant) || is.null(topNum) || is.na(topNum)) {
		detectResult <- .identifySigProbe(detectResult, p.adjust.method=p.adjust.method, pValueTh=pValueTh, fdrTh=fdrTh, diffTh=diffTh)
	}

	## Select the significant CpG-sites
	sigInd <- which(detectResult$isSignificant)

	## check the methylation direction of two conditions for each probe
	classMeanCol <- grep('^mean_', names(detectResult), value=TRUE)
	# if (length(classMeanCol) == 2) {
	# 	classMean <- detectResult[,classMeanCol]
	# 	if (oppositeMethylation) {
	# 		sigInd <- sigInd[classMean[sigInd,1] * classMean[sigInd,2] < 0]
	# 	}
	# } 

	if (!is.null(topNum) && !is.na(topNum)) {
		ord <- order(detectResult$p.value[sigInd], decreasing=FALSE)
		sigInd <- sigInd[ord]
		if (topNum < length(sigInd)) {
			rmInd <- sigInd[-(1:topNum)]
			detectResult$isSignificant[rmInd] <- FALSE
			sigInd <- sigInd[1:topNum]
		}
	} else {
		if (length(sigInd) == 0) {
			cat('No significant CpG-sites were identified based on current criteria!\n')
			return(NULL)
		}
	}
	sigProbe <- rownames(detectResult)[sigInd]
	detectResult$status <- rep(FALSE, nrow(detectResult))
	detectResult[sigProbe, 'status'] <- TRUE

	if (length(sigProbe) == 0) {
		return(list(sigDMRInfo=NULL, sigDataInfo=NULL))
	}

	## ---------------------------------------
	## identify the boundary of DMR (return a GRanges object)
	scoreFuns <- list(p.value=c(min=min), difference=c(max=function(x) x[which.max(abs(x))]), p.adjust=c(min=min), tscore=c(max=function(x) x[which.max(abs(x))]))
	scoreColumns <- c('p.value', 'difference', 'p.adjust', 'tscore', classMeanCol)
	if (is.data.frame(detectResult)) {
		scoreColumns <- scoreColumns[scoreColumns %in% names(detectResult)]
	} else {
		scoreColumns <- scoreColumns[scoreColumns %in% names(values(detectResult))]
	}
	scoreFuns <- scoreFuns[names(scoreFuns) %in% scoreColumns]
	sigDMRInfo.g <- getContinuousRegion(detectResult, scoreColumns=scoreColumns, 
							 scoreFuns=scoreFuns, maxGap=maxGap, minGap=minGap)

	sigDMRInfo <- as(sigDMRInfo.g, 'data.frame')
	rownames(sigDMRInfo) <- names(sigDMRInfo.g)
	
	## filtering based on the minimum number of probes in each DMR
	if (minProbeNum > 1) {
		numProbe <- sigDMRInfo$NumOfProbe
		sigDMRInfo <- sigDMRInfo[numProbe >= minProbeNum, ,drop=FALSE]
		sigDMRInfo.g <- sigDMRInfo.g[numProbe >= minProbeNum]
	}
	
	## check oppositeMethylation direction
	if (length(classMeanCol) == 2 && oppositeMethylation) {
		dmrMean <- sigDMRInfo[, classMeanCol]	
		sigDMRInfo.g <- sigDMRInfo.g[dmrMean[,1] * dmrMean[,2] < 0]
	}

	## only keep the sigDataInfo which overlaps with sigDMRInfo
	sigDataInfo.g <- detectResult.g[sigProbe]
	sigDataInfo.g <- subsetByOverlaps(sigDataInfo.g, sigDMRInfo.g)

	return(list(sigDMRInfo=sigDMRInfo.g, sigDataInfo=sigDataInfo.g))
}


## Get continuous region from discrete probe measurements
# getContinuousRegion <- function(detectResult, scoreColumns=NULL, scoreFuns=mean, maxGap=2000, minGap=100, as.GRanges=TRUE) {
getContinuousRegion <- function(detectResult, scoreColumns=NULL, scoreFuns=c(mean=mean), maxGap=2000, minGap=100) {

	if (is.data.frame(detectResult)) {
		if (!all(c('CHROMOSOME', 'POSITION', 'status') %in% names(detectResult))) {
			stop('CHROMOSOME, POSITION and status columns are required!')
		}
		detectResult.g <- with(detectResult, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=POSITION, end=POSITION), strand='*'))
		values(detectResult.g) <- detectResult[, !(colnames(detectResult) %in% c('CHROMOSOME', 'POSITION'))]
	} else if (is(detectResult, 'GRanges')) {
		detectResult.g <- detectResult
		detectResult <- data.frame(CHROMOSOME=as.character(seqnames(detectResult)), POSITION=as.numeric(start(detectResult)), as(values(detectResult), 'data.frame'))
	}
	detectResult$POSITION <- as.numeric(detectResult$POSITION)
	## sort in chromsome order
	detectResult <- with(detectResult, detectResult[order(CHROMOSOME, POSITION, decreasing=FALSE),])
	
	if (is.null(detectResult$status)) {
		stop('"status" column is required!')
	} else {
		if (!is.logical(detectResult$status)) {
			warning('"status" column should be a logical vector! "status > 0" was used in calculation!')
			detectResult$status <- detectResult$status > 0
		}
	}
	## set NA as FALSE
	detectResult$status[is.na(detectResult$status)] <- FALSE
	if (!is.null(scoreColumns)) {
		if (!all(scoreColumns %in% colnames(detectResult))) {
			stop('Some "scoreColumns" does not match "detectResult"!')
		}
		if (is.null(scoreFuns)) scoreFuns <- c(mean=mean)
		# if (length(scoreFuns) < length(scoreColumns)) {
		# 	scoreFuns <- rep(scoreFuns, length(scoreColumns))
		# }
	}
	
	## split data by Chromosome 
	detectResult.chrList <- split(detectResult, detectResult$CHROMOSOME)
	
	## ---------------------------------------
	## identify the boundary of DMR
	dmr.list <- lapply(detectResult.chrList, function(detectResult.i) {
		len.significant.i <- length(which(detectResult.i$status))
		if (len.significant.i == 0) return(NULL)
		
		allPosition.i <- detectResult.i$POSITION
		status.i <- as.numeric(detectResult.i$status)
		diff.i <- diff(status.i)
		
		# identify the diff region
		startInd.i <- which(diff.i == 1) + 1
		if (status.i[1] == 1) 
			startInd.i <- c(1, startInd.i)
		endInd.i <- which(diff.i == -1)
		if (length(endInd.i) < length(startInd.i)) 
			endInd.i <- c(endInd.i, length(status.i))
		
		## check the gaps of sparse probe 
		gaps.i <- which(c(FALSE, diff(allPosition.i) > maxGap) & status.i == 1) 
		# the gap should be 1 in both sides
		gaps.i <- gaps.i[status.i[gaps.i + 1] == 1]
		if (length(gaps.i) > 0) {
			startInd.i <- sort(c(startInd.i, gaps.i))
			endInd.i <- sort(c(endInd.i, gaps.i))
		}
		
		if ("startWinIndex" %in% colnames(detectResult.i)) {
			startRegion.i <- detectResult.i[startInd.i, "startWinIndex"]
			endRegion.i <- detectResult.i[endInd.i, "endWinIndex"] 
			startLoc.i <- detectResult.i[startRegion.i, 'POSITION']
			endLoc.i <- detectResult.i[endRegion.i, 'POSITION']
			width.i <- endLoc.i - startLoc.i + 1
			regionSummary.i <- data.frame(CHROMOSOME=detectResult.i[startRegion.i, 'CHROMOSOME'],
				startLocation=startLoc.i, endLocation=endLoc.i, width=width.i, startInd=startRegion.i, endInd=endRegion.i)
		} else {
			startLoc.i <- detectResult.i[startInd.i, "POSITION"]
			endLoc.i <- detectResult.i[endInd.i, "POSITION"] 
			width.i <- endLoc.i - startLoc.i + 1
			regionSummary.i <- data.frame(CHROMOSOME=detectResult.i[startInd.i, 'CHROMOSOME'],
				startLocation=startLoc.i, endLocation=endLoc.i, width=width.i, startInd=startInd.i,
				endInd=endInd.i)
		}
		return(regionSummary.i)
	})
	
	sigDMRInfo <- do.call('rbind', dmr.list)
	if (length(sigDMRInfo) == 0) return(GRanges())

	## convert as GRanges
	tmp <- with(sigDMRInfo, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=startLocation, end=endLocation), strand='*'))
	rmInd <- which(colnames(sigDMRInfo) %in% c('CHROMOSOME', 'startLocation', 'endLocation', 'width'))
	values(tmp) <- sigDMRInfo[,-rmInd, drop=FALSE]
	sigDMRInfo <- tmp

	## combine overlapping regions
	sigDMRInfo.r <- reduce(sigDMRInfo, min.gapwidth=minGap) 
	
	matchInfo <- GenomicRanges::findOverlaps(detectResult.g, sigDMRInfo.r, select="first")
	ind <- which(!is.na(matchInfo))
	dmr2ind <- split(ind, matchInfo[ind])
	dmr2len <- sapply(dmr2ind, length)
	dmrInfo <- data.frame(NumOfProbe=dmr2len)

	## calculate the mean score of each region	
	if (!is.null(scoreColumns)) {
		if (!all(sapply(scoreFuns, is.function))) {
			if (!all(names(scoreFuns) %in% scoreColumns))
				stop('The names of scoreFuns list should be from scoreColumns!')
		}

		detectValue <- as.matrix(as(values(detectResult.g), 'data.frame'))[,scoreColumns, drop=FALSE]
		detectValue <- matrix(as.numeric(detectValue), nrow=nrow(detectValue), ncol=ncol(detectValue), dimnames=dimnames(detectValue))
		dmr2score <- lapply(dmr2ind, function(ind.i) {
			value.i <- NULL
			## If scoreFuns is a named list, then the named scoreColumns will be treated differently
			if (all(sapply(scoreFuns, is.function))) {
				for (fun.i in scoreFuns) {
					value.i <- cbind(value.i, apply(detectValue[ind.i,,drop=FALSE], 2, fun.i))
				}
			} else {
				for (scoreCol.j in scoreColumns) {
					scoreFuns.j <- scoreFuns[[scoreCol.j]]
					if (is.null(scoreFuns.j)) {
						value.ij <- mean(detectValue[ind.i, scoreCol.j])
						if (!grepl('^mean', scoreCol.j)) {
							names(value.ij) <- paste('mean', scoreCol.j, sep='_')
						} else {
							names(value.ij) <- scoreCol.j
						}
					} else {
						value.ij <- sapply(scoreFuns.j, function(x) x(detectValue[ind.i, scoreCol.j]))
						names(value.ij) <- paste(names(scoreFuns.j), scoreCol.j, sep='_')
					}
					value.i <- c(value.i, value.ij)
				}
			} 
			return(value.i)
		})

		dmr2score <- do.call('rbind', dmr2score)
		rownames(dmr2score) <- names(dmr2ind) 
		avg.score <- matrix(NA, nrow=length(sigDMRInfo.r), ncol=ncol(dmr2score))
		if (is.null(colnames(dmr2score))) {
			scoreFunName <- names(scoreFuns)
			if (is.null(scoreFunName)) scoreFunName <- paste('Fun', 1:length(scoreFuns), sep='')
			colnames(avg.score) <- unlist(lapply(scoreFunName, paste, scoreColumns, sep='_'))
		} else {
			colnames(avg.score) <- colnames(dmr2score)
		}
		avg.score[as.numeric(rownames(dmr2score)),] <- dmr2score
		dmrInfo <- data.frame(dmrInfo, as.data.frame(avg.score))
	}
	values(sigDMRInfo.r) <- dmrInfo
	
	return(sigDMRInfo.r)
}


annotateGRanges <- function(grange, annotationDatabase, CpGInfo=NULL, exons=FALSE, flankRange=0, promoterRange=2000, checkGeneBody=FALSE, EntrezDB='org.Hs.eg.db') {
	
	if (!is(grange, 'GRanges')) {
		stop('grange should be a GRanges object!')
	}
	grange <- checkChrName(grange, addChr=TRUE)	
	if (!is.null(names(grange))) {
		if (any(duplicated(names(grange)))) names(grange) <- NULL
	}
	
	## load human genome information and check overlap with known genes
	if (is.character(annotationDatabase)) {
		if (file.exists(annotationDatabase)) {
			annotationDatabase <- loadDb(annotationDatabase)		
		} else if (require(annotationDatabase, character.only=TRUE)) {
			annotationDatabase <- get(annotationDatabase)
		} else {
			stop('Provided annotationDatabase does not exist!')
		}
	} 

	if (is(annotationDatabase, 'TxDb')) {
		tr <- transcripts(annotationDatabase, columns=c('gene_id', 'tx_id', 'tx_name'))		
	} else if (is(annotationDatabase, 'GRanges')) {
		tr <- annotationDatabase
	} else {
		stop('Wrong type of annotationDatabase! Please check help for more details.')
	}
	tr <- checkChrName(tr, addChr=TRUE)	
  ## make sure the consistence of the genome
	if (unique(genome(grange)) == '' || is.na(unique(genome(grange)))) genome(grange) <- unique(genome(tr))[1]

	if (is.logical(exons)) {
		if (exons && is(annotationDatabase, 'TxDb')) {
			exons <- exons(annotationDatabase, columns=c('exon_id', 'exon_name', 'exon_rank', 'tx_name'))
		} else {
			exons <- NULL
		}
	} else if (!is(exons, 'GRanges')) {
		stop('"exons" should be a logical or GRanges object!')
	}
	
	## filter the transcripts without matching gene ids
	tr <- tr[elementNROWS(values(tr)$gene_id) > 0]
	names(tr) <- values(tr)$tx_name
	tr <- checkChrName(tr, addChr=TRUE)
	tx2gene <- sub('GeneID:', '', unlist(values(tr)$gene_id))
	names(tx2gene) <- unlist(values(tr)$tx_name)

	if (is.character(CpGInfo)) {
		CpG.grange <- import.bed(CpGInfo[1])
	} else if (is(CpGInfo, 'GRanges')) {
		CpG.grange <- CpGInfo
	} else if (is.null(CpGInfo) || is.na(CpGInfo)) {
		CpG.grange <- NULL
	} else {
		stop('CpGInfo should be a bed file or a GRanges object!')
	}
	if (!is.null(CpGInfo)) CpGInfo <- checkChrName(CpGInfo, addChr=TRUE)

	## expand the interested GRanges
	if (flankRange != 0) {
		grange.ext <- resize(grange, width=width(grange) + 2*flankRange, fix="center")
	} else {
		grange.ext <- grange
	}

	if (is.null(names(grange.ext))) {
		values(grange.ext)$id <- make.unique(paste(seqnames(grange), start(grange), end(grange), sep="_"))
	} else {
		values(grange.ext)$id <- names(grange.ext)
	}

	if (!is.null(values(grange)$Transcript)) {
		txName.known <- values(grange)$Transcript
		txName.known[!(txName.known %in% names(tr))] <- NA
	} else {
		txName.known <- rep(NA, length(grange))
	}
	
	## ----------------------------------------------------------
	## Find the nearest TSS and related Transcript
	tss <- flank(tr, width=-1, start=TRUE)
	if (any(is.na(txName.known))) {
		naInd <- which(is.na(txName.known))
		# nearestInfo.tr <- IRanges::as.matrix(nearest(grange[naInd], tr, select="all"))
		# 
		# nearestTx <- tapply(nearestInfo.tr[,2], nearestInfo.tr[,1], function(ind) names(tr)[ind])
		# multiMapInd <- which(sapply(nearestTx, length) > 1)
		# if (length(multiMapInd) > 0) {
		# 	multiMapInd <- as.numeric(names(nearestTx)[multiMapInd])
		# 	for (i in multiMapInd) {
		# 		nearestTx.i <- nearestTx[[i]]
		# 		nearestTx[[i]] <- nearestTx.i[nearest(grange[naInd][i], tss[nearestTx.i])]
		# 	}
		# }
		# txName.known[naInd] <- unlist(nearestTx)
		naIndGrange = grange[naInd]
		nearestInfo.tr <- IRanges::as.matrix(nearest(naIndGrange, tr, select="all"))
		naIndGrange.expanded = naIndGrange[ nearestInfo.tr[,"queryHits"] ]
		stopifnot( identical(names(tss), names(tr) ) )
		tss.expanded = tss[ nearestInfo.tr[,"subjectHits"] ]
		nearestInfo.tr = cbind( nearestInfo.tr, TSSDist=abs( start(naIndGrange.expanded) - start(tss.expanded) ) )
		nearestInfo.tr = nearestInfo.tr[ order(nearestInfo.tr[,"queryHits"], nearestInfo.tr[,"TSSDist"]),]
		nearestInfo.tr = nearestInfo.tr[ !duplicated(nearestInfo.tr[,"queryHits"]),]
		txName.known[naInd] <- names(tss)[ nearestInfo.tr[,"subjectHits"] ]
		## Suggested modification from Peter Haverty in Sept. 2015
		
		values(grange)$Transcript <- txName.known
	}
	## add related EntrezID and GeneSymbol
	if (is.null(values(grange)$EntrezID)) {
		values(grange)$EntrezID <- tx2gene[values(grange)$Transcript]
	} else {
		values(grange)$EntrezID <- sub('^GeneID:', '', values(grange)$EntrezID)
	}
	if (is.null(values(grange)$GeneSymbol))
		values(grange)$GeneSymbol <- probe2gene(values(grange)$EntrezID, lib=EntrezDB, collapse=TRUE)

	## distance to TSS
	start2tss <- start(grange) - start(tss[txName.known]) 
	end2tss <- end(grange) - end(tss[txName.known]) 
	dist2tss <- pmin(abs(start2tss), abs(end2tss)) * sign(start2tss)
	dist2tss[sign(start2tss) * sign(end2tss) < 0] <- 0
	dist2tss[as.vector(strand(tss[txName.known])) == '-'] <- dist2tss[as.vector(strand(tss[txName.known])) == '-'] * -1
	values(grange)$distance2TSS <- dist2tss

	## nearest transcript based on TSS
	nearestInd.tss <- nearest(grange, tss)
	values(grange)$nearestTx <- values(tss[nearestInd.tss])$tx_name 
	suppressWarnings(dist2tss.nearestTx <- as.data.frame(distanceToNearest(grange, tss[nearestInd.tss]))[['distance']])
	
	## mark promoter based on distance to TSS
	promoterStatus <- rep(FALSE, length(dist2tss.nearestTx))
	promoterStatus[dist2tss <= 0 & abs(dist2tss) <= promoterRange] <- TRUE
	values(grange)$PROMOTER <- promoterStatus

	## ----------------------------------------------------------
	## find the overlapping with gene body
	if (checkGeneBody) {
		OL.gene <- findOverlaps(grange.ext, tr)	
		dmr2gene <- cbind(values(grange.ext)$id[queryHits(OL.gene)], 
			sapply(values(tr)$gene_id[subjectHits(OL.gene)], paste, collapse=';'))
		dupInd <- duplicated(paste(dmr2gene[,1], dmr2gene[,2], sep='_'))
		if (any(dupInd)) dmr2gene <- dmr2gene[!dupInd,,drop=FALSE]
		dmr2gene.list <- split(dmr2gene[,2], as.factor(dmr2gene[,1]))
		dmr2gene <- sapply(dmr2gene.list, function(x) {
			x <- sub('GeneID:', '', unique(x), ignore.case=TRUE)
			paste(probe2gene(x, lib=EntrezDB, collapse=TRUE), collapse=';')
		})
		dmr2ll <- sapply(dmr2gene.list, function(x) paste(unique(x), collapse=';'))
		
		## map to Tx_name
		dmr2tx <- cbind(values(grange.ext)$id[queryHits(OL.gene)], 
			sapply(values(tr)$tx_name[subjectHits(OL.gene)], paste, collapse=';'))
		dupInd <- duplicated(paste(dmr2tx[,1], dmr2tx[,2], sep='_'))
		if (any(dupInd)) dmr2tx <- dmr2tx[!dupInd,,drop=FALSE]
		dmr2tx.list <- split(dmr2tx[,2], as.factor(dmr2tx[,1]))
		dmr2tx <- sapply(dmr2tx.list, paste, collapse=';')

		mapInfo <- matrix("", nrow=length(grange.ext), ncol=3)
		colnames(mapInfo) <- c("EntrezID_GeneBody", "GeneSymbol_GeneBody", "Tx_GeneBody")
		rownames(mapInfo) <- values(grange.ext)$id
		mapInfo[names(dmr2ll), 'EntrezID_GeneBody'] <- dmr2ll
		mapInfo[names(dmr2gene), 'GeneSymbol_GeneBody'] <- dmr2gene
		mapInfo[names(dmr2tx), 'Tx_GeneBody'] <- dmr2tx
		values(grange) <- data.frame(as(values(grange), 'data.frame'), mapInfo)		
	}
	
	## ----------------------------------------------------------
	## find the overlapping with exons
	if (is(exons, 'GRanges')) {
		exons <- checkChrName(exons, addChr=TRUE)	
		OL.exons <- findOverlaps(grange.ext, exons)	
		hitExons <- exons[subjectHits(OL.exons)]
		if (length(hitExons) > 0) {
			## combine tx_name and exon_rank to get a new exon name
			if (all(is.na(values(hitExons)$exon_name))) {
				ll <- sapply(as.list(values(hitExons)$tx_name), length)
				exon_id <- values(hitExons)$exon_id
				exon_id <- rep(exon_id, ll)
				tx_name <- unlist(values(hitExons)$tx_name)
				exon_rank <- unlist(values(hitExons)$exon_rank)
				exon_name <- paste(tx_name, exon_rank, sep='_')
				exon_name <- split(exon_name, exon_id)
				values(hitExons)$exon_name <- sapply(exon_name, paste, collapse=',')
			}
			dmr2exons <- cbind(values(grange.ext)$id[queryHits(OL.exons)], 
				values(hitExons)$exon_name)

			dupInd <- duplicated(paste(dmr2exons[,1], dmr2exons[,2], sep='_'))
			if (any(dupInd)) {
				dmr2exons <- dmr2exons[!dupInd,,drop=FALSE]
				hitExons <- hitExons[!dupInd]
			}
			dmr2exons.list <- split(dmr2exons[,2], as.factor(dmr2exons[,1]))
			dmr2exons.flat <- sapply(dmr2exons.list, function(x) paste(unique(x), collapse=';'))

			uniId <- unique(values(grange.ext)$id)
			mapInfo <- matrix("", nrow=length(uniId), ncol=2)
			colnames(mapInfo) <- c("Exons", "Exon1")
			rownames(mapInfo) <- uniId
			mapInfo[names(dmr2exons.flat), 'Exons'] <- dmr2exons.flat

			exon1_status <- sapply(as.list(values(hitExons)$exon_rank), function(x) any(x== 1))
			if (any(exon1_status)) {
				id_exon1 <- dmr2exons[exon1_status,1]
				tx_exon1 <- sapply(as.list(values(hitExons)$tx_name[exon1_status]), head, 1)
				mapInfo[id_exon1, 'Exon1'] <- tx_exon1
			}
			values(grange) <- suppressWarnings(data.frame(as(values(grange), 'data.frame'), mapInfo[values(grange.ext)$id,])) 		
		} 
	}	
	
	## ----------------------------------------------------------
	## check overlap with CpG islands
	if (!is.null(CpGInfo)) {
		OL.CpG <- findOverlaps(grange.ext, CpG.grange)	
		dmr2CpG <- cbind(values(grange.ext)$id[queryHits(OL.CpG)], 
			unlist(values(CpG.grange)$name)[subjectHits(OL.CpG)])		

		mapInfo <- rep('', length(grange.ext))
		names(mapInfo) <- values(grange.ext)$id
		mapInfo[dmr2CpG[,1]] <- dmr2CpG[,2]
		values(grange) <- data.frame(as(values(grange), 'data.frame'), CpG_island=mapInfo) 		
	}
	
	return(grange)
}


## input of annotateDMR is the output of identifySigDMR
annotateDMRInfo <- function(DMRInfo, annotationDatabase, CpGInfo=NULL, flankRange=500, promoterRange=2000, EntrezDB='org.Hs.eg.db', as.GRanges=TRUE) {
	
	if (all(c('sigDMRInfo', 'sigDataInfo') %in% names(DMRInfo))) {
		sigDMRInfo <- DMRInfo$sigDMRInfo
		sigDataInfo <- DMRInfo$sigDataInfo
	} else if (is(DMRInfo, 'GRanges')) {
		sigDMRInfo <- DMRInfo
		sigDataInfo <- NULL
	} else {
		stop('DMRInfo should be a GRanges object or a list of GRanges objects (sigDMRInfo and sigDataInfo)!')
	}
	
	## load human genome information and check overlap with known genes
	if (is.character(annotationDatabase)) {
		if (file.exists(annotationDatabase)) {
			annotationDatabase <- loadDb(annotationDatabase)		
		} else if (require(annotationDatabase, character.only=TRUE)) {
			annotationDatabase <- get(annotationDatabase)
		} else {
			stop('Provided annotationDatabase does not exist!')
		}
	} 
	
	if (is(annotationDatabase, 'TxDb')) {
		## UCSC only includes reviewed genes!!!
		tr <- transcripts(annotationDatabase, columns=c('gene_id', 'tx_id', 'tx_name'))		
	} else if (is(annotationDatabase, 'GRanges')) {
		tr <- annotationDatabase
	} else {
		stop('Wrong type of annotationDatabase! Please check help for more details.')
	}

	## filter the transcripts without matching gene ids
	tr <- tr[elementNROWS(values(tr)$gene_id) > 0]
	names(tr) <- values(tr)$tx_id

	if (is.character(CpGInfo)) {
		CpG.grange <- import.bed(CpGInfo[1])
	} else if (is(CpGInfo, 'GRanges')) {
		CpG.grange <- CpGInfo
	} else if (is.null(CpGInfo) || is.na(CpGInfo)) {
		CpG.grange <- NULL
	} else {
		stop('CpGInfo should be a bed file or a GRanges object!')
	}
	
	## expand the transcript region
	## create a GRange object and check overlap
	if (!is.null(sigDMRInfo)) {
		sigDMRInfo <- annotateGRanges(sigDMRInfo, annotationDatabase=tr, CpGInfo=CpG.grange, flankRange=flankRange, promoterRange=promoterRange, EntrezDB=EntrezDB)
	}
	
	if (!is.null(sigDataInfo)) {
		sigDataInfo <- annotateGRanges(sigDataInfo, annotationDatabase=tr, CpGInfo=CpG.grange, flankRange=flankRange, promoterRange=promoterRange, EntrezDB=EntrezDB)
	}
	
	return(list(sigDMRInfo=sigDMRInfo, sigDataInfo=sigDataInfo))
}


export.DMRInfo	<- function(DMRInfo.ann, methyData=NULL, savePrefix='') {
	
	## output the DMR data
	sigDataInfo <- as.data.frame(DMRInfo.ann$sigDataInfo)
	sigProbe <- as.character(sigDataInfo$PROBEID)
	if (!is.null(methyData)) {
		sigDMRdataInfo <- cbind(sigDataInfo, exprs(methyData)[sigProbe, , drop=FALSE])
	} else {
		sigDMRdataInfo <- sigDataInfo
	}
	ord <- order(sigDMRdataInfo$seqnames, sigDMRdataInfo$start, sigDMRdataInfo$end, decreasing=FALSE)
	sigDMRdataInfo <- sigDMRdataInfo[ord,]
	
	# if (all(c('startWinIndex', 'endWinIndex') %in% colnames(sigDMRdataInfo))) {
	# 	numProbe <- sigDMRdataInfo[,'endWinIndex'] - sigDMRdataInfo[,'startWinIndex'] + 1
	# 	ind.start <- which(colnames(sigDMRdataInfo) == 'startWinIndex')
	# 	sigDMRdataInfo[,ind.start] <- numProbe
	# 	colnames(sigDMRdataInfo)[ind.start] <- 'numberOfProbe_SlidingWindow'
	# 	sigDMRdataInfo <- sigDMRdataInfo[,-which(colnames(sigDMRdataInfo) == 'endWinIndex')]
	# }
	sigDMRdataInfo <- data.frame(CHROMOSOME=sigDMRdataInfo$seqnames, POSITION=sigDMRdataInfo$start, sigDMRdataInfo[,-c(1:5)])
	## remove columns
	rmCol <- c('isSignificant', 'p.value.adj', 'startLocation', 'endLocation')
	sigDMRdataInfo <- sigDMRdataInfo[, !(colnames(sigDMRdataInfo) %in% rmCol)]
	
	write.csv(sigDMRdataInfo, file=paste('DMRdata_', savePrefix, '_', Sys.Date(), '.csv', sep=''), row.names=FALSE)

	
	## output the sigDMRInfo as a bed file
	fileName.i <- make.names(paste('DMRInfo_', savePrefix, '_', Sys.Date(), '.bed', sep=''))
	export(DMRInfo.ann$sigDMRInfo, fileName.i, format='bed')
	
	## output as a csv file
	sigDMRInfo <- as.data.frame(DMRInfo.ann$sigDMRInfo)
	ord <- order(sigDMRInfo$seqnames, sigDMRInfo$start, sigDMRInfo$end, decreasing=FALSE)
	sigDMRInfo <- sigDMRInfo[ord,]
	# if (all(c('startInd', 'endInd') %in% colnames(sigDMRInfo))) {
	# 	numProbe <- sigDMRInfo[,'endInd'] - sigDMRInfo[,'startInd'] + 1
	# 	ind.start <- which(colnames(sigDMRInfo) == 'startInd')
	# 	sigDMRInfo[,ind.start] <- numProbe
	# 	colnames(sigDMRInfo)[ind.start] <- 'numberOfProbe_SlidingWindow'
	# 	sigDMRInfo <- sigDMRInfo[,-which(colnames(sigDMRInfo) == 'endInd')]
	# }
	sigDMRInfo <- data.frame(CHROMOSOME=sigDMRInfo$seqnames, sigDMRInfo[,-1])
	## remove columns
	rmCol <- c('width', 'strand', 'isSignificant', 'p.value.adj')
	sigDMRInfo <- sigDMRInfo[, !(colnames(sigDMRInfo) %in% rmCol)]

	write.csv(sigDMRInfo, file=paste('DMRInfo_', savePrefix, '_', Sys.Date(), '.csv', sep=''), row.names=FALSE)
	

	return(invisible(TRUE))
}


## -----------------------------------------------------
## other internal functions
# detectResult is the return of detectDMR
.identifySigProbe <- function(detectResult, p.adjust.method="fdr", pValueTh=0.01, fdrTh=pValueTh, diffTh=log2(1.5)) {

	if (!is.numeric(pValueTh) && !is.na(pValueTh)) stop("pValueTh needs to be numeric.	Use 1 or NA if you want to turn it off.")
	if (!is.numeric(diffTh) && !is.na(pValueTh)) stop("diffTh needs to be numeric.	Use 0 or NA if you want to turn it off.")
	if (is.na(pValueTh)) pValueTh <- 1
	if (is.na(diffTh)) diffTh <- 0
	annotateColumn <- c("PROBEID","CHROMOSOME","POSITION")
	
	if (!all(annotateColumn %in% names(detectResult)))
		stop(" PROBEID, CHROMOSOME and POSITION are required columns!")
	
	annotateInd <- which(names(detectResult) %in% annotateColumn)
	# multiple testing correction
	p.val.cols <- grep("^p\\.",colnames(detectResult))
	p.val.adj.cols <- grep("p\\.adjust", colnames(detectResult))
	p.val.cols <- p.val.cols[!(p.val.cols %in% p.val.adj.cols)]
	if (length(p.val.cols) > 1) {
		p.value <- apply(detectResult[,p.val.cols], 1, min, na.rm=TRUE)
	} else {
		p.value <- detectResult[,p.val.cols]
	}
	p.value.adj <- p.adjust(p.value, method=p.adjust.method)
	
	if ('difference' %in% colnames(detectResult)) {
		max.diff <- detectResult[,'difference']
	} else {
		differences <- detectResult[, -c(annotateInd, p.val.cols, p.val.adj.cols, which(colnames(detectResult) %in% c("startWinIndex", "endWinIndex"))), drop=FALSE]
		max.diff <- rowMax(abs(as.matrix(differences)))
		max.diff <- max.diff * sign(differences[,1])
	}
	isSignificant <- rep(TRUE, length(max.diff))
	if (pValueTh < 1) isSignificant <- isSignificant & (p.value < pValueTh)
	if (p.adjust.method != 'none' && fdrTh < 1) isSignificant <- isSignificant & (p.value.adj < fdrTh)
	if (diffTh > 0) isSignificant <- isSignificant & (abs(max.diff) > diffTh)

	detectResult <- data.frame(detectResult, p.value.adj=p.value.adj, difference=max.diff, isSignificant=isSignificant)
	
	return(detectResult)
}


# This helper function sets up the positions for the sliding test based on the positions.
# Returns a list of index positions that fit within the sliding window with width winSize
.setupSlidingTests <- function(pos_data, winSize=250) {
	
	len <- length(pos_data)
	probesetList <- as.vector(seq(pos_data), mode='list')
	diff.pos.left <- c(Inf, diff(pos_data))
	growingHeight.left <- diff.pos.left
	growingStatus.left <- which(growingHeight.left <= winSize)
	diff.pos.right <- c(diff.pos.left[-1], Inf)
	growingHeight.right <- diff.pos.right
	growingStatus.right <- which(growingHeight.right <= winSize)
	iter <- 1
	## growing in both left and right directions until it reaches the half-window-size winSize
	while (length(c(growingStatus.left, growingStatus.right)) > 0) {

		growingHeight.left[-(1:iter)] <- growingHeight.left[-(1:iter)] + diff.pos.left[1:(len - iter)]		
		growingHeight.right[1:(len - iter)] <- growingHeight.right[1:(len - iter)] + diff.pos.right[-(1:iter)]
		
		## in left direction
		if (length(growingStatus.left) > 0) {
			probesetList[growingStatus.left] <- lapply(1:length(growingStatus.left), function(i)	 
				c(growingStatus.left[i] - iter, probesetList[[growingStatus.left[i]]]))
		}
		
		## in right direction
		if (length(growingStatus.right) > 0) {
			probesetList[growingStatus.right] <- lapply(1:length(growingStatus.right), function(i) c(probesetList[[growingStatus.right[i]]], growingStatus.right[i] + iter))
		}
		growingStatus.left <- which(growingHeight.left <= winSize)
		growingStatus.right <- which(growingHeight.right <= winSize)	
		
		iter <- iter + 1
	}

	return(probesetList)
}

probe2gene <- function(probe, lib="org.Hs.eg.db", simplify=TRUE, collapse=FALSE) {
	if (length(probe) == 0) return(NULL)
	if (!require(lib, character.only=TRUE)) stop(paste(lib, 'should be installed first!'))
	if (length(grep('.eg.db', lib)) > 0) {
		gene <- getEntrezAnnotation(probe, lib, from='eg', to='symbol')
		if (collapse) {
			gene <- sapply(gene,	paste, collapse=';')
		}				
	} else {
		llib <- paste(gsub('\\.db$', '', lib), 'SYMBOL', sep='')
		mapWithAllProbes <- do.call('toggleProbes', list(x=as.name(llib), value="all"))
		gene <- AnnotationDbi::mget(probe, mapWithAllProbes, ifnotfound=NA)
		if (collapse) {
			gene <- sapply(gene, paste, collapse=';')
		} else {
			if (simplify) gene <- sapply(gene, function(x) x[1])
		}
	}
	return(gene)
}


getEntrezAnnotation <- function(ll=NULL, lib=NULL, from='eg', to='symbol', species=c('human', 'rat', 'mouse', 'yeast', 'fly'), collapse=FALSE) {
	
	from <- toupper(from); to <- toupper(to)
	# from: 'eg', 'accnum', 'alias', 'chr', chrlengths', 'chrloc', 'enzyme', 'genename', 'go', 'map', 'mapcounts', 
	#	'path', 'pfam', 'pmid', 'prosite', 'refseq', 'symbol', 'unigene', 
	species <- match.arg(species)
	
	if (is.null(lib)) {
		lib = switch(species,
			'rat'='org.Rn.eg.db',
			'human'='org.Hs.eg.db',
			'mouse'='org.Mm.eg.db',
			'yeast'='org.Sc.eg.db',
			'fly'='org.Dm.eg.db'
			 )
	}
	if (!require(lib, character.only=TRUE)) {
		stop(paste('Package', lib, 'cannot be loaded!'))
	}

	ll.input <- ll
	na.ind <- which(is.na(ll) | ll == "")
	if (length(na.ind) > 0) {
		ll <- ll[-na.ind]
	}
		
	if (length(to) > 1) {
		allMapping <- NULL
		for (to.i in to) {
			mapping.i <- getEntrezAnnotation(ll, lib, from, to.i, species, collapse=TRUE)
			allMapping <- cbind(allMapping, mapping.i)
		}
		colnames(allMapping) <- to
		rownames(allMapping) <- ll
		return(allMapping)
	}
	libName <- sub('.db', '', lib)
	if (from == 'EG') {
		map <- paste(libName, to, sep='')		
	} else {
		map <- paste(libName, from, '2', to, sep='')		
		if (!exists(map)) {
			if (to == 'EG') {
				map <- paste(libName, from, sep='')
			}
			if (!exists(map)) stop(paste('Cannot find', map, 'in', lib, '!'))
		}
	}

	## 
	if (!is.null(ll)) {
		map <- AnnotationDbi::mget(ll, get(map), ifnotfound=NA)
	} else {
		map <- as.list(get(map))
	}
	if (to == 'EG' & !exists(paste(libName, from, '2', to, sep=''))) {
		len <- sapply(map, length)		
		map <- tapply(rep(names(map), len), unlist(map),	function(x) unique(x))
	}
	
	if (length(na.ind) > 0) {
		map.out <- rep(NA, length(ll.input))
		map.out[-na.ind] <- map
	} else {
		map.out <- map
	}

	if (collapse) {
		map.out <- sapply(map.out,	paste, collapse=';')
	}
	attr(map.out, 'lib') <- lib
	return(map.out)
}



## estimate the averaged methylation level within a CMR (GRanges) or a provided gene (transcript) elements based on methylation probe annotation
# methyProfile <- estimateCMR.methylation('NM_145201', methyGenoSet, estimateFun=mean, probeAnnotation=methyGrange, selectGeneElement=c('exon1', 'upstream500'))
# plot(m2beta(methyProfile), exprs(selExp)['NM_145201',], ylab='RNA-Seq expression (log2 count)', xlab='DNA methylation (Beta value)', col=rgb(1,0,0, alpha=0.35), pch=19)
estimateCMR.methylation <- function(cmr, methyGenoSet, estimateFun=mean, probeAnnotation=NULL, selectGeneElement=c('exon1', 'promoter'), mc.cores=min(12, detectCores())) {
	
	if (!is(methyGenoSet, 'GenoSet')) stop('"methyGenoSet" should be a "GenoSet" object!')
	if (length(cmr) > 1) {
		methy.cmr <- mclapply(1:length(cmr), function(i) {
			estimateCMR.methylation(cmr[i], methyGenoSet, estimateFun=estimateFun, probeAnnotation=probeAnnotation, selectGeneElement=selectGeneElement)
		}, mc.cores=mc.cores)
		methy.cmr <- do.call('rbind', methy.cmr)
		rownames(methy.cmr) <- names(cmr)
		return(methy.cmr)
	}
	
	if (is(cmr, 'GRanges')) {
		ol <- as.matrix(findOverlaps(cmr, rowRanges(methyGenoSet)))
		selProbe <- unique(featureNames(methyGenoSet)[ol[,2]])
	} else if (is.character(cmr)) {
		## when cmr is a gene Entrez ID
		if (is.null(probeAnnotation)) {
			probeAnnotation <- mcols(rowRanges(methyGenoSet))
		} else if (is(probeAnnotation, 'GRanges')) {
			allProbe <- names(probeAnnotation)
			probeAnnotation <- values(probeAnnotation)
			if (!is.null(allProbe))
				rownames(probeAnnotation) <- allProbe
		}

		selectGeneElement <- tolower(selectGeneElement)
		names(probeAnnotation) <- tolower(names(probeAnnotation))
		selectGeneElement <- intersect(selectGeneElement, names(probeAnnotation))
		if (length(selectGeneElement) == 0) stop('No "selectGeneElement" is provided!')
		allProbe <- intersect(rownames(probeAnnotation), featureNames(methyGenoSet))
		probeAnnotation <- as.matrix(probeAnnotation[allProbe,,drop=FALSE])
		methyGenoSet <- methyGenoSet[allProbe,]
		selProbe <- allProbe[apply(probeAnnotation[,selectGeneElement,drop=FALSE], 1, function(x) {
			return(cmr %in% unlist(strsplit(x, ';')))
		})]
	}

	methyProfile <- apply(assays(methyGenoSet)$exprs[selProbe,,drop=FALSE], 2, estimateFun)		

	attr(methyProfile, 'selProbe') <- selProbe
	return(methyProfile)
}
Any scripts or data that you put into this service are public.
methyAnalysis documentation built on Nov. 8, 2020, 8:09 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
methyAnalysis
DNA methylation data analysis and visualization

R/methyAnalysis.R
In methyAnalysis: DNA methylation data analysis and visualization

Defines functions estimateCMR.methylation getEntrezAnnotation probe2gene .setupSlidingTests .identifySigProbe export.DMRInfo annotateDMRInfo annotateGRanges getContinuousRegion identifySigDMR detectDMR.slideWin export.methyGenoSet smoothMethyData MethyLumiM2GenoSet getMethyProbeLocation

Documented in annotateDMRInfo annotateGRanges detectDMR.slideWin estimateCMR.methylation export.DMRInfo export.methyGenoSet getContinuousRegion identifySigDMR MethyLumiM2GenoSet smoothMethyData

Try the methyAnalysis package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

methyAnalysis DNA methylation data analysis and visualization

R/methyAnalysis.R In methyAnalysis: DNA methylation data analysis and visualization

Defines functions estimateCMR.methylation getEntrezAnnotation probe2gene .setupSlidingTests .identifySigProbe export.DMRInfo annotateDMRInfo annotateGRanges getContinuousRegion identifySigDMR detectDMR.slideWin export.methyGenoSet smoothMethyData MethyLumiM2GenoSet getMethyProbeLocation

Documented in annotateDMRInfo annotateGRanges detectDMR.slideWin estimateCMR.methylation export.DMRInfo export.methyGenoSet getContinuousRegion identifySigDMR MethyLumiM2GenoSet smoothMethyData

Try the methyAnalysis package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

methyAnalysis
DNA methylation data analysis and visualization

R/methyAnalysis.R
In methyAnalysis: DNA methylation data analysis and visualization