R/methyAnalysis.R

Defines functions estimateCMR.methylation getEntrezAnnotation probe2gene .setupSlidingTests .identifySigProbe export.DMRInfo annotateDMRInfo annotateGRanges getContinuousRegion identifySigDMR detectDMR.slideWin export.methyGenoSet smoothMethyData MethyLumiM2GenoSet getMethyProbeLocation

Documented in annotateDMRInfo annotateGRanges detectDMR.slideWin estimateCMR.methylation export.DMRInfo export.methyGenoSet getContinuousRegion identifySigDMR MethyLumiM2GenoSet smoothMethyData

## get the methylation probe location information, and return as a GRanges object
getMethyProbeLocation <- function(probes, lib="FDb.InfiniumMethylation.hg19") {
	
	if (!require(lib, character.only=TRUE)) stop(paste(lib, 'is not installed!'))
	if (is(probes, 'eSet')) probes <- rownames(probes)
	
	if (exists(lib)) {
		lib <- get(lib)
		if (is(lib, 'FeatureDb')) {
			allAnnotation <- features(lib)
		}
		if (any(probes %in% names(allAnnotation))) {
			probes <- probes[probes %in% names(allAnnotation)]
			warnings('Some probes does not exist in the annotation library!')
		}
		methyGrange <- allAnnotation[probes]
		values(methyGrange) <- DataFrame(ProbeID=probes)
		return(methyGrange)
	} 
	
	## For old annotation libraries: IlluminaHumanMethylation450k.db
	## Check whether multiple versions of chromosome information is available,
	## If so, only use the latest version.
	chrPattern <- paste(sub("\\.db$", "", lib), "CHR", sep="")
	chrObjs <- ls(paste("package:", lib, sep=""), pattern=paste(chrPattern, "[0-9]+$", sep=""))
	if (length(chrObjs) > 1) {
		chrVersion <- sub(".*[^0-9]([0-9]+)$", "\\1", chrObjs)
		obj <- get(chrObjs[which.max(as.numeric(chrVersion))])
	} else {
		obj <- get(chrPattern)
	}
	pp <- probes[probes %in% keys(obj)]
  chr[pp] <- sapply(AnnotationDbi::mget(pp, obj), function(x) x[1])

  obj <- get(paste(sub("\\.db$", "", lib), "CPGCOORDINATE", sep=""))
	pp <- probes[probes %in% keys(obj)]
  loc[pp] <- sapply(AnnotationDbi::mget(pp, obj), function(x) x[1])
	methyGrange <- GRanges(seqnames=chr, ranges=IRanges(start=loc, width=1), strand='*', ProbeID=probes)
	names(methyGrange) <- probes
	return(methyGrange)
}



## convert MethyLumiM class object to GenoSet class object
MethyLumiM2GenoSet <- function(methyLumiM, lib="FDb.InfiniumMethylation.hg19", bigMatrix=FALSE, dir.bigMatrix='.', savePrefix.bigMatrix) {
	oldFeatureData <- fData(methyLumiM)
	if (!identical(rownames(oldFeatureData), rownames(methyLumiM))) {
	  warnings('The rownames of featureData is inconsistent with the methyLumiM!')
  	rownames(oldFeatureData) <- rownames(methyLumiM)
	}
	methyLumiM <- addAnnotationInfo(methyLumiM, lib=lib)
	ff <- fData(methyLumiM)
	if (is.null(ff$CHROMOSOME))
		stop('Chromosome information is not available!\n')
		
	## retrieve the hgVersion information from the annotation library
  hgVersion <- ''
	if (require(lib, character.only=TRUE)) {
		dbinfo <- paste(sub('.db$', '', lib), '_dbInfo', sep='')
		if (exists(dbinfo)) {
			dbinfo <- do.call(dbinfo, list())
			rownames(dbinfo) <- dbinfo$name
			hgVersion <- strsplit(dbinfo['GPSOURCEURL', 'value'], '/')[[1]]
			hgVersion <- hgVersion[length(hgVersion)]
		} else if (class(get(lib)) == "FeatureDb") {
		  metaInfo <- metadata(get(lib))
			hgVersion <- metaInfo$value[metaInfo$name == 'Genome']
		}
	}

	## remove those without chromosome location
	rmInd <- (is.na(ff$CHROMOSOME) | ff$CHROMOSOME == '')
	if (any(rmInd)) {
		ff <- ff[!rmInd,]		
		cat(paste(length(which(rmInd)), 'probes were removed because of lack of chromosome location information!\n'))
	}
	ff$CHROMOSOME <- checkChrName(as.character(ff$CHROMOSOME))
	locdata <- GRanges(seqnames=ff$CHROMOSOME, ranges=IRanges(start=ff$POSITION, width=1, names=rownames(methyLumiM)))
	genome(locdata) <- hgVersion
	names(locdata) <- rownames(methyLumiM)[!rmInd]
	# sort the locdata
	locdata <- genoset::toGenomeOrder(locdata, strict=TRUE)

	if ((any(rmInd) && is(exprs(methyLumiM), 'BigMatrix')) || bigMatrix) {
		if (missing(savePrefix.bigMatrix)) {
			warnings('savePrefix.bigMatrix is missing! GenoSet_bigMatrixFiles is used!')
			savePrefix.bigMatrix <- 'GenoSet_bigMatrixFiles'
		}
	  dimNames <- list(names(locdata), colnames(methyLumiM))
		## predefine a big matrix
		methyLumiM.new <- lumi::asBigMatrix(methyLumiM[names(locdata),1], nCol=ncol(methyLumiM), dimNames=dimNames, saveDir=dir.bigMatrix, savePrefix=savePrefix.bigMatrix)
		## fill in the subsetted data
		for (ad.name in assayNames(methyLumiM.new)) {
      matrix.new <- assayElement(methyLumiM.new, ad.name)
      matrix.i <- assayElement(methyLumiM, ad.name)[names(locdata), ]
			matrix.new[, colnames(matrix.i)] <- matrix.i
		}
		methyLumiM <- methyLumiM.new		
		rm(methyLumiM.new)
	} else {
		# methyLumiM <- methyLumiM[!rmInd,]
		methyLumiM <- methyLumiM[names(locdata), ]
	}
	
	
	mcols(locdata) <- oldFeatureData[rownames(methyLumiM), ]
	methyGenoSet <- MethyGenoSet(rowRanges=locdata, pData=pData(methyLumiM), annotation=as.character(lib), assays=as.list(assayData(methyLumiM)))
	#fData(methyGenoSet) <- oldFeatureData[rownames(methyGenoSet), ]
	methyGenoSet@history <- methyLumiM@history
	
	## set smoothing attributes if exists
	if (!is.null(attr(methyLumiM, 'windowIndex')))
		attr(methyGenoSet, 'windowIndex') <- attr(methyLumiM, 'windowIndex')
	if (!is.null(attr(methyLumiM, 'windowRange')))
		attr(methyGenoSet, 'windowRange') <- attr(methyLumiM, 'windowRange')
	if (!is.null(attr(methyLumiM, 'windowSize')))
		attr(methyGenoSet, 'windowSize') <- attr(methyLumiM, 'windowSize')

	return(methyGenoSet)
}


## smooth the methylation data (MethyLumiM or GenoSet objects) using slide window with fixed windowsize (in bp)
smoothMethyData <- function(methyData, winSize=250, lib='FDb.InfiniumMethylation.hg19', p.value.detection.th=0.05, bigMatrix=FALSE, dir.bigMatrix='.', savePrefix.bigMatrix=NULL, ...) {

	if (!is(methyData, 'GenoSet') & !is(methyData, 'MethyLumiM')) {
		stop("methyData should be a GenoSet or MethyLumiM object!")
	}
	if (is(methyData, 'GenoSet')) {
		assaydata <- as.list(assays(methyData))
	} else {
		assaydata <- assayData(methyData)
	}
	if (is(methyData, 'MethyLumiM')) {
		chrInfo <- getChrInfo(methyData, lib=lib)
	} else {
		chrInfo <- data.frame(PROBEID=rownames(methyData), CHROMOSOME=genoset::chr(methyData), POSITION=start(methyData), END=end(methyData))
	}
	if (is.character(chrInfo$POSITION)) chrInfo$POSITION = as.numeric(chrInfo$POSITION)
	
	ratioData.old <- ratioData <- assaydata$exprs
	if (is(ratioData, 'BigMatrix')) { 
		bigMatrix <- TRUE
	}

	index <- 1:nrow(ratioData)
	# remove those probes lack of position information
	rmInd <- which(is.na(chrInfo$POSITION) | chrInfo$CHROMOSOME == '' )
	if (length(rmInd) > 0)	index <- index[-rmInd]
	
	# Sort the ratioData by chromosome and location
	isOrdered <- with(chrInfo, sapply(split(POSITION, CHROMOSOME), function(x) all(x == sort(x, decreasing=FALSE))))
	if (!all(isOrdered)) {
		ord <- order(chrInfo$CHROMOSOME[index], chrInfo$POSITION[index], decreasing=FALSE)
		index <- index[ord]
	}
	
	if (bigMatrix) {
		if (!is(ratioData, 'BigMatrix') | !is.null(savePrefix.bigMatrix) | !all(index == 1:length(index)))
			methyData <- asBigMatrix(methyData, rowInd=index, savePrefix=savePrefix.bigMatrix, saveDir=dir.bigMatrix)
	} else {
		methyData <- methyData[index,]
	}
	ratioData <- assaydata$exprs
	## set NA for those probes having high p-values
	if (!is.null(assaydata$detection)) {
	  naInd <- which(assaydata$detection > p.value.detection.th)
	  if (length(naInd) > 0) ratioData[naInd] <- NA
	}
	chrInfo <- chrInfo[index,]
	index <- 1:nrow(ratioData)
	
	# split data by Chromosome 
	index.chrList <- split(index, as.character(chrInfo$CHROMOSOME))
	chrInfoList <- split(chrInfo, as.character(chrInfo$CHROMOSOME))
	windowIndex <- vector(mode='list', length=length(chrInfoList))
	windowRange <- vector(mode='list', length=length(chrInfoList))
	
	for (i in 1:length(index.chrList)) {
		index.i <- index.chrList[[i]]
		chrInfo.i <- chrInfoList[[i]]
		chr.i <- chrInfo.i$CHROMOSOME[1]
		cat(paste("Smoothing Chromosome", chr.i, "...\n"))
    
		# return the index of sorted probes in each slide window
		windowIndex.i <- eval(call(".setupSlidingTests", pos_data=chrInfo.i$POSITION, winSize=winSize))
		names(windowIndex.i) <- chrInfoList[[i]]$PROBEID
		smooth.ratio.i <- sapply(windowIndex.i, function(ind) {
			# average the values in slide window and perform test
			smooth.ij <- colMeans(ratioData[index.i[ind],,drop=FALSE], na.rm=TRUE)
			return(smooth.ij)
		})
		smooth.ratio.i <- t(smooth.ratio.i) 
		## replace the data with smoothed ones	
		ratioData[index.i,] <- smooth.ratio.i
		
		windowIndex[[i]] <- windowIndex.i
		windowRange.i <- sapply(windowIndex.i, function(ind) {
			# average the values in slide window and perform test
			range.ij <- range(chrInfo.i$POSITION[ind])
			return(range.ij)
		})
		windowRange[[i]] <- t(windowRange.i)
	}
	windowRange <- do.call('rbind', windowRange)
	colnames(windowRange) <- c('startLocation', 'endLocation')

	exprs(methyData) <- ratioData
	if (is(ratioData, 'BigMatrix')) {
		assayData(methyData) <- bigmemoryExtras::attachAssayDataElements(assaydata)
	}
		
		
	# methyData <- suppressMessages(methyData[rownames(smooth.ratioData),colnames(smooth.ratioData)])
	# exprs(methyData) <- smooth.ratioData[rownames(methyData),]

	attr(methyData, 'windowIndex') <- windowIndex
	attr(methyData, 'windowRange') <- windowRange
	attr(methyData, 'windowSize') <- winSize
	return(methyData)
}


## export a MethyGenoSet object as a 'gct' (for IGV) or 'bw' (big-wig) file
export.methyGenoSet <- function(methyGenoSet, file.format=c('gct', 'bw'), exportValue=c('beta', 'M', 'intensity'), hgVersion.default='hg19', savePrefix=NULL, outputFile=NULL) {
	
	# exportValue <- match.arg(exportValue)
	exportValue <- exportValue[1]
	file.format <- match.arg(file.format)
	## get the annotation version
	hgVersion <- unname(genome(methyGenoSet)[1])
	if (is.null(hgVersion) || unique(hgVersion) == '') {
		warnings(paste('hgVersion information is not available in methyGenoSet!', hgVersion.default, ' will be used.'))
		genome(methyGenoSet) <- hgVersion <- hgVersion.default
	}
	
	# chr <- space(rowRanges(methyGenoSet))
	chr <- genoset::chr(methyGenoSet)
	start <- start(methyGenoSet)
	## Sort the rows of ratios.obj
	methyGenoSet <- methyGenoSet[order(chr, start),]
	
	## check overlap probes and average them
	locationID <- paste(chr, start, sep='_')
	dupInd <- which(duplicated(locationID))
	if (length(dupInd) > 0) {
		dupID <- unique(locationID[dupInd])
		for (dupID.i in dupID) {
			selInd.i <- which(locationID == dupID.i)
			exprs(methyGenoSet)[selInd.i[1],] <- colMeans(exprs(methyGenoSet)[selInd.i,], na.rm=TRUE)
		}
		methyGenoSet <- methyGenoSet[-dupInd,]
	}
	## attach chr prefix in the chromosome names if it does not include it
	if (length(grep('^chr', chr)) == 0) {
		names(rowRanges(methyGenoSet)) <- paste('chr', names(rowRanges(methyGenoSet)), sep='')
	}
	
	if (exportValue %in% c('beta', 'M')) {
  	methyData <- assayElement(methyGenoSet, 'exprs')
  	if (exportValue == 'beta') {
  		methyData <- m2beta(methyData) # - 0.5
  	} 
	} else if (exportValue %in% assayNames(methyGenoSet)) {
	  methyData <- assayElement(methyGenoSet, exportValue) 
	} else if (exportValue == 'intensity') {
	  methyData <- log2(assayElement(methyGenoSet, 'methylated') + assayElement(methyGenoSet, 'unmethylated') + 1)
	} else {
	  stop("exportValue doesn't exist in the data!")
	}
	
	## only keep 3 digit to save space
	methyData <- signif(methyData, 3)
	
	if (file.format == 'gct') {
		chrInfo.all <- data.frame(PROBEID=rownames(methyGenoSet), CHROMOSOME=genoset::chr(methyGenoSet), START=start(methyGenoSet), END=end(methyGenoSet),	stringsAsFactors=FALSE)

		# remove those probes lack of position information
		rmInd <- which(is.na(chrInfo.all$START))
		if (length(rmInd) > 0)	{
			chrInfo.all <- chrInfo.all[-rmInd,]
			methyData <- methyData[-rmInd,]
		}
		
		description <- with(chrInfo.all, paste("|@", CHROMOSOME, ":", START, "-", END, "|", sep=""))
		gct <- cbind(Name=chrInfo.all[,'PROBEID'], Description=description, methyData)
		
		## check version hgVersion is included in the filename
		if (is.null(outputFile))
		  outputFile <- paste(savePrefix, "_", exportValue, "_", hgVersion, ".gct", sep='')
		
		cat("#1.2\n", file=outputFile)
		cat(paste(dim(gct), collapse='\t', sep=''), "\n", file=outputFile, append=TRUE)
		suppressWarnings(write.table(gct, sep="\t", file=outputFile, row.names=FALSE, append=TRUE, quote=FALSE))
		return(invisible(outputFile))

	} else if (file.format == 'bw') {
 		chrInfo.all <- getChromInfoFromUCSC(hgVersion)
		rownames(chrInfo.all) <- chrInfo.all[,'chrom']
 
		samplenames <- colnames(methyGenoSet)
		pdata <- pData(methyGenoSet)
		chr.info <- chrInfo(methyGenoSet)
		seq.lengths <- chr.info[,"stop"] - chr.info[,"offset"]
		for (i in 1:ncol(methyData)) {		
			score.i <- methyData[,i]
			cn.data.i <- GRanges(seqnames=genoset::chr(methyGenoSet), ranges=IRanges(start=start(methyGenoSet),end=end(methyGenoSet)), strand='*', score=score.i)
			genome(cn.data.i) <- hgVersion
			cn.data.i <- cn.data.i[!is.na(score.i), ]
			if (savePrefix == '' || is.null(savePrefix)) {
				savePrefix.i <- samplenames[i]
			} else {
				savePrefix.i <- paste(savePrefix, samplenames[i], sep='_')
			}
			
			seqlengths(cn.data.i) <- chrInfo.all[as.character(seqlevels(cn.data.i)), 'size']
			if (length(outputFile) == ncol(methyData)) {
        filename.i <- outputFile[i]
			} else {
  			filename.i <- paste(savePrefix.i, "_", exportValue, "_", hgVersion[1], ".bw", sep='')
			}
			export.bw(cn.data.i, filename.i, dataFormat="auto")
		}
		return(invisible(filename.i))
	}
	
}


# This is the sliding test
## ratios.obj inlcudes c("PROBEID","CHROMOSOME","POSITION"), values 
## winSize is the half slide window size, by default it is 250
## 
detectDMR.slideWin <- function(methyGenoSet, sampleType, winSize=250, testMethod=c('ttest', 'wilcox'), p.adjust.method='fdr', p.value.detection.th=0.05, ...) {
	
	testMethod <- match.arg(testMethod)
	if (is(methyGenoSet, 'MethyLumiM')) {
		methyGenoSet <- MethyLumiM2GenoSet(methyGenoSet, ...)
	}
	
	smooth <- TRUE
	if (!is.null(attr(methyGenoSet, 'windowIndex'))) {
		smooth <- FALSE
		if (!is.null(attr(methyGenoSet, 'windowSize'))) {
			if (attr(methyGenoSet, 'windowSize') != winSize) smooth <- TRUE
		}
	} 
	if (smooth) {
		methyGenoSet <- smoothMethyData(methyGenoSet, winSize=winSize, p.value.detection.th=p.value.detection.th, ...)
	}
	windowIndex <- attr(methyGenoSet, 'windowIndex')	
	windowRange <- attr(methyGenoSet, 'windowRange')
	
	chrInfo <- suppressWarnings(as(rowRanges(methyGenoSet), 'GRanges'))
	smoothData <- assays(methyGenoSet)$exprs
	## set NA for those probes having high p-values
	if (!is.null(assays(methyGenoSet)$detection)) {
	  naInd <- which(assays(methyGenoSet)$detection > p.value.detection.th)
	  if (length(naInd) > 0) smoothData[naInd] <- NA
	}
	
	## calculate class means
	uniClass <- unique(sampleType)
	selInd1 <- which(sampleType == uniClass[1])
	selInd2 <- which(sampleType == uniClass[2])
	classMean <- data.frame(rowMeans(smoothData[, selInd1, drop=FALSE], na.rm=TRUE), rowMeans(smoothData[, selInd2, drop=FALSE], na.rm=TRUE))
	names(classMean) <- paste('mean_', uniClass, sep='')

	if (!is.factor(sampleType)) sampleType <- as.factor(sampleType)

	## perform t-test
	if (is.function(testMethod)) {
		testInfo <- testMethod(smoothData, sampleType, ... )
	} else {
		if (testMethod == 'ttest') {
		  naInd <- which(apply(smoothData, 1, function(x) any(is.na(x))))
		  if (length(naInd) > 0) {
        ## do t.test for those with nas
  			p.value <- difference <- tscore <- rep(NA, nrow(smoothData))
  			## check the non-NA sample number for each class
  			nonNA.num <- apply(smoothData[naInd,,drop=FALSE], 1, function(x) table(sampleType[!is.na(x)]))
  			## only include those with more than 2 samples in each class
  			tooFewSample.ind <- (apply(nonNA.num, 2, min) < 2)
  			if (length(naInd[!tooFewSample.ind]) > 0) {
          t.test.res <- apply(smoothData[naInd[!tooFewSample.ind],,drop=FALSE], 1, function(x) t.test(x ~ sampleType))
    			p.value[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$p.value)
    			difference[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$estimate[1] -  x$estimate[2])
    			tscore[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$statistic)
  			}
  			tstats <- rowttests(smoothData[-naInd,,drop=FALSE], sampleType)
  			p.value[-naInd] <- tstats$p.value
  			difference[-naInd] <- tstats$dm
  			tscore[-naInd] <- tstats$statistic
		  } else {
  			tstats <- rowttests(smoothData, sampleType)
  			p.value <- tstats$p.value
  			difference <- tstats$dm
  			tscore <- tstats$statistic
		  }
		} else if (testMethod == 'wilcox') {
			tmp <- split(as.data.frame(t(smoothData)), sampleType)
			xx <- t(tmp[[1]])
			yy <- t(tmp[[2]])
			p.value <- apply(cbind(xx, yy), 1, function(x) wilcox.test(x[1:ncol(xx)], x[(ncol(xx)+1):length(x)])$p.value, ...)
			difference <- classMean[,1]	- classMean[,2]	
		} else {
			stop('Unsupported "testMethod"!')
		}
		p.adjust <- p.adjust(p.value, method=p.adjust.method)
		testInfo <- data.frame(PROBEID=rownames(smoothData), difference=difference, p.value=p.value, p.adjust=p.adjust)
		if (testMethod == 'ttest') testInfo <- data.frame(testInfo, tscore=tscore)
	}

	startInd <- unlist(lapply(windowIndex, function(x) sapply(x, min)))[rownames(smoothData)]
	endInd <- unlist(lapply(windowIndex, function(x) sapply(x, max)))[rownames(smoothData)]
	testInfo <- data.frame(testInfo, startWinIndex=startInd, endWinIndex=endInd, windowRange, classMean)
	testResult <- chrInfo
	
	## export as a GRanges object
	values(testResult) <- testInfo
	return(testResult)
}


identifySigDMR <- function(detectResult, p.adjust.method="fdr", pValueTh=0.01, fdrTh=pValueTh, diffTh=1, 
			minProbeNum=1, maxGap=2000, minGap=100, oppositeMethylation=FALSE, topNum=NULL) {
	
	if (is(detectResult, 'GRanges')) {
		detectResult.g <- detectResult
		detectResult <- data.frame(CHROMOSOME=as.character(seqnames(detectResult)), POSITION=start(detectResult), as(values(detectResult), 'data.frame'))
		rownames(detectResult) <- names(detectResult.g)
	} else if (is(detectResult, 'data.frame')) {
		if (!all(c("PROBEID","CHROMOSOME","POSITION") %in% names(detectResult)))
			stop(" PROBEID, CHROMOSOME and POSITION are required columns in the detectResult object!")
		# remove those probes lack of position information
		rmInd <- which(is.na(detectResult$POSITION))
		if (length(rmInd) > 0) {
			detectResult <- detectResult[-rmInd,]
		}
		## convert it as a GRanges object
		tmp <- with(detectResult, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=POSITION, end=POSITION), strand='*'))
		rmInd <- which(colnames(detectResult) %in% c('CHROMOSOME', 'POSITION', 'END', 'width'))
		values(tmp) <- detectResult[,-rmInd, drop=FALSE]
		detectResult.g <- tmp
		names(detectResult.g) <- rownames(detectResult)
	} else {
		stop('detectResult should be either a GRanges object or a data.frame with required columns!')
	}

	## No pValue constraint if we only interested in the top CpG-sites
	if (!is.null(topNum) && !is.na(topNum)) pValueTh <- 1
	
	## if there is no isSignificant column, then DMR should be identified first
	if (is.null(detectResult$isSignificant) || is.null(topNum) || is.na(topNum)) {
		detectResult <- .identifySigProbe(detectResult, p.adjust.method=p.adjust.method, pValueTh=pValueTh, fdrTh=fdrTh, diffTh=diffTh)
	}

	## Select the significant CpG-sites
	sigInd <- which(detectResult$isSignificant)

	## check the methylation direction of two conditions for each probe
	classMeanCol <- grep('^mean_', names(detectResult), value=TRUE)
	# if (length(classMeanCol) == 2) {
	# 	classMean <- detectResult[,classMeanCol]
	# 	if (oppositeMethylation) {
	# 		sigInd <- sigInd[classMean[sigInd,1] * classMean[sigInd,2] < 0]
	# 	}
	# } 

	if (!is.null(topNum) && !is.na(topNum)) {
		ord <- order(detectResult$p.value[sigInd], decreasing=FALSE)
		sigInd <- sigInd[ord]
		if (topNum < length(sigInd)) {
			rmInd <- sigInd[-(1:topNum)]
			detectResult$isSignificant[rmInd] <- FALSE
			sigInd <- sigInd[1:topNum]
		}
	} else {
		if (length(sigInd) == 0) {
			cat('No significant CpG-sites were identified based on current criteria!\n')
			return(NULL)
		}
	}
	sigProbe <- rownames(detectResult)[sigInd]
	detectResult$status <- rep(FALSE, nrow(detectResult))
	detectResult[sigProbe, 'status'] <- TRUE

	if (length(sigProbe) == 0) {
		return(list(sigDMRInfo=NULL, sigDataInfo=NULL))
	}

	## ---------------------------------------
	## identify the boundary of DMR (return a GRanges object)
	scoreFuns <- list(p.value=c(min=min), difference=c(max=function(x) x[which.max(abs(x))]), p.adjust=c(min=min), tscore=c(max=function(x) x[which.max(abs(x))]))
	scoreColumns <- c('p.value', 'difference', 'p.adjust', 'tscore', classMeanCol)
	if (is.data.frame(detectResult)) {
		scoreColumns <- scoreColumns[scoreColumns %in% names(detectResult)]
	} else {
		scoreColumns <- scoreColumns[scoreColumns %in% names(values(detectResult))]
	}
	scoreFuns <- scoreFuns[names(scoreFuns) %in% scoreColumns]
	sigDMRInfo.g <- getContinuousRegion(detectResult, scoreColumns=scoreColumns, 
							 scoreFuns=scoreFuns, maxGap=maxGap, minGap=minGap)

	sigDMRInfo <- as(sigDMRInfo.g, 'data.frame')
	rownames(sigDMRInfo) <- names(sigDMRInfo.g)
	
	## filtering based on the minimum number of probes in each DMR
	if (minProbeNum > 1) {
		numProbe <- sigDMRInfo$NumOfProbe
		sigDMRInfo <- sigDMRInfo[numProbe >= minProbeNum, ,drop=FALSE]
		sigDMRInfo.g <- sigDMRInfo.g[numProbe >= minProbeNum]
	}
	
	## check oppositeMethylation direction
	if (length(classMeanCol) == 2 && oppositeMethylation) {
		dmrMean <- sigDMRInfo[, classMeanCol]	
		sigDMRInfo.g <- sigDMRInfo.g[dmrMean[,1] * dmrMean[,2] < 0]
	}

	## only keep the sigDataInfo which overlaps with sigDMRInfo
	sigDataInfo.g <- detectResult.g[sigProbe]
	sigDataInfo.g <- subsetByOverlaps(sigDataInfo.g, sigDMRInfo.g)

	return(list(sigDMRInfo=sigDMRInfo.g, sigDataInfo=sigDataInfo.g))
}


## Get continuous region from discrete probe measurements
# getContinuousRegion <- function(detectResult, scoreColumns=NULL, scoreFuns=mean, maxGap=2000, minGap=100, as.GRanges=TRUE) {
getContinuousRegion <- function(detectResult, scoreColumns=NULL, scoreFuns=c(mean=mean), maxGap=2000, minGap=100) {

	if (is.data.frame(detectResult)) {
		if (!all(c('CHROMOSOME', 'POSITION', 'status') %in% names(detectResult))) {
			stop('CHROMOSOME, POSITION and status columns are required!')
		}
		detectResult.g <- with(detectResult, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=POSITION, end=POSITION), strand='*'))
		values(detectResult.g) <- detectResult[, !(colnames(detectResult) %in% c('CHROMOSOME', 'POSITION'))]
	} else if (is(detectResult, 'GRanges')) {
		detectResult.g <- detectResult
		detectResult <- data.frame(CHROMOSOME=as.character(seqnames(detectResult)), POSITION=as.numeric(start(detectResult)), as(values(detectResult), 'data.frame'))
	}
	detectResult$POSITION <- as.numeric(detectResult$POSITION)
	## sort in chromsome order
	detectResult <- with(detectResult, detectResult[order(CHROMOSOME, POSITION, decreasing=FALSE),])
	
	if (is.null(detectResult$status)) {
		stop('"status" column is required!')
	} else {
		if (!is.logical(detectResult$status)) {
			warning('"status" column should be a logical vector! "status > 0" was used in calculation!')
			detectResult$status <- detectResult$status > 0
		}
	}
	## set NA as FALSE
	detectResult$status[is.na(detectResult$status)] <- FALSE
	if (!is.null(scoreColumns)) {
		if (!all(scoreColumns %in% colnames(detectResult))) {
			stop('Some "scoreColumns" does not match "detectResult"!')
		}
		if (is.null(scoreFuns)) scoreFuns <- c(mean=mean)
		# if (length(scoreFuns) < length(scoreColumns)) {
		# 	scoreFuns <- rep(scoreFuns, length(scoreColumns))
		# }
	}
	
	## split data by Chromosome 
	detectResult.chrList <- split(detectResult, detectResult$CHROMOSOME)
	
	## ---------------------------------------
	## identify the boundary of DMR
	dmr.list <- lapply(detectResult.chrList, function(detectResult.i) {
		len.significant.i <- length(which(detectResult.i$status))
		if (len.significant.i == 0) return(NULL)
		
		allPosition.i <- detectResult.i$POSITION
		status.i <- as.numeric(detectResult.i$status)
		diff.i <- diff(status.i)
		
		# identify the diff region
		startInd.i <- which(diff.i == 1) + 1
		if (status.i[1] == 1) 
			startInd.i <- c(1, startInd.i)
		endInd.i <- which(diff.i == -1)
		if (length(endInd.i) < length(startInd.i)) 
			endInd.i <- c(endInd.i, length(status.i))
		
		## check the gaps of sparse probe 
		gaps.i <- which(c(FALSE, diff(allPosition.i) > maxGap) & status.i == 1) 
		# the gap should be 1 in both sides
		gaps.i <- gaps.i[status.i[gaps.i + 1] == 1]
		if (length(gaps.i) > 0) {
			startInd.i <- sort(c(startInd.i, gaps.i))
			endInd.i <- sort(c(endInd.i, gaps.i))
		}
		
		if ("startWinIndex" %in% colnames(detectResult.i)) {
			startRegion.i <- detectResult.i[startInd.i, "startWinIndex"]
			endRegion.i <- detectResult.i[endInd.i, "endWinIndex"] 
			startLoc.i <- detectResult.i[startRegion.i, 'POSITION']
			endLoc.i <- detectResult.i[endRegion.i, 'POSITION']
			width.i <- endLoc.i - startLoc.i + 1
			regionSummary.i <- data.frame(CHROMOSOME=detectResult.i[startRegion.i, 'CHROMOSOME'],
				startLocation=startLoc.i, endLocation=endLoc.i, width=width.i, startInd=startRegion.i, endInd=endRegion.i)
		} else {
			startLoc.i <- detectResult.i[startInd.i, "POSITION"]
			endLoc.i <- detectResult.i[endInd.i, "POSITION"] 
			width.i <- endLoc.i - startLoc.i + 1
			regionSummary.i <- data.frame(CHROMOSOME=detectResult.i[startInd.i, 'CHROMOSOME'],
				startLocation=startLoc.i, endLocation=endLoc.i, width=width.i, startInd=startInd.i,
				endInd=endInd.i)
		}
		return(regionSummary.i)
	})
	
	sigDMRInfo <- do.call('rbind', dmr.list)
	if (length(sigDMRInfo) == 0) return(GRanges())

	## convert as GRanges
	tmp <- with(sigDMRInfo, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=startLocation, end=endLocation), strand='*'))
	rmInd <- which(colnames(sigDMRInfo) %in% c('CHROMOSOME', 'startLocation', 'endLocation', 'width'))
	values(tmp) <- sigDMRInfo[,-rmInd, drop=FALSE]
	sigDMRInfo <- tmp

	## combine overlapping regions
	sigDMRInfo.r <- reduce(sigDMRInfo, min.gapwidth=minGap) 
	
	matchInfo <- GenomicRanges::findOverlaps(detectResult.g, sigDMRInfo.r, select="first")
	ind <- which(!is.na(matchInfo))
	dmr2ind <- split(ind, matchInfo[ind])
	dmr2len <- sapply(dmr2ind, length)
	dmrInfo <- data.frame(NumOfProbe=dmr2len)

	## calculate the mean score of each region	
	if (!is.null(scoreColumns)) {
		if (!all(sapply(scoreFuns, is.function))) {
			if (!all(names(scoreFuns) %in% scoreColumns))
				stop('The names of scoreFuns list should be from scoreColumns!')
		}

		detectValue <- as.matrix(as(values(detectResult.g), 'data.frame'))[,scoreColumns, drop=FALSE]
		detectValue <- matrix(as.numeric(detectValue), nrow=nrow(detectValue), ncol=ncol(detectValue), dimnames=dimnames(detectValue))
		dmr2score <- lapply(dmr2ind, function(ind.i) {
			value.i <- NULL
			## If scoreFuns is a named list, then the named scoreColumns will be treated differently
			if (all(sapply(scoreFuns, is.function))) {
				for (fun.i in scoreFuns) {
					value.i <- cbind(value.i, apply(detectValue[ind.i,,drop=FALSE], 2, fun.i))
				}
			} else {
				for (scoreCol.j in scoreColumns) {
					scoreFuns.j <- scoreFuns[[scoreCol.j]]
					if (is.null(scoreFuns.j)) {
						value.ij <- mean(detectValue[ind.i, scoreCol.j])
						if (!grepl('^mean', scoreCol.j)) {
							names(value.ij) <- paste('mean', scoreCol.j, sep='_')
						} else {
							names(value.ij) <- scoreCol.j
						}
					} else {
						value.ij <- sapply(scoreFuns.j, function(x) x(detectValue[ind.i, scoreCol.j]))
						names(value.ij) <- paste(names(scoreFuns.j), scoreCol.j, sep='_')
					}
					value.i <- c(value.i, value.ij)
				}
			} 
			return(value.i)
		})

		dmr2score <- do.call('rbind', dmr2score)
		rownames(dmr2score) <- names(dmr2ind) 
		avg.score <- matrix(NA, nrow=length(sigDMRInfo.r), ncol=ncol(dmr2score))
		if (is.null(colnames(dmr2score))) {
			scoreFunName <- names(scoreFuns)
			if (is.null(scoreFunName)) scoreFunName <- paste('Fun', 1:length(scoreFuns), sep='')
			colnames(avg.score) <- unlist(lapply(scoreFunName, paste, scoreColumns, sep='_'))
		} else {
			colnames(avg.score) <- colnames(dmr2score)
		}
		avg.score[as.numeric(rownames(dmr2score)),] <- dmr2score
		dmrInfo <- data.frame(dmrInfo, as.data.frame(avg.score))
	}
	values(sigDMRInfo.r) <- dmrInfo
	
	return(sigDMRInfo.r)
}


annotateGRanges <- function(grange, annotationDatabase, CpGInfo=NULL, exons=FALSE, flankRange=0, promoterRange=2000, checkGeneBody=FALSE, EntrezDB='org.Hs.eg.db') {
	
	if (!is(grange, 'GRanges')) {
		stop('grange should be a GRanges object!')
	}
	grange <- checkChrName(grange, addChr=TRUE)	
	if (!is.null(names(grange))) {
		if (any(duplicated(names(grange)))) names(grange) <- NULL
	}
	
	## load human genome information and check overlap with known genes
	if (is.character(annotationDatabase)) {
		if (file.exists(annotationDatabase)) {
			annotationDatabase <- loadDb(annotationDatabase)		
		} else if (require(annotationDatabase, character.only=TRUE)) {
			annotationDatabase <- get(annotationDatabase)
		} else {
			stop('Provided annotationDatabase does not exist!')
		}
	} 

	if (is(annotationDatabase, 'TxDb')) {
		tr <- transcripts(annotationDatabase, columns=c('gene_id', 'tx_id', 'tx_name'))		
	} else if (is(annotationDatabase, 'GRanges')) {
		tr <- annotationDatabase
	} else {
		stop('Wrong type of annotationDatabase! Please check help for more details.')
	}
	tr <- checkChrName(tr, addChr=TRUE)	
  ## make sure the consistence of the genome
	if (unique(genome(grange)) == '' || is.na(unique(genome(grange)))) genome(grange) <- unique(genome(tr))[1]

	if (is.logical(exons)) {
		if (exons && is(annotationDatabase, 'TxDb')) {
			exons <- exons(annotationDatabase, columns=c('exon_id', 'exon_name', 'exon_rank', 'tx_name'))
		} else {
			exons <- NULL
		}
	} else if (!is(exons, 'GRanges')) {
		stop('"exons" should be a logical or GRanges object!')
	}
	
	## filter the transcripts without matching gene ids
	tr <- tr[elementNROWS(values(tr)$gene_id) > 0]
	names(tr) <- values(tr)$tx_name
	tr <- checkChrName(tr, addChr=TRUE)
	tx2gene <- sub('GeneID:', '', unlist(values(tr)$gene_id))
	names(tx2gene) <- unlist(values(tr)$tx_name)

	if (is.character(CpGInfo)) {
		CpG.grange <- import.bed(CpGInfo[1])
	} else if (is(CpGInfo, 'GRanges')) {
		CpG.grange <- CpGInfo
	} else if (is.null(CpGInfo) || is.na(CpGInfo)) {
		CpG.grange <- NULL
	} else {
		stop('CpGInfo should be a bed file or a GRanges object!')
	}
	if (!is.null(CpGInfo)) CpGInfo <- checkChrName(CpGInfo, addChr=TRUE)

	## expand the interested GRanges
	if (flankRange != 0) {
		grange.ext <- resize(grange, width=width(grange) + 2*flankRange, fix="center")
	} else {
		grange.ext <- grange
	}

	if (is.null(names(grange.ext))) {
		values(grange.ext)$id <- make.unique(paste(seqnames(grange), start(grange), end(grange), sep="_"))
	} else {
		values(grange.ext)$id <- names(grange.ext)
	}

	if (!is.null(values(grange)$Transcript)) {
		txName.known <- values(grange)$Transcript
		txName.known[!(txName.known %in% names(tr))] <- NA
	} else {
		txName.known <- rep(NA, length(grange))
	}
	
	## ----------------------------------------------------------
	## Find the nearest TSS and related Transcript
	tss <- flank(tr, width=-1, start=TRUE)
	if (any(is.na(txName.known))) {
		naInd <- which(is.na(txName.known))
		# nearestInfo.tr <- IRanges::as.matrix(nearest(grange[naInd], tr, select="all"))
		# 
		# nearestTx <- tapply(nearestInfo.tr[,2], nearestInfo.tr[,1], function(ind) names(tr)[ind])
		# multiMapInd <- which(sapply(nearestTx, length) > 1)
		# if (length(multiMapInd) > 0) {
		# 	multiMapInd <- as.numeric(names(nearestTx)[multiMapInd])
		# 	for (i in multiMapInd) {
		# 		nearestTx.i <- nearestTx[[i]]
		# 		nearestTx[[i]] <- nearestTx.i[nearest(grange[naInd][i], tss[nearestTx.i])]
		# 	}
		# }
		# txName.known[naInd] <- unlist(nearestTx)
		naIndGrange = grange[naInd]
		nearestInfo.tr <- IRanges::as.matrix(nearest(naIndGrange, tr, select="all"))
		naIndGrange.expanded = naIndGrange[ nearestInfo.tr[,"queryHits"] ]
		stopifnot( identical(names(tss), names(tr) ) )
		tss.expanded = tss[ nearestInfo.tr[,"subjectHits"] ]
		nearestInfo.tr = cbind( nearestInfo.tr, TSSDist=abs( start(naIndGrange.expanded) - start(tss.expanded) ) )
		nearestInfo.tr = nearestInfo.tr[ order(nearestInfo.tr[,"queryHits"], nearestInfo.tr[,"TSSDist"]),]
		nearestInfo.tr = nearestInfo.tr[ !duplicated(nearestInfo.tr[,"queryHits"]),]
		txName.known[naInd] <- names(tss)[ nearestInfo.tr[,"subjectHits"] ]
		## Suggested modification from Peter Haverty in Sept. 2015
		
		values(grange)$Transcript <- txName.known
	}
	## add related EntrezID and GeneSymbol
	if (is.null(values(grange)$EntrezID)) {
		values(grange)$EntrezID <- tx2gene[values(grange)$Transcript]
	} else {
		values(grange)$EntrezID <- sub('^GeneID:', '', values(grange)$EntrezID)
	}
	if (is.null(values(grange)$GeneSymbol))
		values(grange)$GeneSymbol <- probe2gene(values(grange)$EntrezID, lib=EntrezDB, collapse=TRUE)

	## distance to TSS
	start2tss <- start(grange) - start(tss[txName.known]) 
	end2tss <- end(grange) - end(tss[txName.known]) 
	dist2tss <- pmin(abs(start2tss), abs(end2tss)) * sign(start2tss)
	dist2tss[sign(start2tss) * sign(end2tss) < 0] <- 0
	dist2tss[as.vector(strand(tss[txName.known])) == '-'] <- dist2tss[as.vector(strand(tss[txName.known])) == '-'] * -1
	values(grange)$distance2TSS <- dist2tss

	## nearest transcript based on TSS
	nearestInd.tss <- nearest(grange, tss)
	values(grange)$nearestTx <- values(tss[nearestInd.tss])$tx_name 
	suppressWarnings(dist2tss.nearestTx <- as.data.frame(distanceToNearest(grange, tss[nearestInd.tss]))[['distance']])
	
	## mark promoter based on distance to TSS
	promoterStatus <- rep(FALSE, length(dist2tss.nearestTx))
	promoterStatus[dist2tss <= 0 & abs(dist2tss) <= promoterRange] <- TRUE
	values(grange)$PROMOTER <- promoterStatus

	## ----------------------------------------------------------
	## find the overlapping with gene body
	if (checkGeneBody) {
		OL.gene <- findOverlaps(grange.ext, tr)	
		dmr2gene <- cbind(values(grange.ext)$id[queryHits(OL.gene)], 
			sapply(values(tr)$gene_id[subjectHits(OL.gene)], paste, collapse=';'))
		dupInd <- duplicated(paste(dmr2gene[,1], dmr2gene[,2], sep='_'))
		if (any(dupInd)) dmr2gene <- dmr2gene[!dupInd,,drop=FALSE]
		dmr2gene.list <- split(dmr2gene[,2], as.factor(dmr2gene[,1]))
		dmr2gene <- sapply(dmr2gene.list, function(x) {
			x <- sub('GeneID:', '', unique(x), ignore.case=TRUE)
			paste(probe2gene(x, lib=EntrezDB, collapse=TRUE), collapse=';')
		})
		dmr2ll <- sapply(dmr2gene.list, function(x) paste(unique(x), collapse=';'))
		
		## map to Tx_name
		dmr2tx <- cbind(values(grange.ext)$id[queryHits(OL.gene)], 
			sapply(values(tr)$tx_name[subjectHits(OL.gene)], paste, collapse=';'))
		dupInd <- duplicated(paste(dmr2tx[,1], dmr2tx[,2], sep='_'))
		if (any(dupInd)) dmr2tx <- dmr2tx[!dupInd,,drop=FALSE]
		dmr2tx.list <- split(dmr2tx[,2], as.factor(dmr2tx[,1]))
		dmr2tx <- sapply(dmr2tx.list, paste, collapse=';')

		mapInfo <- matrix("", nrow=length(grange.ext), ncol=3)
		colnames(mapInfo) <- c("EntrezID_GeneBody", "GeneSymbol_GeneBody", "Tx_GeneBody")
		rownames(mapInfo) <- values(grange.ext)$id
		mapInfo[names(dmr2ll), 'EntrezID_GeneBody'] <- dmr2ll
		mapInfo[names(dmr2gene), 'GeneSymbol_GeneBody'] <- dmr2gene
		mapInfo[names(dmr2tx), 'Tx_GeneBody'] <- dmr2tx
		values(grange) <- data.frame(as(values(grange), 'data.frame'), mapInfo)		
	}
	
	## ----------------------------------------------------------
	## find the overlapping with exons
	if (is(exons, 'GRanges')) {
		exons <- checkChrName(exons, addChr=TRUE)	
		OL.exons <- findOverlaps(grange.ext, exons)	
		hitExons <- exons[subjectHits(OL.exons)]
		if (length(hitExons) > 0) {
			## combine tx_name and exon_rank to get a new exon name
			if (all(is.na(values(hitExons)$exon_name))) {
				ll <- sapply(as.list(values(hitExons)$tx_name), length)
				exon_id <- values(hitExons)$exon_id
				exon_id <- rep(exon_id, ll)
				tx_name <- unlist(values(hitExons)$tx_name)
				exon_rank <- unlist(values(hitExons)$exon_rank)
				exon_name <- paste(tx_name, exon_rank, sep='_')
				exon_name <- split(exon_name, exon_id)
				values(hitExons)$exon_name <- sapply(exon_name, paste, collapse=',')
			}
			dmr2exons <- cbind(values(grange.ext)$id[queryHits(OL.exons)], 
				values(hitExons)$exon_name)

			dupInd <- duplicated(paste(dmr2exons[,1], dmr2exons[,2], sep='_'))
			if (any(dupInd)) {
				dmr2exons <- dmr2exons[!dupInd,,drop=FALSE]
				hitExons <- hitExons[!dupInd]
			}
			dmr2exons.list <- split(dmr2exons[,2], as.factor(dmr2exons[,1]))
			dmr2exons.flat <- sapply(dmr2exons.list, function(x) paste(unique(x), collapse=';'))

			uniId <- unique(values(grange.ext)$id)
			mapInfo <- matrix("", nrow=length(uniId), ncol=2)
			colnames(mapInfo) <- c("Exons", "Exon1")
			rownames(mapInfo) <- uniId
			mapInfo[names(dmr2exons.flat), 'Exons'] <- dmr2exons.flat

			exon1_status <- sapply(as.list(values(hitExons)$exon_rank), function(x) any(x== 1))
			if (any(exon1_status)) {
				id_exon1 <- dmr2exons[exon1_status,1]
				tx_exon1 <- sapply(as.list(values(hitExons)$tx_name[exon1_status]), head, 1)
				mapInfo[id_exon1, 'Exon1'] <- tx_exon1
			}
			values(grange) <- suppressWarnings(data.frame(as(values(grange), 'data.frame'), mapInfo[values(grange.ext)$id,])) 		
		} 
	}	
	
	## ----------------------------------------------------------
	## check overlap with CpG islands
	if (!is.null(CpGInfo)) {
		OL.CpG <- findOverlaps(grange.ext, CpG.grange)	
		dmr2CpG <- cbind(values(grange.ext)$id[queryHits(OL.CpG)], 
			unlist(values(CpG.grange)$name)[subjectHits(OL.CpG)])		

		mapInfo <- rep('', length(grange.ext))
		names(mapInfo) <- values(grange.ext)$id
		mapInfo[dmr2CpG[,1]] <- dmr2CpG[,2]
		values(grange) <- data.frame(as(values(grange), 'data.frame'), CpG_island=mapInfo) 		
	}
	
	return(grange)
}


## input of annotateDMR is the output of identifySigDMR
annotateDMRInfo <- function(DMRInfo, annotationDatabase, CpGInfo=NULL, flankRange=500, promoterRange=2000, EntrezDB='org.Hs.eg.db', as.GRanges=TRUE) {
	
	if (all(c('sigDMRInfo', 'sigDataInfo') %in% names(DMRInfo))) {
		sigDMRInfo <- DMRInfo$sigDMRInfo
		sigDataInfo <- DMRInfo$sigDataInfo
	} else if (is(DMRInfo, 'GRanges')) {
		sigDMRInfo <- DMRInfo
		sigDataInfo <- NULL
	} else {
		stop('DMRInfo should be a GRanges object or a list of GRanges objects (sigDMRInfo and sigDataInfo)!')
	}
	
	## load human genome information and check overlap with known genes
	if (is.character(annotationDatabase)) {
		if (file.exists(annotationDatabase)) {
			annotationDatabase <- loadDb(annotationDatabase)		
		} else if (require(annotationDatabase, character.only=TRUE)) {
			annotationDatabase <- get(annotationDatabase)
		} else {
			stop('Provided annotationDatabase does not exist!')
		}
	} 
	
	if (is(annotationDatabase, 'TxDb')) {
		## UCSC only includes reviewed genes!!!
		tr <- transcripts(annotationDatabase, columns=c('gene_id', 'tx_id', 'tx_name'))		
	} else if (is(annotationDatabase, 'GRanges')) {
		tr <- annotationDatabase
	} else {
		stop('Wrong type of annotationDatabase! Please check help for more details.')
	}

	## filter the transcripts without matching gene ids
	tr <- tr[elementNROWS(values(tr)$gene_id) > 0]
	names(tr) <- values(tr)$tx_id

	if (is.character(CpGInfo)) {
		CpG.grange <- import.bed(CpGInfo[1])
	} else if (is(CpGInfo, 'GRanges')) {
		CpG.grange <- CpGInfo
	} else if (is.null(CpGInfo) || is.na(CpGInfo)) {
		CpG.grange <- NULL
	} else {
		stop('CpGInfo should be a bed file or a GRanges object!')
	}
	
	## expand the transcript region
	## create a GRange object and check overlap
	if (!is.null(sigDMRInfo)) {
		sigDMRInfo <- annotateGRanges(sigDMRInfo, annotationDatabase=tr, CpGInfo=CpG.grange, flankRange=flankRange, promoterRange=promoterRange, EntrezDB=EntrezDB)
	}
	
	if (!is.null(sigDataInfo)) {
		sigDataInfo <- annotateGRanges(sigDataInfo, annotationDatabase=tr, CpGInfo=CpG.grange, flankRange=flankRange, promoterRange=promoterRange, EntrezDB=EntrezDB)
	}
	
	return(list(sigDMRInfo=sigDMRInfo, sigDataInfo=sigDataInfo))
}


export.DMRInfo	<- function(DMRInfo.ann, methyData=NULL, savePrefix='') {
	
	## output the DMR data
	sigDataInfo <- as.data.frame(DMRInfo.ann$sigDataInfo)
	sigProbe <- as.character(sigDataInfo$PROBEID)
	if (!is.null(methyData)) {
		sigDMRdataInfo <- cbind(sigDataInfo, exprs(methyData)[sigProbe, , drop=FALSE])
	} else {
		sigDMRdataInfo <- sigDataInfo
	}
	ord <- order(sigDMRdataInfo$seqnames, sigDMRdataInfo$start, sigDMRdataInfo$end, decreasing=FALSE)
	sigDMRdataInfo <- sigDMRdataInfo[ord,]
	
	# if (all(c('startWinIndex', 'endWinIndex') %in% colnames(sigDMRdataInfo))) {
	# 	numProbe <- sigDMRdataInfo[,'endWinIndex'] - sigDMRdataInfo[,'startWinIndex'] + 1
	# 	ind.start <- which(colnames(sigDMRdataInfo) == 'startWinIndex')
	# 	sigDMRdataInfo[,ind.start] <- numProbe
	# 	colnames(sigDMRdataInfo)[ind.start] <- 'numberOfProbe_SlidingWindow'
	# 	sigDMRdataInfo <- sigDMRdataInfo[,-which(colnames(sigDMRdataInfo) == 'endWinIndex')]
	# }
	sigDMRdataInfo <- data.frame(CHROMOSOME=sigDMRdataInfo$seqnames, POSITION=sigDMRdataInfo$start, sigDMRdataInfo[,-c(1:5)])
	## remove columns
	rmCol <- c('isSignificant', 'p.value.adj', 'startLocation', 'endLocation')
	sigDMRdataInfo <- sigDMRdataInfo[, !(colnames(sigDMRdataInfo) %in% rmCol)]
	
	write.csv(sigDMRdataInfo, file=paste('DMRdata_', savePrefix, '_', Sys.Date(), '.csv', sep=''), row.names=FALSE)

	
	## output the sigDMRInfo as a bed file
	fileName.i <- make.names(paste('DMRInfo_', savePrefix, '_', Sys.Date(), '.bed', sep=''))
	export(DMRInfo.ann$sigDMRInfo, fileName.i, format='bed')
	
	## output as a csv file
	sigDMRInfo <- as.data.frame(DMRInfo.ann$sigDMRInfo)
	ord <- order(sigDMRInfo$seqnames, sigDMRInfo$start, sigDMRInfo$end, decreasing=FALSE)
	sigDMRInfo <- sigDMRInfo[ord,]
	# if (all(c('startInd', 'endInd') %in% colnames(sigDMRInfo))) {
	# 	numProbe <- sigDMRInfo[,'endInd'] - sigDMRInfo[,'startInd'] + 1
	# 	ind.start <- which(colnames(sigDMRInfo) == 'startInd')
	# 	sigDMRInfo[,ind.start] <- numProbe
	# 	colnames(sigDMRInfo)[ind.start] <- 'numberOfProbe_SlidingWindow'
	# 	sigDMRInfo <- sigDMRInfo[,-which(colnames(sigDMRInfo) == 'endInd')]
	# }
	sigDMRInfo <- data.frame(CHROMOSOME=sigDMRInfo$seqnames, sigDMRInfo[,-1])
	## remove columns
	rmCol <- c('width', 'strand', 'isSignificant', 'p.value.adj')
	sigDMRInfo <- sigDMRInfo[, !(colnames(sigDMRInfo) %in% rmCol)]

	write.csv(sigDMRInfo, file=paste('DMRInfo_', savePrefix, '_', Sys.Date(), '.csv', sep=''), row.names=FALSE)
	

	return(invisible(TRUE))
}


## -----------------------------------------------------
## other internal functions
# detectResult is the return of detectDMR
.identifySigProbe <- function(detectResult, p.adjust.method="fdr", pValueTh=0.01, fdrTh=pValueTh, diffTh=log2(1.5)) {

	if (!is.numeric(pValueTh) && !is.na(pValueTh)) stop("pValueTh needs to be numeric.	Use 1 or NA if you want to turn it off.")
	if (!is.numeric(diffTh) && !is.na(pValueTh)) stop("diffTh needs to be numeric.	Use 0 or NA if you want to turn it off.")
	if (is.na(pValueTh)) pValueTh <- 1
	if (is.na(diffTh)) diffTh <- 0
	annotateColumn <- c("PROBEID","CHROMOSOME","POSITION")
	
	if (!all(annotateColumn %in% names(detectResult)))
		stop(" PROBEID, CHROMOSOME and POSITION are required columns!")
	
	annotateInd <- which(names(detectResult) %in% annotateColumn)
	# multiple testing correction
	p.val.cols <- grep("^p\\.",colnames(detectResult))
	p.val.adj.cols <- grep("p\\.adjust", colnames(detectResult))
	p.val.cols <- p.val.cols[!(p.val.cols %in% p.val.adj.cols)]
	if (length(p.val.cols) > 1) {
		p.value <- apply(detectResult[,p.val.cols], 1, min, na.rm=TRUE)
	} else {
		p.value <- detectResult[,p.val.cols]
	}
	p.value.adj <- p.adjust(p.value, method=p.adjust.method)
	
	if ('difference' %in% colnames(detectResult)) {
		max.diff <- detectResult[,'difference']
	} else {
		differences <- detectResult[, -c(annotateInd, p.val.cols, p.val.adj.cols, which(colnames(detectResult) %in% c("startWinIndex", "endWinIndex"))), drop=FALSE]
		max.diff <- rowMax(abs(as.matrix(differences)))
		max.diff <- max.diff * sign(differences[,1])
	}
	isSignificant <- rep(TRUE, length(max.diff))
	if (pValueTh < 1) isSignificant <- isSignificant & (p.value < pValueTh)
	if (p.adjust.method != 'none' && fdrTh < 1) isSignificant <- isSignificant & (p.value.adj < fdrTh)
	if (diffTh > 0) isSignificant <- isSignificant & (abs(max.diff) > diffTh)

	detectResult <- data.frame(detectResult, p.value.adj=p.value.adj, difference=max.diff, isSignificant=isSignificant)
	
	return(detectResult)
}


# This helper function sets up the positions for the sliding test based on the positions.
# Returns a list of index positions that fit within the sliding window with width winSize
.setupSlidingTests <- function(pos_data, winSize=250) {
	
	len <- length(pos_data)
	probesetList <- as.vector(seq(pos_data), mode='list')
	diff.pos.left <- c(Inf, diff(pos_data))
	growingHeight.left <- diff.pos.left
	growingStatus.left <- which(growingHeight.left <= winSize)
	diff.pos.right <- c(diff.pos.left[-1], Inf)
	growingHeight.right <- diff.pos.right
	growingStatus.right <- which(growingHeight.right <= winSize)
	iter <- 1
	## growing in both left and right directions until it reaches the half-window-size winSize
	while (length(c(growingStatus.left, growingStatus.right)) > 0) {

		growingHeight.left[-(1:iter)] <- growingHeight.left[-(1:iter)] + diff.pos.left[1:(len - iter)]		
		growingHeight.right[1:(len - iter)] <- growingHeight.right[1:(len - iter)] + diff.pos.right[-(1:iter)]
		
		## in left direction
		if (length(growingStatus.left) > 0) {
			probesetList[growingStatus.left] <- lapply(1:length(growingStatus.left), function(i)	 
				c(growingStatus.left[i] - iter, probesetList[[growingStatus.left[i]]]))
		}
		
		## in right direction
		if (length(growingStatus.right) > 0) {
			probesetList[growingStatus.right] <- lapply(1:length(growingStatus.right), function(i) c(probesetList[[growingStatus.right[i]]], growingStatus.right[i] + iter))
		}
		growingStatus.left <- which(growingHeight.left <= winSize)
		growingStatus.right <- which(growingHeight.right <= winSize)	
		
		iter <- iter + 1
	}

	return(probesetList)
}

probe2gene <- function(probe, lib="org.Hs.eg.db", simplify=TRUE, collapse=FALSE) {
	if (length(probe) == 0) return(NULL)
	if (!require(lib, character.only=TRUE)) stop(paste(lib, 'should be installed first!'))
	if (length(grep('.eg.db', lib)) > 0) {
		gene <- getEntrezAnnotation(probe, lib, from='eg', to='symbol')
		if (collapse) {
			gene <- sapply(gene,	paste, collapse=';')
		}				
	} else {
		llib <- paste(gsub('\\.db$', '', lib), 'SYMBOL', sep='')
		mapWithAllProbes <- do.call('toggleProbes', list(x=as.name(llib), value="all"))
		gene <- AnnotationDbi::mget(probe, mapWithAllProbes, ifnotfound=NA)
		if (collapse) {
			gene <- sapply(gene, paste, collapse=';')
		} else {
			if (simplify) gene <- sapply(gene, function(x) x[1])
		}
	}
	return(gene)
}


getEntrezAnnotation <- function(ll=NULL, lib=NULL, from='eg', to='symbol', species=c('human', 'rat', 'mouse', 'yeast', 'fly'), collapse=FALSE) {
	
	from <- toupper(from); to <- toupper(to)
	# from: 'eg', 'accnum', 'alias', 'chr', chrlengths', 'chrloc', 'enzyme', 'genename', 'go', 'map', 'mapcounts', 
	#	'path', 'pfam', 'pmid', 'prosite', 'refseq', 'symbol', 'unigene', 
	species <- match.arg(species)
	
	if (is.null(lib)) {
		lib = switch(species,
			'rat'='org.Rn.eg.db',
			'human'='org.Hs.eg.db',
			'mouse'='org.Mm.eg.db',
			'yeast'='org.Sc.eg.db',
			'fly'='org.Dm.eg.db'
			 )
	}
	if (!require(lib, character.only=TRUE)) {
		stop(paste('Package', lib, 'cannot be loaded!'))
	}

	ll.input <- ll
	na.ind <- which(is.na(ll) | ll == "")
	if (length(na.ind) > 0) {
		ll <- ll[-na.ind]
	}
		
	if (length(to) > 1) {
		allMapping <- NULL
		for (to.i in to) {
			mapping.i <- getEntrezAnnotation(ll, lib, from, to.i, species, collapse=TRUE)
			allMapping <- cbind(allMapping, mapping.i)
		}
		colnames(allMapping) <- to
		rownames(allMapping) <- ll
		return(allMapping)
	}
	libName <- sub('.db', '', lib)
	if (from == 'EG') {
		map <- paste(libName, to, sep='')		
	} else {
		map <- paste(libName, from, '2', to, sep='')		
		if (!exists(map)) {
			if (to == 'EG') {
				map <- paste(libName, from, sep='')
			}
			if (!exists(map)) stop(paste('Cannot find', map, 'in', lib, '!'))
		}
	}

	## 
	if (!is.null(ll)) {
		map <- AnnotationDbi::mget(ll, get(map), ifnotfound=NA)
	} else {
		map <- as.list(get(map))
	}
	if (to == 'EG' & !exists(paste(libName, from, '2', to, sep=''))) {
		len <- sapply(map, length)		
		map <- tapply(rep(names(map), len), unlist(map),	function(x) unique(x))
	}
	
	if (length(na.ind) > 0) {
		map.out <- rep(NA, length(ll.input))
		map.out[-na.ind] <- map
	} else {
		map.out <- map
	}

	if (collapse) {
		map.out <- sapply(map.out,	paste, collapse=';')
	}
	attr(map.out, 'lib') <- lib
	return(map.out)
}



## estimate the averaged methylation level within a CMR (GRanges) or a provided gene (transcript) elements based on methylation probe annotation
# methyProfile <- estimateCMR.methylation('NM_145201', methyGenoSet, estimateFun=mean, probeAnnotation=methyGrange, selectGeneElement=c('exon1', 'upstream500'))
# plot(m2beta(methyProfile), exprs(selExp)['NM_145201',], ylab='RNA-Seq expression (log2 count)', xlab='DNA methylation (Beta value)', col=rgb(1,0,0, alpha=0.35), pch=19)
estimateCMR.methylation <- function(cmr, methyGenoSet, estimateFun=mean, probeAnnotation=NULL, selectGeneElement=c('exon1', 'promoter'), mc.cores=min(12, detectCores())) {
	
	if (!is(methyGenoSet, 'GenoSet')) stop('"methyGenoSet" should be a "GenoSet" object!')
	if (length(cmr) > 1) {
		methy.cmr <- mclapply(1:length(cmr), function(i) {
			estimateCMR.methylation(cmr[i], methyGenoSet, estimateFun=estimateFun, probeAnnotation=probeAnnotation, selectGeneElement=selectGeneElement)
		}, mc.cores=mc.cores)
		methy.cmr <- do.call('rbind', methy.cmr)
		rownames(methy.cmr) <- names(cmr)
		return(methy.cmr)
	}
	
	if (is(cmr, 'GRanges')) {
		ol <- as.matrix(findOverlaps(cmr, rowRanges(methyGenoSet)))
		selProbe <- unique(featureNames(methyGenoSet)[ol[,2]])
	} else if (is.character(cmr)) {
		## when cmr is a gene Entrez ID
		if (is.null(probeAnnotation)) {
			probeAnnotation <- mcols(rowRanges(methyGenoSet))
		} else if (is(probeAnnotation, 'GRanges')) {
			allProbe <- names(probeAnnotation)
			probeAnnotation <- values(probeAnnotation)
			if (!is.null(allProbe))
				rownames(probeAnnotation) <- allProbe
		}

		selectGeneElement <- tolower(selectGeneElement)
		names(probeAnnotation) <- tolower(names(probeAnnotation))
		selectGeneElement <- intersect(selectGeneElement, names(probeAnnotation))
		if (length(selectGeneElement) == 0) stop('No "selectGeneElement" is provided!')
		allProbe <- intersect(rownames(probeAnnotation), featureNames(methyGenoSet))
		probeAnnotation <- as.matrix(probeAnnotation[allProbe,,drop=FALSE])
		methyGenoSet <- methyGenoSet[allProbe,]
		selProbe <- allProbe[apply(probeAnnotation[,selectGeneElement,drop=FALSE], 1, function(x) {
			return(cmr %in% unlist(strsplit(x, ';')))
		})]
	}

	methyProfile <- apply(assays(methyGenoSet)$exprs[selProbe,,drop=FALSE], 2, estimateFun)		

	attr(methyProfile, 'selProbe') <- selProbe
	return(methyProfile)
}

Try the methyAnalysis package in your browser

Any scripts or data that you put into this service are public.

methyAnalysis documentation built on Nov. 8, 2020, 8:09 p.m.