Nothing
## get the methylation probe location information, and return as a GRanges object
getMethyProbeLocation <- function(probes, lib="FDb.InfiniumMethylation.hg19") {
if (!require(lib, character.only=TRUE)) stop(paste(lib, 'is not installed!'))
if (is(probes, 'eSet')) probes <- rownames(probes)
if (exists(lib)) {
lib <- get(lib)
if (is(lib, 'FeatureDb')) {
allAnnotation <- features(lib)
}
if (any(probes %in% names(allAnnotation))) {
probes <- probes[probes %in% names(allAnnotation)]
warnings('Some probes does not exist in the annotation library!')
}
methyGrange <- allAnnotation[probes]
values(methyGrange) <- DataFrame(ProbeID=probes)
return(methyGrange)
}
## For old annotation libraries: IlluminaHumanMethylation450k.db
## Check whether multiple versions of chromosome information is available,
## If so, only use the latest version.
chrPattern <- paste(sub("\\.db$", "", lib), "CHR", sep="")
chrObjs <- ls(paste("package:", lib, sep=""), pattern=paste(chrPattern, "[0-9]+$", sep=""))
if (length(chrObjs) > 1) {
chrVersion <- sub(".*[^0-9]([0-9]+)$", "\\1", chrObjs)
obj <- get(chrObjs[which.max(as.numeric(chrVersion))])
} else {
obj <- get(chrPattern)
}
pp <- probes[probes %in% keys(obj)]
chr[pp] <- sapply(AnnotationDbi::mget(pp, obj), function(x) x[1])
obj <- get(paste(sub("\\.db$", "", lib), "CPGCOORDINATE", sep=""))
pp <- probes[probes %in% keys(obj)]
loc[pp] <- sapply(AnnotationDbi::mget(pp, obj), function(x) x[1])
methyGrange <- GRanges(seqnames=chr, ranges=IRanges(start=loc, width=1), strand='*', ProbeID=probes)
names(methyGrange) <- probes
return(methyGrange)
}
## convert MethyLumiM class object to GenoSet class object
MethyLumiM2GenoSet <- function(methyLumiM, lib="FDb.InfiniumMethylation.hg19", bigMatrix=FALSE, dir.bigMatrix='.', savePrefix.bigMatrix) {
oldFeatureData <- fData(methyLumiM)
if (!identical(rownames(oldFeatureData), rownames(methyLumiM))) {
warnings('The rownames of featureData is inconsistent with the methyLumiM!')
rownames(oldFeatureData) <- rownames(methyLumiM)
}
methyLumiM <- addAnnotationInfo(methyLumiM, lib=lib)
ff <- fData(methyLumiM)
if (is.null(ff$CHROMOSOME))
stop('Chromosome information is not available!\n')
## retrieve the hgVersion information from the annotation library
hgVersion <- ''
if (require(lib, character.only=TRUE)) {
dbinfo <- paste(sub('.db$', '', lib), '_dbInfo', sep='')
if (exists(dbinfo)) {
dbinfo <- do.call(dbinfo, list())
rownames(dbinfo) <- dbinfo$name
hgVersion <- strsplit(dbinfo['GPSOURCEURL', 'value'], '/')[[1]]
hgVersion <- hgVersion[length(hgVersion)]
} else if (class(get(lib)) == "FeatureDb") {
metaInfo <- metadata(get(lib))
hgVersion <- metaInfo$value[metaInfo$name == 'Genome']
}
}
## remove those without chromosome location
rmInd <- (is.na(ff$CHROMOSOME) | ff$CHROMOSOME == '')
if (any(rmInd)) {
ff <- ff[!rmInd,]
cat(paste(length(which(rmInd)), 'probes were removed because of lack of chromosome location information!\n'))
}
ff$CHROMOSOME <- checkChrName(as.character(ff$CHROMOSOME))
locdata <- GRanges(seqnames=ff$CHROMOSOME, ranges=IRanges(start=ff$POSITION, width=1, names=rownames(methyLumiM)))
genome(locdata) <- hgVersion
names(locdata) <- rownames(methyLumiM)[!rmInd]
# sort the locdata
locdata <- genoset::toGenomeOrder(locdata, strict=TRUE)
if ((any(rmInd) && is(exprs(methyLumiM), 'BigMatrix')) || bigMatrix) {
if (missing(savePrefix.bigMatrix)) {
warnings('savePrefix.bigMatrix is missing! GenoSet_bigMatrixFiles is used!')
savePrefix.bigMatrix <- 'GenoSet_bigMatrixFiles'
}
dimNames <- list(names(locdata), colnames(methyLumiM))
## predefine a big matrix
methyLumiM.new <- lumi::asBigMatrix(methyLumiM[names(locdata),1], nCol=ncol(methyLumiM), dimNames=dimNames, saveDir=dir.bigMatrix, savePrefix=savePrefix.bigMatrix)
## fill in the subsetted data
for (ad.name in assayNames(methyLumiM.new)) {
matrix.new <- assayElement(methyLumiM.new, ad.name)
matrix.i <- assayElement(methyLumiM, ad.name)[names(locdata), ]
matrix.new[, colnames(matrix.i)] <- matrix.i
}
methyLumiM <- methyLumiM.new
rm(methyLumiM.new)
} else {
# methyLumiM <- methyLumiM[!rmInd,]
methyLumiM <- methyLumiM[names(locdata), ]
}
mcols(locdata) <- oldFeatureData[rownames(methyLumiM), ]
methyGenoSet <- MethyGenoSet(rowRanges=locdata, pData=pData(methyLumiM), annotation=as.character(lib), assays=as.list(assayData(methyLumiM)))
#fData(methyGenoSet) <- oldFeatureData[rownames(methyGenoSet), ]
methyGenoSet@history <- methyLumiM@history
## set smoothing attributes if exists
if (!is.null(attr(methyLumiM, 'windowIndex')))
attr(methyGenoSet, 'windowIndex') <- attr(methyLumiM, 'windowIndex')
if (!is.null(attr(methyLumiM, 'windowRange')))
attr(methyGenoSet, 'windowRange') <- attr(methyLumiM, 'windowRange')
if (!is.null(attr(methyLumiM, 'windowSize')))
attr(methyGenoSet, 'windowSize') <- attr(methyLumiM, 'windowSize')
return(methyGenoSet)
}
## smooth the methylation data (MethyLumiM or GenoSet objects) using slide window with fixed windowsize (in bp)
smoothMethyData <- function(methyData, winSize=250, lib='FDb.InfiniumMethylation.hg19', p.value.detection.th=0.05, bigMatrix=FALSE, dir.bigMatrix='.', savePrefix.bigMatrix=NULL, ...) {
if (!is(methyData, 'GenoSet') & !is(methyData, 'MethyLumiM')) {
stop("methyData should be a GenoSet or MethyLumiM object!")
}
if (is(methyData, 'GenoSet')) {
assaydata <- as.list(assays(methyData))
} else {
assaydata <- assayData(methyData)
}
if (is(methyData, 'MethyLumiM')) {
chrInfo <- getChrInfo(methyData, lib=lib)
} else {
chrInfo <- data.frame(PROBEID=rownames(methyData), CHROMOSOME=genoset::chr(methyData), POSITION=start(methyData), END=end(methyData))
}
if (is.character(chrInfo$POSITION)) chrInfo$POSITION = as.numeric(chrInfo$POSITION)
ratioData.old <- ratioData <- assaydata$exprs
if (is(ratioData, 'BigMatrix')) {
bigMatrix <- TRUE
}
index <- 1:nrow(ratioData)
# remove those probes lack of position information
rmInd <- which(is.na(chrInfo$POSITION) | chrInfo$CHROMOSOME == '' )
if (length(rmInd) > 0) index <- index[-rmInd]
# Sort the ratioData by chromosome and location
isOrdered <- with(chrInfo, sapply(split(POSITION, CHROMOSOME), function(x) all(x == sort(x, decreasing=FALSE))))
if (!all(isOrdered)) {
ord <- order(chrInfo$CHROMOSOME[index], chrInfo$POSITION[index], decreasing=FALSE)
index <- index[ord]
}
if (bigMatrix) {
if (!is(ratioData, 'BigMatrix') | !is.null(savePrefix.bigMatrix) | !all(index == 1:length(index)))
methyData <- asBigMatrix(methyData, rowInd=index, savePrefix=savePrefix.bigMatrix, saveDir=dir.bigMatrix)
} else {
methyData <- methyData[index,]
}
ratioData <- assaydata$exprs
## set NA for those probes having high p-values
if (!is.null(assaydata$detection)) {
naInd <- which(assaydata$detection > p.value.detection.th)
if (length(naInd) > 0) ratioData[naInd] <- NA
}
chrInfo <- chrInfo[index,]
index <- 1:nrow(ratioData)
# split data by Chromosome
index.chrList <- split(index, as.character(chrInfo$CHROMOSOME))
chrInfoList <- split(chrInfo, as.character(chrInfo$CHROMOSOME))
windowIndex <- vector(mode='list', length=length(chrInfoList))
windowRange <- vector(mode='list', length=length(chrInfoList))
for (i in 1:length(index.chrList)) {
index.i <- index.chrList[[i]]
chrInfo.i <- chrInfoList[[i]]
chr.i <- chrInfo.i$CHROMOSOME[1]
cat(paste("Smoothing Chromosome", chr.i, "...\n"))
# return the index of sorted probes in each slide window
windowIndex.i <- eval(call(".setupSlidingTests", pos_data=chrInfo.i$POSITION, winSize=winSize))
names(windowIndex.i) <- chrInfoList[[i]]$PROBEID
smooth.ratio.i <- sapply(windowIndex.i, function(ind) {
# average the values in slide window and perform test
smooth.ij <- colMeans(ratioData[index.i[ind],,drop=FALSE], na.rm=TRUE)
return(smooth.ij)
})
smooth.ratio.i <- t(smooth.ratio.i)
## replace the data with smoothed ones
ratioData[index.i,] <- smooth.ratio.i
windowIndex[[i]] <- windowIndex.i
windowRange.i <- sapply(windowIndex.i, function(ind) {
# average the values in slide window and perform test
range.ij <- range(chrInfo.i$POSITION[ind])
return(range.ij)
})
windowRange[[i]] <- t(windowRange.i)
}
windowRange <- do.call('rbind', windowRange)
colnames(windowRange) <- c('startLocation', 'endLocation')
exprs(methyData) <- ratioData
if (is(ratioData, 'BigMatrix')) {
assayData(methyData) <- bigmemoryExtras::attachAssayDataElements(assaydata)
}
# methyData <- suppressMessages(methyData[rownames(smooth.ratioData),colnames(smooth.ratioData)])
# exprs(methyData) <- smooth.ratioData[rownames(methyData),]
attr(methyData, 'windowIndex') <- windowIndex
attr(methyData, 'windowRange') <- windowRange
attr(methyData, 'windowSize') <- winSize
return(methyData)
}
## export a MethyGenoSet object as a 'gct' (for IGV) or 'bw' (big-wig) file
export.methyGenoSet <- function(methyGenoSet, file.format=c('gct', 'bw'), exportValue=c('beta', 'M', 'intensity'), hgVersion.default='hg19', savePrefix=NULL, outputFile=NULL) {
# exportValue <- match.arg(exportValue)
exportValue <- exportValue[1]
file.format <- match.arg(file.format)
## get the annotation version
hgVersion <- unname(genome(methyGenoSet)[1])
if (is.null(hgVersion) || unique(hgVersion) == '') {
warnings(paste('hgVersion information is not available in methyGenoSet!', hgVersion.default, ' will be used.'))
genome(methyGenoSet) <- hgVersion <- hgVersion.default
}
# chr <- space(rowRanges(methyGenoSet))
chr <- genoset::chr(methyGenoSet)
start <- start(methyGenoSet)
## Sort the rows of ratios.obj
methyGenoSet <- methyGenoSet[order(chr, start),]
## check overlap probes and average them
locationID <- paste(chr, start, sep='_')
dupInd <- which(duplicated(locationID))
if (length(dupInd) > 0) {
dupID <- unique(locationID[dupInd])
for (dupID.i in dupID) {
selInd.i <- which(locationID == dupID.i)
exprs(methyGenoSet)[selInd.i[1],] <- colMeans(exprs(methyGenoSet)[selInd.i,], na.rm=TRUE)
}
methyGenoSet <- methyGenoSet[-dupInd,]
}
## attach chr prefix in the chromosome names if it does not include it
if (length(grep('^chr', chr)) == 0) {
names(rowRanges(methyGenoSet)) <- paste('chr', names(rowRanges(methyGenoSet)), sep='')
}
if (exportValue %in% c('beta', 'M')) {
methyData <- assayElement(methyGenoSet, 'exprs')
if (exportValue == 'beta') {
methyData <- m2beta(methyData) # - 0.5
}
} else if (exportValue %in% assayNames(methyGenoSet)) {
methyData <- assayElement(methyGenoSet, exportValue)
} else if (exportValue == 'intensity') {
methyData <- log2(assayElement(methyGenoSet, 'methylated') + assayElement(methyGenoSet, 'unmethylated') + 1)
} else {
stop("exportValue doesn't exist in the data!")
}
## only keep 3 digit to save space
methyData <- signif(methyData, 3)
if (file.format == 'gct') {
chrInfo.all <- data.frame(PROBEID=rownames(methyGenoSet), CHROMOSOME=genoset::chr(methyGenoSet), START=start(methyGenoSet), END=end(methyGenoSet), stringsAsFactors=FALSE)
# remove those probes lack of position information
rmInd <- which(is.na(chrInfo.all$START))
if (length(rmInd) > 0) {
chrInfo.all <- chrInfo.all[-rmInd,]
methyData <- methyData[-rmInd,]
}
description <- with(chrInfo.all, paste("|@", CHROMOSOME, ":", START, "-", END, "|", sep=""))
gct <- cbind(Name=chrInfo.all[,'PROBEID'], Description=description, methyData)
## check version hgVersion is included in the filename
if (is.null(outputFile))
outputFile <- paste(savePrefix, "_", exportValue, "_", hgVersion, ".gct", sep='')
cat("#1.2\n", file=outputFile)
cat(paste(dim(gct), collapse='\t', sep=''), "\n", file=outputFile, append=TRUE)
suppressWarnings(write.table(gct, sep="\t", file=outputFile, row.names=FALSE, append=TRUE, quote=FALSE))
return(invisible(outputFile))
} else if (file.format == 'bw') {
chrInfo.all <- getChromInfoFromUCSC(hgVersion)
rownames(chrInfo.all) <- chrInfo.all[,'chrom']
samplenames <- colnames(methyGenoSet)
pdata <- pData(methyGenoSet)
chr.info <- chrInfo(methyGenoSet)
seq.lengths <- chr.info[,"stop"] - chr.info[,"offset"]
for (i in 1:ncol(methyData)) {
score.i <- methyData[,i]
cn.data.i <- GRanges(seqnames=genoset::chr(methyGenoSet), ranges=IRanges(start=start(methyGenoSet),end=end(methyGenoSet)), strand='*', score=score.i)
genome(cn.data.i) <- hgVersion
cn.data.i <- cn.data.i[!is.na(score.i), ]
if (savePrefix == '' || is.null(savePrefix)) {
savePrefix.i <- samplenames[i]
} else {
savePrefix.i <- paste(savePrefix, samplenames[i], sep='_')
}
seqlengths(cn.data.i) <- chrInfo.all[as.character(seqlevels(cn.data.i)), 'size']
if (length(outputFile) == ncol(methyData)) {
filename.i <- outputFile[i]
} else {
filename.i <- paste(savePrefix.i, "_", exportValue, "_", hgVersion[1], ".bw", sep='')
}
export.bw(cn.data.i, filename.i, dataFormat="auto")
}
return(invisible(filename.i))
}
}
# This is the sliding test
## ratios.obj inlcudes c("PROBEID","CHROMOSOME","POSITION"), values
## winSize is the half slide window size, by default it is 250
##
detectDMR.slideWin <- function(methyGenoSet, sampleType, winSize=250, testMethod=c('ttest', 'wilcox'), p.adjust.method='fdr', p.value.detection.th=0.05, ...) {
testMethod <- match.arg(testMethod)
if (is(methyGenoSet, 'MethyLumiM')) {
methyGenoSet <- MethyLumiM2GenoSet(methyGenoSet, ...)
}
smooth <- TRUE
if (!is.null(attr(methyGenoSet, 'windowIndex'))) {
smooth <- FALSE
if (!is.null(attr(methyGenoSet, 'windowSize'))) {
if (attr(methyGenoSet, 'windowSize') != winSize) smooth <- TRUE
}
}
if (smooth) {
methyGenoSet <- smoothMethyData(methyGenoSet, winSize=winSize, p.value.detection.th=p.value.detection.th, ...)
}
windowIndex <- attr(methyGenoSet, 'windowIndex')
windowRange <- attr(methyGenoSet, 'windowRange')
chrInfo <- suppressWarnings(as(rowRanges(methyGenoSet), 'GRanges'))
smoothData <- assays(methyGenoSet)$exprs
## set NA for those probes having high p-values
if (!is.null(assays(methyGenoSet)$detection)) {
naInd <- which(assays(methyGenoSet)$detection > p.value.detection.th)
if (length(naInd) > 0) smoothData[naInd] <- NA
}
## calculate class means
uniClass <- unique(sampleType)
selInd1 <- which(sampleType == uniClass[1])
selInd2 <- which(sampleType == uniClass[2])
classMean <- data.frame(rowMeans(smoothData[, selInd1, drop=FALSE], na.rm=TRUE), rowMeans(smoothData[, selInd2, drop=FALSE], na.rm=TRUE))
names(classMean) <- paste('mean_', uniClass, sep='')
if (!is.factor(sampleType)) sampleType <- as.factor(sampleType)
## perform t-test
if (is.function(testMethod)) {
testInfo <- testMethod(smoothData, sampleType, ... )
} else {
if (testMethod == 'ttest') {
naInd <- which(apply(smoothData, 1, function(x) any(is.na(x))))
if (length(naInd) > 0) {
## do t.test for those with nas
p.value <- difference <- tscore <- rep(NA, nrow(smoothData))
## check the non-NA sample number for each class
nonNA.num <- apply(smoothData[naInd,,drop=FALSE], 1, function(x) table(sampleType[!is.na(x)]))
## only include those with more than 2 samples in each class
tooFewSample.ind <- (apply(nonNA.num, 2, min) < 2)
if (length(naInd[!tooFewSample.ind]) > 0) {
t.test.res <- apply(smoothData[naInd[!tooFewSample.ind],,drop=FALSE], 1, function(x) t.test(x ~ sampleType))
p.value[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$p.value)
difference[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$estimate[1] - x$estimate[2])
tscore[naInd[!tooFewSample.ind]] <- sapply(t.test.res, function(x) x$statistic)
}
tstats <- rowttests(smoothData[-naInd,,drop=FALSE], sampleType)
p.value[-naInd] <- tstats$p.value
difference[-naInd] <- tstats$dm
tscore[-naInd] <- tstats$statistic
} else {
tstats <- rowttests(smoothData, sampleType)
p.value <- tstats$p.value
difference <- tstats$dm
tscore <- tstats$statistic
}
} else if (testMethod == 'wilcox') {
tmp <- split(as.data.frame(t(smoothData)), sampleType)
xx <- t(tmp[[1]])
yy <- t(tmp[[2]])
p.value <- apply(cbind(xx, yy), 1, function(x) wilcox.test(x[1:ncol(xx)], x[(ncol(xx)+1):length(x)])$p.value, ...)
difference <- classMean[,1] - classMean[,2]
} else {
stop('Unsupported "testMethod"!')
}
p.adjust <- p.adjust(p.value, method=p.adjust.method)
testInfo <- data.frame(PROBEID=rownames(smoothData), difference=difference, p.value=p.value, p.adjust=p.adjust)
if (testMethod == 'ttest') testInfo <- data.frame(testInfo, tscore=tscore)
}
startInd <- unlist(lapply(windowIndex, function(x) sapply(x, min)))[rownames(smoothData)]
endInd <- unlist(lapply(windowIndex, function(x) sapply(x, max)))[rownames(smoothData)]
testInfo <- data.frame(testInfo, startWinIndex=startInd, endWinIndex=endInd, windowRange, classMean)
testResult <- chrInfo
## export as a GRanges object
values(testResult) <- testInfo
return(testResult)
}
identifySigDMR <- function(detectResult, p.adjust.method="fdr", pValueTh=0.01, fdrTh=pValueTh, diffTh=1,
minProbeNum=1, maxGap=2000, minGap=100, oppositeMethylation=FALSE, topNum=NULL) {
if (is(detectResult, 'GRanges')) {
detectResult.g <- detectResult
detectResult <- data.frame(CHROMOSOME=as.character(seqnames(detectResult)), POSITION=start(detectResult), as(values(detectResult), 'data.frame'))
rownames(detectResult) <- names(detectResult.g)
} else if (is(detectResult, 'data.frame')) {
if (!all(c("PROBEID","CHROMOSOME","POSITION") %in% names(detectResult)))
stop(" PROBEID, CHROMOSOME and POSITION are required columns in the detectResult object!")
# remove those probes lack of position information
rmInd <- which(is.na(detectResult$POSITION))
if (length(rmInd) > 0) {
detectResult <- detectResult[-rmInd,]
}
## convert it as a GRanges object
tmp <- with(detectResult, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=POSITION, end=POSITION), strand='*'))
rmInd <- which(colnames(detectResult) %in% c('CHROMOSOME', 'POSITION', 'END', 'width'))
values(tmp) <- detectResult[,-rmInd, drop=FALSE]
detectResult.g <- tmp
names(detectResult.g) <- rownames(detectResult)
} else {
stop('detectResult should be either a GRanges object or a data.frame with required columns!')
}
## No pValue constraint if we only interested in the top CpG-sites
if (!is.null(topNum) && !is.na(topNum)) pValueTh <- 1
## if there is no isSignificant column, then DMR should be identified first
if (is.null(detectResult$isSignificant) || is.null(topNum) || is.na(topNum)) {
detectResult <- .identifySigProbe(detectResult, p.adjust.method=p.adjust.method, pValueTh=pValueTh, fdrTh=fdrTh, diffTh=diffTh)
}
## Select the significant CpG-sites
sigInd <- which(detectResult$isSignificant)
## check the methylation direction of two conditions for each probe
classMeanCol <- grep('^mean_', names(detectResult), value=TRUE)
# if (length(classMeanCol) == 2) {
# classMean <- detectResult[,classMeanCol]
# if (oppositeMethylation) {
# sigInd <- sigInd[classMean[sigInd,1] * classMean[sigInd,2] < 0]
# }
# }
if (!is.null(topNum) && !is.na(topNum)) {
ord <- order(detectResult$p.value[sigInd], decreasing=FALSE)
sigInd <- sigInd[ord]
if (topNum < length(sigInd)) {
rmInd <- sigInd[-(1:topNum)]
detectResult$isSignificant[rmInd] <- FALSE
sigInd <- sigInd[1:topNum]
}
} else {
if (length(sigInd) == 0) {
cat('No significant CpG-sites were identified based on current criteria!\n')
return(NULL)
}
}
sigProbe <- rownames(detectResult)[sigInd]
detectResult$status <- rep(FALSE, nrow(detectResult))
detectResult[sigProbe, 'status'] <- TRUE
if (length(sigProbe) == 0) {
return(list(sigDMRInfo=NULL, sigDataInfo=NULL))
}
## ---------------------------------------
## identify the boundary of DMR (return a GRanges object)
scoreFuns <- list(p.value=c(min=min), difference=c(max=function(x) x[which.max(abs(x))]), p.adjust=c(min=min), tscore=c(max=function(x) x[which.max(abs(x))]))
scoreColumns <- c('p.value', 'difference', 'p.adjust', 'tscore', classMeanCol)
if (is.data.frame(detectResult)) {
scoreColumns <- scoreColumns[scoreColumns %in% names(detectResult)]
} else {
scoreColumns <- scoreColumns[scoreColumns %in% names(values(detectResult))]
}
scoreFuns <- scoreFuns[names(scoreFuns) %in% scoreColumns]
sigDMRInfo.g <- getContinuousRegion(detectResult, scoreColumns=scoreColumns,
scoreFuns=scoreFuns, maxGap=maxGap, minGap=minGap)
sigDMRInfo <- as(sigDMRInfo.g, 'data.frame')
rownames(sigDMRInfo) <- names(sigDMRInfo.g)
## filtering based on the minimum number of probes in each DMR
if (minProbeNum > 1) {
numProbe <- sigDMRInfo$NumOfProbe
sigDMRInfo <- sigDMRInfo[numProbe >= minProbeNum, ,drop=FALSE]
sigDMRInfo.g <- sigDMRInfo.g[numProbe >= minProbeNum]
}
## check oppositeMethylation direction
if (length(classMeanCol) == 2 && oppositeMethylation) {
dmrMean <- sigDMRInfo[, classMeanCol]
sigDMRInfo.g <- sigDMRInfo.g[dmrMean[,1] * dmrMean[,2] < 0]
}
## only keep the sigDataInfo which overlaps with sigDMRInfo
sigDataInfo.g <- detectResult.g[sigProbe]
sigDataInfo.g <- subsetByOverlaps(sigDataInfo.g, sigDMRInfo.g)
return(list(sigDMRInfo=sigDMRInfo.g, sigDataInfo=sigDataInfo.g))
}
## Get continuous region from discrete probe measurements
# getContinuousRegion <- function(detectResult, scoreColumns=NULL, scoreFuns=mean, maxGap=2000, minGap=100, as.GRanges=TRUE) {
getContinuousRegion <- function(detectResult, scoreColumns=NULL, scoreFuns=c(mean=mean), maxGap=2000, minGap=100) {
if (is.data.frame(detectResult)) {
if (!all(c('CHROMOSOME', 'POSITION', 'status') %in% names(detectResult))) {
stop('CHROMOSOME, POSITION and status columns are required!')
}
detectResult.g <- with(detectResult, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=POSITION, end=POSITION), strand='*'))
values(detectResult.g) <- detectResult[, !(colnames(detectResult) %in% c('CHROMOSOME', 'POSITION'))]
} else if (is(detectResult, 'GRanges')) {
detectResult.g <- detectResult
detectResult <- data.frame(CHROMOSOME=as.character(seqnames(detectResult)), POSITION=as.numeric(start(detectResult)), as(values(detectResult), 'data.frame'))
}
detectResult$POSITION <- as.numeric(detectResult$POSITION)
## sort in chromsome order
detectResult <- with(detectResult, detectResult[order(CHROMOSOME, POSITION, decreasing=FALSE),])
if (is.null(detectResult$status)) {
stop('"status" column is required!')
} else {
if (!is.logical(detectResult$status)) {
warning('"status" column should be a logical vector! "status > 0" was used in calculation!')
detectResult$status <- detectResult$status > 0
}
}
## set NA as FALSE
detectResult$status[is.na(detectResult$status)] <- FALSE
if (!is.null(scoreColumns)) {
if (!all(scoreColumns %in% colnames(detectResult))) {
stop('Some "scoreColumns" does not match "detectResult"!')
}
if (is.null(scoreFuns)) scoreFuns <- c(mean=mean)
# if (length(scoreFuns) < length(scoreColumns)) {
# scoreFuns <- rep(scoreFuns, length(scoreColumns))
# }
}
## split data by Chromosome
detectResult.chrList <- split(detectResult, detectResult$CHROMOSOME)
## ---------------------------------------
## identify the boundary of DMR
dmr.list <- lapply(detectResult.chrList, function(detectResult.i) {
len.significant.i <- length(which(detectResult.i$status))
if (len.significant.i == 0) return(NULL)
allPosition.i <- detectResult.i$POSITION
status.i <- as.numeric(detectResult.i$status)
diff.i <- diff(status.i)
# identify the diff region
startInd.i <- which(diff.i == 1) + 1
if (status.i[1] == 1)
startInd.i <- c(1, startInd.i)
endInd.i <- which(diff.i == -1)
if (length(endInd.i) < length(startInd.i))
endInd.i <- c(endInd.i, length(status.i))
## check the gaps of sparse probe
gaps.i <- which(c(FALSE, diff(allPosition.i) > maxGap) & status.i == 1)
# the gap should be 1 in both sides
gaps.i <- gaps.i[status.i[gaps.i + 1] == 1]
if (length(gaps.i) > 0) {
startInd.i <- sort(c(startInd.i, gaps.i))
endInd.i <- sort(c(endInd.i, gaps.i))
}
if ("startWinIndex" %in% colnames(detectResult.i)) {
startRegion.i <- detectResult.i[startInd.i, "startWinIndex"]
endRegion.i <- detectResult.i[endInd.i, "endWinIndex"]
startLoc.i <- detectResult.i[startRegion.i, 'POSITION']
endLoc.i <- detectResult.i[endRegion.i, 'POSITION']
width.i <- endLoc.i - startLoc.i + 1
regionSummary.i <- data.frame(CHROMOSOME=detectResult.i[startRegion.i, 'CHROMOSOME'],
startLocation=startLoc.i, endLocation=endLoc.i, width=width.i, startInd=startRegion.i, endInd=endRegion.i)
} else {
startLoc.i <- detectResult.i[startInd.i, "POSITION"]
endLoc.i <- detectResult.i[endInd.i, "POSITION"]
width.i <- endLoc.i - startLoc.i + 1
regionSummary.i <- data.frame(CHROMOSOME=detectResult.i[startInd.i, 'CHROMOSOME'],
startLocation=startLoc.i, endLocation=endLoc.i, width=width.i, startInd=startInd.i,
endInd=endInd.i)
}
return(regionSummary.i)
})
sigDMRInfo <- do.call('rbind', dmr.list)
if (length(sigDMRInfo) == 0) return(GRanges())
## convert as GRanges
tmp <- with(sigDMRInfo, GRanges(seqnames=CHROMOSOME, ranges=IRanges(start=startLocation, end=endLocation), strand='*'))
rmInd <- which(colnames(sigDMRInfo) %in% c('CHROMOSOME', 'startLocation', 'endLocation', 'width'))
values(tmp) <- sigDMRInfo[,-rmInd, drop=FALSE]
sigDMRInfo <- tmp
## combine overlapping regions
sigDMRInfo.r <- reduce(sigDMRInfo, min.gapwidth=minGap)
matchInfo <- GenomicRanges::findOverlaps(detectResult.g, sigDMRInfo.r, select="first")
ind <- which(!is.na(matchInfo))
dmr2ind <- split(ind, matchInfo[ind])
dmr2len <- sapply(dmr2ind, length)
dmrInfo <- data.frame(NumOfProbe=dmr2len)
## calculate the mean score of each region
if (!is.null(scoreColumns)) {
if (!all(sapply(scoreFuns, is.function))) {
if (!all(names(scoreFuns) %in% scoreColumns))
stop('The names of scoreFuns list should be from scoreColumns!')
}
detectValue <- as.matrix(as(values(detectResult.g), 'data.frame'))[,scoreColumns, drop=FALSE]
detectValue <- matrix(as.numeric(detectValue), nrow=nrow(detectValue), ncol=ncol(detectValue), dimnames=dimnames(detectValue))
dmr2score <- lapply(dmr2ind, function(ind.i) {
value.i <- NULL
## If scoreFuns is a named list, then the named scoreColumns will be treated differently
if (all(sapply(scoreFuns, is.function))) {
for (fun.i in scoreFuns) {
value.i <- cbind(value.i, apply(detectValue[ind.i,,drop=FALSE], 2, fun.i))
}
} else {
for (scoreCol.j in scoreColumns) {
scoreFuns.j <- scoreFuns[[scoreCol.j]]
if (is.null(scoreFuns.j)) {
value.ij <- mean(detectValue[ind.i, scoreCol.j])
if (!grepl('^mean', scoreCol.j)) {
names(value.ij) <- paste('mean', scoreCol.j, sep='_')
} else {
names(value.ij) <- scoreCol.j
}
} else {
value.ij <- sapply(scoreFuns.j, function(x) x(detectValue[ind.i, scoreCol.j]))
names(value.ij) <- paste(names(scoreFuns.j), scoreCol.j, sep='_')
}
value.i <- c(value.i, value.ij)
}
}
return(value.i)
})
dmr2score <- do.call('rbind', dmr2score)
rownames(dmr2score) <- names(dmr2ind)
avg.score <- matrix(NA, nrow=length(sigDMRInfo.r), ncol=ncol(dmr2score))
if (is.null(colnames(dmr2score))) {
scoreFunName <- names(scoreFuns)
if (is.null(scoreFunName)) scoreFunName <- paste('Fun', 1:length(scoreFuns), sep='')
colnames(avg.score) <- unlist(lapply(scoreFunName, paste, scoreColumns, sep='_'))
} else {
colnames(avg.score) <- colnames(dmr2score)
}
avg.score[as.numeric(rownames(dmr2score)),] <- dmr2score
dmrInfo <- data.frame(dmrInfo, as.data.frame(avg.score))
}
values(sigDMRInfo.r) <- dmrInfo
return(sigDMRInfo.r)
}
annotateGRanges <- function(grange, annotationDatabase, CpGInfo=NULL, exons=FALSE, flankRange=0, promoterRange=2000, checkGeneBody=FALSE, EntrezDB='org.Hs.eg.db') {
if (!is(grange, 'GRanges')) {
stop('grange should be a GRanges object!')
}
grange <- checkChrName(grange, addChr=TRUE)
if (!is.null(names(grange))) {
if (any(duplicated(names(grange)))) names(grange) <- NULL
}
## load human genome information and check overlap with known genes
if (is.character(annotationDatabase)) {
if (file.exists(annotationDatabase)) {
annotationDatabase <- loadDb(annotationDatabase)
} else if (require(annotationDatabase, character.only=TRUE)) {
annotationDatabase <- get(annotationDatabase)
} else {
stop('Provided annotationDatabase does not exist!')
}
}
if (is(annotationDatabase, 'TxDb')) {
tr <- transcripts(annotationDatabase, columns=c('gene_id', 'tx_id', 'tx_name'))
} else if (is(annotationDatabase, 'GRanges')) {
tr <- annotationDatabase
} else {
stop('Wrong type of annotationDatabase! Please check help for more details.')
}
tr <- checkChrName(tr, addChr=TRUE)
## make sure the consistence of the genome
if (unique(genome(grange)) == '' || is.na(unique(genome(grange)))) genome(grange) <- unique(genome(tr))[1]
if (is.logical(exons)) {
if (exons && is(annotationDatabase, 'TxDb')) {
exons <- exons(annotationDatabase, columns=c('exon_id', 'exon_name', 'exon_rank', 'tx_name'))
} else {
exons <- NULL
}
} else if (!is(exons, 'GRanges')) {
stop('"exons" should be a logical or GRanges object!')
}
## filter the transcripts without matching gene ids
tr <- tr[elementNROWS(values(tr)$gene_id) > 0]
names(tr) <- values(tr)$tx_name
tr <- checkChrName(tr, addChr=TRUE)
tx2gene <- sub('GeneID:', '', unlist(values(tr)$gene_id))
names(tx2gene) <- unlist(values(tr)$tx_name)
if (is.character(CpGInfo)) {
CpG.grange <- import.bed(CpGInfo[1])
} else if (is(CpGInfo, 'GRanges')) {
CpG.grange <- CpGInfo
} else if (is.null(CpGInfo) || is.na(CpGInfo)) {
CpG.grange <- NULL
} else {
stop('CpGInfo should be a bed file or a GRanges object!')
}
if (!is.null(CpGInfo)) CpGInfo <- checkChrName(CpGInfo, addChr=TRUE)
## expand the interested GRanges
if (flankRange != 0) {
grange.ext <- resize(grange, width=width(grange) + 2*flankRange, fix="center")
} else {
grange.ext <- grange
}
if (is.null(names(grange.ext))) {
values(grange.ext)$id <- make.unique(paste(seqnames(grange), start(grange), end(grange), sep="_"))
} else {
values(grange.ext)$id <- names(grange.ext)
}
if (!is.null(values(grange)$Transcript)) {
txName.known <- values(grange)$Transcript
txName.known[!(txName.known %in% names(tr))] <- NA
} else {
txName.known <- rep(NA, length(grange))
}
## ----------------------------------------------------------
## Find the nearest TSS and related Transcript
tss <- flank(tr, width=-1, start=TRUE)
if (any(is.na(txName.known))) {
naInd <- which(is.na(txName.known))
# nearestInfo.tr <- IRanges::as.matrix(nearest(grange[naInd], tr, select="all"))
#
# nearestTx <- tapply(nearestInfo.tr[,2], nearestInfo.tr[,1], function(ind) names(tr)[ind])
# multiMapInd <- which(sapply(nearestTx, length) > 1)
# if (length(multiMapInd) > 0) {
# multiMapInd <- as.numeric(names(nearestTx)[multiMapInd])
# for (i in multiMapInd) {
# nearestTx.i <- nearestTx[[i]]
# nearestTx[[i]] <- nearestTx.i[nearest(grange[naInd][i], tss[nearestTx.i])]
# }
# }
# txName.known[naInd] <- unlist(nearestTx)
naIndGrange = grange[naInd]
nearestInfo.tr <- IRanges::as.matrix(nearest(naIndGrange, tr, select="all"))
naIndGrange.expanded = naIndGrange[ nearestInfo.tr[,"queryHits"] ]
stopifnot( identical(names(tss), names(tr) ) )
tss.expanded = tss[ nearestInfo.tr[,"subjectHits"] ]
nearestInfo.tr = cbind( nearestInfo.tr, TSSDist=abs( start(naIndGrange.expanded) - start(tss.expanded) ) )
nearestInfo.tr = nearestInfo.tr[ order(nearestInfo.tr[,"queryHits"], nearestInfo.tr[,"TSSDist"]),]
nearestInfo.tr = nearestInfo.tr[ !duplicated(nearestInfo.tr[,"queryHits"]),]
txName.known[naInd] <- names(tss)[ nearestInfo.tr[,"subjectHits"] ]
## Suggested modification from Peter Haverty in Sept. 2015
values(grange)$Transcript <- txName.known
}
## add related EntrezID and GeneSymbol
if (is.null(values(grange)$EntrezID)) {
values(grange)$EntrezID <- tx2gene[values(grange)$Transcript]
} else {
values(grange)$EntrezID <- sub('^GeneID:', '', values(grange)$EntrezID)
}
if (is.null(values(grange)$GeneSymbol))
values(grange)$GeneSymbol <- probe2gene(values(grange)$EntrezID, lib=EntrezDB, collapse=TRUE)
## distance to TSS
start2tss <- start(grange) - start(tss[txName.known])
end2tss <- end(grange) - end(tss[txName.known])
dist2tss <- pmin(abs(start2tss), abs(end2tss)) * sign(start2tss)
dist2tss[sign(start2tss) * sign(end2tss) < 0] <- 0
dist2tss[as.vector(strand(tss[txName.known])) == '-'] <- dist2tss[as.vector(strand(tss[txName.known])) == '-'] * -1
values(grange)$distance2TSS <- dist2tss
## nearest transcript based on TSS
nearestInd.tss <- nearest(grange, tss)
values(grange)$nearestTx <- values(tss[nearestInd.tss])$tx_name
suppressWarnings(dist2tss.nearestTx <- as.data.frame(distanceToNearest(grange, tss[nearestInd.tss]))[['distance']])
## mark promoter based on distance to TSS
promoterStatus <- rep(FALSE, length(dist2tss.nearestTx))
promoterStatus[dist2tss <= 0 & abs(dist2tss) <= promoterRange] <- TRUE
values(grange)$PROMOTER <- promoterStatus
## ----------------------------------------------------------
## find the overlapping with gene body
if (checkGeneBody) {
OL.gene <- findOverlaps(grange.ext, tr)
dmr2gene <- cbind(values(grange.ext)$id[queryHits(OL.gene)],
sapply(values(tr)$gene_id[subjectHits(OL.gene)], paste, collapse=';'))
dupInd <- duplicated(paste(dmr2gene[,1], dmr2gene[,2], sep='_'))
if (any(dupInd)) dmr2gene <- dmr2gene[!dupInd,,drop=FALSE]
dmr2gene.list <- split(dmr2gene[,2], as.factor(dmr2gene[,1]))
dmr2gene <- sapply(dmr2gene.list, function(x) {
x <- sub('GeneID:', '', unique(x), ignore.case=TRUE)
paste(probe2gene(x, lib=EntrezDB, collapse=TRUE), collapse=';')
})
dmr2ll <- sapply(dmr2gene.list, function(x) paste(unique(x), collapse=';'))
## map to Tx_name
dmr2tx <- cbind(values(grange.ext)$id[queryHits(OL.gene)],
sapply(values(tr)$tx_name[subjectHits(OL.gene)], paste, collapse=';'))
dupInd <- duplicated(paste(dmr2tx[,1], dmr2tx[,2], sep='_'))
if (any(dupInd)) dmr2tx <- dmr2tx[!dupInd,,drop=FALSE]
dmr2tx.list <- split(dmr2tx[,2], as.factor(dmr2tx[,1]))
dmr2tx <- sapply(dmr2tx.list, paste, collapse=';')
mapInfo <- matrix("", nrow=length(grange.ext), ncol=3)
colnames(mapInfo) <- c("EntrezID_GeneBody", "GeneSymbol_GeneBody", "Tx_GeneBody")
rownames(mapInfo) <- values(grange.ext)$id
mapInfo[names(dmr2ll), 'EntrezID_GeneBody'] <- dmr2ll
mapInfo[names(dmr2gene), 'GeneSymbol_GeneBody'] <- dmr2gene
mapInfo[names(dmr2tx), 'Tx_GeneBody'] <- dmr2tx
values(grange) <- data.frame(as(values(grange), 'data.frame'), mapInfo)
}
## ----------------------------------------------------------
## find the overlapping with exons
if (is(exons, 'GRanges')) {
exons <- checkChrName(exons, addChr=TRUE)
OL.exons <- findOverlaps(grange.ext, exons)
hitExons <- exons[subjectHits(OL.exons)]
if (length(hitExons) > 0) {
## combine tx_name and exon_rank to get a new exon name
if (all(is.na(values(hitExons)$exon_name))) {
ll <- sapply(as.list(values(hitExons)$tx_name), length)
exon_id <- values(hitExons)$exon_id
exon_id <- rep(exon_id, ll)
tx_name <- unlist(values(hitExons)$tx_name)
exon_rank <- unlist(values(hitExons)$exon_rank)
exon_name <- paste(tx_name, exon_rank, sep='_')
exon_name <- split(exon_name, exon_id)
values(hitExons)$exon_name <- sapply(exon_name, paste, collapse=',')
}
dmr2exons <- cbind(values(grange.ext)$id[queryHits(OL.exons)],
values(hitExons)$exon_name)
dupInd <- duplicated(paste(dmr2exons[,1], dmr2exons[,2], sep='_'))
if (any(dupInd)) {
dmr2exons <- dmr2exons[!dupInd,,drop=FALSE]
hitExons <- hitExons[!dupInd]
}
dmr2exons.list <- split(dmr2exons[,2], as.factor(dmr2exons[,1]))
dmr2exons.flat <- sapply(dmr2exons.list, function(x) paste(unique(x), collapse=';'))
uniId <- unique(values(grange.ext)$id)
mapInfo <- matrix("", nrow=length(uniId), ncol=2)
colnames(mapInfo) <- c("Exons", "Exon1")
rownames(mapInfo) <- uniId
mapInfo[names(dmr2exons.flat), 'Exons'] <- dmr2exons.flat
exon1_status <- sapply(as.list(values(hitExons)$exon_rank), function(x) any(x== 1))
if (any(exon1_status)) {
id_exon1 <- dmr2exons[exon1_status,1]
tx_exon1 <- sapply(as.list(values(hitExons)$tx_name[exon1_status]), head, 1)
mapInfo[id_exon1, 'Exon1'] <- tx_exon1
}
values(grange) <- suppressWarnings(data.frame(as(values(grange), 'data.frame'), mapInfo[values(grange.ext)$id,]))
}
}
## ----------------------------------------------------------
## check overlap with CpG islands
if (!is.null(CpGInfo)) {
OL.CpG <- findOverlaps(grange.ext, CpG.grange)
dmr2CpG <- cbind(values(grange.ext)$id[queryHits(OL.CpG)],
unlist(values(CpG.grange)$name)[subjectHits(OL.CpG)])
mapInfo <- rep('', length(grange.ext))
names(mapInfo) <- values(grange.ext)$id
mapInfo[dmr2CpG[,1]] <- dmr2CpG[,2]
values(grange) <- data.frame(as(values(grange), 'data.frame'), CpG_island=mapInfo)
}
return(grange)
}
## input of annotateDMR is the output of identifySigDMR
annotateDMRInfo <- function(DMRInfo, annotationDatabase, CpGInfo=NULL, flankRange=500, promoterRange=2000, EntrezDB='org.Hs.eg.db', as.GRanges=TRUE) {
if (all(c('sigDMRInfo', 'sigDataInfo') %in% names(DMRInfo))) {
sigDMRInfo <- DMRInfo$sigDMRInfo
sigDataInfo <- DMRInfo$sigDataInfo
} else if (is(DMRInfo, 'GRanges')) {
sigDMRInfo <- DMRInfo
sigDataInfo <- NULL
} else {
stop('DMRInfo should be a GRanges object or a list of GRanges objects (sigDMRInfo and sigDataInfo)!')
}
## load human genome information and check overlap with known genes
if (is.character(annotationDatabase)) {
if (file.exists(annotationDatabase)) {
annotationDatabase <- loadDb(annotationDatabase)
} else if (require(annotationDatabase, character.only=TRUE)) {
annotationDatabase <- get(annotationDatabase)
} else {
stop('Provided annotationDatabase does not exist!')
}
}
if (is(annotationDatabase, 'TxDb')) {
## UCSC only includes reviewed genes!!!
tr <- transcripts(annotationDatabase, columns=c('gene_id', 'tx_id', 'tx_name'))
} else if (is(annotationDatabase, 'GRanges')) {
tr <- annotationDatabase
} else {
stop('Wrong type of annotationDatabase! Please check help for more details.')
}
## filter the transcripts without matching gene ids
tr <- tr[elementNROWS(values(tr)$gene_id) > 0]
names(tr) <- values(tr)$tx_id
if (is.character(CpGInfo)) {
CpG.grange <- import.bed(CpGInfo[1])
} else if (is(CpGInfo, 'GRanges')) {
CpG.grange <- CpGInfo
} else if (is.null(CpGInfo) || is.na(CpGInfo)) {
CpG.grange <- NULL
} else {
stop('CpGInfo should be a bed file or a GRanges object!')
}
## expand the transcript region
## create a GRange object and check overlap
if (!is.null(sigDMRInfo)) {
sigDMRInfo <- annotateGRanges(sigDMRInfo, annotationDatabase=tr, CpGInfo=CpG.grange, flankRange=flankRange, promoterRange=promoterRange, EntrezDB=EntrezDB)
}
if (!is.null(sigDataInfo)) {
sigDataInfo <- annotateGRanges(sigDataInfo, annotationDatabase=tr, CpGInfo=CpG.grange, flankRange=flankRange, promoterRange=promoterRange, EntrezDB=EntrezDB)
}
return(list(sigDMRInfo=sigDMRInfo, sigDataInfo=sigDataInfo))
}
export.DMRInfo <- function(DMRInfo.ann, methyData=NULL, savePrefix='') {
## output the DMR data
sigDataInfo <- as.data.frame(DMRInfo.ann$sigDataInfo)
sigProbe <- as.character(sigDataInfo$PROBEID)
if (!is.null(methyData)) {
sigDMRdataInfo <- cbind(sigDataInfo, exprs(methyData)[sigProbe, , drop=FALSE])
} else {
sigDMRdataInfo <- sigDataInfo
}
ord <- order(sigDMRdataInfo$seqnames, sigDMRdataInfo$start, sigDMRdataInfo$end, decreasing=FALSE)
sigDMRdataInfo <- sigDMRdataInfo[ord,]
# if (all(c('startWinIndex', 'endWinIndex') %in% colnames(sigDMRdataInfo))) {
# numProbe <- sigDMRdataInfo[,'endWinIndex'] - sigDMRdataInfo[,'startWinIndex'] + 1
# ind.start <- which(colnames(sigDMRdataInfo) == 'startWinIndex')
# sigDMRdataInfo[,ind.start] <- numProbe
# colnames(sigDMRdataInfo)[ind.start] <- 'numberOfProbe_SlidingWindow'
# sigDMRdataInfo <- sigDMRdataInfo[,-which(colnames(sigDMRdataInfo) == 'endWinIndex')]
# }
sigDMRdataInfo <- data.frame(CHROMOSOME=sigDMRdataInfo$seqnames, POSITION=sigDMRdataInfo$start, sigDMRdataInfo[,-c(1:5)])
## remove columns
rmCol <- c('isSignificant', 'p.value.adj', 'startLocation', 'endLocation')
sigDMRdataInfo <- sigDMRdataInfo[, !(colnames(sigDMRdataInfo) %in% rmCol)]
write.csv(sigDMRdataInfo, file=paste('DMRdata_', savePrefix, '_', Sys.Date(), '.csv', sep=''), row.names=FALSE)
## output the sigDMRInfo as a bed file
fileName.i <- make.names(paste('DMRInfo_', savePrefix, '_', Sys.Date(), '.bed', sep=''))
export(DMRInfo.ann$sigDMRInfo, fileName.i, format='bed')
## output as a csv file
sigDMRInfo <- as.data.frame(DMRInfo.ann$sigDMRInfo)
ord <- order(sigDMRInfo$seqnames, sigDMRInfo$start, sigDMRInfo$end, decreasing=FALSE)
sigDMRInfo <- sigDMRInfo[ord,]
# if (all(c('startInd', 'endInd') %in% colnames(sigDMRInfo))) {
# numProbe <- sigDMRInfo[,'endInd'] - sigDMRInfo[,'startInd'] + 1
# ind.start <- which(colnames(sigDMRInfo) == 'startInd')
# sigDMRInfo[,ind.start] <- numProbe
# colnames(sigDMRInfo)[ind.start] <- 'numberOfProbe_SlidingWindow'
# sigDMRInfo <- sigDMRInfo[,-which(colnames(sigDMRInfo) == 'endInd')]
# }
sigDMRInfo <- data.frame(CHROMOSOME=sigDMRInfo$seqnames, sigDMRInfo[,-1])
## remove columns
rmCol <- c('width', 'strand', 'isSignificant', 'p.value.adj')
sigDMRInfo <- sigDMRInfo[, !(colnames(sigDMRInfo) %in% rmCol)]
write.csv(sigDMRInfo, file=paste('DMRInfo_', savePrefix, '_', Sys.Date(), '.csv', sep=''), row.names=FALSE)
return(invisible(TRUE))
}
## -----------------------------------------------------
## other internal functions
# detectResult is the return of detectDMR
.identifySigProbe <- function(detectResult, p.adjust.method="fdr", pValueTh=0.01, fdrTh=pValueTh, diffTh=log2(1.5)) {
if (!is.numeric(pValueTh) && !is.na(pValueTh)) stop("pValueTh needs to be numeric. Use 1 or NA if you want to turn it off.")
if (!is.numeric(diffTh) && !is.na(pValueTh)) stop("diffTh needs to be numeric. Use 0 or NA if you want to turn it off.")
if (is.na(pValueTh)) pValueTh <- 1
if (is.na(diffTh)) diffTh <- 0
annotateColumn <- c("PROBEID","CHROMOSOME","POSITION")
if (!all(annotateColumn %in% names(detectResult)))
stop(" PROBEID, CHROMOSOME and POSITION are required columns!")
annotateInd <- which(names(detectResult) %in% annotateColumn)
# multiple testing correction
p.val.cols <- grep("^p\\.",colnames(detectResult))
p.val.adj.cols <- grep("p\\.adjust", colnames(detectResult))
p.val.cols <- p.val.cols[!(p.val.cols %in% p.val.adj.cols)]
if (length(p.val.cols) > 1) {
p.value <- apply(detectResult[,p.val.cols], 1, min, na.rm=TRUE)
} else {
p.value <- detectResult[,p.val.cols]
}
p.value.adj <- p.adjust(p.value, method=p.adjust.method)
if ('difference' %in% colnames(detectResult)) {
max.diff <- detectResult[,'difference']
} else {
differences <- detectResult[, -c(annotateInd, p.val.cols, p.val.adj.cols, which(colnames(detectResult) %in% c("startWinIndex", "endWinIndex"))), drop=FALSE]
max.diff <- rowMax(abs(as.matrix(differences)))
max.diff <- max.diff * sign(differences[,1])
}
isSignificant <- rep(TRUE, length(max.diff))
if (pValueTh < 1) isSignificant <- isSignificant & (p.value < pValueTh)
if (p.adjust.method != 'none' && fdrTh < 1) isSignificant <- isSignificant & (p.value.adj < fdrTh)
if (diffTh > 0) isSignificant <- isSignificant & (abs(max.diff) > diffTh)
detectResult <- data.frame(detectResult, p.value.adj=p.value.adj, difference=max.diff, isSignificant=isSignificant)
return(detectResult)
}
# This helper function sets up the positions for the sliding test based on the positions.
# Returns a list of index positions that fit within the sliding window with width winSize
.setupSlidingTests <- function(pos_data, winSize=250) {
len <- length(pos_data)
probesetList <- as.vector(seq(pos_data), mode='list')
diff.pos.left <- c(Inf, diff(pos_data))
growingHeight.left <- diff.pos.left
growingStatus.left <- which(growingHeight.left <= winSize)
diff.pos.right <- c(diff.pos.left[-1], Inf)
growingHeight.right <- diff.pos.right
growingStatus.right <- which(growingHeight.right <= winSize)
iter <- 1
## growing in both left and right directions until it reaches the half-window-size winSize
while (length(c(growingStatus.left, growingStatus.right)) > 0) {
growingHeight.left[-(1:iter)] <- growingHeight.left[-(1:iter)] + diff.pos.left[1:(len - iter)]
growingHeight.right[1:(len - iter)] <- growingHeight.right[1:(len - iter)] + diff.pos.right[-(1:iter)]
## in left direction
if (length(growingStatus.left) > 0) {
probesetList[growingStatus.left] <- lapply(1:length(growingStatus.left), function(i)
c(growingStatus.left[i] - iter, probesetList[[growingStatus.left[i]]]))
}
## in right direction
if (length(growingStatus.right) > 0) {
probesetList[growingStatus.right] <- lapply(1:length(growingStatus.right), function(i) c(probesetList[[growingStatus.right[i]]], growingStatus.right[i] + iter))
}
growingStatus.left <- which(growingHeight.left <= winSize)
growingStatus.right <- which(growingHeight.right <= winSize)
iter <- iter + 1
}
return(probesetList)
}
probe2gene <- function(probe, lib="org.Hs.eg.db", simplify=TRUE, collapse=FALSE) {
if (length(probe) == 0) return(NULL)
if (!require(lib, character.only=TRUE)) stop(paste(lib, 'should be installed first!'))
if (length(grep('.eg.db', lib)) > 0) {
gene <- getEntrezAnnotation(probe, lib, from='eg', to='symbol')
if (collapse) {
gene <- sapply(gene, paste, collapse=';')
}
} else {
llib <- paste(gsub('\\.db$', '', lib), 'SYMBOL', sep='')
mapWithAllProbes <- do.call('toggleProbes', list(x=as.name(llib), value="all"))
gene <- AnnotationDbi::mget(probe, mapWithAllProbes, ifnotfound=NA)
if (collapse) {
gene <- sapply(gene, paste, collapse=';')
} else {
if (simplify) gene <- sapply(gene, function(x) x[1])
}
}
return(gene)
}
getEntrezAnnotation <- function(ll=NULL, lib=NULL, from='eg', to='symbol', species=c('human', 'rat', 'mouse', 'yeast', 'fly'), collapse=FALSE) {
from <- toupper(from); to <- toupper(to)
# from: 'eg', 'accnum', 'alias', 'chr', chrlengths', 'chrloc', 'enzyme', 'genename', 'go', 'map', 'mapcounts',
# 'path', 'pfam', 'pmid', 'prosite', 'refseq', 'symbol', 'unigene',
species <- match.arg(species)
if (is.null(lib)) {
lib = switch(species,
'rat'='org.Rn.eg.db',
'human'='org.Hs.eg.db',
'mouse'='org.Mm.eg.db',
'yeast'='org.Sc.eg.db',
'fly'='org.Dm.eg.db'
)
}
if (!require(lib, character.only=TRUE)) {
stop(paste('Package', lib, 'cannot be loaded!'))
}
ll.input <- ll
na.ind <- which(is.na(ll) | ll == "")
if (length(na.ind) > 0) {
ll <- ll[-na.ind]
}
if (length(to) > 1) {
allMapping <- NULL
for (to.i in to) {
mapping.i <- getEntrezAnnotation(ll, lib, from, to.i, species, collapse=TRUE)
allMapping <- cbind(allMapping, mapping.i)
}
colnames(allMapping) <- to
rownames(allMapping) <- ll
return(allMapping)
}
libName <- sub('.db', '', lib)
if (from == 'EG') {
map <- paste(libName, to, sep='')
} else {
map <- paste(libName, from, '2', to, sep='')
if (!exists(map)) {
if (to == 'EG') {
map <- paste(libName, from, sep='')
}
if (!exists(map)) stop(paste('Cannot find', map, 'in', lib, '!'))
}
}
##
if (!is.null(ll)) {
map <- AnnotationDbi::mget(ll, get(map), ifnotfound=NA)
} else {
map <- as.list(get(map))
}
if (to == 'EG' & !exists(paste(libName, from, '2', to, sep=''))) {
len <- sapply(map, length)
map <- tapply(rep(names(map), len), unlist(map), function(x) unique(x))
}
if (length(na.ind) > 0) {
map.out <- rep(NA, length(ll.input))
map.out[-na.ind] <- map
} else {
map.out <- map
}
if (collapse) {
map.out <- sapply(map.out, paste, collapse=';')
}
attr(map.out, 'lib') <- lib
return(map.out)
}
## estimate the averaged methylation level within a CMR (GRanges) or a provided gene (transcript) elements based on methylation probe annotation
# methyProfile <- estimateCMR.methylation('NM_145201', methyGenoSet, estimateFun=mean, probeAnnotation=methyGrange, selectGeneElement=c('exon1', 'upstream500'))
# plot(m2beta(methyProfile), exprs(selExp)['NM_145201',], ylab='RNA-Seq expression (log2 count)', xlab='DNA methylation (Beta value)', col=rgb(1,0,0, alpha=0.35), pch=19)
estimateCMR.methylation <- function(cmr, methyGenoSet, estimateFun=mean, probeAnnotation=NULL, selectGeneElement=c('exon1', 'promoter'), mc.cores=min(12, detectCores())) {
if (!is(methyGenoSet, 'GenoSet')) stop('"methyGenoSet" should be a "GenoSet" object!')
if (length(cmr) > 1) {
methy.cmr <- mclapply(1:length(cmr), function(i) {
estimateCMR.methylation(cmr[i], methyGenoSet, estimateFun=estimateFun, probeAnnotation=probeAnnotation, selectGeneElement=selectGeneElement)
}, mc.cores=mc.cores)
methy.cmr <- do.call('rbind', methy.cmr)
rownames(methy.cmr) <- names(cmr)
return(methy.cmr)
}
if (is(cmr, 'GRanges')) {
ol <- as.matrix(findOverlaps(cmr, rowRanges(methyGenoSet)))
selProbe <- unique(featureNames(methyGenoSet)[ol[,2]])
} else if (is.character(cmr)) {
## when cmr is a gene Entrez ID
if (is.null(probeAnnotation)) {
probeAnnotation <- mcols(rowRanges(methyGenoSet))
} else if (is(probeAnnotation, 'GRanges')) {
allProbe <- names(probeAnnotation)
probeAnnotation <- values(probeAnnotation)
if (!is.null(allProbe))
rownames(probeAnnotation) <- allProbe
}
selectGeneElement <- tolower(selectGeneElement)
names(probeAnnotation) <- tolower(names(probeAnnotation))
selectGeneElement <- intersect(selectGeneElement, names(probeAnnotation))
if (length(selectGeneElement) == 0) stop('No "selectGeneElement" is provided!')
allProbe <- intersect(rownames(probeAnnotation), featureNames(methyGenoSet))
probeAnnotation <- as.matrix(probeAnnotation[allProbe,,drop=FALSE])
methyGenoSet <- methyGenoSet[allProbe,]
selProbe <- allProbe[apply(probeAnnotation[,selectGeneElement,drop=FALSE], 1, function(x) {
return(cmr %in% unlist(strsplit(x, ';')))
})]
}
methyProfile <- apply(assays(methyGenoSet)$exprs[selProbe,,drop=FALSE], 2, estimateFun)
attr(methyProfile, 'selProbe') <- selProbe
return(methyProfile)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.