Nothing
# oligo functions we'd like to import once they're exported
#oligoBigObjectSupport <- oligo:::oligoBigObjectSupport
#oligoBigObjectPath <- oligo:::oligoBigObjectPath
bgindex <- oligo:::bgindex # method
setOldClass("ff_matrix")
asff <- function(ffm, rows=1:nrow(ffm), columns=1:ncol(ffm)) { #subset ff_matrix object rows but keep it an ff_matrix object
stopifnot(is.numeric(rows)&is.numeric(columns))
ret2 <- ff(vmode="double", dim=c(length(rows),length(columns)))
for(cm in 1:length(columns)) ret2[,cm] = ffm[rows,columns[cm]]
colnames(ret2) = colnames(ffm)[columns]
rownames(ret2) = rownames(ffm)[rows]
ret2
}
#######################
## oApply and nodeFn ##
#######################
setGeneric("oApply", function(object1, ...) standardGeneric("oApply"))
setGeneric("nodeFn", function(cols, object1, ...) standardGeneric("nodeFn"))
setMethod("oApply", "matrix",
function(object1, object2, copy=TRUE, fn, ...) {
samplesByNode <- splitIndicesByNode(1:ncol(object1))
if (missing(object2)) {
ret <- ocLapply(samplesByNode, nodeFn, object1=object1,
fn=fn, ...)
return(do.call("cbind", ret))
} else {
ret <- ocLapply(samplesByNode, nodeFn,
object1=object1, object2=object2, fn=fn, ...)
c1 <- do.call("cbind", lapply(ret, "[[", 1))
c2 <- do.call("cbind", lapply(ret, "[[", 2))
return(list(c1, c2))
}
})
setMethod("oApply", "ff_matrix",
function (object1, object2, copy=TRUE, fn, ...) {
stopifnot(ldStatus())
if (copy) {
open(object1)
out1 <- clone(object1)
if (!missing(object2)) {
open(object2)
out2 <- clone(object2)
}
} else {
out1 <- object1
if (!missing(object2))
out2 <- object2
}
samplesByNode <- splitIndicesByNode(1:ncol(out1))
if (missing(object2)) {
ocLapply(samplesByNode, nodeFn, object1=out1, fn=fn, ...)
return(out1)
} else {
ocLapply(samplesByNode, nodeFn, object1=out1, object2=out2, fn=fn, ...)
return(list(out1, out2))
}
})
## In these nodeFn methods the rows of the objects returned by fn must correspond to the rows of object1 and object2. If the number of rows is more or less, you'll get an error. If the number is the same but the order is wrong, then you won't (even worse). Before the correction to getM for ff_matrix objects below, an error was returned because the returned object was only a subset of the rows.
setMethod("nodeFn", c("integer", "matrix"),
function (cols, object1, object2, fn, ...) {
if (length(cols) > 0) {
grpCols <- splitIndicesByLength(1:length(cols), ocSamples())
if (missing(object2)) {
object1 <- object1[,cols,drop=FALSE]
for (theCols in grpCols) {
object1[, theCols] <- fn(object1[, theCols, drop = FALSE], ...)
}
return(object1)
} else {
object1 <- object1[,cols,drop=FALSE]
object2 <- object2[,cols,drop=FALSE]
for (theCols in grpCols) {
ret <- fn(object1[, theCols, drop = FALSE],
object2[, theCols, drop = FALSE], ...)
object1[, theCols] <- ret[[1]]
if (!is.null(ret[[2]])) object2[, theCols] <- ret[[2]]
}
return(list(object1, object2))
}
}
})
setMethod("nodeFn", c("integer", "ff_matrix"),
function (cols, object1, object2, fn, ...) {
if (length(cols) > 0) {
grpCols <- splitIndicesByLength(cols, ocSamples())
open(object1)
if (missing(object2)) {
for (theCols in grpCols) {
object1[, theCols] <- fn(object1[, theCols, drop = FALSE], ...)
}
} else {
open(object2)
for (theCols in grpCols) {
ret <- fn(object1[, theCols, drop = FALSE],
object2[, theCols, drop = FALSE], ...)
object1[, theCols] <- ret[[1]]
if (!is.null(ret[[2]])) object2[, theCols] <- ret[[2]]
}
close(object2)
rm(object2)
}
close(object1)
rm(object1, grpCols)
gc()
}
TRUE
})
##########
## getM ##
##########
# method for signature TilingFeatureSet
#setMethod("getM", "TilingFeatureSet",
setGeneric("getM", function(object1, ...) standardGeneric("getM"))
setMethod("getM", "TilingFeatureSet",
function(object1, rows) {
c1 <- assayDataElement(object1, "channel1")
c2 <- assayDataElement(object1, "channel2")
if(missing(rows)) rows <- 1:nrow(c1)
getM(c1, c2, rows)
})
setMethod("getM", "matrix",
function(object1, object2, rows) log2(object1[rows,])-log2(object2[rows,]))
#setMethod("getM", "ff_matrix",
# function(object1, object2, rows) {
# ret <- oApply(object1=object1, object2=object2, copy=TRUE,
# fn=function (object1, object2, rows)
# list(log2(object1[rows,])-log2(object2[rows,]), NULL),
# rows=rows)
# ret[[1]]
# })
## Corrected version of getM for ff_matrix objects:
setMethod("getM", "ff_matrix",
function(object1, object2, rows) {
ret <- oApply(object1=object1, object2=object2, copy=TRUE,
fn=function (object1, object2)
list(log2(object1)-log2(object2), NULL))
ret2 <- ff(vmode="double", dim=c(length(rows), ncol(ret[[1]])))
for(cm in 1:ncol(ret[[1]])) ret2[,cm] = ret[[1]][rows,cm]
colnames(ret2) = colnames(ret[[1]])
ret2
})
cloneFeatureSet <- function(object, finalizer="delete") {
cn <- channelNames(object)
if ("ff_matrix" %in% class(assayDataElement(object, cn[1]))) {
for (i in 1:length(cn)) {
chan <- assayDataElement(object, cn[i])
open(chan)
assayDataElement(object, cn[i]) <- clone(chan, finalizer=finalizer)
}
}
return(object)
}
###########
## methp ##
###########
methp <- function(dat, spatial=TRUE, bgSubtract=TRUE,
withinSampleNorm="loess",
scale=c(0.99, 0.99),
betweenSampleNorm="quantile",
controlProbes=NULL, controlIndex=NULL,
excludeIndex=NULL,
commonMethPercentParams=NULL,
verbose=TRUE, returnM=FALSE,
plotDensity=NULL, plotDensityGroups=NULL) {
if(!is.null(excludeIndex)) if(!inherits(excludeIndex,c("numeric","integer"))) stop("excludeIndex argument, if not NULL, must be a numeric or integer vector.")
if(!is.null(controlIndex)) if(!inherits(controlIndex,c("numeric","integer"))) stop("controlIndex argument, if not NULL, must be a numeric or integer vector.")
if(is.null(controlProbes) & is.null(controlIndex)) stop("Either controlProbes or controlIndex argument must be specified.")
if(!any(controlProbes%in%getContainer(dat)) & is.null(controlIndex)) stop("Invalid controlProbes argument: no such values not found for this array.")
if(!is.null(plotDensity)) {
pdf(file=plotDensity, height=11, width=8)
par(mfrow=c(5,2), mar=c(2,2,4,2))
lwd <- rep(1, ncol(dat))
plotDensity(dat, main="1. Raw", lab=plotDensityGroups,
controlIndex=controlIndex, controlProbes=controlProbes,
excludeIndex=excludeIndex)
}
if (is.list(betweenSampleNorm)) {
bs <- betweenSampleNorm
} else {
if (betweenSampleNorm=="quantile") {
bs <- list(m="allQuantiles", untreated="none", enriched="none")
if(is.null(commonMethPercentParams)) commonMethPercentParams <- TRUE
} else if (betweenSampleNorm=="sqn") {
bs <- list(m="none", untreated="complete", enriched="sqn")
if(is.null(commonMethPercentParams)) commonMethPercentParams <- FALSE
} else if (betweenSampleNorm=="sqn99") {
stop("Option sqn99 is deprecated. The same behaviour is now obtained by using sqn.")
} else if (betweenSampleNorm=="none") {
bs <- list(m="none", untreated="none", enriched="none")
if(is.null(commonMethPercentParams)) commonMethPercentParams <- FALSE
}
}
if (verbose>2) print(gc())
dat <- cloneFeatureSet(dat)
# Spatial bias correction
if (spatial) {
if (verbose) message("Spatial normalization")
dat <- spatialAdjust(dat, copy=FALSE)
}
if (verbose>2) print(gc())
# Background removal
if (bgSubtract) {
if (verbose) message("Background removal")
dat <- bgAdjust(dat, copy=FALSE)
}
if (verbose>2) print(gc())
if(!is.null(plotDensity)) {
plotDensity(dat, main="2. After spatial & bg", lab=plotDensityGroups, controlIndex=controlIndex, controlProbes=controlProbes, excludeIndex=excludeIndex)
}
# Within sample normalization
if (is.null(controlIndex)) {
controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
}
if (verbose) message("Within sample normalization: ", withinSampleNorm)
dat <- normalizeWithinSamples(dat, copy=FALSE,
method=withinSampleNorm,
scale=scale, controlIndex=controlIndex, verbose=verbose)
if(!is.null(plotDensity)) {
plotDensity(dat, main="3. After within-sample norm",
lab=plotDensityGroups, controlIndex=controlIndex,
controlProbes=controlProbes, excludeIndex=excludeIndex)
}
if (verbose>2) print(gc())
# Between sample normalization
if (verbose) {
message("Between sample normalization", appendLF=FALSE)
if (is.list(betweenSampleNorm)) {
message(". M: ", bs$m, ", Untreated channel: ", bs$untreated,
", Methyl-depleted channel: ", bs$enriched)
} else {
message (": ", betweenSampleNorm)
}
}
dat <- normalizeBetweenSamples(dat, copy=FALSE,
m=bs$m, untreated=bs$untreated, enriched=bs$enriched,
controlProbes=controlProbes, controlIndex=controlIndex,
excludeIndex=excludeIndex, verbose=verbose)
if (verbose>2) print(gc())
if(!is.null(plotDensity)) {
plotDensity(dat, main="4. After between-sample norm",
lab=plotDensityGroups, controlIndex=controlIndex,
controlProbes=controlProbes, excludeIndex=excludeIndex)
}
if (returnM=="TRUE") {
retval <- getM(dat, rows=pmindex(dat))
} else {
if (verbose) message("Estimating percentage methylation")
retval <- methPercent(m=getM(dat), pmIndex=pmindex(dat),
commonParams=commonMethPercentParams,
ngc=countGC(dat))
}
if (verbose>2) print(gc())
if(!is.null(plotDensity)) {
if (is.null(controlIndex)) controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
if(returnM=="FALSE") {
plotDensity(retval, main="5. Percentage methylation",
rx=c(0,1), lab=plotDensityGroups, controlIndex=controlIndex,
controlProbes=controlProbes, excludeIndex=excludeIndex)
}
dev.off()
}
rm(dat)
return(retval)
}
readCharm <- function(files, path=".", ut="_532.xys", md="_635.xys",
sampleKey, sampleNames=NULL, pkgname,
type=NULL, ...) {
files <- files2 <- as.character(files)
o <- order(files)
files <- files[o]
if (!is.null(sampleNames)) sampleNames <- as.character(sampleNames[o])
utIdx <- grep(ut, files)
if (length(utIdx)==0) {
stop("No files match the untreated extension ", ut, "\nPlease use the ut option to set the correct extension\n")
}
mdIdx <- grep(md, files)
if (length(mdIdx)==0) {
stop("No files match the methyl-depleted extension ", md, "\nPlease use the md option to set the correct extension\n")
}
if(length(utIdx)+length(mdIdx) < length(files)) warning("Some file names do not contain the ut or md arguments. Those files will not be read in.")
filesUt <- files[utIdx]
filesMd <- files[mdIdx]
if (!all(sub(ut, "", filesUt) == sub(md, "", filesMd)))
stop(("The untreated (ut) and methyl-depleted (md) file names don't match up\n"))
if (!is.null(sampleNames)) {
sampleCheck <- sampleNames[utIdx] == sampleNames[mdIdx]
if (!all(sampleCheck))
stop("The untreated (ut) and methyl-depleted (md) sample names don't match up\n Check:",
sampleNames[utIdx][!sampleCheck])
sampleNames <- sampleNames[utIdx]
} else {
sampleNames <- sub(ut, "", filesUt)
}
if (!missing(sampleKey)) {
sampleKey <- sampleKey[o,]
utNA <- is.na(sampleKey[utIdx,]) & !is.na(sampleKey[mdIdx,])
mdNA <- !is.na(sampleKey[utIdx,]) & is.na(sampleKey[mdIdx,])
for(j in 1:ncol(sampleKey)) {
rp1 = utNA[,j]
rp2 = mdNA[,j]
if(any(rp1)) sampleKey[utIdx[rp1],j] = sampleKey[mdIdx[rp1],j]
if(any(rp2)) sampleKey[mdIdx[rp2],j] = sampleKey[utIdx[rp2],j]
}
same <- sampleKey[utIdx,]==sampleKey[mdIdx,]
bothNA <- is.na(sampleKey[utIdx,]) & is.na(sampleKey[mdIdx,])
keep <- apply(same | bothNA, 2, all)
if(any(!keep)) message("The following columns in sampleKey contain discrepant values between channels and are being removed: ", paste(which(!keep),collapse=", "))
extraCols <- sampleKey[utIdx, keep]
} else {
extraCols <- matrix(nrow=length(utIdx), ncol=0)
}
if (!is.null(type)) {
type <- as.character(type[o])
pd <- data.frame(extraCols, type=type[utIdx],
arrayUT=filesUt, arrayMD=filesMd, stringsAsFactors=FALSE)
} else {
pd <- data.frame(extraCols, arrayUT=filesUt, arrayMD=filesMd, stringsAsFactors=FALSE)
}
vpd <- data.frame(
labelDescription=c(rep(NA, ncol(pd)-2),
"Untreated channel file name",
"Methyl-depleted channel file name"),
channel=factor(c(rep("_ALL_", ncol(pd)-2),
"channel1", "channel2"),
levels=c("channel1", "channel2", "_ALL_")))
## TEMPORARY FIX TO REMAIN BACKWARD COMPATABILITY WITH ##
## oligo 1.10.x (Bioconductor 2.5). This fix avoids ##
## a warning ##
#version <- as.numeric(sub(".*\\.(.*)\\..*", "\\1",
# packageDescription("oligo", fields="Version")))
#if (version<11) {
# vpd <- data.frame(
# labelDescription=c(rep(NA, ncol(pd)-2),
# "Untreated channel file name",
# "Methyl-depleted channel file name"))
#}
## END TEMPORARY FIX ##
##Put arrays back in the order in which their ut channels were listed in files arg:
files2 = files2[files2%in%c(pd$arrayUT,pd$arrayMD)]
files2Ut = files2[sort(grep(ut, files2))]
reord = match(files2Ut,pd$arrayUT)
files2Md = pd$arrayMD[reord]
pdd <- new("AnnotatedDataFrame", data=pd[reord,], varMetadata=vpd)
sampleNames(pdd) <- sampleNames[reord]
dat <- read.xysfiles2(channel1=file.path(path, files2Ut),
channel2=file.path(path, files2Md), pkgname=pkgname,
phenoData=pdd, sampleNames=sampleNames[reord], ...)
return(dat)
}
#################
## plotDensity ##
#################
plotDensity <- function(dat, rx=c(-4,6), controlIndex=NULL,
controlProbes=NULL, pdfFile=NULL, main=NULL, lab=NULL, excludeIndex=NULL) {
if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to plotDensity.")
if (!is.null(pdfFile)) {
pdf(pdfFile)
par(mfcol=c(2,1))
}
if ("TilingFeatureSet" %in% class(dat)) {
M <- getM(dat)
pmIndex <- pmindex(dat)
if (is.null(lab)) lab <- sampleNames(dat)
} else {
M <- dat
pmIndex <- 1:nrow(M)
if (is.null(lab)) lab <- colnames(dat)
}
if (is.null(controlIndex)) controlIndex <- getControlIndex(dat, controlProbes= controlProbes)
pIdx <- pmIndex
if(!is.null(excludeIndex)) pIdx <- pIdx[-excludeIndex]
cIndex <- pIdx[setdiff(controlIndex, excludeIndex)]
plotDensityMat(M, idx=pIdx, xlab="M", lab=lab,
main=paste(main,"\nAll probes"), rx=rx)
plotDensityMat(M, idx=cIndex, xlab="M", lab=lab,
main=paste(main, "\nControl probes"), rx=rx)
if (!is.null(pdfFile)) dev.off()
}
plotDensityMat <- function (x, idx, main = NULL,
ylab = "Density", xlab = "M", lab = NULL, rx = NULL, ry = NULL,
legendPos = "topright", cex=0.8) {
lab <- as.factor(lab)
if (missing(idx)) idx <- 1:nrow(x)
if ("matrix" %in% class(x)) d <- apply(x[idx,], 2, density, na.rm = TRUE)
if ("ff_matrix" %in% class(x)) {
i1<-i2<-NULL # To avoid R CMD CHECK "no visible binding" warning
d <- ffcolapply(density(x[idx,i1:i2], na.rm=TRUE), X=x,
BATCHSIZE=1, BATCHBYTES=2^30, RETURN=TRUE, CFUN="list")
}
if (is.null(rx)) rx <- range(sapply(d, function(i) i$x))
if (is.null(ry)) ry <- range(sapply(d, function(i) i$y))
plot(rx, ry, type = "n", xlab = xlab, ylab = ylab, main = main)
sapply(1:length(d), function(i) lines(d[[i]], col = lab[i]))
legend(legendPos, legend = levels(lab),
text.col = 1:length(levels(lab)), cex = cex)
}
regionFilter <- function(x, region, f) {
x<-unlist(tapply(x, region, myfilter, f))
names(x) <- NULL
return(x)
}
## Uses all control and bg probes passed to it
normControlBg <- function(pms, bgs=NULL, controlIndex=NULL, affinity=NULL) {
mBg <- NULL
ctrl <- NULL
mCtrl <- NULL
if (!is.null(bgs)) {
mBg <- rowMedians(bgs)
}
if (!is.null(controlIndex)) {
ctrl <- pms[controlIndex,]
mCtrl <- rowMedians(ctrl)
}
message(" Normalizing with", length(c(mBg, mCtrl)), "control probes ", appendLF=FALSE)
if (!is.null(affinity)) {
message("(affinity-based mapping)")
} else {
message("(intensity-based mapping)")
}
for (i in 1:ncol(pms)) {
if (is.null(affinity)) {
map <- getMap(x=c(bgs[,i], ctrl[,i]), m=c(mBg, mCtrl))
pms[,i] <- pms[,i]-map(pms[,i])
} else {
map <- getMap(x=ctrl[,i], m=mCtrl, affinity=affinity[controlIndex])
pms[,i] <- pms[,i]-map(affinity)
}
}
return(pms)
}
getMap <- function (x, m, affinity=NULL, df = 5)
{
if (is.null(affinity)) affinity <- x
nas <- is.na(x) | is.na(m)
x <- x[!nas]
m <- m[!nas]
affinity <- affinity[!nas]
d <- x - m
lmFit = lm(d ~ ns(affinity, df = df))
return(function(x1) predict(lmFit, data.frame(affinity = x1)))
}
getControlIndex <- function(dat, controlProbes=NULL, noCpGWindow=1000, subject,
onlyGood=FALSE, matrix=TRUE) {
if (missing(subject)) {
file <- file.path(system.file(package = annotation(dat)),
"data", "controlIndex.rda")
if (file.exists(file)) {
load(file)
} else {
if(is.null(controlProbes)) stop("controlProbes argument must be provided to getControlIndex if subject argument not used.")
controlIndex <- which(getContainer(dat) %in% controlProbes)
}
} else {
if (class(subject)!="BSgenome")
stop("You must supply a BSgenome object for subject if using the noCpGWindow option. E.g. Hsapiens from the BSgenome.Hsapiens.UCSC.hg18 package.\n")
pv <- providerVersion(subject)
file <- file.path(system.file(package = annotation(dat)),
"data", paste(pv, "_noCpG", noCpGWindow, ".rda", sep=""))
if (file.exists(file)) {
load(file)
} else {
chr <- pmChr(dat)
pos <- pmPosition(dat)
cpgd <- cpgdensity(subject, chr=chr, pos=pos,
windowSize=noCpGWindow)
controlIndex <- which(cpgd==0)
}
}
return(controlIndex)
}
diffAmpEstGC <- function(dat, method="median", controlIndex=NULL, controlProbes=NULL, smooth=2) {
if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to diffAmpEstGC.")
#kern <- function(u, type="epanechnikov", smooth=1) {
# ifelse(abs(u)<smooth, (1/smooth) * (3/4) * (1-(u/smooth)^2), 0)
#}
Ngc <- countGC(dat, "pm")
if (is.null(controlIndex)) {
#controlIndex <- getControlIndex(dat, controlProbes = controlProbes)
##same as:
controlIndex <- which(getContainer(dat)%in%controlProbes)
}
ngc <- Ngc[controlIndex]
datPm <- log2(pm(dat))
M <- datPm[,,"channel1"] - datPm[,,"channel2"]
adj <- sapply (1:ncol(M), function(samp) {
m <- M[controlIndex, samp]
a <- (datPm[,,"channel1"][controlIndex,samp] +
datPm[,,"channel2"][controlIndex,samp])/2
nas <- is.na(m) | is.na(a)
m <- m[!nas]
a <- a[!nas]
ngc <- ngc[!nas]
breakpoint <- optimize(checkBreakpoint, range(a), ms=m, as=a)$minimum
keep <- a>breakpoint
if (sum(keep)/length(controlIndex) < (1/10)) { # Make sure we use at least 1/10 the control probes
o <- order(a, decreasing=TRUE)
keep[o[1:(length(controlIndex)/10)]] <- TRUE
}
med <- tapply(m[keep], ngc[keep], method)
weights <- tapply(m[keep], ngc[keep], length)
ngc_adj <- est <- w <- rep(NA, max(Ngc))
est[as.numeric(names(med))] <- med
w[as.numeric(names(weights))] <- weights
for (i in which(!is.na(est))) {
k <- dnorm(abs(i-1:length(ngc_adj))/smooth)
#k <- kern(abs(i-1:length(ngc_adj)), smooth=smooth)
ngc_adj[i] <- sum(k*w*est, na.rm=TRUE) / sum(k*w, na.rm=TRUE)
}
for (i in 2:length(ngc_adj)) {
if (is.na(ngc_adj[i]) & !is.na(ngc_adj[i-1]))
ngc_adj[i] <- ngc_adj[i-1]
}
for (i in (length(ngc_adj)-1):1) {
if (is.na(ngc_adj[i]) & !is.na(ngc_adj[i+1]))
ngc_adj[i] <- ngc_adj[i+1]
}
ngc_adj
})
colnames(adj) <- sampleNames(dat)
rownames(adj) <- paste("GC=", 1:nrow(adj), sep="")
adj
}
diffAmpAdjustGC <- function (dat, adj){
datPm <- log2(pm(dat))
Ngc <- countGC(dat)
for (samp in 1:dim(dat)["Samples"]) {
datPm[,samp,"channel2"] <- datPm[,samp,"channel2"] + adj[Ngc, samp]
}
pm(dat) <- 2^datPm
return(dat)
}
countGC <- function(dat, type="pm", idx) {
if (type=="pm") {
seqs <- pmSequence(dat)
} else if (type=="bg") {
seqs <- bgSequence(dat)
} else {
stop("Invalid type. Choose 'pm' or 'bg'\n")
}
if (!missing(idx))
seqs <- seqs[idx]
tmp <- alphabetFrequency(seqs, baseOnly=TRUE)
tmp[,"C"] + tmp[,"G"]
}
cpgdensity <-function(subject, chr, pos, windowSize=500, sequence="CG") {
idx <- split(1:length(chr), chr)
s <- DNAString(sequence)
cpgdensity <- rep(NA, length(pos))
for (curchr in (names(idx))) {
if (curchr %in% names(subject)) {
chrseq <- subject[[curchr]]
if (inherits(chrseq, "DNAString") | inherits(chrseq, "MaskedDNAString")){
curpos <- pos[idx[[curchr]]]
v <- suppressWarnings(Views(chrseq, start=curpos-windowSize/2, end=curpos+windowSize/2))
d <- suppressWarnings(DNAStringSet(v))
numcpg <- vcountPattern(s, d, fixed=TRUE)
cpgdensity[idx[[curchr]]] <- numcpg/windowSize
}
}
}
cpgdensity
}
##############
## qcReport ##
##############
qcReport <- function(dat, file=NULL, utRange=c(30,100), enRange=c(8,12), numProbes=5e+5, blockSize) {
# Calculate summary quality scores
n <- length(pmindex(dat))
if (numProbes==-1) numProbes <- n
if (n < numProbes) numProbes <- n
idx <- as.integer(seq(1, n, length.out=numProbes))
pmQual <- pmQuality(dat, idx=idx)[,,drop=FALSE]
X <- getX(dat, "pm")[idx]
Y <- getY(dat, "pm")[idx]
if(missing(blockSize)) blockSize <- getBlockSize(dat)
imgs1 <- arrayImage(X,Y, pmQual, blockSize=blockSize)
#sd1 <- unlist(lapply(imgs1, function(x) sd(as.vector(x$z), na.rm=TRUE)))
z <- assayDataElement(dat, "channel1")[pmindex(dat)[idx],, drop = FALSE]
tmp <- arrayImage(X,Y, log2(z), blockSize=blockSize)
sd1 <- unlist(lapply(tmp, function(x) sd(as.vector(x$z), na.rm=TRUE)))
rm(z, tmp)
z <- assayDataElement(dat, "channel2")[pmindex(dat)[idx],, drop = FALSE]
imgs2 <- arrayImage(X,Y, log2(z), blockSize=blockSize)
rm(z)
sd2 <- unlist(lapply(imgs2, function(x) sd(as.vector(x$z), na.rm=TRUE)))
sdRange <- c(0, max(sd1, sd2)*1.1)
if (any(class(pmQual)=="ff")) {
pmSignal <- sapply(1:ncol(pmQual), function(i) mean(pmQual[,i]))
names(pmSignal) <- colnames(pmQual)
} else {
pmSignal <- colMeans(pmQual, na.rm=TRUE)
}
if (!is.null(file)) {
pdf(file=file, width=8, height=10.5)
lay <- rbind(cbind(rep(1,3), matrix(rep(2:4, each=6), ncol=6)), rep(5,7))
layout(lay)
#layout(matrix(c(1,1,1,4, 2,2,2,4, 3,3,3,4 ), ncol=3))
n <- length(pmSignal)
if (is.null(names(pmSignal))) {
names(pmSignal) <- paste("Sample", 1:n)
}
l <- length(pmSignal)
longestName <- max(sapply(names(pmSignal), nchar))
cexh <- ifelse (n>50, max(0.25, 1-(n-50)*.01), 1)
cexw <- ifelse (longestName>17, max(0.25, 1-(longestName-15)*0.05), 1)
cex <- min(cexh, cexw)
par(oma=c(2,2,4,2))
o <- order(names(pmSignal))
#o <- order(names(pmSignal), decreasing=TRUE)
par(mar=c(5,0,3,0))
plot.new()
plot.window(xlim=c(0,10), ylim=c(0,l))
text(1,l:1-0.5, names(pmSignal)[o], adj=0.1, cex=cex)
abline(h=1:(l+1)-1, lwd=0.5, lty=3)
xmin <- max(0, floor((min(pmSignal)-5)/5)*5)
xmax <- min(100, ceiling((max(pmSignal)+5)/5)*5)
par(mar=c(5,1,3,1))
plot(pmSignal[o], l:1, las=1, ylab="", xlab="", type="p",
pch=19, xlim=c(xmin, xmax), ylim=c(0.5, l+0.5),
yaxt="n", frame.plot=TRUE, main="Signal strength")
abline(h=1:(l+1)-0.5, lwd=0.5, lty=3)
#axis(2, at=1:l, labels=names(pmSignal), las=1, tick=FALSE, cex.axis=cex)
#xmin <- min(sd1)-1
#xmax <- max(sd1)+1
par(mar=c(5,1,3,1))
plot(sd1[o], l:1, las=1, ylab="", xlab="", type="p",
pch=19, xlim=sdRange, ylim=c(0.5, l+0.5),
yaxt="n", frame.plot=TRUE, main="Channel 1\nstandard deviation")
abline(h=1:(l+1)-0.5, lwd=0.5, lty=3)
#xmin <- min(sd2)-0.01
#xmax <- max(sd2)+0.01
par(mar=c(5,1,3,1))
plot(sd2[o], l:1, las=1, ylab="", xlab="", type="p",
pch=19, xlim=sdRange, ylim=c(0.5, l+0.5),
yaxt="n", frame.plot=TRUE, main="Channel 2\nstandard deviation")
abline(h=1:(l+1)-0.5, lwd=0.5, lty=3)
#axis(1, tick=FALSE)
hist(pmSignal, main="Signal strength histogram", xlab="Signal strength")
# Plot UT channel probe quality (PM vs BG)
l <- layout(matrix(c(1,1,1,1,2:21), 6, 4, byrow = TRUE)) #
plot.new()
text(0.5, 0.2, "Untreated Channel: PM probe quality", cex=2)
par(mar=c(4.5,8,3.5,3))
arrayPlot(imgs1[o], r=utRange)
# Plot MD channel
l <- layout(matrix(c(1,1,1,1,2:21), 6, 4, byrow = TRUE)) #
plot.new()
text(0.5, 0.2, "Enriched Channel: PM signal intensity", cex=2)
par(mar=c(4.5,8,3.5,3))
arrayPlot(imgs2[o], r=enRange)
dev.off()
}
return(as.data.frame(cbind(pmSignal, sd1, sd2)))
}
as.image <- function(Z, ind, nx, ny, na.rm=TRUE) {
if (!na.rm)
return(as.image.fast(Z, ind, nx, ny))
my_mean <- function(X) base::mean(X, na.rm=TRUE)
list(x = 1:ny, y = 1:nx, z = tapply(Z, list(ind[,1], ind[,2]),
FUN=my_mean))
}
as.image.fast <- function(Z, ind, nx, ny)
list(x = 1:ny, y = 1:nx, z = tapply(Z, list(ind[,1], ind[,2]),
FUN=mean))
arrayImage <- function(x,y,z, view="2d", blockSize=50) {
if (is.null(dim(z))) z <- as.matrix(z)
if (view=="col") {
tmp <- vec2array(x,y,z)
ret <- apply(tmp, 3, rowMeans, na.rm=TRUE)
colnames(ret) <- colnames(z)
}
if (view=="row") {
tmp <- vec2array(x,y,z)
ret <- apply(aperm(tmp, c(2,1,3)), 3, rowMeans, na.rm=TRUE)
colnames(ret) <- colnames(z)
}
if (view=="2d") {
nx <- max(x)/blockSize
ny <- max(y)/blockSize
d <- discretize.image(cbind(y,x), m=ny, n=nx)
## The "index" object returned by discretize.image() changed from a matrix
## in fields package v6.9.1 to a list in v7.1, which does not work in as.image
## below, so change it back:
d$index = do.call(cbind,d$index)
stopifnot(ncol(d$index)==2)
ret <- apply(z, 2, function(vec) {
as.image(vec, ind=d$index, nx=d$m, ny=d$n, na.rm=TRUE)
})
names(ret) <- colnames(z)
}
return(ret)
}
arrayPlot <- function(imgs, xlab="NULL", r=NULL) {
if (is.list(imgs)) {
if(is.null(r)) r <- range(unlist(lapply(imgs, function(x) x$z)), na.rm=TRUE)
for (i in 1:length(imgs)) {
curImg <- imgs[[i]]$z
curImg <- apply(curImg, 1, pmax, r[1])
curImg <- apply(curImg, 1, pmin, r[2])
image.plot(curImg, main=names(imgs)[i], zlim=r, xaxt="n", yaxt="n", horizontal=TRUE)
}
} else {
# No longer needed
#r <- range(imgs$z, na.rm=TRUE)
#for (i in 1:ncol(imgs$z)) {
# plot(imgs[,i], main=colnames(imgs)[i], type="l", ylim=r, ylab="pm quality", xlab=xlab)
#}
}
}
###############
## pmQuality ##
###############
pmQuality <- function(dat, channel="channel1", verbose=FALSE, idx=NULL) {
# TODO: parallelize
if (is.null(idx)) idx <- 1:length(pmindex(dat))
Ngc <- countGC(dat, "pm", idx)
bgNgc <- countGC(dat, "bg")
pmIndex <- pmindex(dat)
bgIndex <- bgindex(dat)
x <- assayDataElement(dat, channel)
if ("ff_matrix" %in% class(x)) {
ret <- ff(vmode="double", dim=c(length(idx), ncol(x)))
} else {
ret <- matrix(nrow=length(idx), ncol=ncol(x))
}
for (i in 1:ncol(x)) {
fn <- tapply(x[bgIndex,i], bgNgc, ecdf)
for (ngc in unique(Ngc)) {
curIdx <- Ngc==ngc
closestIdx <- order(abs(as.numeric(names(fn))-ngc))[1]
bgngc <- as.character(names(fn)[closestIdx])
ret[curIdx,i] <- 100 * fn[[bgngc]](x[pmIndex[idx][curIdx],i])
}
}
colnames(ret) <- sampleNames(dat)
return(ret)
}
pmvsbg <- function(...) {
message("The pmvsbg function has been renamed pmQuality. Both names work for now but please update your code soon")
pmQuality(...)
}
dynamicRange <- function(dat, prob=0.8) {
Ngc <- countGC(dat, "pm")
bgNgc <- countGC(dat, "bg")
c1 <- log2(oligo::pm(dat)[,,"channel1"])
c2 <- log2(oligo::bg(dat)[,,"channel2"])
sapply(sampleNames(dat), function(i) {
tmp <- tapply(c2[,i], bgNgc, quantile, prob)
c2med <- rep(NA, max(as.numeric(names(tmp))))
c2med[as.numeric(names(tmp))] <- tmp
c1[,i] - c2med[Ngc]
})
}
vec2array <- function(X,Y,Z) {
if (ncol(Z)==1) Z <- as.matrix(Z)
maxY <- max(Y)
maxX <- max(X)
o <- order(X,Y)
Y <- Y[o]
X <- X[o]
tmp <- tapply(Y, X, function(y) as.numeric(y))
tmp <- unlist(lapply(names(tmp), function(i) tmp[[i]]+((as.numeric(i)-1)*maxY)))
Z <- as.matrix(Z[o,])
d <- matrix(NA, nrow=maxX*maxY, ncol=ncol(Z))
d[tmp,] <- Z
array(d, dim=c(maxY, maxX, ncol(Z)))
}
countSeq <- function(subject, chr, start, end, seq) {
idx <- split(1:length(chr), chr)
retval <- rep(NA, length(chr))
for (curChr in (names(idx))) {
if (curChr %in% names(subject)) {
chrSeq <- unmasked(subject[[curChr]])
curStart <- start[idx[[curChr]]]
curEnd <- end[idx[[curChr]]]
v <- suppressWarnings(Views(chrSeq, start=curStart, end=curEnd))
d <- suppressWarnings(DNAStringSet(v))
s <- DNAString(seq)
numSeq <- vcountPattern(s, d, fixed=TRUE)
retval[idx[[curChr]]] <- numSeq
} else {
#cat("\ncountSeq: Skipping chr", curChr, "\n")
}
}
return(retval)
}
###################
## spatialAdjust ##
###################
spatialAdjustAtom <- function(x, d, I) {
for (j in 1:ncol(x)) {
x[I,j] <- 2^(spatialAdjustVec(log2(x[I,j]), d)$zAdj)
}
return(x)
}
spatialAdjust <- function(dat, copy=TRUE, blockSize, theta=1) {
if(missing(blockSize)) blockSize <- getBlockSize(dat)
c1 <- assayDataElement(dat, "channel1")
c2 <- assayDataElement(dat, "channel2")
X <- Y <- rep(NA, nrow(c1))
pmi <- pmindex(dat)
X[pmi] <- getX(dat, "pm")
Y[pmi] <- getY(dat, "pm")
bgi <- bgindex(dat)
X[bgi] <- getX(dat, "bg")
Y[bgi] <- getY(dat, "bg")
# HJ
nax <- which(is.na(X))
nay <- which(is.na(Y))
# sanity checks
stopifnot(identical(nax, nay))
stopifnot(length(X) == length(pmi)+length(bgi)+length(nax))
I <- sort(c(pmi, bgi))
nx <- max(X, na.rm=TRUE)/blockSize
ny <- max(Y, na.rm=TRUE)/blockSize
d <- discretize.image(cbind(Y[I], X[I]), m=ny, n=nx)
## The "index" object returned by discretize.image() changed from a matrix
## in fields package v6.9.1 to a list in v7.1, which does not work in the code
## below, so change it back:
d$index = do.call(cbind,d$index)
stopifnot(ncol(d$index)==2)
# HJ
c1 <- oApply(c1, copy=copy, fn=spatialAdjustAtom, d=d, I=I)
c2 <- oApply(c2, copy=copy, fn=spatialAdjustAtom, d=d, I=I)
assayDataElement(dat, "channel1") <- c1
assayDataElement(dat, "channel2") <- c2
return(dat)
}
spatialAdjustVec <- function(z, d, ims=NULL, theta=1) {
if (is.null(ims)) {
# HJ -- accept NAs in the data at the probes of interest
#im <- as.image(z, ind=d$index, nx=d$m, ny=d$n, na.rm=TRUE)
im <- as.image.fast(z, ind=d$index, nx=d$m, ny=d$n)
ims <- image.smooth(im, theta=theta)
}
adj <- ims$z - median(ims$z)
adjV <- as.vector(t(adj))
ind <- d$index
idx <- (ind[,1]-1)*d$n + ind[,2]
zAdj <- z - adjV[idx]
return(list(zAdj=zAdj, ims=ims))
}
getBlockSize <- function(dat, probesPerBlock=1250) {
x<-getX(dat, "pm")
y<-getY(dat, "pm")
area <- max(x) * max(y)
numProbes <- length(pmindex(dat))
probeDensity <- numProbes/area
blockSize <- round(sqrt(probesPerBlock/probeDensity))
return(blockSize)
}
maxDensity <- function(x, n.pts = 2^14, minPoints=30) {
# Slightly modified version of the max.density function in affy
if (length(x) >= minPoints) {
aux <- density(x, kernel = "epanechnikov", n = n.pts,
na.rm = TRUE)
aux$x[order(-aux$y)[1]]
} else {
return(NA)
}
}
##############
## bgAdjust ##
##############
# Calculate RMA bg parameters using the GC-stratitifed RMA model
# with background probes
bgAdjustParameters <- function (pm, bgpm, Ngc, bgNgc, n.pts = 2^14)
{
drop <- as.numeric(names(which(table(bgNgc)<50)))
bgNgc[bgNgc %in% drop] <- NA
pmbg <- maxDensity(bgpm, n.pts)
pmbg.gc <- tapply(bgpm, bgNgc, maxDensity, n.pts) # Get mode for each GC bin
#l<-loess(pmbg.gc~as.numeric(names(pmbg.gc)), control = loess.control(surface = "direct"))
#x <- 1:max(Ngc)
#mubg.gc <- predict(l,(x))
mubg.gc <- approx(as.numeric(names(pmbg.gc)), pmbg.gc, 1:max(Ngc), rule=2)$y
bg.data <- bgpm[bgpm < pmbg]
pmbg <- maxDensity(bg.data, n.pts)
bg.data <- bgpm[bgpm < pmbg]
bg.data <- bg.data - pmbg
bgsd <- sqrt(sum(bg.data^2)/(length(bg.data) - 1)) * sqrt(2)
sig.data <- pm[pm > pmbg]
sig.data <- sig.data - pmbg
expmean <- maxDensity(sig.data, n.pts)
alpha <- 1/expmean
mubg <- pmbg
list(alpha = alpha, mu = mubg, mu.gc = mubg.gc, sigma = bgsd)
}
bgAdjustAtom <- function(x, pmIndex, bgIndex, ngc, bgNgc) {
#cat("In bgAdjustAtom\n")
for (i in 1:ncol(x)){
pms <- x[pmIndex,i]
bgs <- x[bgIndex,i]
param <- bgAdjustParameters(pms, bgs, ngc, bgNgc)
b <- param$sigma
pms <- pms - param$mu.gc[ngc] - param$alpha * b^2
pms <- pms + b * ((1/sqrt(2 * pi)) * exp((-1/2) * ((pms/b)^2)))/pnorm(pms/b)
bgs <- bgs - param$mu.gc[bgNgc] - param$alpha * b^2
bgs <- bgs + b * ((1/sqrt(2 * pi)) * exp((-1/2) * ((bgs/b)^2)))/pnorm(bgs/b)
isInf <- which(is.infinite(pms))
if (length(isInf)>0) {
biggest <- max(pms[-isInf], na.rm=TRUE)
pms[isInf] <- biggest
}
isInf <- which(is.infinite(bgs))
if (length(isInf)>0) {
biggest <- max(bgs[-isInf], na.rm=TRUE)
bgs[isInf] <- biggest
}
x[pmIndex,i] <- pms
x[bgIndex,i] <- bgs
}
return(x)
}
bgAdjust <- function(dat, copy=TRUE) {
pmIndex <- pmindex(dat)
bgIndex <- bgindex(dat)
ngc <- countGC(dat, "pm")
bgNgc <- countGC(dat, "bg")
c1 <- assayDataElement(dat, "channel1")
c2 <- assayDataElement(dat, "channel2")
c1 <- oApply(c1, copy=copy, fn=bgAdjustAtom, pmIndex=pmIndex,
bgIndex=bgIndex, ngc=ngc, bgNgc=bgNgc)
c2 <- oApply(c2, copy=copy, fn=bgAdjustAtom, pmIndex=pmIndex,
bgIndex=bgIndex, ngc=ngc, bgNgc=bgNgc)
assayDataElement(dat, "channel1") <- c1
assayDataElement(dat, "channel2") <- c2
return(dat)
}
############################
## normalizeWithinSamples ##
############################
normalizeWithinSamples <- function (dat, copy=TRUE, method = "loess",
scale = c(0.99, 0.99),
controlProbes = NULL, controlIndex = NULL,
approx = TRUE, breaks = 1000,
verbose = FALSE)
{
if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to normalizeWithinSamples.")
if (grepl("gc", method)) {
mAdj <- diffAmpEstGC(dat, method = method, controlIndex=controlIndex, controlProbes = controlProbes)
dat <- diffAmpAdjustGC(dat, mAdj)
}
if (grepl("loess", method)) {
dat <- normalizeLoess(dat, copy=copy, controlIndex = controlIndex,
controlProbes = controlProbes, approx = approx, breaks = breaks)
#gc()
}
if (grepl("median", method)) {
if (is.null(controlIndex)) {
controlIndex <- getControlIndex(dat, controlProbes = controlProbes)
}
datPm <- log2(pm(dat))
M <- datPm[, , "channel1"] - datPm[, , "channel2"]
mCtrl <- M[controlIndex, ]
mAdj <- apply(mCtrl, 2, median, na.rm = TRUE)
datPm[, , "channel2"] <- sweep(datPm[, , "channel2"],
2, mAdj, FUN = "+")
pm(dat) <- 2^datPm
}
dat <- scaleSamples(dat, copy=copy, scale=scale)
gc()
return(dat)
}
normalizeLoessAtom <- function(c1, c2, controlIndex, span, approx, breaks,
by, pmIndex) {
for (i in 1:ncol(c1)) {
lc1 <- log2(c1[pmIndex,i])
lc2 <- log2(c2[pmIndex,i])
y <- lc1-lc2
if (by=="tot") {
x <- lc2
} else if (by=="A") {
x <- (lc1+lc2)/2
}
fit <- loess(y ~ x, subset = controlIndex,
na.action = na.exclude, degree=1, surface = "direct",
span=span)
adj <- predictLoess(fit, newdata=x, approx=approx, breaks=breaks)
c2[pmIndex,i] <- 2^(lc2 + adj)
}
return(list(c1, c2))
}
predictLoess <- function(fit, newdata, approx=TRUE, breaks=1000) {
if (approx) {
newdataBin <- cut(newdata, breaks=breaks, ordered.result=TRUE)
binMean <- tapply(newdata, newdataBin, mean)
isNa <- which(is.na(binMean))
adj <- rep(NA, length(binMean))
if (length(isNa)==0) {
adj <- predict(fit, newdata = binMean)
} else {
adj[-isNa] <- predict(fit, newdata = binMean[-isNa])
}
ret <- adj[newdataBin]
} else {
ret <- predict(fit, newdata=newdata)
}
return(ret)
}
normalizeLoess <- function(dat, copy=TRUE,
controlIndex=NULL, controlProbes=NULL,
span=0.3, by="A", approx=TRUE, breaks=1000) {
if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to normalizeLoess.")
if (is.null(controlIndex)) {
controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
}
pmIndex=pmindex(dat)
c1 <- assayDataElement(dat, "channel1")
c2 <- assayDataElement(dat, "channel2")
ret <- oApply(c1, c2, copy=copy, fn=normalizeLoessAtom,
controlIndex=controlIndex, span=span, approx=approx,
breaks=breaks, by=by, pmIndex=pmIndex)
assayDataElement(dat, "channel1") <- ret[[1]]
assayDataElement(dat, "channel2") <- ret[[2]]
return(dat)
}
scaleSamplesAtom <- function(c1, c2, copy, scale, pmIndex) {
lc1 <- log2(c1[pmIndex,,drop=FALSE])
lc2 <- log2(c2[pmIndex,,drop=FALSE])
M <- lc1-lc2
x <- apply(M, 2, quantile, scale[1], na.rm=TRUE)
adj <- x / -log(1-scale[2])
M <- sweep(M, 2, adj, FUN="/")
c2[pmIndex,] <- 2^(lc1-M)
return(list(c1,c2))
}
scaleSamples <- function(dat, copy=TRUE, scale=c(0.99, 0.99)) {
if (length(scale)==2) {
pmIndex=pmindex(dat)
c1 <- assayDataElement(dat, "channel1")
c2 <- assayDataElement(dat, "channel2")
ret <- oApply(c1, c2, copy=copy, fn=scaleSamplesAtom, scale=scale,
pmIndex=pmIndex)
assayDataElement(dat, "channel1") <- ret[[1]]
assayDataElement(dat, "channel2") <- ret[[2]]
}
return(dat)
}
#########
## SQN ##
#########
mix.qn <- function (y, log2=FALSE, ctrl.id, idx, NQ, mix.param,
max.q = 0.95, low) {
if(missing(idx)) idx <- 1:nrow(y)
if(log2) y <- log2(y)
if (!is.matrix(y)) y <- as.matrix(y)
ret <- y
for (i in 1:ncol(y)) {
y0 <- y[idx, i]
ECDF = ecdf(y0[ctrl.id])
Q = ECDF(y0)
id0 = which(Q < 0.05)
id2 = which(Q > max.q)
B = length(id2)
Q2 = max.q + (1 - max.q) * (1:B)/(B + 1)
y2o = order(y0[id2])
Q[id2][y2o] = Q2
ynorm = vector(mode = "numeric", length = length(y0))
ynorm[id0] = low
ynorm[-c(id0, id2)] = quantile(NQ, Q[-c(id0, id2)])
ynorm[id2] = qnorMix(Q[id2], mix.param)
y[idx,i] <- ynorm
}
if(log2) y <- 2^y
return(y)
}
SQNff <- function (y, copy=TRUE, log2=FALSE, N.mix = 5, ctrl.id, idx,
model.weight = 0.9) {
if(missing(idx)) idx <- 1:nrow(y)
ctrl <- y[idx[ctrl.id], ]
if(log2) ctrl <- log2(ctrl)
QE = apply(ctrl, 2, sort)
QN = apply(QE, 1, median)
mix.param = suppressWarnings(Mclust(QN, G = N.mix)$parameters)
mix.param = norMix(mu = mix.param$mean, sig2 = mix.param$variance$sigmasq,
w = mix.param$pro)
qq = seq(1/(2 * length(QN)), 1 - 1/(2 * length(QN)), 1/length(QN))
qq = qnorMix(qq, mix.param)
QN1 = QN * (1 - model.weight) + qq * model.weight
if ("ff_matrix" %in% class(y)) {
ynorm = oApply(y, copy=copy, fn=mix.qn, log2=log2,
ctrl.id=ctrl.id, idx=idx, NQ = QN1, mix.param = mix.param,
max.q = 0.95, low = quantile(QN1, 0.05))
} else {
if (log2) y <- log2(y)
ynorm = apply(y, 2, mix.qn, ctrl.id=ctrl.id, idx=idx, NQ = QN1,
mix.param = mix.param,
max.q = 0.95, low = quantile(QN1, 0.05))
if (log2) ynorm <- 2^ynorm
}
return(ynorm)
}
#############################
## normalizeBetweenSamples ##
#############################
normalizeBetweenSamples <- function(dat, copy=TRUE,
m="allQuantiles", untreated="none", enriched="none",
controlProbes=NULL, controlIndex=NULL,
excludeIndex=NULL, verbose=FALSE) {
if(is.null(controlIndex) & is.null(controlProbes) & enriched=="sqn") stop("Either controlIndex or controlProbes argument must be provided to normalizeBetweenSamples if using sqn normalization.")
if(ncol(dat)>1) {
c1 <- assayDataElement(dat, "channel1")
c2 <- assayDataElement(dat, "channel2")
if (copy & "ff_matrix" %in% class(c1)) {
open(c1)
open(c2)
c1 <- clone(c1)
c2 <- clone(c2)
}
if (is.null(excludeIndex)) {
idx <- pmindex(dat)
} else {
idx <- pmindex(dat)[-excludeIndex]
}
if (m %in% c("quantile", "allQuantiles")){
assayDataElement(dat, "channel1") <- c1
assayDataElement(dat, "channel2") <- c2
dat <- quantileNormalize(dat, copy=FALSE, idx=idx)
c1 <- assayDataElement(dat, "channel1")
c2 <- assayDataElement(dat, "channel2")
}
## Normalize untreated (leaving M unchanged)
if (untreated=="complete") { # TODO: parallelize
if ("ff_matrix" %in% class(c1)) {
open(c1)
open(c2)
batchSize <- ocSamples()*nrow(c1)/ncol(c1)
i1<-i2<-NULL # To avoid R CMD CHECK "no visible binding" warning
med<-ffrowapply(rowMedians(log2(c1[i1:i2,,drop=FALSE])),
X=c1, RETURN=TRUE, CFUN="c", BATCHSIZE=batchSize)
} else {
med <- rowMedians(log2(c1))
}
M <- getM(dat)
if ("ff_matrix" %in% class(M)) open(M)
for (i in 1:ncol(M)) {
c1[,i] <- 2^med
c2[,i] <- 2^(med - M[,i])
}
}
## Normalize enriched
if (enriched=="sqn") {
if(is.null(controlIndex))
controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
c2 <- SQNff(y=c2, copy=FALSE, log2=TRUE,
ctrl.id=controlIndex, idx=idx)
gc()
}
assayDataElement(dat, "channel1") <- c1
assayDataElement(dat, "channel2") <- c2
}
return(dat)
}
##############
## quantile ##
##############
setGeneric("quantileNormalize", function(object, copy, idx) {
if (missing(copy)) copy <- TRUE
if (missing(idx)) idx <- 1:nrow(object)
standardGeneric("quantileNormalize")
})
setMethod("quantileNormalize", c(object="matrix"),
function (object, copy, idx) {
object[idx,] <- normalize.quantiles(object[idx,], copy=copy)
return(object)
})
setMethod("quantileNormalize", c(object="ff_matrix"),
function (object, copy, idx) {
samplesByNode <- splitIndicesByNode(1:ncol(object))
stats <- ocLapply(samplesByNode, qnTargetStatsNode, object, idx)
totalN <- sum(sapply(stats, "[[", "n"))
total <- rowSums(sapply(stats, "[[", "total"))
target <- total/totalN
oApply(object, copy=copy, fn=qnToTargetAtom,
target=target, idx=idx)
})
setMethod("quantileNormalize", c(object="TilingFeatureSet"),
function (object, copy, idx) {
M <- getM(object)
M <- quantileNormalize(M, copy=FALSE, idx=idx)
c1 <- assayDataElement(object, "channel1")
if (copy & "ff_matrix" %in% class(c1)) {
c1 <- clone(c1)
}
fn <- function(x) 2^x
c2 <- combine(c1, M, "-", transform1="log2", invTransform=fn)
assayDataElement(object, "channel1") <- c1
assayDataElement(object, "channel2") <- c2
return(object)
})
setGeneric("qnTargetStatsNode", function(cols, object, idx)
standardGeneric("qnTargetStatsNode"))
setMethod("qnTargetStatsNode", c(cols="integer", object="matrix", idx="integer"),
function(cols, object, idx) {
total <- rep(0, length(idx))
if (length(cols) > 0) {
for (i in cols) total <- total + sort(object[idx, i])
}
list(total = total, n = length(cols))
})
setMethod("qnTargetStatsNode", c(cols="integer", object="ff_matrix",
idx="integer"),
function(cols, object, idx) {
open(object)
total <- rep(0, length(idx))
if (length(cols) > 0) {
for (i in cols) total <- total + sort(object[idx, i], na.last=TRUE)
}
close(object)
rm(object)
list(total = total, n = length(cols))
})
qnToTargetAtom <- function (object, target, idx) {
object[idx,] <- normalize.quantiles.use.target(object[idx,,drop=FALSE],
target, copy=FALSE)
return(object)
}
#############
## combine ##
#############
setGeneric("combine", function(object1, object2, fun,
transform1, transform2, invTransform, ...) {
if (missing(fun)) fun <- "+"
if (missing(transform1)) transform1 <- NULL
if (missing(transform2)) transform2 <- NULL
if (missing(invTransform)) invTransform <- NULL
standardGeneric("combine")
})
setMethod("combine", "matrix",
function(object1, object2, fun,
transform1, transform2, invTransform, returnList=FALSE) {
if (!is.null(transform1)) object1 <- do.call(transform1, list(object1))
if (!is.null(transform2)) object2 <- do.call(transform2, list(object2))
ret <- do.call(fun, list(object1, object2))
if (!is.null(invTransform)) ret <- do.call(invTransform, list(ret))
if (returnList) {
return(list(ret, NULL))
} else {
return(ret)
}
})
setMethod("combine", "ff_matrix",
function(object1, object2, fun, transform1, transform2, invTransform) {
oApply(object1, object2, copy=TRUE, fun=fun,
fn=combine, transform1=transform1,
transform2=transform2, invTransform=invTransform,
returnList=TRUE)[[1]]
})
#################
## methPercent ##
#################
methPercent <- function(m, pmIndex, ngc, commonParams=TRUE) {
if (is.null(dim(m))) m <- as.matrix(m)
if(missing(pmIndex)) pmIndex <- 1:nrow(m)
## TODO: add parallel support
if ("ff_matrix" %in% class(m)) {
open(m)
ret <- ff(vmode="double", dim=c(length(pmIndex), ncol(m)))
} else {
ret <- matrix(nrow=length(pmIndex), ncol=ncol(m))
}
param <- t(sapply(1:ncol(m),
function(i) logmethParameters(m[pmIndex,i], ngc)))
alpha <- unlist(param[,"alpha"])
sigma <- unlist(param[,"sigma"])
if(commonParams) {
alpha[] <- median(alpha)
sigma[] <- median(sigma)
}
for (i in 1:ncol(m)) {
x <- m[pmIndex,i]
alf <- alpha[i]
sig <- sigma[i]
mu <- 0
p0 <- sum(x<0, na.rm=TRUE) / sum(!is.na(x))
a <- x - mu - alf * sig^2
qPost <- (1-p0) * (a + sig * dnorm(a/sig)/pnorm(a/sig))
ret[,i] <- 1-2^(-qPost)
}
colnames(ret) <- colnames(m)
return(ret)
}
##Assume the signal is S=X+Y where X~Normal(0, s2) and Y~Exp(alpha)
## Calculate E(Y|S)
logmethParameters <- function (pm, ngc, n.pts = 2^14)
{
#maxDensity <- function(x, n.pts) {
# aux <- density(x, kernel = "epanechnikov", n = n.pts,
# na.rm = TRUE)
# aux$x[order(-aux$y)[1]]
#}
pmbg <- 0
bg.data <- pm[pm < pmbg]
#bgsd <- mad(bg.data, center=0, constant=1)
#bgsd <- mad(bg.data, center=0)
idx <- pm < pmbg
bgsd <- median(tapply(pm[idx], ngc[idx], mad, center=0, na.rm=TRUE))
sig.data <- pm[pm > pmbg]
sig.data <- sig.data - pmbg
#cat("Debug logmethParameters(): sig.data =",
# 100*round(length(sig.data)/length(pm), 2), "%\n")
#expmean <- maxDensity(sig.data, n.pts)
expmean <- mean(sig.data, na.rm=TRUE)
alpha <- 1/expmean
mubg <- pmbg
list(alpha = alpha, mu = mubg, sigma = bgsd)
}
checkBreakpoint <- function(breakpoint, ms, as) {
#plot(a, data, type="p", pch=".")
tmp <- sapply(breakpoint, function(b) {
#browser()
left <- as<b
right <- as>=b
l <- length(as)
#print(sum(left))
if (sum(left)==l | sum(right)==l) {
return(1000)
}else {
#print(as[left])
lmLeft <- lm(ms[left] ~ as[left])
lmRight <- lm(ms[right] ~ as[right])
#abline(reg=lmLeft, col="red")
#abline(reg=lmRight, col="red")
res <- c(lmLeft$residuals, lmRight$residuals)
#return(mad(res))
return(sum(res^2))
}
})
names(tmp) <- breakpoint
tmp
}
reshapePd <- function(data, sample, channel, fileName, direction="wide", ...) {
reshape(data, idvar=sample, timevar=channel, v.names=fileName, direction=direction, ...)
}
myfilter <- function(x,filter,...){
L=length(x)
if(L>length(filter)){
res=filter(x,filter,...)
##now fill out the NAs
M=length(filter)
N=(M- 1)/2
for(i in 1:N){
w=filter[(N-i+2):M]
y=x[1:(M-N+i-1)]
res[i] = sum(w*y)/sum(w)
w=rev(w)
ii=(L-(i-1))
y=x[(ii-N):L]
res[ii]<-sum(w*y)/sum(w)
}
} else{ res=rep(mean(x),L)}
return(res)
}
myfilterse <- function(x,filter,...){
##filter must add up to 1!!!
##x are the SD of the variable being smoothed
L=length(x)
if(L>length(filter)){
res=filter(x,filter^2,...)
##now fill out the NAs
M=length(filter)
N=(M- 1)/2
for(i in 1:N){
w=filter[(N-i+2):M]
y=x[1:(M-N+i-1)]
res[i] = sum(w^2*y)/sum(w)^2
w=rev(w)
ii=(L-(i-1))
y=x[(ii-N):L]
res[ii]<-sum(w^2*y)/sum(w)^2
}
} else { res=rep(mean(x)/L,L)}
return(sqrt(res))
}
rowMads <- function(x,center=NULL,constant=1.4826){
if(is.null(center)) center=rowMedians(x)
constant*rowMedians(abs(x-center))
}
smoothTile <- function(mat,pns,filter=NULL,ws=3,verbose=TRUE,...){
##... goes to filter
if(is.null(dim(mat))) mat=as.matrix(mat)
if(is.null(filter)){
Tukey = function(x) pmax(1 - x^2,0)^2
filter= Tukey(seq(-ws,ws)/(ws+1));filter=filter/sum(filter)
}
##this function assumes genome position of mat[Indexes[[1]] are ordered
Indexes=split(seq(along=pns),pns)
for(i in seq(along=Indexes)){
#if(verbose) if(i%%1000==0) cat(i,",")
Index=Indexes[[i]]
for(j in 1:ncol(mat)){
mat[Index,j] <- myfilter(mat[Index,j],filter)
}
}
return(mat)
}
comp = function(g){
g = sort(unique(g))
ng = length(g)
v = vector("character",length=ng*(ng-1)/2)
count=1
for(j in 1:(ng-1)){
for(k in (j+1):ng){
v[count] = g[j]
v[count+1] = g[k]
count = count+2
}
}
return(v)
}
get.tog <- function(l,groups,compare,verbose){
#require(genefilter)
gIndex=split(seq(along=groups),groups)
gIndex=gIndex[which(names(gIndex)%in%compare)]
ng=length(gIndex)
lm=matrix(0,nrow(l),ng)
ls=matrix(0,nrow(l),ng)
ns=sapply(gIndex,length)
if(verbose) message("Computing group medians and SDs for ",ng," groups:")
for(i in seq(along=gIndex)){
if(verbose) message("\n",i)
Index=gIndex[[i]]
if(length(Index)>1){
lm[,i]=rowMedians(l[,Index,drop=FALSE])
ls[,i]=rowMads(l[,Index,drop=FALSE],center=lm[,i])
} else{
message(paste(" ",names(gIndex)[i],"has only 1 array!"))
lm[,i]=l[,Index,drop=FALSE]
ls[,i]=NA
}
}
colnames(lm)=names(gIndex)
colnames(ls)=names(gIndex)
nums <- match(compare,colnames(lm))
COMPS <- matrix(nums,ncol=2,byrow=TRUE)
if(verbose) message("\nDone.\n")
return(list(lm=lm,ls=ls,ns=ns,COMPS=COMPS))
}
get.tt <- function(lm,ls,ns,filter,Indexes,COMPS,ws,verbose){
##this function assumes genome position of mat[Indexes[[1]] are ordered:
if(is.null(filter)){
Tukey = function(x) pmax(1 - x^2,0)^2
filter= Tukey(seq(-ws,ws)/(ws+1));filter=filter/sum(filter)
}
if(!isTRUE(all.equal(sum(filter),1))) stop("filter must sum to 1.")
if(verbose) message("Smoothing:\n")
dm=matrix(0,nrow(lm),nrow(COMPS))
cnames = vector("character",nrow(COMPS))
for(r in 1:ncol(dm)) cnames[r]=paste(colnames(lm)[COMPS[r,]],collapse="-")
colnames(dm) = cnames
vr = dm
sdm = dm
svr = dm
tt = dm
for(r in 1:nrow(COMPS)){
j=COMPS[r,1]
k=COMPS[r,2]
dm[,r]=lm[,j]-lm[,k]
vr[,r]=(ls[,j]^2)/ns[j]+(ls[,k]^2)/ns[k]
}
if(verbose) pb = txtProgressBar(min=1, max=length(Indexes), initial=0, style=1)
for(i in seq(along=Indexes)){
#if(verbose) if(i%%1000==0) cat(".")
Index=Indexes[[i]]
for(r in 1:nrow(COMPS)){
j=COMPS[r,1]
k=COMPS[r,2]
sdm[Index,r]=myfilter(dm[Index,r],filter)
if(ns[j]==1|ns[k]==1){ svr[Index,r] = 1} else{ svr[Index,r] = myfilterse(vr[Index,r],filter) }
}
if(verbose) setTxtProgressBar(pb, i)
}
if(verbose) close(pb)
for(r in 1:nrow(COMPS)) tt[,r] = sdm[,r]/svr[,r]
if(verbose) message("Done.")
return(tt)
}
dmrFinder <- function(eset=NULL, groups, p=NULL, l=NULL, chr=NULL, pos=NULL, pns=NULL, sdBins=NULL, controlIndex=NULL, controlProbes=NULL, Indexes=NULL, filter=NULL, package=NULL, ws=7, verbose=TRUE, compare="all", withinSampleNorm="loess", betweenSampleNorm="quantile", cutoff=0.995, sortBy="ttarea", removeIf=expression(nprobes<3), paired=FALSE, pairs=NULL, DD=NULL, COMPS=NULL, COMPS.names=NULL) {
if(!is.null(removeIf) & !is.expression(removeIf)) stop("If not NULL, removeIf argument must be an expression.")
groups = as.character(groups)
if(paired & is.null(DD) & is.null(pairs)) stop("pairs argument must be provided if paired=TRUE.")
if(paired & is.null(DD) & length(pairs)!=length(groups)) stop("pairs argument must be same length as groups.")
if(!paired | is.null(DD)) { #then the compare arg will be used.
if(identical(compare,"all")) compare=comp(groups)
if(length(compare)%%2!=0) stop("compare must have an even number of elements.")
if(length(cutoff)==1) cutoff <- rep(cutoff, length(compare)/2)
if(length(compare)/2!=length(cutoff)) stop(length(compare)/2," comparisons requested but ", length(cutoff)," cutoff(s) provided.")
if(!all(compare%in%groups)) stop("Not all groups specified in the compare argument are in the groups argument.")
}
args=list(filter=filter, ws=ws, betweenSampleNorm=betweenSampleNorm,
withinSampleNorm=withinSampleNorm, sdBins=sdBins,
cutoff=cutoff, sortBy=sortBy, compare=compare,
paired=paired, pairs=pairs, removeIf=removeIf)
# dmrFinder must be given either eset or p/l,chr,pos,pns, and package.
# If eset is supplied then all the latter will be taken from it (with any
# that were given as arguments being ignored) (except p).
# l=logit(p). Indexes=split(seq(along=pns),pns).
if(is.null(eset)) {
if (is.null("p") & is.null("l")) stop("p or l must be supplied.")
Args = c("chr","pos","pns","package") #"controlIndex" not needed
nulls = sapply(Args,function(x) is.null(get(x)))
if(any(nulls))
stop(paste("The following arguments are missing:", paste(Args[nulls], collapse=", ")))
lens = c( nrow(p), nrow(l), length(chr), length(pos), length(pns) )
if(length(unique(lens))!=1)
stop("p, l, chr, pos, and/or pns are incompatible.")
stopifnot(length(groups)==max(ncol(p), ncol(l)))
chrpos <- paste(chr, pos)
dup <- duplicated(chrpos) | duplicated(chrpos, fromLast = TRUE)
index <- which(!is.na(chr) & !is.na(pos) & !is.na(pns) & !dup)
index=index[order(chr[index],pos[index])]
chr=chr[index]
pos=pos[index]
pns=pns[index]
# controlIndex=which(index%in%controlIndex)
if(!is.null(sdBins)) sdBins<-sdBins[index]
if(!is.null(p)) p=p[index,]
if(!is.null(l)) l=l[index,]
} else if (is.character(eset)) {
if (is.null("p") & is.null("l")) stop("p or l must be supplied.")
pdInfo=get(eset)
class(pdInfo)="TilingFeatureSet" # Trick oligo so that pmChr, pmPosition, probeNames work
chr=pmChr(pdInfo)
pos=pmPosition(pdInfo)
chrpos <- paste(chr, pos)
dup <- duplicated(chrpos) | duplicated(chrpos, fromLast = TRUE)
if (!is.null(l)) {
index=which(rowSums(is.na(l))==0 & !dup)
index=index[order(chr[index],pos[index])]
l=l[index,]
} else {
index=which(rowSums(is.na(p))==0 & !dup)
index=index[order(chr[index],pos[index])]
p=p[index,]
}
chr=chr[index]
pos=pos[index]
pns=probeNames(pdInfo)[index]
# if (is.null(controlIndex)) {
# controlIndex <- which(getContainer(pdInfo) %in% controlProbes)
# }
# controlIndex=which(index%in%controlIndex)
if(!is.null(sdBins)) sdBins<-sdBins[index]
package = eset
if(package=="pd.feinberg.mm8.me.hx1"){
#add some code here to break up regions with gaps of >300 bp
}
if(package=="pd.feinberg.hg18.me.hx1"){
#add some code here to break up regions with gaps of >300 bp
}
} else {
stopifnot(length(groups)==ncol(eset))
if(is.null(p) & is.null(l)){
p <- methp(eset,
withinSampleNorm=withinSampleNorm,
betweenSampleNorm=betweenSampleNorm,
controlIndex=controlIndex,
controlProbes=controlProbes,
verbose=TRUE)
}
chr=pmChr(eset)
pos=pmPosition(eset)
chrpos <- paste(chr, pos)
dup <- duplicated(chrpos) | duplicated(chrpos, fromLast = TRUE)
if (!is.null(l)) {
index=which(rowSums(is.na(l))==0 & !dup)
index=index[order(chr[index],pos[index])]
l=l[index,]
} else {
index=which(rowSums(is.na(p))==0 & !dup)
index=index[order(chr[index],pos[index])]
p=p[index,]
}
chr=chr[index]
pos=pos[index]
pns=probeNames(eset)[index]
# if (is.null(controlIndex)) {
# controlIndex=getControlIndex(eset, controlProbes=controlProbes)
# }
# controlIndex=which(index%in%controlIndex)
if(!is.null(sdBins)) sdBins<-sdBins[index]
package = annotation(eset)
if(package=="pd.feinberg.mm8.me.hx1"){
#add some code here to break up regions with gaps of >300 bp
}
if(package=="pd.feinberg.hg18.me.hx1"){
#add some code here to break up regions with gaps of >300 bp
}
}
if(is.null(l)) {
l=log(p)-log(1-p)
}
if(is.null(Indexes)) {
Indexes=split(seq(along=pns),pns)
}
if(!paired) {
tog = get.tog(l=l,groups=groups,compare=compare,verbose=verbose)
lm=tog$lm
ls=tog$ls
ns=tog$ns
COMPS=tog$COMPS
tt = get.tt(lm=lm,ls=ls,ns=ns,COMPS=COMPS,Indexes=Indexes,filter=filter,ws=ws,verbose=verbose)
} else {
if(is.null(DD)) {
DD0 = get.DD(l=l, groups=groups, compare=compare, pairs=pairs)
DD = DD0$DD
COMPS = DD0$COMPS
COMPS.names = DD0$COMPS.names
} else {
if(is.null(COMPS)|is.null(COMPS.names)) stop("If DD is provided, COMPS and COMPS.names must also be provided.")
}
ttpaired = get.tt.paired(DD=DD,Indexes=Indexes,filter=filter,ws=ws,verbose=verbose)
tt = ttpaired$sT
MD = ttpaired$MD
sMD = ttpaired$sMD
}
res=vector("list",ncol(tt))
names(res)=colnames(tt)
if(verbose) message("Finding DMRs for each pairwise comparison.")
for(r in 1:nrow(COMPS)) {
j = COMPS[r,1]
k = COMPS[r,2]
if(verbose) message("\n",colnames(tt)[r])
if(!paired) DF=ifelse(ns[j]==1 & ns[k]==1, 1, ns[j]+ns[k]-2)
if(paired) DF=ifelse(ncol(DD[[r]])-1==0,1,ncol(DD[[r]])-1)
if (length(sdBins)==0) {
K=mad(tt[,r], na.rm=TRUE)*qt(cutoff[r],DF)
} else {
s <- tapply(tt[,r], sdBins, mad, na.rm=TRUE)
K=s[sdBins]*qt(cutoff[r],DF)
}
LAST=0
segmentation=vector("numeric",nrow(tt))
type=vector("numeric",nrow(tt))
for(i in seq(along=Indexes)){
if(verbose) if(i%%1000==0) message(".", appendLF=FALSE)
Index=Indexes[[i]]
y=tt[Index,r]
if(length(sdBins)==0) {
tmp=sign(y)*as.numeric(abs(y)>K)
} else {
Ki <- K[Index]
tmp=sign(y)*as.numeric(abs(y)>Ki)
}
tmp2=cumsum(c(1,diff(tmp)!=0))+LAST
segmentation[Index]=tmp2
type[Index]=tmp
LAST=max(tmp2)
}
Index = which(type!=0)
rows = length(unique(segmentation[Index]))
res[[r]]=data.frame(
chr=tapply(chr[Index],segmentation[Index],function(x) x[1]),
start=tapply(pos[Index],segmentation[Index],min),
end=tapply(pos[Index],segmentation[Index],max),
p1=rep(NA, rows),
p2=rep(NA, rows),
regionName=tapply(pns[Index],segmentation[Index],function(x) x[1]),
indexStart=tapply(Index,segmentation[Index],min),
indexEnd=tapply(Index,segmentation[Index],max))
length=res[[r]]$indexEnd-res[[r]]$indexStart+1
res[[r]]$nprobes = length
## The following commented out part gives identical results to the uncommented out replacement code below if using l rather than p, and if odd number of samples in each group, when using p too, but otherwise lm's values are the means of the middle 2 values and transforming back from logit to p isn't exact so there will be very minor discrepancies.
# if(!paired) {
# if(is.null(p)) { # We return log-ratios
# colnames(res[[r]]) <- sub("p1", "m1", colnames(res[[r]]))
# colnames(res[[r]]) <- sub("p2", "m2", colnames(res[[r]]))
# res[[r]]$m1=tapply(lm[Index,j],segmentation[Index],mean)
# res[[r]]$m2=tapply(lm[Index,k],segmentation[Index],mean)
# diff = res[[r]]$m1 - res[[r]]$m2
# maxdiff=tapply(lm[Index,j]-lm[Index,k],segmentation[Index],
# function(x) { x[which.max(abs(x))] })
# } else { # We return percentages
# tmp1 = 1/(1+exp(-lm[Index,j]))
# tmp2 = 1/(1+exp(-lm[Index,k]))
# res[[r]]$p1=tapply(tmp1,segmentation[Index],mean)
# res[[r]]$p2=tapply(tmp2,segmentation[Index],mean)
# diff = res[[r]]$p1 - res[[r]]$p2
# maxdiff=tapply(tmp1-tmp2,segmentation[Index],
# function(x) { x[which.max(abs(x))] })
# }
# } else {
if(nrow(res[[r]])>0) {
if(!paired) {
grp1 = which(groups==colnames(lm)[j])
grp2 = which(groups==colnames(lm)[k])
} else {
grp1 = which(groups==COMPS.names[r,1])
grp2 = which(groups==COMPS.names[r,2])
}
if(is.null(p)) {
colnames(res[[r]]) <- sub("p1", "m1", colnames(res[[r]]))
colnames(res[[r]]) <- sub("p2", "m2", colnames(res[[r]]))
mat = cbind(rowMedians(l[,grp1,drop=FALSE]), rowMedians(l[,grp2,drop=FALSE]))
matr = t(apply(res[[r]][,c("indexStart","indexEnd")],1,
function(se) colMeans(mat[se[1]:se[2],,drop=FALSE])))
res[[r]]$m1 = matr[,1]
res[[r]]$m2 = matr[,2]
diff = res[[r]]$m1 - res[[r]]$m2
} else {
mat = cbind(rowMedians(p[,grp1,drop=FALSE]), rowMedians(p[,grp2,drop=FALSE]))
matr = t(apply(res[[r]][,c("indexStart","indexEnd")],1,
function(se) colMeans(mat[se[1]:se[2],,drop=FALSE])))
res[[r]]$p1 = matr[,1]
res[[r]]$p2 = matr[,2]
diff = res[[r]]$p1 - res[[r]]$p2
}
## Max diff:
matd = mat[,1]-mat[,2]
maxdiff = apply(res[[r]][,c("indexStart","indexEnd")],1,
function(se) {
rt = matd[se[1]:se[2]]
rt[which.max(abs(rt))] })
area = abs(diff)*length
ttarea = abs(tapply(tt[Index,r],segmentation[Index],mean)) *length
res[[r]]$area = area
res[[r]]$ttarea = ttarea
res[[r]]$diff = diff
res[[r]]$maxdiff = maxdiff
if(sortBy=="area") res[[r]]=res[[r]][order(-area),]
if(sortBy=="ttarea") res[[r]]=res[[r]][order(-ttarea),]
if(sortBy=="avg.diff") res[[r]]=res[[r]][order(-abs(diff)),]
if(sortBy=="max.diff") res[[r]]=res[[r]][order(-abs(maxdiff)),]
if(!is.null(removeIf)) res[[r]]=subset(res[[r]],subset=!eval(removeIf))
} else {
res[[r]]=data.frame(chr=character(0), start=numeric(0), end=numeric(0),
p1=numeric(0), p2=numeric(0), regionName=character(0),
indexStart=numeric(0), indexEnd=numeric(0),
nprobes=numeric(0), area=numeric(0), ttarea=numeric(0),
diff=numeric(0), maxdiff=numeric(0))
if(is.null(p)) {
colnames(res[[r]]) <- sub("p1", "m1", colnames(res[[r]]))
colnames(res[[r]]) <- sub("p2", "m2", colnames(res[[r]]))
}
}
if(verbose) message(nrow(res[[r]])," DMR candidates found using cutoff=",cutoff[r],".")
}
if(verbose) message("\nDone")
if(!paired) {
return(list(tabs=res, p=p, l=l, chr=chr, pos=pos, pns=pns,
index=index, gm=lm, #controlIndex=controlIndex,
groups=groups, args=args, comps=COMPS, package=package))
} else {
return(list(tabs=res, p=p, l=l, chr=chr, pos=pos, pns=pns,
index=index, DD=DD, sMD=sMD, #controlIndex=controlIndex,
groups=groups, args=args, comps=COMPS, comps.names=COMPS.names, package=package))
}
}
dmrFdr <- function(dmr, compare=1, numPerms=1000, seed=NULL, verbose=TRUE) {
if (length(compare)!=1) stop("You must choose one comparison at a time when calculating FDRs. Please set dmr to be one of: ",
paste(names(dmr$tabs), collapse=", "), "\n")
if (is.numeric(compare)) compare <- names(dmr$tabs)[compare]
message("Calculating q-values for DMRs between ", compare, "\n")
# Get probe order from TilingFeatureSet object
pdInfo=get(dmr$package)
class(pdInfo)="TilingFeatureSet" # Trick oligo so that pmChr, pmPosition work
chr=pmChr(pdInfo)
pos=pmPosition(pdInfo)
chrpos <- paste(chr, pos)
dchrpos <- paste(dmr$chr, dmr$pos)
#idx <- which(!(chrpos %in% dchrpos))
mis <- setdiff(chrpos, dchrpos)
o <- order(chr, pos)
chrpos <- chrpos[o]
keepProbes <- !(chrpos %in% mis)
o <- o[keepProbes]
keep <- dmr$groups %in% unlist(strsplit(compare, "-"))
# Recreate p or l with same sort order as in annotation package
if (is.null(dmr$p)) {
l <- matrix(NA, nrow=length(pos), ncol=ncol(dmr$l))
l[o,] <- dmr$l
l <- l[,keep]
} else {
p <- matrix(NA, nrow=length(pos), ncol=ncol(dmr$p))
p[o,] <- dmr$p
p <- p[,keep]
}
n <- sum(keep)
n1 <- sum(dmr$groups==unlist(strsplit(compare, "-"))[1])
maxPerms <- choose(n, n1)
if (numPerms=="all") numPerms <- maxPerms
if (numPerms>maxPerms) {
message("Given the sample sizes in the two groups the maximum number of permutations is ", maxPerms, sep="")
numPerms <- maxPerms
}
## Reshuffled group label DMRs
if (!is.null(seed)) set.seed(seed)
if (maxPerms<1e6) { # Enumerate all the combinations
s <- sample(1:maxPerms, numPerms)
grp1 <- combinations(n,n1)[s,]
} else {
grp1 <- t(sapply(1:numPerms, function(x) sample(n,n1)))
}
if (verbose) message("Finding permuted data DMRs. Estimating time remaining")
areas <- lapply(1:numPerms, function(i) {
groups <- rep("grp2", n)
groups[grp1[i,]] <- "grp1"
if (is.null(dmr$p)) {
st <- system.time(dmrPerm <- dmrFinder(dmr$package, l=l,
groups=groups, cutoff=dmr$args$cutoff,
filter=dmr$args$filter, ws=dmr$args$ws,
verbose=FALSE))[3]
} else {
st <- system.time(dmrPerm <- dmrFinder(dmr$package, p=p,
groups=groups, cutoff=dmr$args$cutoff,
filter=dmr$args$filter, ws=dmr$args$ws,
verbose=FALSE))[3]
}
if (verbose & (i %in% round(seq(1, numPerms, length.out=10)))) {
message(i, "/", numPerms, " (", prettyTime((numPerms-i)*st),
" remaining)", sep="")
}
dmrPerm$tabs[[1]][,dmr$args$sortBy]
})
nullDist <- unlist(areas)
fn <- ecdf(nullDist)
pval <- 1-fn(dmr$tabs[[compare]][,dmr$args$sortBy])
pi0<-pi0.est(pval)$p0
qval<-qvalue.cal(pval, pi0)
dmr$tabs[[compare]] <- cbind(dmr$tabs[[compare]], pval, qval)
if (!("numPerms" %in% names(dmr))) {
dmr$numPerms <- rep(NA, length(dmr$tabs))
names(dmr$numPerms) <- names(dmr$tabs)
}
dmr$numPerms[compare] <- numPerms
return(dmr)
}
prettyTime <- function(seconds) {
hours <- floor(seconds / 60 / 60)
minutes <- floor((seconds/60) - (hours*60))
ret <- ""
if (hours>0) ret <- paste(ret, hours,
ifelse(hours==1, " hour", " hours"), ", ", sep="")
ret <- paste(ret, minutes, ifelse(minutes==1, " minute", " minutes"),
sep="")
ret
}
validatePd <- function(pd, fileNameColumn, sampleNameColumn,
groupColumn, ut = "_532.xys", md = "_635.xys") {
msg <- ""
message("Filenames:")
if (missing(fileNameColumn)) {
fileNameColumn <- which.max(apply(pd, 2, function(col) {
u <- grep(ut, col, ignore.case=TRUE)
m <- grep(md, col, ignore.case=TRUE)
length(c(u,m))
}))
message(" fileNameColumn not specified. Trying to guess.\n")
}
files <- pd[,fileNameColumn]
o <- order(files)
files <- files[o]
pd <- pd[o,]
utIdx <- grep(ut, files)
if (length(utIdx)==0) {
msg <- paste(msg, " No files in column ", fileNameColumn, " match the untreated extension ", ut, ". Please use the ut option to set the correct extension\n", sep="")
}
mdIdx <- grep(md, files)
if (length(mdIdx)==0) {
msg <- paste(msg, " No files in column ", fileNameColumn, " match the methyl-depleted extension ", md, ". Please use the md option to set the correct extension\n", sep="")
}
filesUt <- sub(ut, "", files[utIdx])
filesMd <- sub(md, "", files[mdIdx])
missing <- c(filesUt[!filesUt%in%filesMd], filesMd[!filesMd%in%filesUt])
if (length(missing)>0) {
msg <- paste(msg, " The untreated (ut) and methyl-depleted (md) file names in column ", fileNameColumn, " don't match up. Check ", paste(missing, collapse=", "), sep="")
}
if (length(missing)>0 | length(utIdx)==0 | length(mdIdx)==0) {
message(" ERROR - ", msg)
return(list(status="ERROR", message=msg))
} else {
message(" OK - Found in column", fileNameColumn)
}
channelMatch <- apply(pd[utIdx,]==pd[mdIdx,], 2, all)
numUnique <- apply(pd[utIdx,], 2, function(x) length(unique(x)))
onePerSample <- numUnique==length(utIdx)
message("Sample names:")
if (missing(sampleNameColumn)) {
message(" sampleNameColumn column not specified. Trying to guess.")
sampleNameColumn <- which(channelMatch & onePerSample)
if (length(sampleNameColumn)==1) {
message(" OK - Found in column", sampleNameColumn)
} else if (length(sampleNameColumn)==0) {
msg <- paste(msg, "No suitable sample ID column found. This column should have the same entry for each pair of untreated and methyl-depleted filenames.\n")
message(" ERROR - ", msg, appendLF=FALSE )
return(list(status="ERROR", message=msg))
} else if (length(sampleNameColumn)>1) {
message(" WARNING - Multiple columns (", paste(sampleNameColumn, collapse=", "),
") are candidates for sample ID.", sep="")
sampleNameColumn <- sampleNameColumn[1]
}
} else { # User specified sampleNameColumn
if (channelMatch[sampleNameColumn] & onePerSample[sampleNameColumn]) {
message(" OK - Found in column", sampleNameColumn)
} else {
if (!channelMatch[sampleNameColumn]) {
msg <- paste(msg, " There are samples with different values in column ", sampleNameColumn, " for their untreated and methyl-depleted rows\n", sep="")
} else if (!onePerSample[sampleNameColumn]) {
msg <- paste(msg, " The sample IDs in column ", sampleNameColumn, " are not unique to each sample\n", sep="")
}
message(" ERROR -", msg)
return(list(status="ERROR", message=msg))
}
}
message("Group labels:")
if (missing(groupColumn)) {
message(" groupColumn column not specified. Trying to guess.")
groupColumn <- which(channelMatch & !onePerSample & numUnique>1)
if (length(groupColumn)==1) {
message(" OK - Found in column", groupColumn)
} else if (length(groupColumn)==0) {
msg <- paste(msg, "No suitable group label column found - Defaulting to sample ID column for group labels (i.e. 1 sample per group)\n")
warning("No suitable group label column found - Defaulting to sample ID column for group labels (i.e. 1 sample per group)")
groupColumn <- sampleNameColumn
} else if (length(groupColumn)>1) {
warning(" WARNING - Multiple columns (",
paste(groupColumn, collapse=", "),
") are candidates for group labels.\n", sep="")
groupColumn <- groupColumn[1]
}
} else { # User specified groupColumn
if (channelMatch[groupColumn] & !onePerSample[groupColumn]) {
message(" OK - Found in column", groupColumn)
} else {
if (!channelMatch[groupColumn]) {
msg <- paste(msg, "There are samples with different values in column", groupColumn, "for their untreated and methyl-depleted rows\n")
message(" ERROR -", msg, appendLF=FALSE)
return(list(status="ERROR", message=msg))
} else if (onePerSample[groupColumn]) {
message(" WARNING - Each group has only 1 sample")
}
}
}
return(list(status="OK", fileNameColumn=fileNameColumn,
sampleNameColumn=sampleNameColumn,
groupColumn=groupColumn))
}
.onAttach <- function(libname, pkgname) {
packageStartupMessage("Welcome to charm version ",
packageDescription("charm", fields="Version"))
ocSamples(1)
ldPath(tempdir())
}
##The next 2 functions are for the paired=TRUE option of dmrFinder:
get.DD <- function(l, groups, compare, pairs){
##Define COMPS the same as in get.tog():
gIndex=split(seq(along=groups),groups)
gIndex=gIndex[which(names(gIndex)%in%compare)]
NAMES = names(gIndex)
nums <- match(compare,NAMES)
COMPS <- matrix(nums,ncol=2,byrow=TRUE)
COMPS.names <- t(apply(COMPS,1,function(x) NAMES[x]))
DD <- list() #unlike an array a list will accomodate drop-out
for(i in 1:nrow(COMPS)){
#Make DD[[i]], the differences for comparison i
j=COMPS[i,1]
k=COMPS[i,2]
pd.j <- which(groups==NAMES[j])
pd.k <- which(groups==NAMES[k])
ord <- match(pairs[pd.j],pairs[pd.k])
if(all(is.na(ord))){
message(paste("Comparison",i,"has no pairs! Ignoring this comparison."))
DD[[i]] = NA
next
}
if(any(is.na(ord))){
pd.j <- pd.j[-which(is.na(ord))]
ord <- ord[-which(is.na(ord))]
} #If any in j not in k, this will remove that one in j.
#If any in k not in j, it's won't be included in pd.k anyway.
pd.k <- pd.k[ord]
stopifnot(identical(pairs[pd.j],pairs[pd.k]))
DD[[i]] <- as.matrix(l[,pd.j] - l[,pd.k])
#if(absdiff) DD[[i]] <- abs(DD[[i]])^1
#^.5 is down ladder of powers and makes symmetrical, but ^1 seems to work better.
#log makes small differences large in magnitude (in the
#negative direction) which is not what we want.
if(ncol(DD[[i]])==1) colnames(DD[[i]]) = colnames(l)[pd.j] #since no colname given
names(DD)[i] = paste(NAMES[j],"-",NAMES[k],sep="")
}
return(list(DD=DD,COMPS=COMPS,COMPS.names=COMPS.names))
}
get.tt.paired <- function(DD,Indexes,filter=NULL,ws,verbose=TRUE) {
#require(genefilter)
ok = which(sapply(DD,length)>1)
MD <- matrix(NA,nrow=nrow(DD[[1]]),ncol=length(DD)) #mean differences
vars <- matrix(NA,nrow=nrow(DD[[1]]),ncol=length(DD)) #their variances
for(i in ok) {
if(ncol(DD[[i]])<3) message(paste("Comparison",i,"is just using raw differences! May want to set lower CUTOFF."))
#if(ncol(DD[[i]])<3) stop(paste("Comparison",i,"has <3 pairs."))
MD[,i] <- rowMeans(DD[[i]])
vars[,i] <- rowVars(DD[[i]])/ncol(DD[[i]]) #all NA if only 1 pair
}
#Now get smoothed MD and its vars:
sMD <- matrix(NA,nrow=nrow(MD),ncol=ncol(MD))
ses <- matrix(NA,nrow=nrow(vars),ncol=ncol(vars)) #myfilterse for new Charm returns sqrt(var)
if(is.null(filter)) {
Tukey = function(x) pmax(1 - x^2,0)^2
fs= Tukey(seq(-ws,ws)/(ws+1));fs=fs/sum(fs)
} else fs =filter
if(verbose) message("Smoothing mean differences using window size of ",ws*2+1,":")
if(verbose) pb = txtProgressBar(min=1, max=length(Indexes), initial=0, style=1)
for(i in seq(along=Indexes)) {
Index=Indexes[[i]]
for(j in ok){
sMD[Index,j] = myfilter( MD[Index,j],fs )
if(ncol(DD[[j]])>2) ses[Index,j] = myfilterse( vars[Index,j],fs )
if(ncol(DD[[j]])<3) ses[Index,j] = 1
}
if(verbose) setTxtProgressBar(pb, i)
}
if(verbose) close(pb)
colnames(MD) = names(DD)
colnames(sMD) = names(DD)
#if(permute==FALSE & permute.paired==FALSE){
# save(MD,sMD,file=file.path(outpath,"MD.rda"),compress=TRUE) #for results.R
#}
#Finally make sT:
sT <- sMD/ses
#if(absdiff) sT = (sT - mean(sT))*(sT>mean(sT))
return(list(sT=sT, MD=MD, sMD=sMD))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.