R/charm.R

# oligo functions we'd like to import once they're exported
#oligoBigObjectSupport <- oligo:::oligoBigObjectSupport
#oligoBigObjectPath <- oligo:::oligoBigObjectPath
bgindex <- oligo:::bgindex # method

setOldClass("ff_matrix")

asff <- function(ffm, rows=1:nrow(ffm), columns=1:ncol(ffm)) { #subset ff_matrix object rows but keep it an ff_matrix object
    stopifnot(is.numeric(rows)&is.numeric(columns))
    ret2 <- ff(vmode="double", dim=c(length(rows),length(columns)))
    for(cm in 1:length(columns)) ret2[,cm] = ffm[rows,columns[cm]]
    colnames(ret2) = colnames(ffm)[columns]
    rownames(ret2) = rownames(ffm)[rows]
    ret2
}

#######################
## oApply and nodeFn ##
#######################
setGeneric("oApply", function(object1, ...) standardGeneric("oApply"))
setGeneric("nodeFn", function(cols, object1, ...) standardGeneric("nodeFn"))

setMethod("oApply", "matrix",
	function(object1, object2, copy=TRUE, fn, ...) {
	    samplesByNode <- splitIndicesByNode(1:ncol(object1))
	    if (missing(object2)) {
			ret <- ocLapply(samplesByNode, nodeFn, object1=object1, 
				fn=fn, ...)
			return(do.call("cbind", ret))
		} else {
			ret <- ocLapply(samplesByNode, nodeFn, 
				object1=object1, object2=object2, fn=fn, ...)
			c1 <- do.call("cbind", lapply(ret, "[[", 1))
			c2 <- do.call("cbind", lapply(ret, "[[", 2))
			return(list(c1, c2))
		}
	})

	
setMethod("oApply", "ff_matrix", 
	function (object1, object2, copy=TRUE, fn, ...) {
    	stopifnot(ldStatus())
	    if (copy) {
			open(object1)
	        out1 <- clone(object1)
			if (!missing(object2)) {
				open(object2)
	        	out2 <- clone(object2)
			}
	    } else {
	        out1 <- object1
			if (!missing(object2))
				out2 <- object2
	    }
	    samplesByNode <- splitIndicesByNode(1:ncol(out1))
	    if (missing(object2)) {
			ocLapply(samplesByNode, nodeFn, object1=out1, fn=fn, ...)
			return(out1)
		} else {
			ocLapply(samplesByNode, nodeFn, object1=out1, object2=out2, fn=fn, ...)
			return(list(out1, out2))
		}	
	})

## In these nodeFn methods the rows of the objects returned by fn must correspond to the rows of object1 and object2.  If the number of rows is more or less, you'll get an error.  If the number is the same but the order is wrong, then you won't (even worse).  Before the correction to getM for ff_matrix objects below, an error was returned because the returned object was only a subset of the rows.
setMethod("nodeFn", c("integer", "matrix"),
	function (cols, object1, object2, fn, ...) {
	    if (length(cols) > 0) {
			grpCols <- splitIndicesByLength(1:length(cols), ocSamples())
			if (missing(object2)) {
				object1 <- object1[,cols,drop=FALSE]
		        for (theCols in grpCols) {
					object1[, theCols] <- fn(object1[, theCols, drop = FALSE], ...)
				}			
				return(object1)
			} else {
				object1 <- object1[,cols,drop=FALSE]
				object2 <- object2[,cols,drop=FALSE]
	        	for (theCols in grpCols) {
					ret <- fn(object1[, theCols, drop = FALSE], 
						object2[, theCols, drop = FALSE], ...)	
					object1[, theCols] <- ret[[1]]
					if (!is.null(ret[[2]])) object2[, theCols] <- ret[[2]]
				}
				return(list(object1, object2))
			}
	    }
	})

setMethod("nodeFn", c("integer", "ff_matrix"), 
	function (cols, object1, object2, fn, ...) {
    	if (length(cols) > 0) {
	        grpCols <- splitIndicesByLength(cols, ocSamples())
	        open(object1)
			if (missing(object2)) {
		        for (theCols in grpCols) {
					object1[, theCols] <- fn(object1[, theCols, drop = FALSE], ...)
				}			
			} else {
				open(object2)
	        	for (theCols in grpCols) {
					ret <- fn(object1[, theCols, drop = FALSE], 
						object2[, theCols, drop = FALSE], ...)	
					object1[, theCols] <- ret[[1]]
					if (!is.null(ret[[2]])) object2[, theCols] <- ret[[2]]
				}
				close(object2)
				rm(object2)
			}
	        close(object1)
	        rm(object1, grpCols)
	        gc()
	    }
	    TRUE
	})


##########
## getM ##
##########
# method for signature TilingFeatureSet
#setMethod("getM", "TilingFeatureSet", 

setGeneric("getM", function(object1, ...) standardGeneric("getM"))

setMethod("getM", "TilingFeatureSet", 
	function(object1, rows) {
		c1 <- assayDataElement(object1, "channel1")
		c2 <- assayDataElement(object1, "channel2")
		if(missing(rows)) rows <- 1:nrow(c1)
		getM(c1, c2, rows)
	})

setMethod("getM", "matrix", 
	function(object1, object2, rows) log2(object1[rows,])-log2(object2[rows,]))

#setMethod("getM", "ff_matrix", 
#	function(object1, object2, rows) {
#		ret <- oApply(object1=object1, object2=object2, copy=TRUE, 
#			fn=function (object1, object2, rows)
#			 	list(log2(object1[rows,])-log2(object2[rows,]), NULL),
#			rows=rows)
#		ret[[1]]	
#	})	
## Corrected version of getM for ff_matrix objects:
setMethod("getM", "ff_matrix", 
	function(object1, object2, rows) {
		ret <- oApply(object1=object1, object2=object2, copy=TRUE, 
			fn=function (object1, object2)
			 	list(log2(object1)-log2(object2), NULL))
                ret2 <- ff(vmode="double", dim=c(length(rows), ncol(ret[[1]])))
                for(cm in 1:ncol(ret[[1]])) ret2[,cm] = ret[[1]][rows,cm]
                colnames(ret2) = colnames(ret[[1]])             
		ret2
	})



cloneFeatureSet <- function(object, finalizer="delete") {
	cn <- channelNames(object)
	if ("ff_matrix" %in% class(assayDataElement(object, cn[1]))) {
		for (i in 1:length(cn)) {
			chan <- assayDataElement(object, cn[i])	
			open(chan)
			assayDataElement(object, cn[i])	 <- clone(chan, finalizer=finalizer)
		}
	}
	return(object)
}

###########
## methp ##
###########
methp <- function(dat, spatial=TRUE, bgSubtract=TRUE,
		withinSampleNorm="loess",
		scale=c(0.99, 0.99),
		betweenSampleNorm="quantile", 
		controlProbes=NULL, controlIndex=NULL, 
                excludeIndex=NULL,
		commonMethPercentParams=NULL,
		verbose=TRUE, returnM=FALSE, 
		plotDensity=NULL, plotDensityGroups=NULL) {

        if(!is.null(excludeIndex)) if(!inherits(excludeIndex,c("numeric","integer"))) stop("excludeIndex argument, if not NULL, must be a numeric or integer vector.")
        if(!is.null(controlIndex)) if(!inherits(controlIndex,c("numeric","integer"))) stop("controlIndex argument, if not NULL, must be a numeric or integer vector.")
        if(is.null(controlProbes) & is.null(controlIndex)) stop("Either controlProbes or controlIndex argument must be specified.")
        if(!any(controlProbes%in%getContainer(dat)) & is.null(controlIndex)) stop("Invalid controlProbes argument: no such values not found for this array.")

	if(!is.null(plotDensity)) {
		pdf(file=plotDensity, height=11, width=8)
		par(mfrow=c(5,2), mar=c(2,2,4,2))
		lwd <- rep(1, ncol(dat))
		plotDensity(dat, main="1. Raw", lab=plotDensityGroups, 
		 	controlIndex=controlIndex, controlProbes=controlProbes,
                        excludeIndex=excludeIndex)
	}

	if (is.list(betweenSampleNorm)) {
		bs <- betweenSampleNorm
	} else {
		if (betweenSampleNorm=="quantile") {
			bs <- list(m="allQuantiles", untreated="none", enriched="none")
			if(is.null(commonMethPercentParams)) commonMethPercentParams <- TRUE
		} else if (betweenSampleNorm=="sqn") {
			bs <- list(m="none", untreated="complete", enriched="sqn")
			if(is.null(commonMethPercentParams)) commonMethPercentParams <- FALSE
		} else if (betweenSampleNorm=="sqn99") {
			stop("Option sqn99 is deprecated. The same behaviour is now obtained by using sqn.")
		} else if (betweenSampleNorm=="none") {
			bs <- list(m="none", untreated="none", enriched="none")
			if(is.null(commonMethPercentParams)) commonMethPercentParams <- FALSE
		}
	}
	if (verbose>2) print(gc())			
	
	dat <- cloneFeatureSet(dat)
	
    # Spatial bias correction
    if (spatial) {
        if (verbose) message("Spatial normalization")
       	dat <- spatialAdjust(dat, copy=FALSE)
    }
	if (verbose>2) print(gc())		
    # Background removal
	if (bgSubtract) {
    	if (verbose) message("Background removal")
		dat <- bgAdjust(dat, copy=FALSE)
	}
	if (verbose>2) print(gc())		
	if(!is.null(plotDensity)) {
		plotDensity(dat, main="2. After spatial & bg", lab=plotDensityGroups, controlIndex=controlIndex, controlProbes=controlProbes, excludeIndex=excludeIndex)
	}	
	# Within sample normalization
	if (is.null(controlIndex)) {
		controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
	}
	if (verbose) message("Within sample normalization: ", withinSampleNorm) 
	dat <- normalizeWithinSamples(dat, copy=FALSE, 
		method=withinSampleNorm,
		scale=scale, controlIndex=controlIndex, verbose=verbose)
	if(!is.null(plotDensity)) {
		plotDensity(dat, main="3. After within-sample norm", 
			lab=plotDensityGroups, controlIndex=controlIndex,
                        controlProbes=controlProbes, excludeIndex=excludeIndex)
	}
	if (verbose>2) print(gc())		

    # Between sample normalization    
	if (verbose) {
		message("Between sample normalization", appendLF=FALSE)
		if (is.list(betweenSampleNorm)) {
			message(". M: ", bs$m, ", Untreated channel: ", bs$untreated, 
		        ", Methyl-depleted channel: ", bs$enriched)
		} else {
			message (": ", betweenSampleNorm)
		}
	}	
    dat <- normalizeBetweenSamples(dat, copy=FALSE,
		m=bs$m, untreated=bs$untreated, enriched=bs$enriched,
		controlProbes=controlProbes, controlIndex=controlIndex,
		excludeIndex=excludeIndex, verbose=verbose)
	if (verbose>2) print(gc())		
	if(!is.null(plotDensity)) {
		plotDensity(dat, main="4. After between-sample norm",
		 lab=plotDensityGroups, controlIndex=controlIndex,
                 controlProbes=controlProbes, excludeIndex=excludeIndex)
	}		

	if (returnM=="TRUE") {
		retval <- getM(dat, rows=pmindex(dat))	
	} else {
	    if (verbose) message("Estimating percentage methylation")
     	retval <- methPercent(m=getM(dat), pmIndex=pmindex(dat),
			commonParams=commonMethPercentParams,
	 		ngc=countGC(dat))
	}
	if (verbose>2) print(gc())		
	if(!is.null(plotDensity)) {
		if (is.null(controlIndex)) controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
		if(returnM=="FALSE") {
			plotDensity(retval, main="5. Percentage methylation", 
			rx=c(0,1), lab=plotDensityGroups, controlIndex=controlIndex,
                       controlProbes=controlProbes, excludeIndex=excludeIndex) 
                }
		dev.off()		
	}
	rm(dat)
    return(retval)
}



readCharm <- function(files, path=".", ut="_532.xys", md="_635.xys", 
		sampleKey, sampleNames=NULL, pkgname,
		type=NULL, ...) {
    files <- files2 <- as.character(files)
    o <- order(files)
    files <- files[o]
    if (!is.null(sampleNames)) sampleNames <- as.character(sampleNames[o])
    utIdx <- grep(ut, files)
	if (length(utIdx)==0) {
		stop("No files match the untreated extension ", ut, "\nPlease use the ut option to set the correct extension\n")
	}
    mdIdx <- grep(md, files)
	if (length(mdIdx)==0) {
		stop("No files match the methyl-depleted extension ", md, "\nPlease use the md option to set the correct extension\n")
	}
    if(length(utIdx)+length(mdIdx) < length(files)) warning("Some file names do not contain the ut or md arguments. Those files will not be read in.")
    filesUt <- files[utIdx]
    filesMd <- files[mdIdx]
    if (!all(sub(ut, "", filesUt) == sub(md, "", filesMd))) 
        stop(("The untreated (ut) and methyl-depleted (md) file names don't match up\n"))
    if (!is.null(sampleNames)) {
        sampleCheck <- sampleNames[utIdx] == sampleNames[mdIdx]
        if (!all(sampleCheck)) 
            stop("The untreated (ut) and methyl-depleted (md) sample names don't match up\n Check:", 
                sampleNames[utIdx][!sampleCheck])
        sampleNames <- sampleNames[utIdx]
    } else {
        sampleNames <- sub(ut, "", filesUt)
    }

	if (!missing(sampleKey)) {
		sampleKey <- sampleKey[o,]

                utNA <- is.na(sampleKey[utIdx,]) & !is.na(sampleKey[mdIdx,])
                mdNA <- !is.na(sampleKey[utIdx,]) & is.na(sampleKey[mdIdx,])
                for(j in 1:ncol(sampleKey)) {
                    rp1 = utNA[,j]
                    rp2 = mdNA[,j]
                    if(any(rp1)) sampleKey[utIdx[rp1],j] = sampleKey[mdIdx[rp1],j]
                    if(any(rp2)) sampleKey[mdIdx[rp2],j] = sampleKey[utIdx[rp2],j]
                }

		same <- sampleKey[utIdx,]==sampleKey[mdIdx,]
		bothNA <- is.na(sampleKey[utIdx,]) & is.na(sampleKey[mdIdx,])
		keep <- apply(same | bothNA, 2, all)
                if(any(!keep)) message("The following columns in sampleKey contain discrepant values between channels and are being removed: ", paste(which(!keep),collapse=", "))
		extraCols <- sampleKey[utIdx, keep]
	} else {
		extraCols <- matrix(nrow=length(utIdx), ncol=0)
	}
	if (!is.null(type)) {
		type <- as.character(type[o])
		pd <- data.frame(extraCols, type=type[utIdx],
			 arrayUT=filesUt, arrayMD=filesMd, stringsAsFactors=FALSE)
	} else {
		pd <- data.frame(extraCols, arrayUT=filesUt, arrayMD=filesMd, stringsAsFactors=FALSE)
	}
	vpd <- data.frame(
		labelDescription=c(rep(NA, ncol(pd)-2), 
			"Untreated channel file name", 
			"Methyl-depleted channel file name"),
		channel=factor(c(rep("_ALL_", ncol(pd)-2), 
			"channel1", "channel2"), 
			levels=c("channel1", "channel2", "_ALL_")))
			
	## TEMPORARY FIX TO REMAIN BACKWARD COMPATABILITY WITH ##
	## oligo 1.10.x (Bioconductor 2.5). This fix avoids    ##
	## a warning 										   ## 		
	#version <- as.numeric(sub(".*\\.(.*)\\..*", "\\1",
	# 	packageDescription("oligo", fields="Version")))
	#if (version<11) {	
	#	vpd <- data.frame(
	#		labelDescription=c(rep(NA, ncol(pd)-2), 
	#			"Untreated channel file name", 
	#			"Methyl-depleted channel file name"))
	#}
	## END TEMPORARY FIX ##
	 
    ##Put arrays back in the order in which their ut channels were listed in files arg:
    files2 = files2[files2%in%c(pd$arrayUT,pd$arrayMD)]
    files2Ut = files2[sort(grep(ut, files2))]
    reord = match(files2Ut,pd$arrayUT)
    files2Md = pd$arrayMD[reord]

    pdd <- new("AnnotatedDataFrame", data=pd[reord,], varMetadata=vpd)
    sampleNames(pdd) <- sampleNames[reord]  
   	dat <- read.xysfiles2(channel1=file.path(path, files2Ut), 
			channel2=file.path(path, files2Md), pkgname=pkgname,
        	phenoData=pdd, sampleNames=sampleNames[reord], ...)     
	return(dat)
}


#################
## plotDensity ##
#################
plotDensity <- function(dat, rx=c(-4,6), controlIndex=NULL, 
		controlProbes=NULL, pdfFile=NULL, main=NULL, lab=NULL, excludeIndex=NULL) {

        if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to plotDensity.")
	if (!is.null(pdfFile)) {
		pdf(pdfFile)
		par(mfcol=c(2,1))
	}
	if ("TilingFeatureSet" %in% class(dat)) {
		M <- getM(dat)
		pmIndex <- pmindex(dat)
		if (is.null(lab)) lab <- sampleNames(dat)
	} else {
		M <- dat
		pmIndex <- 1:nrow(M)
		if (is.null(lab)) lab <- colnames(dat)		
	}
	if (is.null(controlIndex)) controlIndex <- getControlIndex(dat, controlProbes= controlProbes)
        pIdx <- pmIndex
	if(!is.null(excludeIndex)) pIdx <- pIdx[-excludeIndex]
	cIndex <- pIdx[setdiff(controlIndex, excludeIndex)]
	plotDensityMat(M, idx=pIdx, xlab="M", lab=lab, 
		main=paste(main,"\nAll probes"), rx=rx)
	plotDensityMat(M, idx=cIndex, xlab="M", lab=lab, 
		main=paste(main, "\nControl probes"), rx=rx)
	if (!is.null(pdfFile)) dev.off()
}

plotDensityMat <- function (x, idx, main = NULL, 
    ylab = "Density", xlab = "M", lab = NULL, rx = NULL, ry = NULL, 
    legendPos = "topright", cex=0.8) {
	lab <- as.factor(lab)
	if (missing(idx)) idx <- 1:nrow(x)
	if ("matrix" %in% class(x)) d <- apply(x[idx,], 2, density, na.rm = TRUE)
	if ("ff_matrix" %in% class(x)) {
		i1<-i2<-NULL # To avoid R CMD CHECK "no visible binding" warning
		d <- ffcolapply(density(x[idx,i1:i2], na.rm=TRUE), X=x, 
			BATCHSIZE=1, BATCHBYTES=2^30, RETURN=TRUE, CFUN="list")
	}
    if (is.null(rx)) rx <- range(sapply(d, function(i) i$x))
    if (is.null(ry)) ry <- range(sapply(d, function(i) i$y))
    plot(rx, ry, type = "n", xlab = xlab, ylab = ylab, main = main)
    sapply(1:length(d), function(i) lines(d[[i]], col = lab[i]))
    legend(legendPos, legend = levels(lab), 
		text.col = 1:length(levels(lab)), cex = cex)
}

	

regionFilter <- function(x, region, f) {
	x<-unlist(tapply(x, region, myfilter, f))
	names(x) <- NULL
	return(x)
}



## Uses all control and bg probes passed to it
normControlBg <- function(pms, bgs=NULL, controlIndex=NULL, affinity=NULL) {
    mBg <- NULL
    ctrl <- NULL
    mCtrl <- NULL
    if (!is.null(bgs)) {
        mBg <- rowMedians(bgs)
    }
    if (!is.null(controlIndex)) {
        ctrl <- pms[controlIndex,]
        mCtrl <- rowMedians(ctrl)
    }
    message(" Normalizing with", length(c(mBg, mCtrl)), "control probes ", appendLF=FALSE)
    if (!is.null(affinity)) {
        message("(affinity-based mapping)")
    } else {
        message("(intensity-based mapping)")
    }
  
    for (i in 1:ncol(pms)) {
        if (is.null(affinity)) {
            map <- getMap(x=c(bgs[,i], ctrl[,i]), m=c(mBg, mCtrl))
            pms[,i] <- pms[,i]-map(pms[,i])   
        } else {
            map <- getMap(x=ctrl[,i], m=mCtrl, affinity=affinity[controlIndex])
            pms[,i] <- pms[,i]-map(affinity)   
        }
    }
    return(pms)
}

getMap <- function (x, m, affinity=NULL, df = 5) 
{
    if (is.null(affinity)) affinity <- x
    nas <- is.na(x) | is.na(m)
    x <- x[!nas]
    m <- m[!nas]
    affinity <- affinity[!nas]
    d <- x - m
    lmFit = lm(d ~ ns(affinity, df = df))
    return(function(x1) predict(lmFit, data.frame(affinity = x1)))
}




getControlIndex <- function(dat, controlProbes=NULL, noCpGWindow=1000, subject, 
                            onlyGood=FALSE, matrix=TRUE) {
    if (missing(subject)) {
		file <- file.path(system.file(package = annotation(dat)), 
		    "data", "controlIndex.rda")
		if (file.exists(file)) {
		    load(file)
		} else {
                    if(is.null(controlProbes)) stop("controlProbes argument must be provided to getControlIndex if subject argument not used.")
        	    controlIndex <- which(getContainer(dat) %in% controlProbes)
		}
    } else {
        if (class(subject)!="BSgenome")
            stop("You must supply a BSgenome object for subject if using the noCpGWindow option. E.g. Hsapiens from the BSgenome.Hsapiens.UCSC.hg18 package.\n")
		pv <- providerVersion(subject)	
		file <- file.path(system.file(package = annotation(dat)), 
			    "data",	paste(pv, "_noCpG", noCpGWindow, ".rda", sep=""))
		if (file.exists(file)) {
		    load(file)
		} else {
	    	chr <- pmChr(dat)
	        pos <- pmPosition(dat)
	        cpgd <- cpgdensity(subject, chr=chr, pos=pos, 
				windowSize=noCpGWindow)
	        controlIndex <- which(cpgd==0)
		}
    }
	return(controlIndex)
}



diffAmpEstGC <- function(dat, method="median", controlIndex=NULL, controlProbes=NULL, smooth=2) {

        if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to diffAmpEstGC.")

    #kern <- function(u, type="epanechnikov", smooth=1) {
    #    ifelse(abs(u)<smooth, (1/smooth) * (3/4) * (1-(u/smooth)^2), 0)
    #}
    Ngc <- countGC(dat, "pm")

    if (is.null(controlIndex)) {
        #controlIndex <- getControlIndex(dat, controlProbes = controlProbes)
        ##same as:
        controlIndex <- which(getContainer(dat)%in%controlProbes)
    }

    ngc <- Ngc[controlIndex]
    datPm <- log2(pm(dat)) 
    M <- datPm[,,"channel1"] - datPm[,,"channel2"] 
    adj <- sapply (1:ncol(M), function(samp) {
        m <- M[controlIndex, samp]
        a <- (datPm[,,"channel1"][controlIndex,samp] + 
              datPm[,,"channel2"][controlIndex,samp])/2
        nas <- is.na(m) | is.na(a)
        m <- m[!nas]       
        a <- a[!nas]
        ngc <- ngc[!nas]
        breakpoint <- optimize(checkBreakpoint, range(a), ms=m, as=a)$minimum
        keep <- a>breakpoint
        if (sum(keep)/length(controlIndex) < (1/10)) { # Make sure we use at least 1/10 the control probes
            o <- order(a, decreasing=TRUE)
            keep[o[1:(length(controlIndex)/10)]] <- TRUE
        }        
        med <- tapply(m[keep], ngc[keep], method)
        weights <- tapply(m[keep], ngc[keep], length)
        ngc_adj <- est <- w <- rep(NA, max(Ngc))
        est[as.numeric(names(med))] <- med
        w[as.numeric(names(weights))] <- weights
        for (i in which(!is.na(est))) {
            k <- dnorm(abs(i-1:length(ngc_adj))/smooth)       
            #k <- kern(abs(i-1:length(ngc_adj)), smooth=smooth)
            ngc_adj[i] <- sum(k*w*est, na.rm=TRUE) / sum(k*w, na.rm=TRUE)
        }
        for (i in 2:length(ngc_adj)) {
            if (is.na(ngc_adj[i]) & !is.na(ngc_adj[i-1]))
                ngc_adj[i] <- ngc_adj[i-1]
        }
        for (i in (length(ngc_adj)-1):1) {
            if (is.na(ngc_adj[i]) & !is.na(ngc_adj[i+1]))
                ngc_adj[i] <- ngc_adj[i+1]
        }
        ngc_adj
    })   
    colnames(adj) <- sampleNames(dat)
    rownames(adj) <- paste("GC=", 1:nrow(adj), sep="")
    adj
}


diffAmpAdjustGC <- function (dat, adj){
    datPm <- log2(pm(dat)) 
    Ngc <- countGC(dat)
    for (samp in 1:dim(dat)["Samples"]) {
        datPm[,samp,"channel2"] <- datPm[,samp,"channel2"] + adj[Ngc, samp]
    }
    pm(dat) <- 2^datPm
    return(dat)
}


    
countGC <- function(dat, type="pm", idx) {
    if (type=="pm") {
        seqs <- pmSequence(dat)
    } else if (type=="bg") {
        seqs <- bgSequence(dat)
    } else {
        stop("Invalid type. Choose 'pm' or 'bg'\n")
    }
    if (!missing(idx))
        seqs <- seqs[idx]
    tmp <- alphabetFrequency(seqs, baseOnly=TRUE)
	tmp[,"C"] + tmp[,"G"]
}

cpgdensity <-function(subject, chr, pos, windowSize=500, sequence="CG") {
    idx <- split(1:length(chr), chr)
    s <- DNAString(sequence) 
    cpgdensity <- rep(NA, length(pos))
    for (curchr in (names(idx))) {
            if (curchr %in% names(subject)) {
                chrseq <- subject[[curchr]]
				if (inherits(chrseq, "DNAString") | inherits(chrseq, "MaskedDNAString")){
	                curpos <- pos[idx[[curchr]]]
	                v <- suppressWarnings(Views(chrseq, start=curpos-windowSize/2, end=curpos+windowSize/2))
	                d <- suppressWarnings(DNAStringSet(v))
	                numcpg <- vcountPattern(s, d, fixed=TRUE)
	                cpgdensity[idx[[curchr]]] <- numcpg/windowSize       
				}
            }
    }
    cpgdensity
}



##############
## qcReport ##
##############

qcReport <- function(dat, file=NULL, utRange=c(30,100), enRange=c(8,12), numProbes=5e+5, blockSize) {
    # Calculate summary quality scores
	n <- length(pmindex(dat))
	if (numProbes==-1) numProbes <- n
	if (n < numProbes) numProbes <- n
	idx <- as.integer(seq(1, n, length.out=numProbes))
    pmQual <- pmQuality(dat, idx=idx)[,,drop=FALSE] 
    X <- getX(dat, "pm")[idx]
    Y <- getY(dat, "pm")[idx]
	if(missing(blockSize)) blockSize <- getBlockSize(dat)
    imgs1 <- arrayImage(X,Y, pmQual, blockSize=blockSize)
 	#sd1 <- unlist(lapply(imgs1, function(x) sd(as.vector(x$z), na.rm=TRUE)))

	z <- assayDataElement(dat, "channel1")[pmindex(dat)[idx],, drop = FALSE]
	tmp <- arrayImage(X,Y, log2(z),	blockSize=blockSize)
	sd1 <- unlist(lapply(tmp, function(x) sd(as.vector(x$z), na.rm=TRUE)))
	rm(z, tmp)
	z <- assayDataElement(dat, "channel2")[pmindex(dat)[idx],, drop = FALSE]
	imgs2 <- arrayImage(X,Y, log2(z), blockSize=blockSize)
	rm(z)
 	sd2 <- unlist(lapply(imgs2, function(x) sd(as.vector(x$z), na.rm=TRUE)))

	sdRange <- c(0, max(sd1, sd2)*1.1)
	if (any(class(pmQual)=="ff")) {
		pmSignal <- sapply(1:ncol(pmQual), function(i) mean(pmQual[,i]))
		names(pmSignal) <- colnames(pmQual)
	} else {
		pmSignal <- colMeans(pmQual, na.rm=TRUE) 
	}
	
    if (!is.null(file)) {
        pdf(file=file, width=8, height=10.5)    
        lay <- rbind(cbind(rep(1,3), matrix(rep(2:4, each=6), ncol=6)), rep(5,7))
		layout(lay)
		#layout(matrix(c(1,1,1,4, 2,2,2,4, 3,3,3,4 ), ncol=3))

	    n <- length(pmSignal)
		if (is.null(names(pmSignal))) {
			names(pmSignal) <- paste("Sample", 1:n)
		}
		l <- length(pmSignal)

		longestName <- max(sapply(names(pmSignal), nchar))
	    cexh <- ifelse (n>50, max(0.25, 1-(n-50)*.01), 1)
	    cexw <- ifelse (longestName>17, max(0.25, 1-(longestName-15)*0.05), 1)
		cex <- min(cexh, cexw)
		par(oma=c(2,2,4,2))

		o <- order(names(pmSignal))
		#o <- order(names(pmSignal), decreasing=TRUE)
		par(mar=c(5,0,3,0))
		plot.new()
		plot.window(xlim=c(0,10), ylim=c(0,l))
		text(1,l:1-0.5, names(pmSignal)[o], adj=0.1, cex=cex)
	    abline(h=1:(l+1)-1, lwd=0.5, lty=3)

	    xmin <- max(0, floor((min(pmSignal)-5)/5)*5)
	    xmax <- min(100, ceiling((max(pmSignal)+5)/5)*5)
	    par(mar=c(5,1,3,1))
	    plot(pmSignal[o], l:1, las=1, ylab="", xlab="", type="p", 
	        pch=19, xlim=c(xmin, xmax), ylim=c(0.5, l+0.5),
	        yaxt="n", frame.plot=TRUE, main="Signal strength")
	    abline(h=1:(l+1)-0.5, lwd=0.5, lty=3)
	    #axis(2, at=1:l, labels=names(pmSignal), las=1, tick=FALSE, cex.axis=cex)

	    #xmin <- min(sd1)-1
	    #xmax <- max(sd1)+1
	    par(mar=c(5,1,3,1))
	    plot(sd1[o], l:1, las=1, ylab="", xlab="", type="p", 
	        pch=19, xlim=sdRange, ylim=c(0.5, l+0.5),
	        yaxt="n", frame.plot=TRUE, main="Channel 1\nstandard deviation")
	    abline(h=1:(l+1)-0.5, lwd=0.5, lty=3)

	    #xmin <- min(sd2)-0.01
	    #xmax <- max(sd2)+0.01
	    par(mar=c(5,1,3,1))
	    plot(sd2[o], l:1, las=1, ylab="", xlab="", type="p", 
	        pch=19, xlim=sdRange, ylim=c(0.5, l+0.5),
	        yaxt="n", frame.plot=TRUE, main="Channel 2\nstandard deviation")
	    abline(h=1:(l+1)-0.5, lwd=0.5, lty=3)

	    #axis(1, tick=FALSE)
	    hist(pmSignal, main="Signal strength histogram", xlab="Signal strength")

        # Plot UT channel probe quality (PM vs BG)
        l <- layout(matrix(c(1,1,1,1,2:21), 6, 4, byrow = TRUE)) # 
        plot.new()
        text(0.5, 0.2, "Untreated Channel: PM probe quality", cex=2)
        par(mar=c(4.5,8,3.5,3))
        arrayPlot(imgs1[o], r=utRange)
        # Plot MD channel
        l <- layout(matrix(c(1,1,1,1,2:21), 6, 4, byrow = TRUE)) # 
        plot.new()
        text(0.5, 0.2, "Enriched Channel: PM signal intensity", cex=2)
        par(mar=c(4.5,8,3.5,3))
        arrayPlot(imgs2[o], r=enRange)
        dev.off()
    }
    return(as.data.frame(cbind(pmSignal, sd1, sd2)))
}

as.image <- function(Z, ind, nx, ny, na.rm=TRUE) {
        if (!na.rm)
                return(as.image.fast(Z, ind, nx, ny))
        my_mean <- function(X) base::mean(X, na.rm=TRUE)
        list(x = 1:ny, y = 1:nx, z = tapply(Z, list(ind[,1], ind[,2]),
                FUN=my_mean))
}

as.image.fast <- function(Z, ind, nx, ny)
        list(x = 1:ny, y = 1:nx, z = tapply(Z, list(ind[,1], ind[,2]),
                FUN=mean))


arrayImage <- function(x,y,z, view="2d", blockSize=50) {
    if (is.null(dim(z))) z <- as.matrix(z)
    if (view=="col") {
		tmp <- vec2array(x,y,z)
        ret <- apply(tmp, 3, rowMeans, na.rm=TRUE)
        colnames(ret) <- colnames(z)
    }
    if (view=="row") {
		tmp <- vec2array(x,y,z)
        ret <- apply(aperm(tmp, c(2,1,3)), 3, rowMeans, na.rm=TRUE)  
        colnames(ret) <- colnames(z)
    }
    if (view=="2d") {
		nx <- max(x)/blockSize
		ny <- max(y)/blockSize
		d <- discretize.image(cbind(y,x), m=ny, n=nx)
                ## The "index" object returned by discretize.image() changed from a matrix 
                ## in fields package v6.9.1 to a list in v7.1, which does not work in as.image 
                ## below, so change it back:
                d$index = do.call(cbind,d$index)
                stopifnot(ncol(d$index)==2)
		ret <- apply(z, 2, function(vec) {
			as.image(vec, ind=d$index, nx=d$m, ny=d$n, na.rm=TRUE)		
        })		
        names(ret) <- colnames(z)
    }
	return(ret)
}

arrayPlot <- function(imgs, xlab="NULL", r=NULL) {
    if (is.list(imgs)) {
        if(is.null(r)) r <- range(unlist(lapply(imgs, function(x) x$z)), na.rm=TRUE)
        for (i in 1:length(imgs)) {
			curImg <- imgs[[i]]$z
			curImg <- apply(curImg, 1, pmax, r[1])
			curImg <- apply(curImg, 1, pmin, r[2])
            image.plot(curImg, main=names(imgs)[i],  zlim=r, xaxt="n", yaxt="n", horizontal=TRUE)    
        }
    } else {
		# No longer needed
        #r <- range(imgs$z, na.rm=TRUE)
        #for (i in 1:ncol(imgs$z)) {
        #    plot(imgs[,i], main=colnames(imgs)[i], type="l", ylim=r, ylab="pm quality", xlab=xlab)
        #}
    }
}



###############
## pmQuality ##
###############
pmQuality <- function(dat, channel="channel1", verbose=FALSE, idx=NULL) {
	# TODO: parallelize
	if (is.null(idx)) idx <- 1:length(pmindex(dat))
    Ngc <- countGC(dat, "pm", idx)
    bgNgc <- countGC(dat, "bg")
	pmIndex <- pmindex(dat)
	bgIndex <- bgindex(dat)
	x <- assayDataElement(dat, channel)
	if ("ff_matrix" %in% class(x)) {
		ret <- ff(vmode="double", dim=c(length(idx), ncol(x)))
	} else {
		ret <- matrix(nrow=length(idx), ncol=ncol(x))
	}
	for (i in 1:ncol(x)) {
    	fn <- tapply(x[bgIndex,i], bgNgc, ecdf)
		for (ngc in unique(Ngc)) {
	        curIdx <- Ngc==ngc
	        closestIdx <- order(abs(as.numeric(names(fn))-ngc))[1]
	        bgngc <- as.character(names(fn)[closestIdx])
			ret[curIdx,i] <- 100 * fn[[bgngc]](x[pmIndex[idx][curIdx],i])   
	    }
	}
	colnames(ret) <- sampleNames(dat)
	return(ret)
}

pmvsbg <- function(...) {
	message("The pmvsbg function has been renamed pmQuality. Both names work for now but please update your code soon")
	pmQuality(...)
}

dynamicRange <- function(dat, prob=0.8) {
    Ngc <- countGC(dat, "pm")
    bgNgc <- countGC(dat, "bg")
    c1 <- log2(oligo::pm(dat)[,,"channel1"])
    c2 <- log2(oligo::bg(dat)[,,"channel2"])    
    sapply(sampleNames(dat), function(i) {
            tmp <- tapply(c2[,i], bgNgc, quantile, prob)
            c2med <- rep(NA, max(as.numeric(names(tmp))))
            c2med[as.numeric(names(tmp))] <- tmp
            c1[,i] - c2med[Ngc]
    })    
}

vec2array <- function(X,Y,Z) {
    if (ncol(Z)==1) Z <- as.matrix(Z)
    maxY <- max(Y)
    maxX <- max(X)
    o <- order(X,Y)
    Y <- Y[o]
	X <- X[o]
    tmp <- tapply(Y, X, function(y) as.numeric(y))
    tmp <- unlist(lapply(names(tmp), function(i) tmp[[i]]+((as.numeric(i)-1)*maxY)))
	Z <- as.matrix(Z[o,])
    d <- matrix(NA, nrow=maxX*maxY, ncol=ncol(Z))
    d[tmp,] <- Z
    array(d, dim=c(maxY, maxX, ncol(Z)))
}

countSeq <- function(subject, chr, start, end, seq) {
    idx <- split(1:length(chr), chr)
    retval <- rep(NA, length(chr))
    for (curChr in (names(idx))) {
        if (curChr %in% names(subject)) {
            chrSeq <- unmasked(subject[[curChr]])
            curStart <- start[idx[[curChr]]]
            curEnd <- end[idx[[curChr]]]
            v <- suppressWarnings(Views(chrSeq, start=curStart, end=curEnd))
            d <- suppressWarnings(DNAStringSet(v))
            s <- DNAString(seq)
            numSeq <- vcountPattern(s, d, fixed=TRUE)
            retval[idx[[curChr]]] <- numSeq
        } else {
            #cat("\ncountSeq: Skipping chr", curChr, "\n")
        }
    }    
    return(retval)
}


###################
## spatialAdjust ##
###################
spatialAdjustAtom <- function(x, d, I) {    
        for (j in 1:ncol(x)) {
                        x[I,j] <- 2^(spatialAdjustVec(log2(x[I,j]), d)$zAdj)
                }
        return(x)
}

spatialAdjust <- function(dat, copy=TRUE, blockSize, theta=1) {
        if(missing(blockSize)) blockSize <- getBlockSize(dat)
        c1 <- assayDataElement(dat, "channel1")
        c2 <- assayDataElement(dat, "channel2")
        X <- Y <- rep(NA, nrow(c1))

        pmi <- pmindex(dat)
        X[pmi] <- getX(dat, "pm")
        Y[pmi] <- getY(dat, "pm")
        bgi <- bgindex(dat)
        X[bgi] <- getX(dat, "bg")
        Y[bgi] <- getY(dat, "bg")

        # HJ
        nax <- which(is.na(X))
        nay <- which(is.na(Y))
        # sanity checks
        stopifnot(identical(nax, nay))
        stopifnot(length(X) == length(pmi)+length(bgi)+length(nax))

        I <- sort(c(pmi, bgi))
        nx <- max(X, na.rm=TRUE)/blockSize
        ny <- max(Y, na.rm=TRUE)/blockSize
        d <- discretize.image(cbind(Y[I], X[I]), m=ny, n=nx)
        ## The "index" object returned by discretize.image() changed from a matrix 
        ## in fields package v6.9.1 to a list in v7.1, which does not work in the code 
        ## below, so change it back:
        d$index = do.call(cbind,d$index)
        stopifnot(ncol(d$index)==2)

        # HJ
        c1 <- oApply(c1, copy=copy, fn=spatialAdjustAtom, d=d, I=I)
        c2 <- oApply(c2, copy=copy, fn=spatialAdjustAtom, d=d, I=I)
        assayDataElement(dat, "channel1") <- c1
        assayDataElement(dat, "channel2") <- c2
        return(dat)
}


spatialAdjustVec <- function(z, d, ims=NULL, theta=1) {
        if (is.null(ims)) {
                # HJ -- accept NAs in the data at the probes of interest
                #im <- as.image(z, ind=d$index, nx=d$m, ny=d$n, na.rm=TRUE)
                im <- as.image.fast(z, ind=d$index, nx=d$m, ny=d$n)
                ims <- image.smooth(im, theta=theta)
        }
        adj <- ims$z - median(ims$z)
        adjV <- as.vector(t(adj))
        ind <- d$index
        idx <- (ind[,1]-1)*d$n + ind[,2]
        zAdj <- z - adjV[idx]
        return(list(zAdj=zAdj, ims=ims))
}




getBlockSize <- function(dat, probesPerBlock=1250) {
	x<-getX(dat, "pm")
	y<-getY(dat, "pm")
	area <-  max(x) * max(y)
	numProbes <- length(pmindex(dat))
	probeDensity <- numProbes/area
	blockSize <- round(sqrt(probesPerBlock/probeDensity))
	return(blockSize)
}

maxDensity <- function(x, n.pts = 2^14, minPoints=30) { 
# Slightly modified version of the max.density function in affy
        if (length(x) >= minPoints) {
            aux <- density(x, kernel = "epanechnikov", n = n.pts, 
                na.rm = TRUE)
            aux$x[order(-aux$y)[1]]
        } else {
            return(NA)
        }
}


##############
## bgAdjust ##
##############

# Calculate RMA bg parameters using the GC-stratitifed RMA model 
# with background probes
bgAdjustParameters <- function (pm, bgpm, Ngc, bgNgc, n.pts = 2^14) 
{
    drop <- as.numeric(names(which(table(bgNgc)<50)))
    bgNgc[bgNgc %in% drop] <- NA
    pmbg <- maxDensity(bgpm, n.pts)
    pmbg.gc <- tapply(bgpm, bgNgc, maxDensity, n.pts) # Get mode for each GC bin
    #l<-loess(pmbg.gc~as.numeric(names(pmbg.gc)), control = loess.control(surface = "direct")) 
    #x <- 1:max(Ngc)
    #mubg.gc <- predict(l,(x))
    mubg.gc <- approx(as.numeric(names(pmbg.gc)), pmbg.gc, 1:max(Ngc), rule=2)$y
    bg.data <- bgpm[bgpm < pmbg]
    pmbg <- maxDensity(bg.data, n.pts)
    bg.data <- bgpm[bgpm < pmbg]
    bg.data <- bg.data - pmbg
    bgsd <- sqrt(sum(bg.data^2)/(length(bg.data) - 1)) * sqrt(2)
    sig.data <- pm[pm > pmbg]
    sig.data <- sig.data - pmbg
    expmean <- maxDensity(sig.data, n.pts)
    alpha <- 1/expmean
    mubg <- pmbg
    list(alpha = alpha, mu = mubg, mu.gc = mubg.gc, sigma = bgsd)
}

bgAdjustAtom <- function(x, pmIndex, bgIndex, ngc, bgNgc) {
	#cat("In bgAdjustAtom\n")
	for (i in 1:ncol(x)){
		pms <- x[pmIndex,i]
		bgs <- x[bgIndex,i]
		param <- bgAdjustParameters(pms, bgs, ngc, bgNgc)
        b <- param$sigma
        pms <- pms - param$mu.gc[ngc] - param$alpha * b^2
        pms <- pms + b * ((1/sqrt(2 * pi)) * exp((-1/2) * ((pms/b)^2)))/pnorm(pms/b)
        bgs <- bgs - param$mu.gc[bgNgc] - param$alpha * b^2
        bgs <- bgs + b * ((1/sqrt(2 * pi)) * exp((-1/2) * ((bgs/b)^2)))/pnorm(bgs/b)
		isInf <- which(is.infinite(pms))
		if (length(isInf)>0) {
			biggest <- max(pms[-isInf], na.rm=TRUE)		
			pms[isInf] <- biggest
		}
		isInf <- which(is.infinite(bgs))
		if (length(isInf)>0) {
			biggest <- max(bgs[-isInf], na.rm=TRUE)		
			bgs[isInf] <- biggest
		}
		x[pmIndex,i] <- pms
		x[bgIndex,i] <- bgs
	}
	return(x)
}

bgAdjust <- function(dat, copy=TRUE) {
	pmIndex <- pmindex(dat)
	bgIndex <- bgindex(dat)
    ngc <- countGC(dat, "pm")
    bgNgc <- countGC(dat, "bg")
	c1 <- assayDataElement(dat, "channel1")
	c2 <- assayDataElement(dat, "channel2")
	c1 <- oApply(c1, copy=copy, fn=bgAdjustAtom, pmIndex=pmIndex, 
		bgIndex=bgIndex, ngc=ngc, bgNgc=bgNgc)
	c2 <- oApply(c2, copy=copy, fn=bgAdjustAtom, pmIndex=pmIndex, 
		bgIndex=bgIndex, ngc=ngc, bgNgc=bgNgc)	
	assayDataElement(dat, "channel1") <- c1
	assayDataElement(dat, "channel2") <- c2	
	return(dat)
}


############################
## normalizeWithinSamples ##
############################
normalizeWithinSamples <- function (dat, copy=TRUE, method = "loess",
 	scale = c(0.99, 0.99), 
	controlProbes = NULL, controlIndex = NULL, 
        approx = TRUE, breaks = 1000, 
    verbose = FALSE) 
{
    if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to normalizeWithinSamples.")

    if (grepl("gc", method)) {
        mAdj <- diffAmpEstGC(dat, method = method, controlIndex=controlIndex, controlProbes = controlProbes)
        dat <- diffAmpAdjustGC(dat, mAdj)
    }
    if (grepl("loess", method)) {
        dat <- normalizeLoess(dat, copy=copy, controlIndex = controlIndex, 
            controlProbes = controlProbes, approx = approx, breaks = breaks)
		#gc()
    }
    if (grepl("median", method)) {
        if (is.null(controlIndex)) {
            controlIndex <- getControlIndex(dat, controlProbes = controlProbes)
        }
        datPm <- log2(pm(dat))
        M <- datPm[, , "channel1"] - datPm[, , "channel2"]
        mCtrl <- M[controlIndex, ]
        mAdj <- apply(mCtrl, 2, median, na.rm = TRUE)
        datPm[, , "channel2"] <- sweep(datPm[, , "channel2"], 
            2, mAdj, FUN = "+")
        pm(dat) <- 2^datPm
    }
    dat <- scaleSamples(dat, copy=copy, scale=scale)
	gc()
    return(dat)
}

normalizeLoessAtom <- function(c1, c2, controlIndex, span, approx, breaks, 
		by, pmIndex) {
	for (i in 1:ncol(c1)) {
		lc1 <- log2(c1[pmIndex,i])
		lc2 <- log2(c2[pmIndex,i])
		y <- lc1-lc2
		if (by=="tot") {
			x <- lc2
		} else if (by=="A") {
			x <- (lc1+lc2)/2
		}
		fit <- loess(y ~ x, subset = controlIndex,
               na.action = na.exclude, degree=1, surface = "direct",
 			   span=span)
		adj <- predictLoess(fit, newdata=x, approx=approx, breaks=breaks)
		c2[pmIndex,i] <- 2^(lc2 + adj)
	}
	return(list(c1, c2))
}


predictLoess <- function(fit, newdata, approx=TRUE, breaks=1000) {
	if (approx) {
		newdataBin <- cut(newdata, breaks=breaks, ordered.result=TRUE)
		binMean <- tapply(newdata, newdataBin, mean)
		isNa <- which(is.na(binMean))
		adj <- rep(NA, length(binMean))
		if (length(isNa)==0) {
			adj <- predict(fit, newdata = binMean)
		} else {
			adj[-isNa] <- predict(fit, newdata = binMean[-isNa])
		}
		ret <- adj[newdataBin]
	} else {
		ret <- predict(fit, newdata=newdata)
	}
	return(ret)
}

normalizeLoess <- function(dat, copy=TRUE, 
                           controlIndex=NULL, controlProbes=NULL, 
                           span=0.3, by="A", approx=TRUE, breaks=1000) {

        if(is.null(controlIndex) & is.null(controlProbes)) stop("Either controlIndex or controlProbes argument must be provided to normalizeLoess.")
	if (is.null(controlIndex)) {
    	    controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
	}
	pmIndex=pmindex(dat)
	c1 <- assayDataElement(dat, "channel1")
	c2 <- assayDataElement(dat, "channel2")
	ret <- oApply(c1, c2, copy=copy, fn=normalizeLoessAtom, 
			controlIndex=controlIndex, span=span, approx=approx,
			breaks=breaks, by=by, pmIndex=pmIndex)
	assayDataElement(dat, "channel1") <- ret[[1]]
	assayDataElement(dat, "channel2") <- ret[[2]]
	return(dat)
}

scaleSamplesAtom <- function(c1, c2, copy, scale, pmIndex) {
	lc1 <- log2(c1[pmIndex,,drop=FALSE])
	lc2 <- log2(c2[pmIndex,,drop=FALSE])
	M <- lc1-lc2
	x <- apply(M, 2, quantile, scale[1], na.rm=TRUE)
	adj <- x / -log(1-scale[2])
	M <- sweep(M, 2, adj, FUN="/")
	c2[pmIndex,] <- 2^(lc1-M)
	return(list(c1,c2))
}
	
scaleSamples <- function(dat, copy=TRUE, scale=c(0.99, 0.99)) {
	if (length(scale)==2) {
		pmIndex=pmindex(dat)
		c1 <- assayDataElement(dat, "channel1")
		c2 <- assayDataElement(dat, "channel2")
		ret <- oApply(c1, c2, copy=copy, fn=scaleSamplesAtom, scale=scale, 
			pmIndex=pmIndex)
		assayDataElement(dat, "channel1") <- ret[[1]]
		assayDataElement(dat, "channel2") <- ret[[2]]
	}
        return(dat)
}

#########
## SQN ##
#########
mix.qn <- function (y, log2=FALSE, ctrl.id, idx, NQ, mix.param, 
		max.q = 0.95, low) {
	if(missing(idx)) idx <- 1:nrow(y)
	if(log2) y <- log2(y)
	if (!is.matrix(y)) y <- as.matrix(y)
	ret <- y
	for (i in 1:ncol(y)) {
		y0 <- y[idx, i]
	    ECDF = ecdf(y0[ctrl.id])
	    Q = ECDF(y0)
	    id0 = which(Q < 0.05)
	    id2 = which(Q > max.q)
	    B = length(id2)
	    Q2 = max.q + (1 - max.q) * (1:B)/(B + 1)
	    y2o = order(y0[id2])
	    Q[id2][y2o] = Q2
	    ynorm = vector(mode = "numeric", length = length(y0))
	    ynorm[id0] = low
	    ynorm[-c(id0, id2)] = quantile(NQ, Q[-c(id0, id2)])
	    ynorm[id2] = qnorMix(Q[id2], mix.param)
		y[idx,i] <- ynorm
 	}
	if(log2) y <- 2^y
	return(y)
}

SQNff <- function (y, copy=TRUE, log2=FALSE, N.mix = 5, ctrl.id, idx,
		model.weight = 0.9) {
	if(missing(idx)) idx <- 1:nrow(y)
	ctrl <- y[idx[ctrl.id], ]
	if(log2) ctrl <- log2(ctrl)		
    QE = apply(ctrl, 2, sort)
    QN = apply(QE, 1, median)
    mix.param = suppressWarnings(Mclust(QN, G = N.mix)$parameters)
    mix.param = norMix(mu = mix.param$mean, sig2 = mix.param$variance$sigmasq, 
        w = mix.param$pro)
    qq = seq(1/(2 * length(QN)), 1 - 1/(2 * length(QN)), 1/length(QN))
    qq = qnorMix(qq, mix.param)
    QN1 = QN * (1 - model.weight) + qq * model.weight
	if ("ff_matrix" %in% class(y)) { 
		ynorm = oApply(y, copy=copy, fn=mix.qn, log2=log2,
			ctrl.id=ctrl.id, idx=idx, NQ = QN1, mix.param = mix.param, 
        	max.q = 0.95, low = quantile(QN1, 0.05))
    } else {
		if (log2) y <- log2(y)
		ynorm = apply(y, 2, mix.qn, ctrl.id=ctrl.id, idx=idx, NQ = QN1, 
			mix.param = mix.param, 
        	max.q = 0.95, low = quantile(QN1, 0.05))
		if (log2) ynorm <- 2^ynorm
	}
	return(ynorm)
}		

#############################	
## normalizeBetweenSamples ##
#############################	

normalizeBetweenSamples <- function(dat, copy=TRUE,
		m="allQuantiles", untreated="none", enriched="none", 
		controlProbes=NULL, controlIndex=NULL, 
                excludeIndex=NULL, verbose=FALSE) {

        if(is.null(controlIndex) & is.null(controlProbes) & enriched=="sqn") stop("Either controlIndex or controlProbes argument must be provided to normalizeBetweenSamples if using sqn normalization.")
	if(ncol(dat)>1) {
		c1 <- assayDataElement(dat, "channel1")
		c2 <- assayDataElement(dat, "channel2")
	    if (copy & "ff_matrix" %in% class(c1)) {
			open(c1)
			open(c2)
	        c1 <- clone(c1)
			c2 <- clone(c2)
	    }
	
		if (is.null(excludeIndex)) {
			idx <- pmindex(dat)	
		} else {
			idx <- pmindex(dat)[-excludeIndex]
		}
		
		if (m %in% c("quantile", "allQuantiles")){
			assayDataElement(dat, "channel1") <- c1
			assayDataElement(dat, "channel2") <- c2
			dat <- quantileNormalize(dat, copy=FALSE, idx=idx)
			c1 <- assayDataElement(dat, "channel1") 
			c2 <- assayDataElement(dat, "channel2") 
		}

	    ## Normalize untreated (leaving M unchanged)
	    if (untreated=="complete") { # TODO: parallelize
	       	if ("ff_matrix" %in% class(c1)) {
				open(c1)
				open(c2)
				batchSize <- ocSamples()*nrow(c1)/ncol(c1)
				i1<-i2<-NULL # To avoid R CMD CHECK "no visible binding" warning
				med<-ffrowapply(rowMedians(log2(c1[i1:i2,,drop=FALSE])),
				 	X=c1, RETURN=TRUE, CFUN="c", BATCHSIZE=batchSize) 	
			} else {
				med <- rowMedians(log2(c1))
			}
			M <- getM(dat)
			if ("ff_matrix" %in% class(M)) open(M)
			for (i in 1:ncol(M)) {
				c1[,i] <- 2^med
				c2[,i] <- 2^(med - M[,i])
			}
	    }
	    ## Normalize enriched 
		if (enriched=="sqn") {
			if(is.null(controlIndex)) 	
				controlIndex <- getControlIndex(dat, controlProbes=controlProbes)
			c2 <- SQNff(y=c2, copy=FALSE, log2=TRUE, 
				ctrl.id=controlIndex, idx=idx)
			gc()	
		}
		assayDataElement(dat, "channel1") <- c1
		assayDataElement(dat, "channel2") <- c2
	}
	return(dat)
} 


##############
## quantile ##
##############

setGeneric("quantileNormalize", function(object, copy, idx)	{
		if (missing(copy)) copy <- TRUE
		if (missing(idx)) idx <- 1:nrow(object)	
		standardGeneric("quantileNormalize")	
	})

setMethod("quantileNormalize", c(object="matrix"),
	function (object, copy, idx) {
		object[idx,] <- normalize.quantiles(object[idx,], copy=copy)
		return(object)
	})

setMethod("quantileNormalize", c(object="ff_matrix"),
	function (object, copy, idx) {
		samplesByNode <- splitIndicesByNode(1:ncol(object))
	    stats <- ocLapply(samplesByNode, qnTargetStatsNode, object, idx)
	    totalN <- sum(sapply(stats, "[[", "n"))
	    total <- rowSums(sapply(stats, "[[", "total"))
	    target <- total/totalN
	    oApply(object, copy=copy, fn=qnToTargetAtom, 
			target=target, idx=idx)
	})

setMethod("quantileNormalize", c(object="TilingFeatureSet"),
	function (object, copy, idx) {
		M <- getM(object)
		M <- quantileNormalize(M, copy=FALSE, idx=idx)	
		c1 <- assayDataElement(object, "channel1")
		if (copy & "ff_matrix" %in% class(c1)) {
			c1 <- clone(c1)
		}
		fn <- function(x) 2^x
		c2 <- combine(c1, M, "-", transform1="log2", invTransform=fn)
		assayDataElement(object, "channel1") <- c1
		assayDataElement(object, "channel2") <- c2
		return(object)
	})


setGeneric("qnTargetStatsNode", function(cols, object, idx)
	standardGeneric("qnTargetStatsNode"))

setMethod("qnTargetStatsNode", c(cols="integer", object="matrix", idx="integer"), 
	function(cols, object, idx) {
	    total <- rep(0, length(idx))
	    if (length(cols) > 0) {
	        for (i in cols) total <- total + sort(object[idx, i])
	    }
	    list(total = total, n = length(cols))
	})

setMethod("qnTargetStatsNode", c(cols="integer", object="ff_matrix",
 		idx="integer"), 
	function(cols, object, idx) {
	    open(object)
	    total <- rep(0, length(idx))
	    if (length(cols) > 0) {
	        for (i in cols) total <- total + sort(object[idx, i], na.last=TRUE)
	    }
	    close(object)
	    rm(object)
	    list(total = total, n = length(cols))
	})

qnToTargetAtom <- function (object, target, idx) {
	object[idx,] <- normalize.quantiles.use.target(object[idx,,drop=FALSE], 
		target, copy=FALSE)
	return(object)
}


#############
## combine ##
#############

setGeneric("combine", function(object1, object2, fun, 
		transform1, transform2, invTransform, ...)	{
	if (missing(fun)) fun <- "+"
	if (missing(transform1)) transform1 <- NULL
	if (missing(transform2)) transform2 <- NULL
	if (missing(invTransform)) invTransform <- NULL
	standardGeneric("combine")	
})

setMethod("combine", "matrix", 
	function(object1, object2, fun, 
			transform1, transform2, invTransform, returnList=FALSE) {
		if (!is.null(transform1)) object1 <- do.call(transform1, list(object1))
		if (!is.null(transform2)) object2 <- do.call(transform2, list(object2))
		ret <- do.call(fun, list(object1, object2))
		if (!is.null(invTransform)) ret <- do.call(invTransform, list(ret))
		if (returnList) {
			return(list(ret, NULL))
		} else {
			return(ret)
		}
	})

setMethod("combine", "ff_matrix", 
	function(object1, object2, fun, transform1, transform2, invTransform) {
		oApply(object1, object2, copy=TRUE, fun=fun,
			fn=combine, transform1=transform1,
			transform2=transform2, invTransform=invTransform,
			returnList=TRUE)[[1]]
		})
		


#################
## methPercent ##
#################
methPercent <- function(m, pmIndex, ngc, commonParams=TRUE) {
	if (is.null(dim(m))) m <- as.matrix(m)
	if(missing(pmIndex)) pmIndex <- 1:nrow(m)
	## TODO: add parallel support 
	if ("ff_matrix" %in% class(m)) {
		open(m)
		ret <- ff(vmode="double", dim=c(length(pmIndex), ncol(m)))
	} else {
		ret <- matrix(nrow=length(pmIndex), ncol=ncol(m))
	}
	param <- t(sapply(1:ncol(m), 
		function(i) logmethParameters(m[pmIndex,i], ngc)))
	alpha <- unlist(param[,"alpha"])
	sigma <- unlist(param[,"sigma"])
	if(commonParams) {
		alpha[] <- median(alpha)
		sigma[] <- median(sigma)
	}
	for (i in 1:ncol(m)) {
		x <- m[pmIndex,i]
		alf <- alpha[i]
		sig <- sigma[i]
		mu <- 0
        p0 <- sum(x<0, na.rm=TRUE) / sum(!is.na(x))
        a <- x - mu - alf * sig^2
        qPost <- (1-p0) * (a + sig * dnorm(a/sig)/pnorm(a/sig))
		ret[,i] <- 1-2^(-qPost)
   	}
	colnames(ret) <- colnames(m)
	return(ret)
}

##Assume the signal is S=X+Y where X~Normal(0, s2) and Y~Exp(alpha)
## Calculate E(Y|S)
logmethParameters <- function (pm, ngc, n.pts = 2^14) 
{
    #maxDensity <- function(x, n.pts) {
    #    aux <- density(x, kernel = "epanechnikov", n = n.pts, 
    #        na.rm = TRUE)
    #    aux$x[order(-aux$y)[1]]
    #}
    pmbg <- 0
    bg.data <- pm[pm < pmbg]
    #bgsd <- mad(bg.data, center=0, constant=1)
    #bgsd <- mad(bg.data, center=0)
    idx <- pm < pmbg
    bgsd <- median(tapply(pm[idx], ngc[idx], mad, center=0, na.rm=TRUE))
    sig.data <- pm[pm > pmbg]
    sig.data <- sig.data - pmbg
    #cat("Debug logmethParameters(): sig.data =", 
    #    100*round(length(sig.data)/length(pm), 2), "%\n")
    #expmean <- maxDensity(sig.data, n.pts)
	expmean <- mean(sig.data, na.rm=TRUE)
    alpha <- 1/expmean
    mubg <- pmbg
    list(alpha = alpha, mu = mubg, sigma = bgsd)
}





checkBreakpoint <- function(breakpoint, ms, as) {
    #plot(a, data, type="p", pch=".")
    tmp <- sapply(breakpoint, function(b) {
        #browser()
        left <- as<b
        right <- as>=b
        l <- length(as)
        #print(sum(left))
        if (sum(left)==l | sum(right)==l) {
            return(1000)
        }else {   
            #print(as[left])
            lmLeft <- lm(ms[left] ~ as[left])
            lmRight <- lm(ms[right] ~ as[right])
            #abline(reg=lmLeft, col="red")
            #abline(reg=lmRight, col="red")
            res <- c(lmLeft$residuals, lmRight$residuals)
            #return(mad(res))
            return(sum(res^2))
        }
    })
    names(tmp) <- breakpoint
    tmp
}

reshapePd <- function(data, sample, channel, fileName, direction="wide", ...) {
    reshape(data, idvar=sample, timevar=channel, v.names=fileName, direction=direction, ...)   
}



myfilter <- function(x,filter,...){
  L=length(x)
  if(L>length(filter)){
    res=filter(x,filter,...)
    ##now fill out the NAs
    M=length(filter)
    N=(M- 1)/2
    for(i in 1:N){
      w=filter[(N-i+2):M]
      y=x[1:(M-N+i-1)]
      res[i] = sum(w*y)/sum(w)
      
      w=rev(w)
      ii=(L-(i-1))
      y=x[(ii-N):L]
      res[ii]<-sum(w*y)/sum(w)
    }
  } else{ res=rep(mean(x),L)}
  return(res)
}

myfilterse <- function(x,filter,...){
  ##filter must add up to 1!!!
  ##x are the SD of the variable being smoothed
  L=length(x)
  if(L>length(filter)){
    res=filter(x,filter^2,...)
    ##now fill out the NAs
    M=length(filter)
    N=(M- 1)/2
    for(i in 1:N){
      w=filter[(N-i+2):M]
      y=x[1:(M-N+i-1)]
      res[i] = sum(w^2*y)/sum(w)^2
      
      w=rev(w)
      ii=(L-(i-1))
      y=x[(ii-N):L]
      res[ii]<-sum(w^2*y)/sum(w)^2
    }
  } else { res=rep(mean(x)/L,L)}
  return(sqrt(res))
}

rowMads <- function(x,center=NULL,constant=1.4826){
  if(is.null(center)) center=rowMedians(x)
  constant*rowMedians(abs(x-center))
}

smoothTile <- function(mat,pns,filter=NULL,ws=3,verbose=TRUE,...){
  ##... goes to filter
  if(is.null(dim(mat))) mat=as.matrix(mat)
  if(is.null(filter)){
    Tukey = function(x) pmax(1 - x^2,0)^2
    filter= Tukey(seq(-ws,ws)/(ws+1));filter=filter/sum(filter)
  }
  ##this function assumes genome position of mat[Indexes[[1]] are ordered
  Indexes=split(seq(along=pns),pns)
  for(i in seq(along=Indexes)){
    #if(verbose) if(i%%1000==0) cat(i,",")
    Index=Indexes[[i]]
    for(j in 1:ncol(mat)){
      mat[Index,j] <- myfilter(mat[Index,j],filter)
    }
  }
  return(mat)
}

comp = function(g){
    g = sort(unique(g))
    ng = length(g)
    v = vector("character",length=ng*(ng-1)/2)
    count=1
    for(j in 1:(ng-1)){
        for(k in (j+1):ng){
            v[count]   = g[j]
            v[count+1] = g[k]
            count = count+2
        }
    }
    return(v)
}
 
get.tog <- function(l,groups,compare,verbose){
  #require(genefilter)
  
  gIndex=split(seq(along=groups),groups)
  gIndex=gIndex[which(names(gIndex)%in%compare)]
  ng=length(gIndex)
  
  lm=matrix(0,nrow(l),ng)
  ls=matrix(0,nrow(l),ng)
  ns=sapply(gIndex,length)
  
  if(verbose) message("Computing group medians and SDs for ",ng," groups:")
  for(i in seq(along=gIndex)){
    if(verbose) message("\n",i)
    Index=gIndex[[i]]
    if(length(Index)>1){
        lm[,i]=rowMedians(l[,Index,drop=FALSE])
        ls[,i]=rowMads(l[,Index,drop=FALSE],center=lm[,i])
    } else{
        message(paste(" ",names(gIndex)[i],"has only 1 array!"))
        lm[,i]=l[,Index,drop=FALSE]
        ls[,i]=NA
    }
  }
  colnames(lm)=names(gIndex)
  colnames(ls)=names(gIndex)

  nums  <- match(compare,colnames(lm))
  COMPS <- matrix(nums,ncol=2,byrow=TRUE)
  
  if(verbose) message("\nDone.\n")
  return(list(lm=lm,ls=ls,ns=ns,COMPS=COMPS))
}

get.tt <- function(lm,ls,ns,filter,Indexes,COMPS,ws,verbose){
##this function assumes genome position of mat[Indexes[[1]] are ordered:
  if(is.null(filter)){
    Tukey = function(x) pmax(1 - x^2,0)^2
    filter= Tukey(seq(-ws,ws)/(ws+1));filter=filter/sum(filter)
  }
  if(!isTRUE(all.equal(sum(filter),1))) stop("filter must sum to 1.")
  
  if(verbose) message("Smoothing:\n")
  dm=matrix(0,nrow(lm),nrow(COMPS))
  cnames = vector("character",nrow(COMPS))
  for(r in 1:ncol(dm)) cnames[r]=paste(colnames(lm)[COMPS[r,]],collapse="-")
  colnames(dm) = cnames
  vr = dm
  sdm = dm
  svr = dm
  tt = dm
  for(r in 1:nrow(COMPS)){
      j=COMPS[r,1]
      k=COMPS[r,2]
      dm[,r]=lm[,j]-lm[,k]
      vr[,r]=(ls[,j]^2)/ns[j]+(ls[,k]^2)/ns[k]
  }
  if(verbose) pb = txtProgressBar(min=1, max=length(Indexes), initial=0, style=1)
  for(i in seq(along=Indexes)){
    #if(verbose) if(i%%1000==0) cat(".")
    Index=Indexes[[i]]
    for(r in 1:nrow(COMPS)){
        j=COMPS[r,1]
        k=COMPS[r,2]
        sdm[Index,r]=myfilter(dm[Index,r],filter)
        if(ns[j]==1|ns[k]==1){ svr[Index,r] = 1} else{ svr[Index,r] = myfilterse(vr[Index,r],filter) }
    }
    if(verbose) setTxtProgressBar(pb, i)
  }
  if(verbose) close(pb)
  for(r in 1:nrow(COMPS)) tt[,r] = sdm[,r]/svr[,r]
  if(verbose) message("Done.")
  return(tt)
}


dmrFinder <- function(eset=NULL, groups, p=NULL, l=NULL, chr=NULL, pos=NULL, pns=NULL, sdBins=NULL, controlIndex=NULL, controlProbes=NULL, Indexes=NULL, filter=NULL, package=NULL, ws=7, verbose=TRUE, compare="all",  withinSampleNorm="loess", betweenSampleNorm="quantile", cutoff=0.995, sortBy="ttarea", removeIf=expression(nprobes<3), paired=FALSE, pairs=NULL, DD=NULL, COMPS=NULL, COMPS.names=NULL) {

  if(!is.null(removeIf) & !is.expression(removeIf)) stop("If not NULL, removeIf argument must be an expression.")
  groups = as.character(groups)
  if(paired & is.null(DD) & is.null(pairs)) stop("pairs argument must be provided if paired=TRUE.")
  if(paired & is.null(DD) & length(pairs)!=length(groups)) stop("pairs argument must be same length as groups.")

  if(!paired | is.null(DD)) { #then the compare arg will be used.
      if(identical(compare,"all")) compare=comp(groups)
      if(length(compare)%%2!=0) stop("compare must have an even number of elements.")
      if(length(cutoff)==1) cutoff <- rep(cutoff, length(compare)/2)
      if(length(compare)/2!=length(cutoff)) stop(length(compare)/2," comparisons requested but ", length(cutoff)," cutoff(s) provided.")
      if(!all(compare%in%groups)) stop("Not all groups specified in the compare argument are in the groups argument.")
  }

  args=list(filter=filter, ws=ws, betweenSampleNorm=betweenSampleNorm, 
	    withinSampleNorm=withinSampleNorm, sdBins=sdBins,
            cutoff=cutoff, sortBy=sortBy, compare=compare, 
            paired=paired, pairs=pairs, removeIf=removeIf)

  # dmrFinder must be given either eset or p/l,chr,pos,pns, and package.
  # If eset is supplied then all the latter will be taken from it (with any
  # that were given as arguments being ignored) (except p).
  # l=logit(p). Indexes=split(seq(along=pns),pns).
  if(is.null(eset)) {
      if (is.null("p") & is.null("l")) stop("p or l must be supplied.")
      Args = c("chr","pos","pns","package")  #"controlIndex" not needed
      nulls = sapply(Args,function(x) is.null(get(x)))
      if(any(nulls))
        stop(paste("The following arguments are missing:", paste(Args[nulls], collapse=", ")))
      lens = c( nrow(p), nrow(l), length(chr), length(pos), length(pns) ) 
      if(length(unique(lens))!=1)
        stop("p, l, chr, pos, and/or pns are incompatible.")
      stopifnot(length(groups)==max(ncol(p), ncol(l)))
	  chrpos <- paste(chr, pos)
	  dup <- duplicated(chrpos) | duplicated(chrpos, fromLast = TRUE)
	  index <- which(!is.na(chr) & !is.na(pos) & !is.na(pns) & !dup)
      index=index[order(chr[index],pos[index])]
      chr=chr[index]
      pos=pos[index]
      pns=pns[index]
#      controlIndex=which(index%in%controlIndex)
      if(!is.null(sdBins)) sdBins<-sdBins[index]
      if(!is.null(p)) p=p[index,]
      if(!is.null(l)) l=l[index,]
  } else if (is.character(eset)) {
          if (is.null("p") & is.null("l")) stop("p or l must be supplied.")
	  pdInfo=get(eset)
	  class(pdInfo)="TilingFeatureSet" # Trick oligo so that pmChr, pmPosition, probeNames work
	  chr=pmChr(pdInfo)
	  pos=pmPosition(pdInfo)
	  chrpos <- paste(chr, pos)
	  dup <- duplicated(chrpos) | duplicated(chrpos, fromLast = TRUE)
	  if (!is.null(l)) {
		index=which(rowSums(is.na(l))==0 & !dup)	
        index=index[order(chr[index],pos[index])]
		l=l[index,]
	  } else {
		index=which(rowSums(is.na(p))==0 & !dup)	
        index=index[order(chr[index],pos[index])]
        p=p[index,]
	  }
      chr=chr[index]
      pos=pos[index]
      pns=probeNames(pdInfo)[index]
#      if (is.null(controlIndex)) {
#          controlIndex <- which(getContainer(pdInfo) %in% controlProbes)
#       }
#      controlIndex=which(index%in%controlIndex)
      if(!is.null(sdBins)) sdBins<-sdBins[index]
      package = eset
      if(package=="pd.feinberg.mm8.me.hx1"){
      #add some code here to break up regions with gaps of >300 bp
      }
      if(package=="pd.feinberg.hg18.me.hx1"){
      #add some code here to break up regions with gaps of >300 bp
      }
  } else {
      stopifnot(length(groups)==ncol(eset))
      if(is.null(p) & is.null(l)){
		p <- methp(eset, 
					withinSampleNorm=withinSampleNorm, 
					betweenSampleNorm=betweenSampleNorm,
					controlIndex=controlIndex,
					controlProbes=controlProbes, 
					verbose=TRUE)
	  }
      chr=pmChr(eset)
      pos=pmPosition(eset)
	  chrpos <- paste(chr, pos)
	  dup <- duplicated(chrpos) | duplicated(chrpos, fromLast = TRUE)
	  if (!is.null(l)) {
		index=which(rowSums(is.na(l))==0 & !dup)	
        index=index[order(chr[index],pos[index])]
		l=l[index,]
	  } else {
		index=which(rowSums(is.na(p))==0 & !dup)			
        index=index[order(chr[index],pos[index])]
        p=p[index,]
	  }
      chr=chr[index]
      pos=pos[index]
      pns=probeNames(eset)[index]
#      if (is.null(controlIndex)) {
#          controlIndex=getControlIndex(eset, controlProbes=controlProbes)
#      }
#      controlIndex=which(index%in%controlIndex)
      if(!is.null(sdBins)) sdBins<-sdBins[index]
      package = annotation(eset)
      if(package=="pd.feinberg.mm8.me.hx1"){
      #add some code here to break up regions with gaps of >300 bp
      }
      if(package=="pd.feinberg.hg18.me.hx1"){
      #add some code here to break up regions with gaps of >300 bp
      }
  }
  if(is.null(l)) {
	  l=log(p)-log(1-p)	
  }
  if(is.null(Indexes)) {
  	  Indexes=split(seq(along=pns),pns)
  }

  if(!paired) {
      tog = get.tog(l=l,groups=groups,compare=compare,verbose=verbose)
      lm=tog$lm
      ls=tog$ls
      ns=tog$ns
      COMPS=tog$COMPS
      tt = get.tt(lm=lm,ls=ls,ns=ns,COMPS=COMPS,Indexes=Indexes,filter=filter,ws=ws,verbose=verbose)
  } else {
      if(is.null(DD)) {
          DD0 = get.DD(l=l, groups=groups, compare=compare, pairs=pairs)
          DD    = DD0$DD
          COMPS = DD0$COMPS
          COMPS.names = DD0$COMPS.names
      } else { 
          if(is.null(COMPS)|is.null(COMPS.names)) stop("If DD is provided, COMPS and COMPS.names must also be provided.")
      }
      ttpaired = get.tt.paired(DD=DD,Indexes=Indexes,filter=filter,ws=ws,verbose=verbose)
      tt  = ttpaired$sT
      MD  = ttpaired$MD
      sMD = ttpaired$sMD
  }

  res=vector("list",ncol(tt))
  names(res)=colnames(tt)
  if(verbose) message("Finding DMRs for each pairwise comparison.")
  for(r in 1:nrow(COMPS)) {
      j = COMPS[r,1]
      k = COMPS[r,2]
      if(verbose) message("\n",colnames(tt)[r])
      if(!paired) DF=ifelse(ns[j]==1 & ns[k]==1, 1, ns[j]+ns[k]-2)
      if(paired)  DF=ifelse(ncol(DD[[r]])-1==0,1,ncol(DD[[r]])-1)
	
	  if (length(sdBins)==0) {
	      K=mad(tt[,r], na.rm=TRUE)*qt(cutoff[r],DF)	
	  }	else {
		  s <- tapply(tt[,r], sdBins, mad, na.rm=TRUE)
		  K=s[sdBins]*qt(cutoff[r],DF)	
	  }
      LAST=0
      segmentation=vector("numeric",nrow(tt))
      type=vector("numeric",nrow(tt))
      for(i in seq(along=Indexes)){
        if(verbose) if(i%%1000==0) message(".", appendLF=FALSE)
        Index=Indexes[[i]]
        y=tt[Index,r]
		if(length(sdBins)==0) {
			tmp=sign(y)*as.numeric(abs(y)>K)	
		} else {
			Ki <- K[Index]
			tmp=sign(y)*as.numeric(abs(y)>Ki)	
		}
        tmp2=cumsum(c(1,diff(tmp)!=0))+LAST
        segmentation[Index]=tmp2
        type[Index]=tmp
        LAST=max(tmp2)
      }
      
      Index = which(type!=0)
      rows  = length(unique(segmentation[Index]))
      res[[r]]=data.frame(
           chr=tapply(chr[Index],segmentation[Index],function(x) x[1]),
           start=tapply(pos[Index],segmentation[Index],min),
           end=tapply(pos[Index],segmentation[Index],max),
           p1=rep(NA, rows),
           p2=rep(NA, rows),
           regionName=tapply(pns[Index],segmentation[Index],function(x) x[1]),
           indexStart=tapply(Index,segmentation[Index],min),
           indexEnd=tapply(Index,segmentation[Index],max))
      length=res[[r]]$indexEnd-res[[r]]$indexStart+1
      res[[r]]$nprobes = length
## The following commented out part gives identical results to the uncommented out replacement code below if using l rather than p, and if odd number of samples in each group, when using p too, but otherwise lm's values are the means of the middle 2 values and transforming back from logit to p isn't exact so there will be very minor discrepancies.
#     if(!paired) {
#         if(is.null(p)) { #  We return log-ratios
#	      colnames(res[[r]]) <- sub("p1", "m1", colnames(res[[r]]))
#	      colnames(res[[r]]) <- sub("p2", "m2", colnames(res[[r]]))
#	      res[[r]]$m1=tapply(lm[Index,j],segmentation[Index],mean)
#             res[[r]]$m2=tapply(lm[Index,k],segmentation[Index],mean)
#             diff = res[[r]]$m1 - res[[r]]$m2
#             maxdiff=tapply(lm[Index,j]-lm[Index,k],segmentation[Index], 
#                            function(x) { x[which.max(abs(x))] })
#	  } else { # We return percentages
#             tmp1 = 1/(1+exp(-lm[Index,j]))
#             tmp2 = 1/(1+exp(-lm[Index,k]))
#	      res[[r]]$p1=tapply(tmp1,segmentation[Index],mean)
#             res[[r]]$p2=tapply(tmp2,segmentation[Index],mean)
#             diff = res[[r]]$p1 - res[[r]]$p2                
#	      maxdiff=tapply(tmp1-tmp2,segmentation[Index], 
#                            function(x) { x[which.max(abs(x))] })
#         }
#     } else {

      if(nrow(res[[r]])>0) {
          if(!paired) {
              grp1 = which(groups==colnames(lm)[j])
              grp2 = which(groups==colnames(lm)[k])
          } else {
              grp1 = which(groups==COMPS.names[r,1])
              grp2 = which(groups==COMPS.names[r,2])
          }
          if(is.null(p)) {
              colnames(res[[r]]) <- sub("p1", "m1", colnames(res[[r]]))
	      colnames(res[[r]]) <- sub("p2", "m2", colnames(res[[r]]))
              mat  = cbind(rowMedians(l[,grp1,drop=FALSE]), rowMedians(l[,grp2,drop=FALSE]))
              matr = t(apply(res[[r]][,c("indexStart","indexEnd")],1,
                             function(se) colMeans(mat[se[1]:se[2],,drop=FALSE])))
              res[[r]]$m1 = matr[,1]
              res[[r]]$m2 = matr[,2]
              diff = res[[r]]$m1 - res[[r]]$m2
          } else {
              mat  = cbind(rowMedians(p[,grp1,drop=FALSE]), rowMedians(p[,grp2,drop=FALSE]))
              matr = t(apply(res[[r]][,c("indexStart","indexEnd")],1,
                             function(se) colMeans(mat[se[1]:se[2],,drop=FALSE])))
              res[[r]]$p1 = matr[,1]
              res[[r]]$p2 = matr[,2]
              diff = res[[r]]$p1 - res[[r]]$p2
          }
          ## Max diff:
          matd = mat[,1]-mat[,2]
          maxdiff = apply(res[[r]][,c("indexStart","indexEnd")],1,
                          function(se) {
                              rt = matd[se[1]:se[2]]
                              rt[which.max(abs(rt))] })
     
          area   = abs(diff)*length
          ttarea = abs(tapply(tt[Index,r],segmentation[Index],mean)) *length
          res[[r]]$area    = area
          res[[r]]$ttarea  = ttarea
          res[[r]]$diff    = diff
          res[[r]]$maxdiff = maxdiff
          if(sortBy=="area")     res[[r]]=res[[r]][order(-area),]
          if(sortBy=="ttarea")   res[[r]]=res[[r]][order(-ttarea),]
          if(sortBy=="avg.diff") res[[r]]=res[[r]][order(-abs(diff)),]
          if(sortBy=="max.diff") res[[r]]=res[[r]][order(-abs(maxdiff)),]
          if(!is.null(removeIf)) res[[r]]=subset(res[[r]],subset=!eval(removeIf))
      } else {
           res[[r]]=data.frame(chr=character(0), start=numeric(0), end=numeric(0),
                               p1=numeric(0), p2=numeric(0), regionName=character(0),
                               indexStart=numeric(0), indexEnd=numeric(0), 
                               nprobes=numeric(0), area=numeric(0), ttarea=numeric(0), 
                               diff=numeric(0), maxdiff=numeric(0))
           if(is.null(p)) {
              colnames(res[[r]]) <- sub("p1", "m1", colnames(res[[r]]))
	      colnames(res[[r]]) <- sub("p2", "m2", colnames(res[[r]]))
           }
      }
      if(verbose) message(nrow(res[[r]])," DMR candidates found using cutoff=",cutoff[r],".")
  }
  if(verbose) message("\nDone")
  if(!paired) {
      return(list(tabs=res, p=p, l=l, chr=chr, pos=pos, pns=pns, 
              index=index, gm=lm, #controlIndex=controlIndex,
              groups=groups, args=args, comps=COMPS, package=package))
  } else {
      return(list(tabs=res, p=p, l=l, chr=chr, pos=pos, pns=pns, 
              index=index, DD=DD, sMD=sMD, #controlIndex=controlIndex,
              groups=groups, args=args, comps=COMPS, comps.names=COMPS.names, package=package))
  }
}

dmrFdr <- function(dmr, compare=1, numPerms=1000, seed=NULL, verbose=TRUE) {
	if (length(compare)!=1) stop("You must choose one comparison at a time when calculating FDRs. Please set dmr to be one of: ", 
	paste(names(dmr$tabs), collapse=", "), "\n")
	if (is.numeric(compare)) compare <- names(dmr$tabs)[compare]
	message("Calculating q-values for DMRs between ", compare, "\n")
	# Get probe order from TilingFeatureSet object
	pdInfo=get(dmr$package)
	class(pdInfo)="TilingFeatureSet" # Trick oligo so that pmChr, pmPosition work
	chr=pmChr(pdInfo)
	pos=pmPosition(pdInfo)
	chrpos <- paste(chr, pos)
	dchrpos <- paste(dmr$chr, dmr$pos)
	#idx <- which(!(chrpos %in% dchrpos))
	mis <- setdiff(chrpos, dchrpos)
	o <- order(chr, pos)
	chrpos <- chrpos[o]
	keepProbes <- !(chrpos %in% mis)
	o <- o[keepProbes]
	keep <- dmr$groups %in% unlist(strsplit(compare, "-"))
	# Recreate p or l with same sort order as in annotation package 
	if (is.null(dmr$p)) {
		l <- matrix(NA, nrow=length(pos), ncol=ncol(dmr$l))
		l[o,] <- dmr$l
		l <- l[,keep]
	} else {
		p <- matrix(NA, nrow=length(pos), ncol=ncol(dmr$p))
		p[o,] <- dmr$p
		p <- p[,keep]
	}
	n <- sum(keep)
	n1 <- sum(dmr$groups==unlist(strsplit(compare, "-"))[1])
	maxPerms <- choose(n, n1)
	if (numPerms=="all") numPerms <- maxPerms
	if (numPerms>maxPerms) {
		message("Given the sample sizes in the two groups the maximum number of permutations is ", maxPerms, sep="")
		numPerms <- maxPerms
	} 

	## Reshuffled group label DMRs
	if (!is.null(seed)) set.seed(seed)
	if (maxPerms<1e6) { # Enumerate all the combinations
		s <- sample(1:maxPerms, numPerms)
		grp1 <- combinations(n,n1)[s,]
	} else { 
		grp1 <- t(sapply(1:numPerms, function(x) sample(n,n1)))
	}

	if (verbose) message("Finding permuted data DMRs. Estimating time remaining")
	areas <- lapply(1:numPerms, function(i) {
		groups <- rep("grp2", n)
		groups[grp1[i,]] <- "grp1"
		if (is.null(dmr$p)) {
			st <- system.time(dmrPerm <- dmrFinder(dmr$package, l=l, 
				groups=groups, cutoff=dmr$args$cutoff, 
				filter=dmr$args$filter, ws=dmr$args$ws, 
				verbose=FALSE))[3]
		} else {
			st <- system.time(dmrPerm <- dmrFinder(dmr$package, p=p, 
				groups=groups, cutoff=dmr$args$cutoff, 
				filter=dmr$args$filter, ws=dmr$args$ws,
				verbose=FALSE))[3]
		}
		if (verbose & (i %in% round(seq(1, numPerms, length.out=10)))) {
			message(i, "/", numPerms, " (", prettyTime((numPerms-i)*st), 
				" remaining)", sep="")
		}
		dmrPerm$tabs[[1]][,dmr$args$sortBy]
	})

	nullDist <- unlist(areas)
	fn <- ecdf(nullDist)
	pval <- 1-fn(dmr$tabs[[compare]][,dmr$args$sortBy])
	pi0<-pi0.est(pval)$p0
	qval<-qvalue.cal(pval, pi0)
	dmr$tabs[[compare]] <- cbind(dmr$tabs[[compare]], pval, qval)
	if (!("numPerms" %in% names(dmr))) {
		dmr$numPerms <- rep(NA, length(dmr$tabs))
		names(dmr$numPerms) <- names(dmr$tabs)
	}
	dmr$numPerms[compare] <- numPerms
	return(dmr)
}


prettyTime <- function(seconds) {
	hours <- floor(seconds / 60 / 60)
	minutes <- floor((seconds/60) - (hours*60))
	ret <- ""
	if (hours>0) ret <- paste(ret, hours, 
		ifelse(hours==1, " hour", " hours"), ", ", sep="")
	ret <- paste(ret, minutes, ifelse(minutes==1, " minute", " minutes"),
		sep="")
	ret
}


validatePd <- function(pd, fileNameColumn, sampleNameColumn, 
		groupColumn, ut = "_532.xys", md = "_635.xys") {
	msg <- ""
	message("Filenames:")
	if (missing(fileNameColumn)) {
		fileNameColumn <- which.max(apply(pd, 2, function(col) {
			u <- grep(ut, col, ignore.case=TRUE)
			m <- grep(md, col, ignore.case=TRUE)
			length(c(u,m))
		}))
		message("  fileNameColumn not specified. Trying to guess.\n")	
	}
	files <- pd[,fileNameColumn]
	o <- order(files)
	files <- files[o]
	pd <- pd[o,]
	utIdx <- grep(ut, files)
	if (length(utIdx)==0) {
		msg <- paste(msg, "  No files in column ", fileNameColumn, " match the untreated extension ", ut, ". Please use the ut option to set the correct extension\n", sep="")
	}
	mdIdx <- grep(md, files)
	if (length(mdIdx)==0) {
		msg <- paste(msg, "  No files in column ", fileNameColumn, " match the methyl-depleted extension ", md, ". Please use the md option to set the correct extension\n", sep="")
	}
	filesUt <- sub(ut, "", files[utIdx])
	filesMd <- sub(md, "", files[mdIdx])
	missing <- c(filesUt[!filesUt%in%filesMd], filesMd[!filesMd%in%filesUt])
	if (length(missing)>0) {
		msg <- paste(msg, "  The untreated (ut) and methyl-depleted (md) file names in column ", fileNameColumn, " don't match up. Check ", paste(missing, collapse=", "), sep="")
	}
	if (length(missing)>0 | length(utIdx)==0 | length(mdIdx)==0) {
		message("  ERROR - ", msg)
		return(list(status="ERROR", message=msg))	
	} else {
		message("  OK - Found in column", fileNameColumn)
	}

	channelMatch <- apply(pd[utIdx,]==pd[mdIdx,], 2, all)
	numUnique <- apply(pd[utIdx,], 2, function(x) length(unique(x)))
	onePerSample <- numUnique==length(utIdx)

	message("Sample names:")
	if (missing(sampleNameColumn)) {
		message("  sampleNameColumn column not specified. Trying to guess.")	
		sampleNameColumn <- which(channelMatch & onePerSample)	
		if (length(sampleNameColumn)==1) {
			message("  OK - Found in column", sampleNameColumn)
		} else if (length(sampleNameColumn)==0) {
			msg <- paste(msg, "No suitable sample ID column found. This column should have the same entry for each pair of untreated and methyl-depleted filenames.\n")
			message("  ERROR - ", msg, appendLF=FALSE )
			return(list(status="ERROR", message=msg))
		} else if (length(sampleNameColumn)>1) {
			message("  WARNING - Multiple columns (", paste(sampleNameColumn, collapse=", "), 
				") are candidates for sample ID.", sep="")
				sampleNameColumn <- sampleNameColumn[1]
		}
	} else { # User specified sampleNameColumn
		if (channelMatch[sampleNameColumn] & onePerSample[sampleNameColumn]) {
			message("  OK - Found in column", sampleNameColumn)
		} else {
			if (!channelMatch[sampleNameColumn]) {
				msg <- paste(msg, "  There are samples with different values in column ", sampleNameColumn, " for their untreated and methyl-depleted rows\n", sep="")
			} else if (!onePerSample[sampleNameColumn]) {
				msg <- paste(msg, "  The sample IDs in column ", sampleNameColumn, " are not unique to each sample\n", sep="")
			}
			message("  ERROR -", msg)
			return(list(status="ERROR", message=msg))	
		}
	}
	message("Group labels:")
	if (missing(groupColumn)) {
		message("  groupColumn column not specified. Trying to guess.")	
		groupColumn <- which(channelMatch & !onePerSample & numUnique>1)
		if (length(groupColumn)==1) {
			message("  OK - Found in column", groupColumn)
		} else if (length(groupColumn)==0) {
			msg <- paste(msg,  "No suitable group label column found - Defaulting to sample ID column for group labels (i.e. 1 sample per group)\n")
			warning("No suitable group label column found - Defaulting to sample ID column for group labels (i.e. 1 sample per group)")
			groupColumn <- sampleNameColumn
		} else if (length(groupColumn)>1) {
			warning("  WARNING - Multiple columns (", 
				paste(groupColumn, collapse=", "), 
				") are candidates for group labels.\n", sep="")
				groupColumn <- groupColumn[1]
		}
	} else { # User specified groupColumn
		if (channelMatch[groupColumn] & !onePerSample[groupColumn]) {
			message("  OK - Found in column", groupColumn)
		} else {
			if (!channelMatch[groupColumn]) {
				msg <- paste(msg,  "There are samples with different values in column", groupColumn, "for their untreated and methyl-depleted rows\n")
				message("  ERROR -", msg, appendLF=FALSE)
				return(list(status="ERROR", message=msg))	
			} else if (onePerSample[groupColumn]) {
				message("  WARNING - Each group has only 1 sample")
			}
		}
	}
	return(list(status="OK", fileNameColumn=fileNameColumn, 
			sampleNameColumn=sampleNameColumn,
			groupColumn=groupColumn))
}


.onAttach <- function(libname, pkgname) {
	packageStartupMessage("Welcome to charm version ", 
		packageDescription("charm", fields="Version"))
	ocSamples(1)
	ldPath(tempdir())
}

##The next 2 functions are for the paired=TRUE option of dmrFinder:
get.DD <- function(l, groups, compare, pairs){

  ##Define COMPS the same as in get.tog():
  gIndex=split(seq(along=groups),groups)
  gIndex=gIndex[which(names(gIndex)%in%compare)]
  NAMES = names(gIndex)
  nums  <- match(compare,NAMES)
  COMPS <- matrix(nums,ncol=2,byrow=TRUE)
  COMPS.names <- t(apply(COMPS,1,function(x) NAMES[x]))
   
  DD    <- list() #unlike an array a list will accomodate drop-out
  for(i in 1:nrow(COMPS)){
    #Make DD[[i]], the differences for comparison i
    j=COMPS[i,1]
    k=COMPS[i,2]
    pd.j <- which(groups==NAMES[j])
    pd.k <- which(groups==NAMES[k])
    ord  <- match(pairs[pd.j],pairs[pd.k])
    if(all(is.na(ord))){
        message(paste("Comparison",i,"has no pairs! Ignoring this comparison."))
        DD[[i]] = NA
        next
    }
    if(any(is.na(ord))){
      pd.j <- pd.j[-which(is.na(ord))]
      ord  <- ord[-which(is.na(ord))]
    } #If any in j not in k, this will remove that one in j.
      #If any in k not in j, it's won't be included in pd.k anyway. 
    pd.k <- pd.k[ord]
    stopifnot(identical(pairs[pd.j],pairs[pd.k]))

    DD[[i]] <- as.matrix(l[,pd.j] - l[,pd.k])
    #if(absdiff) DD[[i]] <- abs(DD[[i]])^1
        #^.5 is down ladder of powers and makes symmetrical, but ^1 seems to work better.
        #log makes small differences large in magnitude (in the 
        #negative direction) which is not what we want.
    if(ncol(DD[[i]])==1) colnames(DD[[i]]) = colnames(l)[pd.j] #since no colname given
    names(DD)[i] = paste(NAMES[j],"-",NAMES[k],sep="")
  }
  return(list(DD=DD,COMPS=COMPS,COMPS.names=COMPS.names))
}

get.tt.paired <- function(DD,Indexes,filter=NULL,ws,verbose=TRUE) {
  #require(genefilter)
  ok = which(sapply(DD,length)>1)
  MD   <- matrix(NA,nrow=nrow(DD[[1]]),ncol=length(DD)) #mean differences
  vars <- matrix(NA,nrow=nrow(DD[[1]]),ncol=length(DD)) #their variances
  for(i in ok) {
    if(ncol(DD[[i]])<3) message(paste("Comparison",i,"is just using raw differences!  May want to set lower CUTOFF."))
    #if(ncol(DD[[i]])<3) stop(paste("Comparison",i,"has <3 pairs."))
    MD[,i]   <- rowMeans(DD[[i]])
    vars[,i] <- rowVars(DD[[i]])/ncol(DD[[i]]) #all NA if only 1 pair
  }

  #Now get smoothed MD and its vars:
  sMD <- matrix(NA,nrow=nrow(MD),ncol=ncol(MD))
  ses <- matrix(NA,nrow=nrow(vars),ncol=ncol(vars)) #myfilterse for new Charm returns sqrt(var)

  if(is.null(filter)) {
      Tukey = function(x) pmax(1 - x^2,0)^2
      fs= Tukey(seq(-ws,ws)/(ws+1));fs=fs/sum(fs)
  } else fs =filter

  if(verbose) message("Smoothing mean differences using window size of ",ws*2+1,":")
  if(verbose) pb = txtProgressBar(min=1, max=length(Indexes), initial=0, style=1)
  for(i in seq(along=Indexes)) {
    Index=Indexes[[i]]
    for(j in ok){
      sMD[Index,j] = myfilter( MD[Index,j],fs )
      if(ncol(DD[[j]])>2) ses[Index,j] = myfilterse( vars[Index,j],fs )
      if(ncol(DD[[j]])<3) ses[Index,j] = 1
    }
    if(verbose) setTxtProgressBar(pb, i)
  }
  if(verbose) close(pb)
  colnames(MD)  = names(DD)
  colnames(sMD) = names(DD)
  #if(permute==FALSE & permute.paired==FALSE){
  #    save(MD,sMD,file=file.path(outpath,"MD.rda"),compress=TRUE) #for results.R
  #}
  #Finally make sT:
  sT <- sMD/ses
  #if(absdiff) sT = (sT - mean(sT))*(sT>mean(sT))
  return(list(sT=sT, MD=MD, sMD=sMD))
}  

Try the charm package in your browser

Any scripts or data that you put into this service are public.

charm documentation built on May 6, 2019, 2:29 a.m.