R/pipeline1.R

dmrFind <- function(p=NULL, logitp=NULL, svs=NULL, mod, mod0, coeff, pns, chr, pos, only.cleanp=FALSE, only.dmrs=FALSE, rob=TRUE, use.limma=FALSE, smoo="weighted.loess", k=3, SPAN=300, DELTA=36, use="sbeta", Q=0.99, min.probes=3, min.value=0.075, keepXY=TRUE, sortBy="area.raw", verbose=TRUE, vfilter=NULL, nsubsets=50, ...) {
    if(only.cleanp & only.dmrs) stop("only.cleanp and only.dmrs cannot both be TRUE.")
    if(!sortBy%in%c("area","area.raw","avg","max")) stop("sortBy must be area, area.raw, avg, or max.")
    if(min.value<0 | min.value>1) stop("min.value must be between 0 and 1.")
    if(Q<0 | Q>1) stop("Q must be between 0 and 1.")

    for(j in 1:ncol(mod0)) if(!all(mod0[,j]==mod[,j])) stop("This function requires that all columns of mod not in mod0 (i.e., the column(s) for your covariate of interest) come after the columns of mod0, i.e., the first n columns of mod must be the same as mod0, where n is the number of columns in mod0.") # If this function were modified to enable the covariate of interest columns to come at the start or elsewhere, e.g., changing mod[,-c(1:ncol(mod0)),drop=FALSE] below, it would require matching on column names of mod and mod0, but then the function would require column names to be the same, a new requirement and one often not met for the intercept column.

    if(is.null(p) & is.null(logitp)) stop("Either p or logitp must be provided.")
    if("ff_matrix" %in% c(class(p),class(logitp)) & is.null(vfilter)) message("If available memory is insufficient, try setting vfilter to use only a subset of probes in SVA.")
    if("ff_matrix" %in% c(class(p),class(logitp)) & use.limma) stop("use.limma=TRUE will not work if p or logitp are ff_matrix objects")
    if(!is.null(p)) {
        stopifnot(Min(p)>=0 & Max(p)<=1)
        stopifnot(ncol(p)==nrow(mod))
    }
    if(!is.null(logitp)) {
        stopifnot(ncol(logitp)==nrow(mod))
    } else logitp = logit(p)

    errmsg = "coeff must be a character or numeric index for the column of mod that is the coefficient of interest."
    if(is.character(coeff)) {
        if(!coeff%in%colnames(mod)) stop(errmsg)
        coiIndex = which(colnames(mod)==coeff)
    } else if(is.numeric(coeff)) {
        if(coeff>ncol(mod)) stop(errmsg)
        coiIndex = coeff        
    } else stop(errmsg)

    if(is.null(svs)) {
#        require(sva)
        cat("Running SVA\n")

        if (!is.null(vfilter)) {
            if (vfilter < 100 | vfilter > dim(logitp)[1]) {
                stop(paste("The number of genes used in sva (i.e., the vfilter argument) must be between 100 and", 
                     dim(logitp)[1], "\n"))
            }
            tmpv = as.vector(ffapply(X=logitp, MARGIN=1, AFUN="var", RETURN=TRUE, FF_RETURN=FALSE))
            ind = which(rank(-tmpv) < vfilter)
            svaobj = sva(logitp[ind,],mod=mod,mod0=mod0,method="irw",vfilter=NULL,...)
        } else svaobj = sva(logitp,mod=mod,mod0=mod0,method="irw",...)
        gc(); gc()
        svs = svaobj$sv
    }
    args = list(svs=svs, mod=mod, mod0=mod0, coeff=coeff, use.limma=use.limma, smoo=smoo, SPAN=SPAN, DELTA=DELTA, use=use, Q=Q, min.probes=min.probes, min.value=min.value, keepXY=keepXY, sortBy=sortBy, rob=rob, k=k, vfilter=vfilter, nsubsets=nsubsets) #save for obtaining q-values.

    svs = as.matrix(svs)
    svs = svs[,!duplicated(svs[1,]),drop=FALSE] #remove duplicate sv's
    if(identical(svs,matrix(0,1,1))) X = mod else {
        if(ncol(svs)>=0.5*nrow(svs)) warning("sva() returns an unusually large number of surrogate variables.  You may want to use fewer than the full set returned.")
        X = cbind(mod, svs)
    }
    P = ncol(X)
    if(rob) no = 1:ncol(mod) else no = c(1, which(!colnames(mod)%in%colnames(mod0)))
    if(use.limma) {
        #require(limma)
        if(verbose) cat("\nRegression (limma)\n")
        fit=limma::lmFit(logitp, X)

        ##Get cleanp:
        cleanp = ilogit(logitp-fit$coef[,-no,drop=FALSE]%*%t(X[,-no,drop=FALSE]))
        if(only.cleanp) return(cleanp)
      
        beta=fit$coef[,coiIndex]
        if(verbose) cat("Obtaining estimates for ",colnames(fit$coef)[coiIndex],"\n")
        fit2=limma::ebayes(fit, proportion=0.01)
        wald=fit2$t[,coiIndex]
        pval=fit2$p.value[,coiIndex]
        ## sigma here only gets used when smoo=="weighted.loess": 
        #sigma=fit$sigma # same as sigma when !use.limma.
        sigma=sqrt(fit2$s2.post) #could do this instead, to borrow strength.
    } else {
        ##below is just bare version of lm
        if(verbose) cat("\nRegression\n")
        Hat=solve(t(X)%*%X)%*%t(X)
        N=ncol(logitp)
        if("ff_matrix"%in%class(logitp)) {
            sp = split(1:nrow(logitp), cut(1:nrow(logitp),nsubsets))
            beta = NULL
            sigma = NULL
            cleanp <- ff(vmode = "double", dim = dim(logitp))
            colnames(cleanp) = colnames(logitp)
            rownames(cleanp) = rownames(logitp)
            for(i in sp) {
                beta2=(Hat%*%t(logitp[i,]))
                sigma2=genefilter::rowSds(logitp[i,]-t(X%*%beta2))*sqrt((N-1)/(N-P))
                beta = cbind(beta,beta2)               
                sigma = c(sigma,sigma2)                   
                ##Get cleanp:                
                cleanp[i,] = ilogit(logitp[i,]-t(X[,-no,drop=FALSE]%*%beta2[-no,,drop=FALSE]))
                rm(beta2,sigma2); gc(); gc()
            }
        } else {
            beta=(Hat%*%t(logitp))
            sigma=genefilter::rowSds(logitp-t(X%*%beta))*sqrt((N-1)/(N-P))
            ##Get cleanp:
            cleanp=ilogit(logitp-t(X[,-no,drop=FALSE]%*%beta[-no,,drop=FALSE]))
            colnames(cleanp)=colnames(logitp)
        }
        if(only.cleanp) return(cleanp)
  
        if(verbose) cat("Obtaining estimates for ",rownames(beta)[coiIndex],"\n")
        wald=beta[coiIndex,]/(sqrt(diag(solve(t(X)%*%X))[coiIndex])*sigma)
        beta=beta[coiIndex,]
        ##All one sided tests, not 2-sided:
        #pval=ifelse(wald<0, pt(wald,df=N-P), 1-pt(wald,df=N-P))
        pval=pt(-abs(wald),df=N-P)*2
    }

    ## Find DMRs:
    if(verbose) cat("Smoothing\n")
    pnsIndexes = split(seq(along = pns), pns)
    sbeta = try(dosmooth(stat=beta, pnsIndexes=pnsIndexes, pos=pos, smoo=smoo, verbose=verbose, k=k, SPAN=SPAN, DELTA=DELTA, sigma=sigma), silent=TRUE) #sigma only gets used if smoo=="weighted.loess"
    if(inherits(sbeta,"try-error")) {
        warning("Problem using smoo=weighted.loess, so loess being used instead.")
        sbeta = dosmooth(stat=beta, pnsIndexes=pnsIndexes, pos=pos, smoo="loess", verbose=verbose, k=k, SPAN=SPAN, DELTA=DELTA, sigma=NULL)
    }
    if(use=="sbeta") { ##Finding based on sbeta:
        swald = NULL
        odmrs = regionFinder(x=sbeta,pns,chr,pos,cutoff=quantile(sbeta,Q),verbose=verbose)
    } else if(use=="swald") { ##Finding based on swald:
        swald = dosmooth(stat=wald, pnsIndexes=pnsIndexes, pos=pos, smoo=smoo, verbose=verbose, k=k, SPAN=SPAN, DELTA=DELTA, sigma=NULL)
        odmrs = regionFinder(x=swald,pns,chr,pos,cutoff=quantile(swald,Q),verbose=verbose)
    }
    ##Remove dmrs in sex chromosomes if keepXY==FALSE:
    if(!keepXY) odmrs=odmrs[which(!odmrs$chr%in%c("chrX","chrY")),]
    ##Remove dmrs that don't have at least min.probes probes:
    odmrs=odmrs[which(odmrs$nprobes>min.probes),]
    ##Remove dmrs in which the average difference or correlation is not at least min.value:
    ##(Can't just use the 'value' column regionFinder(..., y=sbeta) since sbeta is from logit data)
    contin = FALSE
    if(all(mod[,coiIndex]%in%c(0,1))) { #covariate is categorical
        if(verbose) message("Covariate recognized as categorical.")
        base = which(rowSums(mod[,-c(1:ncol(mod0)),drop=FALSE])==0)
        grp1 = which(mod[,coiIndex]==1)
        if(!"ff_matrix"%in%class(cleanp)) {
            mat = cbind(rowMeans(cleanp[,grp1,drop=FALSE]), rowMeans(cleanp[,base,drop=FALSE]))
        } else {
            mat = cbind(
              as.vector(ffapply(X=asff(cleanp,columns=grp1), MARGIN=1,
                                AFUN="mean", RETURN=TRUE, FF_RETURN=FALSE)),
              as.vector(ffapply(X=asff(cleanp,columns=base), MARGIN=1,
                                AFUN="mean", RETURN=TRUE, FF_RETURN=FALSE))
              )
        }
        ## Mean diff:
        res = t(apply(odmrs[,c("indexStart","indexEnd")],1,
                      function(se) colMeans(mat[se[1]:se[2],,drop=FALSE])))
        x = res[,1]-res[,2]
        ## Max diff:
        res0 = mat[,1]-mat[,2]
        x2 = apply(odmrs[,c("indexStart","indexEnd")],1,
                       function(se) {
                           rt = res0[se[1]:se[2]]
                           rt[which.max(abs(rt))]
                       })      
    } else { #covariate is continuous
        if(verbose) message("Covariate recognized as continuous.")
        contin = TRUE
        mat = rowCor(m=cleanp,y=mod[,coiIndex], nsubsets=nsubsets)
        ## Mean correlation:
        x = apply(odmrs[,c("indexStart","indexEnd")],1,
                  function(se) median(mat[se[1]:se[2]]))
        ## Max correlation:
        x2 = apply(odmrs[,c("indexStart","indexEnd")],1,
                   function(se) {
                       rt = mat[se[1]:se[2]]
                       rt[which.max(abs(rt))]
                   })
    }
    stopifnot(length(x)==nrow(odmrs))
    odmrs$avg = x
    odmrs$max = x2
    odmrs$area.raw = abs(x*odmrs$nprobes) #unsigned area, like area column from regionFinder
    x[is.na(x)] = 1 #maximum possible so it will definitely exceed min.value
    dmrs = odmrs[abs(x)>min.value,] #If not for this there would be no difference in the dmrs returned by dmrFind when rob=TRUE or rob=FALSE.
    
    if(nrow(dmrs)==0) {
        if(verbose) cat("\nNo DMRs found (using ",use,").\n")
    } else {
        dmrs=dmrs[order(-abs(dmrs[,sortBy])),]
        if(verbose) cat("\nFound",nrow(dmrs),"potential DMRs\n")
    }
    if(!verbose) cat(".") #for qval when verbose=FALSE.
    if(only.dmrs) {
        if(!contin) return(list(dmrs=dmrs, pval=pval, pns=pns, chr=chr, pos=pos, args=args))
        if(contin)  return(list(dmrs=dmrs, pval=pval, pns=pns, chr=chr, pos=pos, args=args, beta=beta, sbeta=sbeta))        
    } else {
       if(!contin)  return(list(dmrs=dmrs, pval=pval, pns=pns, chr=chr, pos=pos, args=args, cleanp=cleanp))
       if(contin)   return(list(dmrs=dmrs, pval=pval, pns=pns, chr=chr, pos=pos, args=args, beta=beta, sbeta=sbeta, cleanp=cleanp)) #whether or not the result contains beta and sbeta is how resultMaker knows that the analysis is of continuous phenotype.
   }
}

rowCor <- function(m, y, nsubsets=50) { #faster version of apply(m,1,function(x) cor(x,y))
    #require(genefilter)
    stopifnot(is.matrix(m)|is.data.frame(m)|"ff_matrix"%in%class(m))
    stopifnot(is.vector(y))
    stopifnot(ncol(m)==length(y))
    X = cbind(1,y)
    Hat=solve(t(X)%*%X)%*%t(X)
    if(!"ff_matrix"%in%class(m)) {
        beta=t(Hat%*%t(m))[,"y"]
        beta * sd(y)/rowSds(m)
    } else {
        sp = split(1:nrow(m), cut(1:nrow(m),nsubsets))
        beta = NULL
        for(i in sp) {
            beta2 = t(Hat%*%t(m[i,]))[,"y"]
            beta2 = beta2 * sd(y)/rowSds(m[i,])
            beta = c(beta,beta2)
            rm(beta2); gc(); gc()
        }
        beta
    }
}

dosmooth <- function(stat, pnsIndexes, pos, smoo="runmed", k=3, SPAN=300, DELTA=36, sigma=NULL, verbose=TRUE) {
#SPAN  = 300 ##used by loess. we smooth 300 base pairs
#DELTA = 36  ##span parameter in loess smoothing will = SPAN/(DELTA* no. of probes in region)
    if(smoo=="runmed") { #assumes stat is already in order by genomic location
        sstat = stat
        if(verbose) { pb=txtProgressBar(min=0,max=length(pnsIndexes),initial=0); j=0 }
        for(ind in pnsIndexes) {
            if(length(ind)>k) {
                sstat[ind]=runmed(stat[ind],k,endrule="constant")  
            } else sstat[ind]=median(stat[ind])
            if(verbose) { j=j+1; setTxtProgressBar(pb,j) }
        }
        if(verbose) close(pb)        
    } else if(smoo=="loess") {
        sstat = stat
        if(verbose) { pb=txtProgressBar(min=0,max=length(pnsIndexes),initial=0); j=0 }
        for(ind in pnsIndexes) {
            if(length(ind)>4) {
                thespan=SPAN/(DELTA*length(pos[ind]))
                fit1=loess(stat[ind]~pos[ind],span=thespan,degree=1,family="symmetric")    
                sstat[ind]=fit1$fitted
            } else sstat[ind]=median(stat[ind])
            if(verbose) { j=j+1; setTxtProgressBar(pb,j) }
        }
        if(verbose) close(pb)        
    } else if(smoo=="weighted.loess") { #use Andrew's way
        sstat = loessPns(stat=stat, se=sigma, pnsIndexes=pnsIndexes, pos=pos, verbose=verbose)
    } else stop("Choices for smoo are runmed, loess, and weighted.loess.")
    sstat
}

loessPns <- function(stat, se=NULL, pnsIndexes, pos, numProbes = 8, verbose=TRUE) {
        ##pnsIndexes = split(seq(along = pns), pns) #takes a second, so do outside function for speed
	#require(limma)
	sstat = stat

        if(verbose) pb=txtProgressBar(min=0,max=length(pnsIndexes),initial=0)        
	if(is.null(se)) {
		for(i in seq(along=pnsIndexes)) {
			#if(i %% 1e4 == 0) cat(".")
			Index = pnsIndexes[[i]]
			ss = numProbes/length(Index) # make a span
			if(ss > 1) ss = 0.75
						
			if(length(Index) > 4) {
			    sstat[Index] = loessFit(stat[Index], pos[Index], span = ss)$fitted
			} else sstat[Index] = median(stat[Index])
                        if(verbose) setTxtProgressBar(pb,i)
		}
	} else {
		for(i in seq(along=pnsIndexes)) {
			#if(i %% 1e4 == 0) cat(".")
			Index = pnsIndexes[[i]]
			ss = numProbes/length(Index) # make a span
			if(ss > 1) ss = 0.75
						
			if(length(Index) > 4) {
			    sstat[Index] = loessFit(stat[Index], pos[Index], span = ss, weights = 1/se[Index], method="locfit")$fitted
			} else sstat[Index] = median(stat[Index])
                        if(verbose) setTxtProgressBar(pb,i)
		}
	}
        if(verbose) close(pb)
	return(sstat)
}

######################################################################
######################################################################

qval <- function(p=NULL, logitp=NULL, dmr, numiter=500, seed=54256, verbose=FALSE, mc=1, return.permutations=FALSE, method=c("direct","fwer"), fwer.num=c(1,5)) {
    if(!any(c("direct","pool","fwer")%in%method)) stop("method must = direct, pool, or fwer.")
    if("fwer"%in%method) {
        stopifnot(is.numeric(fwer.num))
        if(!all(fwer.num%%1==0)) stop("fwer.num value(s) must be integers.")
        if(!all(fwer.num>0)) stop("fwer.num value(s) must be >0.")
        fwer.num = sort(fwer.num)
    }
    #require(parallel)
    if(is.null(p) & is.null(logitp)) stop("Either p or logitp must be provided (the same as you provided to dmrFind when it produced dmr).")
    if(!is.null(p) & "cleanp"%in%names(dmr)) {
        if(nrow(dmr$cleanp)!=nrow(p) | ncol(dmr$cleanp)!=ncol(p)) stop("p must be the same as the p that you provided to dmrFind when it produced dmr.")        
    }
    if(!is.null(logitp)) {
        if("cleanp"%in%names(dmr)) if(nrow(dmr$cleanp)!=nrow(logitp) | ncol(dmr$cleanp)!=ncol(logitp)) stop("logitp must be the same as the logitp that you provided to dmrFind when it produced dmr.")
    } else logitp = logit(p)
    #stopifnot(all(dmr$dmrs[,dmr$args$sortBy]>=0))

    ## Test that result is same as original:
    dmr2 = dmrFind(logitp=logitp, svs=dmr$args$svs, mod=dmr$args$mod, mod0=dmr$args$mod0, only.cleanp=FALSE, only.dmrs=TRUE, coeff=dmr$args$coeff, use.limma=dmr$args$use.limma, smoo=dmr$args$smoo, SPAN=dmr$args$SPAN, DELTA=dmr$args$DELTA, use=dmr$args$use, Q=dmr$args$Q, min.probes=dmr$args$min.probes, min.value=dmr$args$min.value, pns=dmr$pns, chr=dmr$chr, pos=dmr$pos, keepXY=dmr$args$keepXY, sortBy=dmr$args$sortBy, verbose=verbose, rob=dmr$args$rob, k=dmr$args$k, vfilter=dmr$args$vfilter, nsubsets=dmr$args$nsubsets)
    orig_tab = dmr$dmrs    
    TFvec = vector()
    for(hg in 1:ncol(dmr2$dmrs)) TFvec[hg] = isTRUE(all.equal(dmr2$dmrs[,hg],orig_tab[,hg]))
    dimsame = identical(dim(dmr2$dmrs),dim(orig_tab))
    if(any(TFvec==FALSE) | !dimsame) stop("Make sure your p or logitp argument is the same as the one you passed to dmrFinder when it produced dmr.")

    ## Now start the iterations:
    svs = as.matrix(dmr$args$svs)
    svs = svs[,!duplicated(svs[1,])] #remove duplicate sv's
    if(isTRUE(svs==0)) {
        X  = dmr$args$mod
        X0 = dmr$args$mod0
    } else {
        X  = cbind(dmr$args$mod, svs)
        X0 = cbind(dmr$args$mod0, svs)
    }
    #fit1 = limma::lmFit(logitp,X)
    #r_ij = residuals(fit1, logitp)
    #fit0 = limma::lmFit(logitp,X0)
    #n_ij = fitted(fit0)
    ## To use less memory and go faster, do this instead of lmFit:
    r_ij = lm_fit(dat=logitp,X=X,out="res",nsubsets=dmr$args$nsubsets)
    n_ij = lm_fit(dat=logitp,X=X0,out="fit",nsubsets=dmr$args$nsubsets)

    colsamp = matrix(NA, nrow=numiter, ncol=ncol(r_ij))
    set.seed(seed)
    for(j in 1:nrow(colsamp)) colsamp[j,] = sample(1:ncol(r_ij),replace=TRUE)
    fun1 <- function(h, method) {
        dmr2 = dmrFind(logitp= n_ij + r_ij[,colsamp[h,]], svs=dmr$args$svs, mod=dmr$args$mod, mod0=dmr$args$mod0, only.cleanp=FALSE, only.dmrs=TRUE, coeff=dmr$args$coeff, use.limma=dmr$args$use.limma, smoo=dmr$args$smoo, SPAN=dmr$args$SPAN, DELTA=dmr$args$DELTA, use=dmr$args$use, Q=dmr$args$Q, min.probes=dmr$args$min.probes, min.value=dmr$args$min.value, pns=dmr$pns, chr=dmr$chr, pos=dmr$pos, keepXY=dmr$args$keepXY, sortBy=dmr$args$sortBy, verbose=verbose, rob=dmr$args$rob, k=dmr$args$k, vfilter=dmr$args$vfilter, nsubsets=dmr$args$nsubsets)
        ret = vector("list",3)
        ## abs() here makes these tests 2-sided:
        stats = abs(dmr2$dmrs[,dmr$args$sortBy])
        if("direct"%in%method) {
            if(nrow(dmr2$dmrs)==0) add=0 else add = stats
            #rm(dmr2); gc(); gc()
            ret[[1]] = counts(add, abs(orig_tab[,dmr$args$sortBy]))
        }
        if("pool"%in%method) {
            if(nrow(dmr2$dmrs)>0) ret[[2]] = stats
        }
        if("fwer"%in%method) {
            if(nrow(dmr2$dmrs)==0) ret[[3]] = rep(0,length(fwer.num)) else ret[[3]] = sort(stats,decreasing=TRUE)[fwer.num] #will be NA for fwer.num[i] if length(stat)<fwer.num[i]
            ret[[3]][is.na(ret[[3]])] = 0
            names(ret[[3]]) = fwer.num
        }
        ret
    }
    message("\nBeginning the iterations")
    nullnum0 = mclapply(1:nrow(colsamp), fun1, method=method, mc.cores=mc)
    message("\nFinished the iterations")
    dig=7
    if("direct"%in%method) {
        nullnum = sapply(nullnum0, function(x) x[[1]])
        ##Obtain FDR estimates:
        stopifnot(nrow(nullnum)==nrow(orig_tab))
        qv = rowMeans(nullnum)/seq(1,nrow(orig_tab))
        orig_tab$qvalue0 = round(pmin(1,qv), dig)
        orig_tab$qvalue = round(monotonic(pmin(1,qv)), dig)
    }
    if("pool"%in%method) {
        nulldist = unlist(sapply(nullnum0, function(x) x[[2]]))
        Fn = ecdf(nulldist + (1e-9))
        pv = 1-Fn(abs(orig_tab[,dmr$args$sortBy]))
        orig_tab$pvalue.pool = round(pv,dig)
        qv = try(edge.qvalue(pv)$qvalues, silent=TRUE) #Unfortunately for some reason the error message from this function isn't actually an error message but a message, so it can't be suppressed. If the error comes up, the output of the function is 1 for some reason.  So here I catch the error by putting $qvalues here, since it won't work if the output is 1.
        if(inherits(qv,"try-error")) orig_tab$qvalue.pool = NA else orig_tab$qvalue.pool = round(qv,dig)
    }
    if("fwer"%in%method) {
        fwerm = matrix(NA,nrow=nrow(orig_tab),ncol=length(fwer.num))
        colnames(fwerm) = paste("pvalue.fwer",fwer.num,sep="")
        colnames(fwerm)[fwer.num==1] = "pvalue.fwer"
        for(i in 1:length(fwer.num)) {
            maxs = sapply(nullnum0, function(x) x[[3]][i])
            Fnn = ecdf(maxs + (1e-9))
            pvv = 1-Fnn(abs(orig_tab[,dmr$args$sortBy]))
            fwerm[,i] = round(pvv, dig)
        }
        orig_tab = cbind(orig_tab,fwerm)
    }
    
    if(return.permutations) return(list(q=orig_tab, permutations=colsamp)) else return(orig_tab)
}

lm_fit <- function(dat, X, out, nsubsets=50) {
    ## Use this instead of lmFit because this uses less memory and is faster.  Estimated betas are the same anyway--lmFit is only useful for (shrunk) se's, which we don't need for this.  This also handles dat as an ff_matrix object.
    Hat = solve(t(X)%*%X)%*%t(X)
    if(!"ff_matrix"%in%class(dat)) {
        if(out=="fit") {
            return(t(X%*% (Hat%*%t(dat)) ))
        } else if(out=="res") {
            return(dat - t(X%*% (Hat%*%t(dat)) ))
        } else stop("invalid out arg")
    } else {
        output = ff(vmode = "double", dim = dim(dat))
        colnames(output) = colnames(dat)
        rownames(output) = rownames(dat)
        sp = split(1:nrow(dat), cut(1:nrow(dat),nsubsets))      
        if(out=="fit") {
            for(i in sp) output[i,] = t( X %*% (Hat%*%t(dat[i,])) )
        } else if(out=="res") {
            for(i in sp) output[i,] = dat[i,] - t( X %*% (Hat%*%t(dat[i,])) )      
        } else stop("invalid out arg")
        return(output)
    }
}
counts <- function(x1,x2) {
    if(length(x1)==0) rep(0,length(x2)) else {
        ec = ecdf(x1+(1e-9)) #add small amount to make ecdf < rather than <=.
        out = 1-ec(x2)
        stopifnot(all(out>=0 & out<=1))
        round(out*length(x1)) # round since some numbers have very small imprecision
    }

    ##This is the slower way to do this function:
    #ret = vector()
    #for(i in 1:length(x2))  ret[i] <- sum(x1>=x2[i])
    #ret
}
monotonic <- function(x,increasing=TRUE) {
    if(length(x)>1) {
        if(increasing)  for(i in 2:length(x))  x[i] = max(x[i],x[i-1])
        if(!increasing) for(i in 2:length(x))  x[i] = min(x[i],x[i-1])
    }
    x
}

# qvalue calculator that doesn't use tcltk
edge.qvalue <- function(p,lambda = seq(0, 0.9, 0.05), pi0.method = "smoother",
                         fdr.level = NULL, robust = FALSE,smooth.df = 3, smooth.log.pi0 = FALSE, ...) {

  err.func <- "edge.qvalue"
  if (min(p) < 0 || max(p) > 1) {
    err.msg(err.func,"P-values not in valid range.")
    return(invisible(1))
  }
  if (length(lambda) > 1 && length(lambda) < 4) {
    err.msg(err.func,"If length of lambda greater than 1, you need at least 4 values.")
    return(invisible(1))
  }
  if (length(lambda) > 1 && (min(lambda) < 0 || max(lambda) >= 1)) {
    err.msg(err.func,"Lambda must be in [0,1).")
    return(invisible(1))
  }
  m <- length(p)
  if (length(lambda) == 1) {
    if (lambda < 0 || lambda >= 1) {
      err.msg(err.func,"Lambda must be in [0,1).")
      return(invisible(1))
    }
    pi0 <- mean(p >= lambda)/(1 - lambda)
    pi0 <- min(pi0, 1)
  } else {
    pi0 <- rep(0, length(lambda))
    for (i in 1:length(lambda)) {
      pi0[i] <- mean(p >= lambda[i])/(1 - lambda[i])
    }
    if (pi0.method == "smoother") {
      if (smooth.log.pi0){ 
        pi0 <- log(pi0)
        spi0 <- smooth.spline(lambda, pi0, df = smooth.df)
        pi0 <- predict(spi0, x = max(lambda))$y
      }
      if (smooth.log.pi0) {
        pi0 <- exp(pi0)
      }
      pi0 <- min(pi0, 1)
    }
    else if (pi0.method == "bootstrap") {
      minpi0 <- min(pi0)
      mse <- rep(0, length(lambda))
      pi0.boot <- rep(0, length(lambda))
      for (i in 1:100) {
        p.boot <- sample(p, size = m, replace = TRUE)
        for (i in 1:length(lambda)) {
          pi0.boot[i] <- mean(p.boot > lambda[i])/(1 - lambda[i])
        }
        mse <- mse + (pi0.boot - minpi0)^2
      }
      pi0 <- min(pi0[mse == min(mse)])
      pi0 <- min(pi0, 1)
    }
    else {
      err.msg(err.func,"'pi0.method' must be one of 'smoother' or 'bootstrap'")
      return(invisible(1))
    }
  }
  if (pi0 <= 0) {
    err.msg(err.func,"The estimated pi0 <= 0. Check that you have valid\np-values or use another lambda method.")
    return(invisible(1))
  }
  if (!is.null(fdr.level) && (fdr.level <= 0 || fdr.level > 1)) {
    err.msg(err.func,"'fdr.level' must be within (0,1].")
    return(invisible(1))
  }
  u <- order(p)
  qvalue.rank <- function(x) {
    idx <- sort.list(x)
    fc <- factor(x)
    nl <- length(levels(fc))
    bin <- as.integer(fc)
    tbl <- tabulate(bin)
    cs <- cumsum(tbl)
    tbl <- rep(cs, tbl)
    tbl[idx] <- tbl
    return(tbl)
  }
  v <- qvalue.rank(p)
  qvalue <- pi0 * m * p/v
  if (robust) {
    qvalue <- pi0 * m * p/(v * (1 - (1 - p)^m))
  }
  qvalue[u[m]] <- min(qvalue[u[m]], 1)
  for (i in (m - 1):1) {
    qvalue[u[i]] <- min(qvalue[u[i]], qvalue[u[i + 1]], 1)
  }
  if (!is.null(fdr.level)) {
    retval <- list(call = match.call(), pi0 = pi0, qvalues = qvalue, pvalues = p, fdr.level = fdr.level, significant = (qvalue <= fdr.level), lambda = lambda)
  }
  else {
    retval <- list(call = match.call(), pi0 = pi0, qvalues = qvalue, pvalues = p, lambda = lambda)
  }
  class(retval) <- "qvalue"
  return(retval)
}

err.msg <- function(err.func = "edge",msg) {
 cat('\n')
 cat('\t')
# cat('ERROR in the',err.func,'function: ','\n')
 cat('Problem in the',err.func,'function: ','\n')
 cat('\t',msg,'\n\n')
}

######################################################################
######################################################################

plotDMRs <- function(dmrs, Genome, cpg.islands, exposure, outfile="./dmr_plots.pdf", which_plot=1:50, which_lines=NULL, which_points=which_lines, ADD=3000, cols=c("black","blue","red","gray","brown","pink","orange"), legend.size=1, smoo="loess", SPAN=300, DELTA=36, point.info=FALSE, pch.groups=NULL, panel3="pvalues", G=NULL, seq=NULL) {

    args = dmrs$args
    if(panel3=="G") {      
        if(is.null(G)|is.null(seq)) stop("If panel3=G, then G and seq must be provided.")
        stopifnot(length(seq)==nrow(G))
        stopifnot(length(seq)==length(dmrs$pos))
        stopifnot(all(colnames(G)==colnames(dmrs$cleanp)))
        stopifnot(ncol(G)==nrow(args$mod))        
    }
    stopifnot(length(exposure)==ncol(dmrs$cleanp))
    stopifnot(inherits(Genome,"BSgenome"))
    stopifnot(is.data.frame(cpg.islands))
    stopifnot(all(c("chr","start","end")%in%colnames(cpg.islands)))
    ocpgi = cpg.islands
    dmrs$chr = as.character(dmrs$chr)
    ocpgi = ocpgi[ocpgi$chr%in%unique(dmrs$chr),]
    
    if("beta"%in%names(dmrs)) beta = dmrs$beta else beta = NULL
    if("sbeta"%in%names(dmrs)) sbeta = dmrs$sbeta else sbeta = NULL
    ##plot as continuous if analysis was of continuous exposure and exposure not recoded for plotting.  Otherwise plot as categorical:
    if(!is.null(beta) & all(exposure==args$mod[,args$coeff])) cont=TRUE else cont=FALSE
    if(!cont) {
        groups = as.numeric(factor(exposure))
        gNames = levels(factor(exposure))
        stopifnot(length(cols)>=length(unique(exposure)))
        if(!is.null(which_points)) if(!all(which_points%in%gNames)) stop("which_points not all in exposure")
        if(!is.null(which_lines)) if(!all(which_lines%in%gNames)) stop("which_lines not all in exposure")
        if(!is.null(pch.groups)) stopifnot(length(pch.groups)==length(exposure))
    }
    dmrs$dmrs$chr = as.character(dmrs$dmrs$chr)   

    if(panel3=="G") {
        #require(Biobase) #for rowMedians
        G = log2(preprocessCore::normalize.quantiles(G))
        # remove probe effects from G:
        G = G - rowMedians(G)
        # GC sequence effect correction:
        G = remove_gc_effect(G, seq)

        if(is.null(beta)) { #not !cont
            g1 = which(args$mod[,args$coeff]==1)
            rmP = 1:ncol(args$mod0)
            g0 = which(rowSums(args$mod[,-rmP,drop=FALSE])==0)
            mes = cbind(rowMedians(G[,g1]), rowMedians(G[,g0]))
            colsg = c(unique(cols[groups[g1]]), unique(cols[groups[g0]]))
            stopifnot(length(colsg)==2)
            stopifnot(nrow(mes)==length(dmrs$pos))           
        } else mes = rowCor(G,exposure)
    }

    if(!cont) {
        ## Legend:
        if(is.null(which_lines)|is.null(which_points)) indLeg = 1:length(gNames) else indLeg = which(gNames%in%union(which_lines,which_points))
        ## Lines:
        gIndexes=split(1:ncol(dmrs$cleanp),groups)
        if(is.null(which_lines)) indL = 1:length(gIndexes) else indL = which(gNames%in%which_lines)
        ## Points:
        if(is.null(which_points)) ind = 1:length(exposure) else ind = which(exposure%in%which_points)
        if(!is.null(pch.groups)) {
            thepch = as.numeric(factor(pch.groups))[ind]
            thepch.lb = tapply(thepch, as.character(pch.groups)[ind],unique)
        } else thepch = pch.groups #if null, keep NULL and it will plot numbers and letters.
    }
    
    ###################################################################
    cat("Making",min(length(which_plot),nrow(dmrs$dmrs)),"figures\n")
    pdf(outfile,width=11,height=8)
    for(i in intersect(which_plot,1:nrow(dmrs$dmrs))) {
      cat("Plotting DMR candidate",i,"\n")
      layout(matrix(1:3,ncol=1),heights=c(0.6,0.3,0.3))
      par(mar=c(0,3.6,1.1,1.1),oma=c(0,0,1,0),mgp=c(2.6,.5,0))

      Index = which(dmrs$chr==dmrs$dmrs$chr[i]) 
      Index = Index[which(dmrs$pos[Index]>=dmrs$dmrs$start[i]-ADD & dmrs$pos[Index]<=dmrs$dmrs$end[i]+ADD)]
      test  = Index[which(dmrs$pos[Index]>=dmrs$dmrs$start[i]     & dmrs$pos[Index]<=dmrs$dmrs$end[i]    )]
      if(length(Index)==0 | length(test)==0) stop("Region doesn't exist on array.")

      xx=dmrs$pos[Index]
      if(!cont) {
          cn = colnames(dmrs$cleanp)[ind]
          matplot(xx,dmrs$cleanp[Index,ind], col=cols[groups[ind]], cex=0.4,
                  xlim=range(xx), ylim=c(0,1), ylab="Methylation", xaxt="n", xlab="",
                  main=paste("DMR ",i," - ",dmrs$dmrs$chr[i],":",dmrs$dmrs$start[i],"-", 
                             dmrs$dmrs$end[i],sep=""),las=1, pch=thepch)
          for(j in indL) {
              yy=rowMeans(dmrs$cleanp[Index,gIndexes[[j]],drop=FALSE])
              if(smoo=="runmed") {
                  yy2 = runmed(yy,3,endrule="constant")
              } else {
                  fit1=loess(yy~xx,degree=1,span=SPAN/(DELTA*length(xx)),family="symmetric")
                  yy2 =fit1$fitted        
              }
              lines(xx,yy2,col=cols[as.numeric(names(gIndexes))[j]],lwd=2)
          }
          legend("topleft",gNames[indLeg],col=cols[indLeg],lty=1,lwd=2,cex=legend.size)
          if(!is.null(pch.groups)) legend("topright", pch=thepch.lb, legend=names(thepch.lb))
      } else {
          pc = rowCor(dmrs$cleanp[Index,],exposure)
          plot(xx,pc,ylim=c(-1,1), xlab="", xaxt="n", xlim=range(xx), las=1,
               ylab="Pearson correlation with phenotype",
               main=paste("DMR ",i," - ",dmrs$dmrs$chr[i],":",dmrs$dmrs$start[i],"-", dmrs$dmrs$end[i],sep=""))
          #plot(xx,beta[Index],cex=0.6,ylim=range(sbeta),xlab="",
          #     xaxt="n", xlim=range(xx), las=1,
          #     ylab="difference in logit(p) associated with having phenotype 1 unit larger",
          #     main=paste("DMR ",i," - ",dmrs$dmrs$chr[i],":",dmrs$dmrs$start[i],"-", dmrs$dmrs$end[i],sep=""))
          #lines(xx,sbeta[Index],lwd=2,lty=1)
          abline(h=0,lty=3)
      }
      abline(v=dmrs$dmrs$start[i], lty=2)
      abline(v=dmrs$dmrs$end[i], lty=2)

      ############
      ##PLOT CPG ISLANDS AND MCRBC RECOGNITION SITES:
      par(mar=c(0,3.6,0.5,1.1))
      plot_CpG(thechr=dmrs$dmrs$chr[i],xx=xx,ocpgi=ocpgi,Genome=Genome,mcrbc=TRUE)

      ##PLOT DIFFERENCE IN G, GENES, OR MANHATTAN PLOT:
      if(panel3=="G") {
          par(mar=c(5.1,3.6,0.5,1.1))
          if(is.null(beta)) {
              #matplot(xx, mes[Index,], col=colsg, ylab="Median corrected G", xlab="Location",
              #        ylim=c(-2,2), xlim=range(xx), type="b", las=1, pch=21)
              matplot(xx, mes[Index,1]-mes[Index,2], ylab="Difference in median G", xlab="Location",
                      ylim=c(-2.5,2.5), xlim=range(xx), type="b", las=1, pch=21)
          } else {
              matplot(xx, mes[Index], ylab="cor(corrected G, exposure)", xlab="Location",
                      ylim=c(-1,1), xlim=range(xx), type="b", las=1, pch=21)
          }
          abline(h=0,lty=3)
      } else {
          par(mar=c(4.1,3.6,0.5,1.1), mgp=c(1.6,.5,0))
          yy = -log10(dmrs$pval[Index])
          plot(xx,yy, ylab="-log10(pvalue)", xlab="Location",
               ylim=c(0,7), xlim=range(xx), las=1)
          if(smoo=="runmed") {
              yy2 = runmed(yy,3,endrule="constant")
          } else {
              fit1=loess(yy~xx,degree=1,span=SPAN/(DELTA*length(xx)),family="symmetric")
              yy2 =fit1$fitted        
          }
          lines(xx,yy2)
          #abline(h=-log10(.05),lty=2)
        }
    }
    dev.off()
    if(point.info & is.null(pch.groups) & !cont) {
        pts = c(1:9,0,letters,LETTERS)
        pts = rep(pts, ceiling(length(cn)/length(pts)))
        ret = cbind(cn, pts[1:length(cn)])
        colnames(ret) = c("sample","symbol")
        ret
    }
}

plotRegions <- function(thetable, cleanp, chr, pos, seq=NULL, Genome, cpg.islands, exposure, exposure.continuous=FALSE, outfile="./myregions.pdf", which_lines=NULL, which_points=which_lines, ADD=3000, cols=c("black","red","blue","gray","green","orange","brown"), legend.size=1, smoo="loess", SPAN=300, DELTA=36, panel3="none", G=NULL, grs=NULL) {

    if(is.factor(exposure)|!exposure.continuous) cont=FALSE else cont=TRUE  
    if(panel3=="G") {
        if(is.null(G)|is.null(grs)|is.null(seq)) stop("If panel3=G, then G ,grs and seq must be provided.")
        stopifnot(length(seq)==nrow(G))
        stopifnot(length(seq)==length(pos))
        stopifnot(all(colnames(G)==colnames(cleanp)))
        if(!cont) {
            stopifnot(length(grs)==2)
            stopifnot(all(grs%in%exposure))
        }
    }
    stopifnot(length(exposure)==ncol(cleanp))
    stopifnot(length(chr)==length(pos))
    stopifnot(length(chr)==nrow(cleanp))
    stopifnot(is.numeric(thetable$start))
    stopifnot(is.numeric(thetable$end))  
    stopifnot(inherits(Genome,"BSgenome"))
    stopifnot(is.data.frame(cpg.islands))
    stopifnot(all(c("chr","start","end")%in%colnames(thetable)))
    stopifnot(all(c("chr","start","end")%in%colnames(cpg.islands)))
    ocpgi = cpg.islands
    chr = as.character(chr)
    ocpgi = ocpgi[ocpgi$chr%in%unique(chr),]
    thetable$chr = as.character(thetable$chr)
          
    if(!cont) {
        ### for plots, this defines the groups to be shown in color.
        groups = as.numeric(factor(exposure))
        gNames = levels(factor(exposure))
        if(!is.null(which_points)) if(!all(which_points%in%gNames)) stop("which_points not all in exposure")
        if(!is.null(which_lines)) if(!all(which_lines%in%gNames)) stop("which_lines not all in exposure")
        if(length(cols)<length(unique(exposure))) stop("Not as many colors as groups.")
    }  

    if(panel3=="G") {
        #require(Biobase) #for rowMedians
        G = log2(preprocessCore::normalize.quantiles(G))
        # remove probe effects from G:
        G = G - rowMedians(G)
        # GC sequence effect correction:
        G = remove_gc_effect(G, seq)

        if(!cont) {
            g1 = which(exposure==grs[1])
            g0 = which(exposure==grs[2])        
            mes = cbind(rowMedians(G[,g1]), rowMedians(G[,g0]))
            colsg = c(unique(cols[groups[g1]]), unique(cols[groups[g0]]))
            stopifnot(length(colsg)==2)
            stopifnot(nrow(mes)==length(pos)) #pos is dmr$pos, defined above
        } else mes = rowCor(G,exposure)
    }

    if(!cont) {
        ## Legend:
        if(is.null(which_lines)|is.null(which_points)) indLeg = 1:length(gNames) else indLeg = which(gNames%in%union(which_lines,which_points))
        ## Lines:
        gIndexes=split(1:ncol(cleanp),groups)
        if(is.null(which_lines)) indL = 1:length(gIndexes) else indL = which(gNames%in%which_lines)
        ## Points:
        if(is.null(which_points)) ind = 1:length(exposure) else ind = which(exposure%in%which_points)
    }

    ###################################################################
    cat("Making",nrow(thetable),"figures\n")
    pdf(outfile,width=11,height=8)
    for(i in 1:nrow(thetable)) {
        if(panel3=="G") {
            layout(matrix(1:3,ncol=1),heights=c(0.6,0.3,0.3))
        } else layout(matrix(1:2,ncol=1),heights=c(0.6,0.3))
        par(mar=c(0,3.6,1.1,1.1),oma=c(0,0,1,0),mgp=c(2.6,.5,0))

        ##PLOT REGIONS:
        Index = which(chr==thetable$chr[i])
        Index = Index[which(pos[Index]>=thetable$start[i]-ADD & pos[Index]<=thetable$end[i]+ADD)]
        test  = Index[which(pos[Index]>=thetable$start[i]     & pos[Index]<=thetable$end[i]    )] 
        if(length(Index)==0 | length(test)==0) {
            message("Region ",i," not on the array.")
            next
        }

        xx=pos[Index]
        if(!cont) {
            matplot(xx,cleanp[Index,ind],col=cols[groups[ind]], cex=0.4,
                    xlim=range(xx), ylim=c(0,1), ylab="Methylation", xaxt="n", xlab="",
                    main=paste("Region ",i," - ",thetable$chr[i],":",thetable$start[i],"-", 
                               thetable$end[i],sep=""),las=1)
            for(j in indL){
                yy=rowMeans(cleanp[Index,gIndexes[[j]],drop=FALSE])
                if(smoo=="runmed") {
                    yy2 = runmed(yy,3,endrule="constant")
                } else {
                    fit1=loess(yy~xx,degree=1,span=SPAN/(DELTA*length(xx)),family="symmetric")
                    yy2 =fit1$fitted        
                }
                lines(xx,yy2,col=cols[as.numeric(names(gIndexes))[j]],lwd=2)
            }
            legend("topleft",gNames[indLeg],col=cols[indLeg],lty=1,lwd=2,cex=legend.size)
        } else {
            pc = rowCor(cleanp[Index,],exposure)
            plot(xx,pc,ylim=c(-1,1), xlab="", xaxt="n", xlim=range(xx), las=1,
                 ylab="Pearson correlation with phenotype",
                 main=paste("Region ",i," - ",thetable$chr[i],":",thetable$start[i],"-", thetable$end[i],sep=""))
            #plot(xx,beta[Index],cex=0.6,ylim=range(sbeta),xlab="",
            #     xaxt="n", xlim=range(xx), las=1,
            #     ylab="difference in logit(p) associated with having phenotype 1 unit larger",
            #     main=paste("Region ",i," - ",Dmrs$chr[i],":",Dmrs$start[i],"-", Dmrs$end[i],sep=""))
            #lines(xx,sbeta[Index],lwd=2,lty=1)
            abline(h=0,lty=3)
        }      
        abline(v=thetable$start[i], lty=2)
        abline(v=thetable$end[i], lty=2)

        ############
        ##PLOT CPG ISLANDS AND MCRBC RECOGNITION SITES:
        if(panel3=="G") {
            plot_CpG(thechr=thetable$chr[i],xx=xx,ocpgi=ocpgi,Genome=Genome,mcrbc=TRUE)
        } else {
            par(mar=c(4.1,3.6,0.5,1.1))
            plot_CpG(thechr=thetable$chr[i],xx=xx,ocpgi=ocpgi,Genome=Genome,mcrbc=TRUE,
                     xaxt="s",xlab="Location")
        }
        ##PLOT DIFFERENCE IN G:
        if(panel3=="G") {
            par(mar=c(4.1,3.6,0.5,1.1))
            if(!cont) {
                #matplot(xx, mes[Index,], col=colsg, ylab="Median corrected G", xlab="Location",
                #        ylim=c(-2,2), xlim=range(xx), type="b", las=1, pch=21)
                matplot(xx, mes[Index,1]-mes[Index,2], ylab="Difference in median G", xlab="Location",
                        ylim=c(-2.5,2.5), xlim=range(xx), type="b", las=1, pch=21)
            } else {
                matplot(xx, mes[Index], ylab="cor(corrected G, exposure)", xlab="Location",
                        ylim=c(-1,1), xlim=range(xx), type="b", las=1, pch=21)
            }
            abline(h=0,lty=3)
        }
    }
    dev.off()
}

###Remove sequence effect:
remove_gc_effect <- function(M, seq) {
    #require(Biostrings)
    stopifnot(nrow(M)==length(seq))
    #dset = DNAStringSet(seq)
    freq = alphabetFrequency(seq)
    gccontent = rowSums(freq[,c("C", "G")]) / seq@ranges@width
    gc = round(gccontent*100); gc[gc<=20]<-20; gc[gc>=80]<-80
    
    sapply(1:ncol(M),function(i) {
      cat(".")
      meds=tapply(M[,i],gc,median)
      mads=tapply(M[,i],gc,mad)
      Index=match(gc,as.numeric(names(meds)))
      bias=meds[Index]
      sds=mads[Index]
      return((M[,i]-bias)/sds*mad(M[,i]))
    })
}

plot_CpG <- function(thechr,xx,ocpgi,Genome,mcrbc=TRUE,xlab="",xaxt="n") {
    #Genome should be a BSgenome object, e.g., Hsapiens, Mmusculus, Rnorvegicus, Mmulatta, etc.
      ##PLOT CPG ISLANDS AND MCRBC RECOGNITION SITES:
      seq<-Genome[[thechr]]
      Index2=c(min(xx),max(xx))
      if(Index2[1]<1 | Index2[2]>length(seq)) stop("position outside length of chromosome.")
      subseq<-subseq(seq,start=Index2[1],end=Index2[2])
      cpgs=start(matchPattern("CG",subseq))+Index2[1]-1
      ##Find the C in ACG or GCG if mcrbc=TRUE:
      if(mcrbc) Mcrbc=sort(c(start(matchPattern("ACG",subseq))+Index2[1],
                             start(matchPattern("GCG",subseq))+Index2[1]))
      cuts=seq(Index2[1],Index2[2],8)
      scpgs=cut(cpgs,cuts,include.lowest=TRUE)
      x = (cuts[1:(length(cuts)-1)]+cuts[2:(length(cuts))])/2
      y = table(scpgs)/8
      SPAN2=400/diff(range(x))
      d = loess(y~x,span=SPAN2,degree=1)
      plot(x,d$fitted,type="l",ylim=c(0,0.15),xlab=xlab,xaxt=xaxt,
           ylab="CpG density",xlim=range(xx),las=1)
      Rug(cpgs)
      if(mcrbc) Rug(Mcrbc,side=3)
      Index1 = which(ocpgi[,"chr"]==as.character(thechr) &
                   ((ocpgi[,"start"] > min(xx) & ocpgi[,"start"]< max(xx)) |
                    (ocpgi[,"end"  ] > min(xx) & ocpgi[,"end"]  < max(xx))))
      if(length(Index1)>0)
          sapply(Index1,function(j) Rug(unlist(ocpgi[j,c("start","end")]),col="red",lwd=3,side=1))
}

######################################################################
######################################################################

logit <- function(x) {
    if ("ff_matrix" %in% class(x) ) {
        ret2 <- ff(vmode="double", dim=dim(x))
        for(cm in 1:ncol(x)) ret2[,cm] = log(x[,cm])-log(1-x[,cm])
        colnames(ret2) = colnames(x)
        rownames(ret2) = rownames(x)
        ret2
    } else log(x)-log(1-x)
}
ilogit <- function(x) {
    if ("ff_matrix" %in% class(x) ) {
        ret2 <- ff(vmode="double", dim=dim(x))
        for(cm in 1:ncol(x)) ret2[,cm] = 1/(1+exp(-x[,cm]))
        colnames(ret2) = colnames(x)
        rownames(ret2) = rownames(x)
        ret2
    } else 1/(1+exp(-x))
}

Min <- function(x) {
    if ("ff_matrix" %in% class(x) ) {
        ret2 = rep(NA,ncol(x))
        for(cm in 1:ncol(x)) ret2[cm] = min(x[,cm])
        min(ret2)
    } else min(x)
}
Max <- function(x) {
    if ("ff_matrix" %in% class(x) ) {
        ret2 = rep(NA,ncol(x))
        for(cm in 1:ncol(x)) ret2[cm] = max(x[,cm])
        max(ret2)
    } else max(x)
}

getSegments <- function(x,factor,cutoff=quantile(abs(x),0.99),verbose=TRUE){

  Indexes=split(seq(along=x),factor)
  regionID=vector("numeric",length(x))
  LAST = 0
  
  segmentation = vector("numeric", length(x))
  type = vector("numeric", length(x))

  for (i in seq(along = Indexes)) {
    if (verbose) if (i%%1000 == 0) cat(".")
    Index = Indexes[[i]]
    y = x[Index]
    z = sign(y) * as.numeric(abs(y) > cutoff)
    w = cumsum(c(1, diff(z) != 0)) + LAST
    segmentation[Index] = w
    type[Index] = z
    LAST = max(w)
  }
  if(verbose) cat("\n")
  ##add a vector of the pns
  res=list(upIndex=split(which(type>0),segmentation[type>0]),
    dnIndex=split(which(type<0),segmentation[type<0]),
    zeroIndex=split(which(type==0),segmentation[type==0]))
  names(res[[1]])<-NULL
  names(res[[2]])<-NULL
  names(res[[3]])<-NULL
  return(res)
}


##you can pass cutoff through the ...
regionFinder<-function(x,regionNames,chr,position,y=x,
                       summary=mean,ind=seq(along=x),order=TRUE,oneTable=TRUE,
                       ...){
  if(is.unsorted(order(chr,position))) stop("regionFinder inputs must be in order by genomic location.")

  Indexes=getSegments(x[ind],regionNames[ind],...)
  res=vector("list",2)
  for(i in 1:2){
    res[[i]]=data.frame(chr=sapply(Indexes[[i]],function(Index) chr[ind[Index[1]]]),
         start=sapply(Indexes[[i]],function(Index) min(position[ind[Index]])),
         end=sapply(Indexes[[i]],function(Index) max(position[ind[Index]])),
         value=sapply(Indexes[[i]],function(Index) summary(y[ind[Index]])),
         area=sapply(Indexes[[i]],function(Index) abs(sum(y[ind[Index]]))),
         pns=sapply(Indexes[[i]],function(Index) regionNames[ind[Index]][1]),
         indexStart=sapply(Indexes[[i]],function(Index) min(ind[Index])),
         indexEnd=sapply(Indexes[[i]],function(Index) max(ind[Index])))
    res[[i]]$nprobes=res[[i]]$indexEnd-res[[i]]$indexStart+1
    #res[[i]]$length=res[[i]]$end-res[[i]]$start+0 #could add 50 to go to end of last probe
  }
  names(res)=c("up","dn")
  if(order & !oneTable){
    if(nrow(res$up)>0) res$up=res$up[order(-res$up$area),]
    if(nrow(res$dn)>0) res$dn=res$dn[order(-res$dn$area),]
  }
  if(oneTable){
    res=rbind(res$up,res$dn)
    if(order & nrow(res)>0) res=res[order(-res$area),]
  }
  return(res)
}

regionMatch <- function(object1, object2, verbose=TRUE) {
    #require(IRanges)
    m1 = object1; m2 = object2 #keep arguments named object1,object2 for backward compatibility
    stopifnot(all(c("chr","start","end")%in%colnames(m1)))
    stopifnot(all(c("chr","start","end")%in%colnames(m2)))
    if(any(is.na(m1[,c("chr","start","end")])) | any(is.na(m2[,c("chr","start","end")]))) stop("No missing values allowed in chr, start, or end columns.")
    m1$chr = as.character(m1$chr)
    m2$chr = as.character(m2$chr)
    stopifnot(is.numeric(m1$start) & is.numeric(m1$end))
    stopifnot(is.numeric(m2$start) & is.numeric(m2$end))
    stopifnot(all(m1$end>=m1$start))
    stopifnot(all(m2$end>=m2$start))
              
    ret = matrix(NA,nrow=nrow(m1),ncol=7)
    colnames(ret) = c("dist","matchIndex","type","amountOverlap","insideDist","size1","size2")
    sp1 = split(1:nrow(m1), m1$chr)
    sp2 = split(1:nrow(m2), m2$chr)          
    for(i in names(sp1)) {
        if(verbose) cat(i," ")
        inds1 = sp1[[i]]
        if(i %in% names(sp2)) {
            inds2 = sp2[[i]]
            m1IR = IRanges(start=m1$start[inds1],end=m1$end[inds1])
            m2IR = IRanges(start=m2$start[inds2],end=m2$end[inds2])
            ret[inds1,"matchIndex"] = inds2[ nearest(m1IR,m2IR) ]
        } else ret[inds1,"matchIndex"] = NA
    }
    if(verbose) cat("\n")
    inds = ret[,"matchIndex"]
    m2b = m2[inds,]
    ## if all(is.na(inds)) & length(inds)<nrow(m2), R returns a matrix (of all NAs) with nrow(m2) rows instead of length(inds), so have to do this:
    if(all(is.na(inds)) & length(inds)<nrow(m2)) m2b = m2b[1:length(inds),]
    ret = data.frame(ret, stringsAsFactors=FALSE)
    
    ret$type[(m1$start>m2b$start & m1$end<=m2b$end) |
             (m1$start>=m2b$start & m1$end<m2b$end)] = "inside"
    ret$type[m1$start<=m2b$start & m1$end>=m2b$end] = "cover"
    ret$type[m1$start>m2b$end] = "disjointR"
    ret$type[m1$end<m2b$start] = "disjointL"
    ret$type[is.na(ret$matchIndex)] = "disjoint"
    ret$type[(m1$start>m2b$start & m1$start<=m2b$end) & m1$end>m2b$end] = "overlapR"    
    ret$type[m1$start<m2b$start & (m1$end>=m2b$start & m1$end<m2b$end)] = "overlapL"

    ret$dist = 0
    ret$dist[ret$type=="disjoint"]  = NA
    ret$dist[ret$type=="disjointR"] = m2b$end[ret$type=="disjointR"] - m1$start[ret$type=="disjointR"]
    ret$dist[ret$type=="disjointL"] = m2b$start[ret$type=="disjointL"] - m1$end[ret$type=="disjointL"]
    ret$amountOverlap[ret$type=="overlapR"] = -1*(m2b$end[ret$type=="overlapR"]-m1$start[ret$type=="overlapR"]+1)
    ret$amountOverlap[ret$type=="overlapL"] = m1$end[ret$type=="overlapL"]-m2b$start[ret$type=="overlapL"]+1
    ret$type[ret$type%in%c("disjointR","disjointL")] = "disjoint"
    ret$type[ret$type%in%c("overlapR","overlapL")] = "overlap"

    ## insideDist column:
    insideIndex = ret$type=="inside" #no missing ret$type at this point
    tmp0 = cbind(m1$end[insideIndex]  -m2b$end[insideIndex],
                 m1$start[insideIndex]-m2b$start[insideIndex])
    tmp = apply(abs(tmp0),1,which.min)
    tmpinsidedist = tmp0[,1]
    tmpIndex = tmp==2
    tmpinsidedist[tmpIndex] = tmp0[tmpIndex,2]
    ret$insideDist[insideIndex] = tmpinsidedist

    ## size1 and size2 columns:
    ret$size1 = m1$end -m1$start +1
    ret$size2 = m2b$end-m2b$start+1
    
    ret
}

clusterMaker <- function(chr,pos,order.it=TRUE,maxGap=300) {
  nonaIndex=which(!is.na(chr) & !is.na(pos))
  Indexes=split(nonaIndex,chr[nonaIndex])
  clusterIDs=rep(NA,length(chr))
  LAST=0
  for(i in seq(along=Indexes)){
    Index=Indexes[[i]]
    x=pos[Index]

    if(order.it){ Index=Index[order(x)];x=pos[Index] }

    y=as.numeric(diff(x)>maxGap)
    z=cumsum(c(1,y))
    clusterIDs[Index]=z+LAST
    LAST=max(z)+LAST
  }
  clusterIDs
}  

Try the charm package in your browser

Any scripts or data that you put into this service are public.

charm documentation built on May 6, 2019, 2:29 a.m.