methylarking: Analyzing RRBS Data

###############################
# VMR Finder
# http://biostatistics.oxfordjournals.org/content/early/2011/06/17/biostatistics.kxr013.full
# Andrew Jaffe
# Updated 10/24/2011
######################

# p = methylation matrix of Beta values: M probes down the rows by N samples across columns
# coi = covariate of interest. a vector of 1's if one-sample VMRs, or respective 
#		factor labels for 2 sample dVMRs'
# G = vector (length M) of median probe intensities. Or a matrix of intensities for each sample.
#		This is the green channel for CHARM
# pns = vector (length M) of probe group IDs (for smoothing and region finding).
#		generated from clusterMaker(chr,pos,maxGap). 
# chr = vector (length M) of chromosome information for each probe
# pos = vector (length M) of position information for each probe
# WBP = smoothing span parameter. For each probe group: 
#		span = WBP/median_probe_spacing/num_probes_in_group
# cutoffQuant = quantile of smoothed variance statistic used to define regions
# n_boot = number of bootstrap permutations 
# batches = NULL for no batch correction, or batch variable (like processing date)
#			If provided, strip plots with guide the number of batches to remove, 
#			with user input

vmrFinder = function(p, coi, G, pns, chr, pos,WBP=600,cutoffQuant=0.99,
	n_boot = 200, batches=NULL) {
	
	if(length(grep("chr", chr)) == 0) paste("chr",chr,sep="")
	
	# load libraries
	
	pckg = try(require(corpcor))
	if(!pckg) {
		cat("Installing 'corpcor' from CRAN\n")

		getPckg(corpcor)
		require(corpcor)
	}

	pckg = try(require(limma))
	if(!pckg) {
		cat("Installing 'limma' from Bioconductor\n")

		source("http://bioconductor.org/biocLite.R")
		biocLite("limma")
		require("limma")
	}
	
	pckg = try(require(Biobase))
	if(!pckg) {
		cat("Installing 'Biobase' from Bioconductor\n")
		source("http://bioconductor.org/biocLite.R")
		biocLite("Biobase")
		require("Biobase")
	}

	if(is.matrix(G)) G = rowMedians(G)
	
	# drop sex chr
	Index = which(!chr %in% c("chrX","chrY","chrM")) #added mitochondrial chr 
	p=p[Index,]; chr=chr[Index]; pos=pos[Index];pns=pns[Index]
	if(!is.null(G)) G = G[Index]

	
	type = length(unique(coi)) 
	
	stopifnot(type %in% c(1,2)) 
	
	vmrType = ifelse(type == 1, "vmr","dvmr")
	

	# one sample VMRs
	if(type == 1) {
		
		vmrName = unique(coi)
		if(is.character(vmrName)) vmrName = paste("group",vmrName,sep="")
		
		cleanp = p
		
		if(!is.null(batches)) {
			y = logit(p)
			d = factor(batches)
			mm=rowMedians(y)
			svd=fast.svd(y-mm, tol=0)
			
			len = ceiling(sqrt(ncol(y)))
			nr = min(c(len,4))
			mypar(nr, nr)
			
			for(k in 1:min(c(nr^2,ncol(y)))) {
				stripchart(svd$v[,k] ~ d,
					method = "jitter", vertical = TRUE,
					ylab = paste("pc",k))
			}
			
			cat("How many PCs to remove? Enter Number: ")
			nsv = scan(n=1)
				
			dd=svd$d;dd[c(1:as.numeric(nsv))]=0 # remove PCs 
			y=svd$u%*%diag(dd)%*%t(svd$v)
			cleanp = ilogit(y + mm)
		}
		
	
		# smooth out the effect of G
		s = rowMads(cleanp)
		fit = loessFit(log(s), log2(G), span = 0.05)
		newmad = fit$residuals

		sf = loessSmoothStat(newmad, pns,pos,WBP = WBP)
		#cutoff = quantile(sf, cutoffQuant) 		#original code
		cutoff = quantile(sf, cutoffQuant,na.rm=TRUE) #SE edit
		vmrs = regionFinderPos(sf, pns, chr,
					pos, cutoff = cutoff)
		
		stat = newmad
	} 

	if(type == 2) {
		cat("dVMR Finder\n")
		Indexes = splitit(coi)
		coiNames = names(Indexes)
		vmrName = paste(coiNames,collapse = "_minus_")

		cleanp = p
		pList = list()

		if(!is.null(batches)) {
		
			cat("Look at boxplots and remove batches at the prompts\n")

		
			for(i in seq(along=Indexes)) {
				y = logit(p[,Indexes[[i]]])
				# mod =  matrix(rep(1), nc = 1, nr = ncol(y))
				# nsv = num.sv(y,mod,method="be")$n
				d = factor(batches[Indexes[[i]]])
				mm=rowMedians(y)
				svd=fast.svd(y-mm, tol=0)
			
				len = ceiling(sqrt(ncol(y)))
				nr = min(c(len,4))
				mypar(nr, nr)
				
				for(k in 1:min(c(nr^2,ncol(y)))) {
					stripchart(svd$v[,k] ~ d,
					# stripchart(svd$v[,k],
						method = "jitter", vertical = TRUE,
						ylab = paste("pc",k))
				}
				
				# input = file("stdin")
				# nsv = print(readLines(n=1, ok=FALSE))
	
				# close(input)
				# nsv = readline(prompt="How many PCs to remove? Enter Number: ")
				cat("How many PCs to remove? Enter Number: ")
				nsv = scan(n=1)
				
				dd=svd$d;dd[c(1:as.numeric(nsv))]=0 # remove PCs 
				y=svd$u%*%diag(dd)%*%t(svd$v)
				pList[[i]] = ilogit(y + mm)
				cleanp[,Indexes[[i]]] = pList[[i]]
				
			}
		} else {
			pList[[1]] = cleanp[,Indexes[[1]]]
			pList[[2]] = cleanp[,Indexes[[2]]]
		}
			

		mm = rowMedians(cleanp)
		p1 = pList[[1]] - mm
		p2 = pList[[2]] - mm 
		
		
		# smooth out the effect of G
		s1 = rowMads(p1)
		fit = loessFit(log(s1), log2(G), span = 0.05)
		newmad1 = fit$residuals

		s2 = rowMads(p2)		
		fit = loessFit(log(s2), log2(G), span = 0.05)
		newmad2 = fit$residuals

		lev = newmad1 - newmad2
		levf = loessSmoothStat(lev, pns, pos, WBP=WBP)
		cat("\n")
		cutoff = quantile(abs(levf),cutoffQuant,na.rm=TRUE) #added na.rm SE
		cat("Finding VMRs\n")
		vmrs = regionFinder(levf, pns, chr,
			pos, cutoff = cutoff,verbose=FALSE)
		stat = lev
	}	
	
	# generic boostrapping
	# null=arimaBootSmoothStat(stat,n_boot,pns,pos,WBP) 
	# nullareas = bootArea(null,cutoff=cutoff, pns, type = vmrType)
	# vmrs$pval = edge.pvalue(vmrs$area, nullareas)
	# vmrs$fdr = edge.qvalue(vmrs$pval,lambda=0)$qval

	# sig = vmrs[vmrs$fdr < 0.05,]
	
	#return(sig)
	return(vmrs)
}

logit=function(x) log(x)-log(1-x)
ilogit=function(x) 1/(1+exp(-x))

# generates null areas
bootArea <- function(null, cutoff , pns, type="vmr") {

	area.dat <- list()
	for(j in 1:ncol(null)) {
		cat(j,",")
		if(type == "vmr") {
			hold.dat <- regionFinderPos(null[,j], pns,
					chr,pos, cutoff=cutoff,verbose=F)
		}
		if(type == "dmr") {
			hold.dat <- regionFinder(null[,j], pns,
					chr,pos, cutoff=cutoff,verbose=F)
		}
		if(type == "dvmr") {
			hold.dat <- regionFinder(null[,j], pns,
					chr,pos, cutoff=cutoff,verbose=F)
		}
		
		area.dat[[j]] <- hold.dat$area
	}
	area.dat <- unlist(area.dat)
	return(area.dat)
}

# via Rafael Irizarry
rowMads <- function(x,constant = 1.4826)constant*Biobase::rowMedians(abs(x-Biobase::rowMedians(x,na.rm=TRUE)),na.rm=TRUE) #allowed for NAs in computation

# via Rafael Irizarry
getSegments <- function(x,factor,cutoff=quantile(abs(x),0.99),verbose=TRUE){

  Indexes=split(seq(along=x),factor)
  regionID=vector("numeric",length(x))
  LAST = 0
  
  segmentation = vector("numeric", length(x))
  type = vector("numeric", length(x))

  for (i in seq(along = Indexes)) {
    if (verbose) if (i%%1000 == 0) cat(".")
    Index = Indexes[[i]]
    y = x[Index]
    z = sign(y) * as.numeric(abs(y) > cutoff)
    w = cumsum(c(1, diff(z) != 0)) + LAST
    segmentation[Index] = w
    type[Index] = z
    LAST = max(w)
  }
  if(verbose) cat("\n")
  ##add a vector of the pns
  res=list(upIndex=split(which(type>0),segmentation[type>0]),
    dnIndex=split(which(type<0),segmentation[type<0]),
    zeroIndex=split(which(type==0),segmentation[type==0]))
  names(res[[1]])<-NULL
  names(res[[2]])<-NULL
  names(res[[3]])<-NULL
  return(res)
}

# via Rafael Irizarry
clusterMaker <- function(chr,pos,order.it=TRUE,maxGap=300){
  nonaIndex=which(!is.na(chr) & !is.na(pos))
  Indexes=split(nonaIndex,chr[nonaIndex])
  clusterIDs=rep(NA,length(chr))
  LAST=0
  for(i in seq(along=Indexes)){
    Index=Indexes[[i]]
    x=pos[Index]

    if(order.it){ Index=Index[order(x)];x=pos[Index] }

    y=as.numeric(diff(x)>maxGap)
    z=cumsum(c(1,y))
    clusterIDs[Index]=z+LAST
    LAST=max(z)+LAST
  }
  clusterIDs
}  

# via Rafael Irizarry
##you can pass cutoff through the ...
regionFinder<-function(x,regionNames,chr,position,y=x,
                       summary=mean,ind=seq(along=x),order=TRUE,oneTable=TRUE,
                       ...){

  Indexes=getSegments(x[ind],regionNames[ind],...)
  
  res=vector("list",2)
  for(i in 1:2){
    res[[i]]=data.frame(chr=sapply(Indexes[[i]],function(Index) chr[ind[Index[1]]]),
         start=sapply(Indexes[[i]],function(Index) min(position[ind[Index]])),
         end=sapply(Indexes[[i]],function(Index) max(position[ind[Index]])),
         value=sapply(Indexes[[i]],function(Index) summary(y[ind[Index]])),
         area=sapply(Indexes[[i]],function(Index) abs(sum(y[ind[Index]]))),
         pns=sapply(Indexes[[i]],function(Index) regionNames[ind[Index]][1]),
         indexStart=sapply(Indexes[[i]],function(Index) min(ind[Index])),
         indexEnd=sapply(Indexes[[i]],function(Index) max(ind[Index])))
    res[[i]]$L=res[[i]]$indexEnd-res[[i]]$indexStart+1
  }
  names(res)=c("up","dn")
  if(order & !oneTable){
    if(nrow(res$up)>0) res$up=res$up[order(-res$up$area),]
    if(nrow(res$dn)>0) res$dn=res$dn[order(-res$dn$area),]
  }
  if(oneTable){
    res=rbind(res$up,res$dn)
    if(order & nrow(res)>0) res=res[order(-res$area),]
  }
  return(res)
}



# loess smoothing of statistic
loessSmoothStat <- function(stat,pns,pos,WBP=600) {
	require(limma)
	stat.smooth <- stat
	Indexes=split(seq(along=pns),pns)
	for(i in seq(along=Indexes)){
		if(i%%10000 == 0) cat(".")
		Index=Indexes[[i]]
		lens=median(diff(pos[Index]))
		spans=WBP/lens/length(Index) ##this is actually span for loess
		if(length(Index) > 3 & lens > 0) {
		    stat.smooth[Index] <- loessFit(stat[Index],pos[Index], span = spans)$fitted
		} else stat.smooth[Index] = median(stat[Index])
	}
	return(stat.smooth)
}

##you can pass cutoff through the ...
# doesn't take abs(area)
regionFinderPos <-function(x,regionNames,chr,position,y=x,
                       summary=mean,ind=seq(along=x),order=TRUE,
                       ...){

  tmp = regionFinder(x=x,regionNames=regionNames,
		chr=chr,position=position,y=x,
                       summary=mean,ind=seq(along=x),order=TRUE,oneTable=FALSE,
                       ...)
  
	out = tmp[["up"]]
	return(out)

}

# via Jeffrey Leek
# pvalue calculator
edge.pvalue <- function(stat, stat0, pool=TRUE) {
  err.func <- "edge.pvalue"
  m <- length(stat)
  if(pool==TRUE) {
    if(is.matrix(stat0)) {stat0 <- as.vector(stat0)}
    m0 <- length(stat0) 
    v <- c(rep(T, m), rep(F, m0))
    v <- v[order(c(stat,stat0), decreasing = TRUE)]
    u <- 1:length(v)
    w <- 1:m
    p <- (u[v==TRUE]-w)/m0
    p <- p[rank(-stat)]
    p <- pmax(p,1/m0)
  } else {
    if(is.vector(stat0)) {
      err.msg(err.func,"stat0 must be a matrix.")
      return(invisible(1))
    }
    if(ncol(stat0)==m) {stat0 <- t(stat0)}
    if(nrow(stat0)!=m){
      err.msg(err.func,"Number of rows of stat0 must equal length of stat.")
      return(invisible(1))
    }
    stat0 <- (stat0 - matrix(rep(stat,ncol(stat0)),byrow=FALSE,nrow=m)) >= 0
    p <- apply(stat0,1,mean)
    p <- pmax(p,1/ncol(stat0))
  }
  return(p)
}

# via Jeffrey Leek
# qvalue calculator that doesn't use tcltk
edge.qvalue <- function(p,lambda = seq(0, 0.9, 0.05), pi0.method = "smoother",
                         fdr.level = NULL, robust = FALSE,smooth.df = 3, smooth.log.pi0 = FALSE, ...) {

  err.func <- "edge.qvalue"
  if (min(p) < 0 || max(p) > 1) {
    err.msg(err.func,"P-values not in valid range.")
    return(invisible(1))
  }
  if (length(lambda) > 1 && length(lambda) < 4) {
    err.msg(err.func,"If length of lambda greater than 1, you need at least 4 values.")
    return(invisible(1))
  }
  if (length(lambda) > 1 && (min(lambda) < 0 || max(lambda) >= 1)) {
    err.msg(err.func,"Lambda must be in [0,1).")
    return(invisible(1))
  }
  m <- length(p)
  if (length(lambda) == 1) {
    if (lambda < 0 || lambda >= 1) {
      err.msg(err.func,"Lambda must be in [0,1).")
      return(invisible(1))
    }
    pi0 <- mean(p >= lambda)/(1 - lambda)
    pi0 <- min(pi0, 1)
  } else {
    pi0 <- rep(0, length(lambda))
    for (i in 1:length(lambda)) {
      pi0[i] <- mean(p >= lambda[i])/(1 - lambda[i])
    }
    if (pi0.method == "smoother") {
      if (smooth.log.pi0){ 
        pi0 <- log(pi0)
        spi0 <- smooth.spline(lambda, pi0, df = smooth.df)
        pi0 <- predict(spi0, x = max(lambda))$y
      }
      if (smooth.log.pi0) {
        pi0 <- exp(pi0)
      }
      pi0 <- min(pi0, 1)
    }
    else if (pi0.method == "bootstrap") {
      minpi0 <- min(pi0)
      mse <- rep(0, length(lambda))
      pi0.boot <- rep(0, length(lambda))
      for (i in 1:100) {
        p.boot <- sample(p, size = m, replace = TRUE)
        for (i in 1:length(lambda)) {
          pi0.boot[i] <- mean(p.boot > lambda[i])/(1 - lambda[i])
        }
        mse <- mse + (pi0.boot - minpi0)^2
      }
      pi0 <- min(pi0[mse == min(mse)])
      pi0 <- min(pi0, 1)
    }
    else {
      err.msg(err.func,"'pi0.method' must be one of 'smoother' or 'bootstrap'")
      return(invisible(1))
    }
  }
  if (pi0 <= 0) {
    err.msg(err.func,"The estimated pi0 <= 0. Check that you have valid\np-values or use another lambda method.")
    return(invisible(1))
  }
  if (!is.null(fdr.level) && (fdr.level <= 0 || fdr.level > 1)) {
    err.msg(err.func,"'fdr.level' must be within (0,1].")
    return(invisible(1))
  }
  u <- order(p)
  qvalue.rank <- function(x) {
    idx <- sort.list(x)
    fc <- factor(x)
    nl <- length(levels(fc))
    bin <- as.integer(fc)
    tbl <- tabulate(bin)
    cs <- cumsum(tbl)
    tbl <- rep(cs, tbl)
    tbl[idx] <- tbl
    return(tbl)
  }
  v <- qvalue.rank(p)
  qvalue <- pi0 * m * p/v
  if (robust) {
    qvalue <- pi0 * m * p/(v * (1 - (1 - p)^m))
  }
  qvalue[u[m]] <- min(qvalue[u[m]], 1)
  for (i in (m - 1):1) {
    qvalue[u[i]] <- min(qvalue[u[i]], qvalue[u[i + 1]], 1)
  }
  if (!is.null(fdr.level)) {
    retval <- list(call = match.call(), pi0 = pi0, qvalues = qvalue, pvalues = p, fdr.level = fdr.level, significant = (qvalue <= fdr.level), lambda = lambda)
  }
  else {
    retval <- list(call = match.call(), pi0 = pi0, qvalues = qvalue, pvalues = p, lambda = lambda)
  }
  class(retval) <- "qvalue"
  return(retval)
}


# arima bootstrapping/sampling, loess smoothing
arimaBootSmoothStat <- function(stat,n_boot,pns,pos,WBP=300) {
	null.dat <- matrix(nr = length(stat), nc = n_boot)

	# lstat = log(stat)
	Indexes=split(seq(along=pns),pns)

	
	# AR process
	phi = matrix(NA, ncol = 4, nrow = length(Indexes))
	cat("calculating AR coefficient.","\n")
	for(i in seq(along = Indexes)) {
		if(i%%10000 == 0) cat(".")
		Index = Indexes[[i]]
		if(length(Index) > 100) {
			phi[i,] = ar(stat[Index],order.max=4, aic = F)$ar
		}
	}
	
	# ar model parameters
	ar1 = mean(phi[,1], na.rm = T)
	ar.sd = sd(stat, na.rm=T) #added na.rm=T
	
	cat("\nAR Bootstrapping.","\n")
	for(j in 1:n_boot) {
		cat(".")
		null.dat[,j] = arima.sim(list(ar = ar1), n = length(stat), 
			sd = ar.sd)
	}
	

	cat("\nSmoothing of null data.","\n")
	
	require(limma)
	for(i in seq(along=Indexes)){
		if(i%%10000 == 0) cat(".")
		Index=Indexes[[i]]
		lens=median(diff(pos[Index]))
		spans=WBP/lens/length(Index)
		for(j in 1:n_boot) {
			if(length(Index) > 3 & lens > 0) {
				null.dat[Index,j] <- loessFit(null.dat[Index,j],pos[Index], 
					span = spans)$fitted
			}
		}
	}
	return(null.dat)
}

getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")
splitit <- function(x) split(seq(along=x),x)
	
# via Rafael Irizarry
library(RColorBrewer)
mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){
 par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0))
 par(mfrow=c(a,b),...)
 palette(brewer.pal(brewer.n,brewer.name))
}