R/utils.R
In tileHMM: Hidden Markov Models for ChIP-on-Chip Analysis

Documented in generate.data getHMM gff2index hmm.setup logSum posterior reg2gff region.length region.position remove.short shrinkt.st viterbiEM

# utils.R
# 
# Other useful methods for HMMs
#
# Author: Peter Humburg
###############################################################################

############################ auxiliary functions ##############################
## functions to handle the log space computations
## calculate the log of the sum from the log of the summands
## base: base of logarithm, 0 indicates natural log
logSum <- function(x,y=NULL,base=0){
	if(is.null(y)){
		if(is.vector(x)){
			lsum <- x[1]
			if(length(x) == 1) return(lsum)
			for(i in 2:length(x)){
				lsum <- logSum(lsum,x[i]) 
			}
			return(lsum)
		}
		if(is.matrix(x)){
			lsum <- apply(x,1,logSum)
			lsum <- logSum(lsum)
			return(lsum)
		}
	}
	if(is.matrix(x) && is.matrix(y)){
		if(sum(dim(x) != dim(y))){
			stop("logSum: Matrix dimensions do not agree.")
		}
		lsum <- x
		for(i in 1:dim(x)[1]){
			for(j in 1:dim(x)[2]){
				lsum[i,j] <- logSum(x[i,j],y[i,j])
			}
		}
		return(lsum)
	}
	if(length(x) > 1 || length(y) > 1){
		if(length(x) != length(y)){
			stop("logSum: vectors of different length.")
		}
		lsum <- numeric(length(x))
		for(i in 1:length(x)){
			lsum[i] <- logSum(x[i],y[i])
		}
		return(lsum)
	}
	
	.Call("_log_sum",x,y,as.integer(base))
}

## Use combination of Viterbi training and Baum-Welch algorithm to fit HMM
viterbiEM <- function(hmm,data,max.iter=c(5,15),eps=0.01,verbose=0,...){
	args <- list(...)
	if(is.null(args$viterbi)) vit.args <- args[names(args) != "df"] else vit.args <- args$viterbi
	if(is.null(args$baumWelch)) bw.args <- args[names(args) != "df"] else bw.args <- args$baumWelch
	df <- args$df
	
	max.iter <- rep(max.iter, length = 2)
	eps <- rep(eps, length = 2)
	
	if(class(df) == "list"){
		df.vit <- df[[1]]
		df.bw <- df[[2]]
	} else{
		df.vit <- df
		df.bw <- df
	}
	if(verbose >=1) message("Viterbi training: ", max.iter[1], " iterations\n")
	hmm.vit <- do.call("viterbiTraining",c(list(hmm,data,max.iter=max.iter[1],eps=eps[1],
	             df=df.vit,verbose=verbose-1),vit.args))
	
	## check that all states are accessible
	## get prior
	if(is.null(bw.args$trans.prior)){
		## no prior
		bw.args$trans.prior <- matrix(0,ncol=length(hmm),nrow=length(hmm))
	} else if(is.logical(bw.args$trans.prior) && bw.args$trans.prior){
		bw.args$trans.prior <- hmm@transition.matrix
	}
	if(!is.matrix(bw.args$trans.prior) || dim(bw.args$trans.prior) != dim(hmm@transition.matrix)){
		stop("Illegal prior transition distribution")
	}
	## check prior initial state distribution
	if(is.null(bw.args$init.prior)){
		## no prior
		bw.args$init.prior <- numeric(length(hmm))
	} else if(is.logical(bw.args$init.prior) && bw.args$init.prior){
		bw.args$init.prior <- hmm@init
	} 
	if(!is.vector(bw.args$init.prior) || length(bw.args$init.prior) != length(hmm)){
		stop("Illegal prior initial state distribution")
	}
	access.trans <- hmm.vit@transition.matrix + bw.args$trans.prior
	access.init <- hmm.vit@init + bw.args$init.prior
	diag(access.trans) <- access.init
	access <- apply(access.trans, 2, sum) > 0
	if(sum(access == 0)){
		warning("Detected ", sum(access == 0), " state(s) without inbound transitions (",
				paste(states(hmm.vit)[which(access == 0)], collapse=', '), ").\n",
				"Continuing with uniform prior.", call.=FALSE, immediate.=TRUE)
		bw.args$trans.prior <- bw.args$trans.prior + 1/length(hmm.vit)
		bw.args$init.prior <- bw.args$init.prior + 1/length(hmm.vit)
		## need to update current transition probabilities
		init <- .baumWelchInit(hmm.vit, data)
		trans <- .baumWelchTransition(hmm.vit, data, init$alpha, init$beta, bw.args$trans.prior, bw.args$init.prior)
		hmm.vit@transition.matrix <- trans$transition
	}
	
	if(verbose >=1) message("EM algorithm: ", max.iter[2], " iterations\n")
	hmm.bw <- do.call("baumWelch",c(list(hmm.vit,data,max.iter=max.iter[2],
				eps=eps[2],df=df.bw, verbose=verbose-1),bw.args))
				
	hmm.bw
}

## calculate posterior probability for each state of hmm
posterior <- function(data, hmm, log=TRUE){
	alpha <- forward(hmm, data)$alpha.scaled
	beta <- backward(hmm, data)
	
	post <- alpha + beta
	post <- t(t(post) - apply(post,2,logSum))
	if(!log) post <- exp(post)
	return(post)
}

## takes a matrix containing information about enriched regions, a data frame with information about probe positions,
## and a vector with posterior probabilities for each probe
## returns a data frame in gff format invisibly.
## output is saved to 'file' if a file name or connection is provided
reg2gff <- function(regions,post,probe.pos,file=NULL,score.fun=mean,source="tHMM",feature.type="posterior_prob",class="ChIP_region",name="tHMM"){
	chr <- probe.pos[regions[1,],"chromosome"]
	score <- apply(regions,2,function(reg,p)score.fun(p[reg[1]:reg[2]]),post)
	start <- probe.pos[regions[1,],"position"]
	end <- probe.pos[regions[2,],"position"]
	gff <- data.frame(chr=chr,source=I(source),type=I(feature.type),start=start,end=end,
						score=score,strand=I('.'),phase=I('.'),group=I(paste(class,paste(name,1:(dim(regions)[2]),sep='_'))))
	if(!is.null(file)){
		write.table(gff,file=file,quote=FALSE,sep='\t',row.names=FALSE,col.names=FALSE)
	}
	
	return(invisible(gff))
}

## takes data from a gff file and probe positions, returns a logical vector indicating probes in
## annotated regions
## If gff is a character string it is assumed to be the name of a gff file which is read and used subsequently
gff2index <- function(gff,pos){
	if(is.character(gff) || is(gff,"connection")){
		gff <- read.delim(gff,comment.char='#',header=FALSE)
	}
	index <- logical(dim(pos)[1])
	for(i in 1:dim(gff)[1]){
		chr <- gff[i,1]
		start <- gff[i,4]
		end <- gff[i,5]
		index <- index | (pos[,1] == chr & pos[,2] >= start & pos[,2] <= end)
	}
	index
}

## takes a logical vector indicating positive and negative probes
## returns a list with components 'positive' and 'negative' providing length
## information for positive and negative regions
region.length <- function(probes, min.len=1){
	positive <- numeric()
	negative <- numeric()
	pos.count <- as.numeric(probes[1])
	neg.count <- 0
	if(!probes[1]) neg.count <- 1
	
	for(p in probes[-1]){
		## extend current region
		if(p & pos.count > 0) pos.count <- pos.count + 1
		if(!p & neg.count > 0) neg.count <- neg.count + 1
		## start of new region
		if(p & pos.count == 0){
			if(neg.count >= min.len) negative <- c(negative,neg.count)
			neg.count <- 0
			pos.count <- 1
		}
		if(!p & neg.count == 0){
			if(pos.count >= min.len) positive <- c(positive,pos.count)
			pos.count <- 0
			neg.count <- 1
		}
	}
	## add last region
	if(pos.count > min.len) positive <- c(positive,pos.count)
	if(neg.count > min.len) negative <- c(negative,neg.count)
	
	ret <- list()
	ret[["positive"]] <- positive
	ret[["negative"]] <- negative
	
	ret
}

## Takes a vector containing calls for each probe, returns a matrix with
## start and end points of the regions of interest in its first and second row respectively. 
region.position <- function(probe.calls,region=TRUE){
	probe.index <- probe.calls == region
	idx <- 1:length(probe.calls)
	probe.index2 <- idx[probe.index]
	probe.dist <- diff(probe.index2)
	region.idx <- probe.dist > 1
	idx <- 1:length(probe.dist)
	region.idx2 <- idx[region.idx]
	end <- c(probe.index2[region.idx2],probe.index2[length(probe.index2)])
	start <- c(probe.index2[1],probe.index2[region.idx2+1])
	
	rbind(start,end)
}

## takes a matrix with information about enriched regions (in terms of probe indices),
## posterior probabilities for each probe and a data frame with probe positions
## removes regions that are shorter than min.length and have a score of less than min.score
## the score for each region is calculated by applying summary.fun to the posterior probabilities 
## of all probes in the region
remove.short <- function(regions, post, probe.pos, min.length=1000, min.score=0.8, summary.fun=mean){
	regions.pos <- matrix(probe.pos[regions, "position"], nrow = 2, ncol = dim(regions)[2])
	reg.len <- apply(regions.pos, 2, diff)
	if(max(post) < 0) post <- exp(post)
	reg.score <- apply(regions, 2, function(reg) summary.fun(post[reg[1]:reg[2]]))
	reg.idx <- reg.len >= min.length | reg.score >= min.score
	regions[ , reg.idx]
}

## generate simulated data set by sampling from real data using results of TileMap analysis
## data is a data.frame containing probe positions and measurements
## group is either a logical vector indicating the position of positive probes or
##     the name of a gff file containing information about positive regions
## the number of positive regions per sequence is between min(pos.range) and max(pos.range)
##     for each sequence the actual number is sampled from a uniform distribution
## num.seq sequences are generated
## probe positions are generated using a distance of gap between probes of the same sequence
##      and split.gap between different sequences
## returns a data matrix
generate.data <- function(data,group,pos.range=c(1,10),num.seq=100,gap=35,split.gap=1000,min.len=2){
	## read region information from file if necessary
	if(is.character(group) || is(group,"connection")){
		gff <- read.delim(group,comment.char='#',header=FALSE)
		group <- gff2index(gff,data[,c(1,2)])
	}
	## get length distributions
	reg.len <- region.length(group,min.len=min.len)
	
	## generate observation sequences
	pos.count <- runif(num.seq,min=min(pos.range),max=max(pos.range))
	probe.pos <- 1
	output <- matrix(ncol=dim(data)[2],nrow=0)
	index <- 1:dim(data)[1]
	pos.prob <- sum(group)/length(group)
	start <- sample(c(0,1),num.seq,prob=c(1-pos.prob,pos.prob),replace=TRUE)
	end <- sample(c(0,1),num.seq,prob=c(1-pos.prob,pos.prob),replace=TRUE)
	state.seq <- list()
	for(i in 1:num.seq){
		sequence <- matrix(ncol=dim(data)[2]-2,nrow=0)
		state.seq[[i]] <- logical()
		if(pos.count[i] > 0){
			if(start[i]){
				## positive region
				seq.len <- sample(reg.len$positive,1)
				seq.idx <- sample(index[group],seq.len)
				sequence <- rbind(sequence,data[seq.idx,c(-1,-2)])
				pos.count[i] <- pos.count[i]-1
				state.seq[[i]] <- c(state.seq[[i]],rep(TRUE,each=seq.len))
			}
			while(pos.count[i] > end[i]){
				## negative region
				seq.len <- sample(reg.len$negative,1)
				seq.idx <- sample(index[!group],seq.len)
				sequence <- rbind(sequence,data[seq.idx,c(-1,-2)])
				state.seq[[i]] <- c(state.seq[[i]],rep(FALSE,each=seq.len))
				## positive region
				seq.len <- sample(reg.len$positive,1)
				seq.idx <- sample(index[group],seq.len)
				sequence <- rbind(sequence,data[seq.idx,c(-1,-2)])
				pos.count[i] <- pos.count[i]-1
				state.seq[[i]] <- c(state.seq[[i]],rep(TRUE,each=seq.len))
			}
			## negative region
			seq.len <- sample(reg.len$negative,1)
			seq.idx <- sample(index[!group],seq.len)
			sequence <- rbind(sequence,data[seq.idx,c(-1,-2)])
			state.seq[[i]] <- c(state.seq[[i]],rep(FALSE,each=seq.len))
			if(end[i]){
				## positive region
				seq.len <- sample(reg.len$positive,1)
				seq.idx <- sample(index[group],seq.len)
				sequence <- rbind(sequence,data[seq.idx,c(-1,-2)])
				state.seq[[i]] <- c(state.seq[[i]],rep(TRUE,each=seq.len))
			}
		} else{
			## negative region
			seq.len <- sample(reg.len$negative,1)
			seq.idx <- sample(index[!group],seq.len)
			sequence <- rbind(sequence,data[seq.idx,c(-1,-2)])
			state.seq[[i]] <- c(state.seq[[i]],rep(FALSE,each=seq.len))
		}
		## generate probe positions
		positions <- seq(from=probe.pos,by=gap,length.out=dim(sequence)[1])
		chr <- rep("chr1",each=dim(sequence)[1])
		output <- rbind(output,cbind(chr,positions,sequence))
		probe.pos <- positions[length(positions)]+split.gap
	}
	ret <- list()
	ret[["observation"]] <- output
	ret[["regions"]] <- state.seq
	ret
} 

## get initial partition of data into states
.get.init <- function(data,nstate=2,iter.max=20,nstart=3){
	cluster <- kmeans(data,nstate,iter.max=iter.max,nstart=nstart)
	means <- cluster$centers
	vars <- cluster$withinss/(cluster$size - 1)
	
	ret <- list()
	ret[["mean"]] <- means
	ret[["var"]] <- vars
	ret[["cluster"]] <- cluster$cluster
	ret[["size"]] <- cluster$size
	ret	
}

## setting up an HMM with initial estimates from the data
hmm.setup <- function(data,state=c("enriched","non-enriched"),probe.region=35,frag.size=1000,pos.state=1,
						em.type="tDist",max.prob=1,df=9){
	comp <- numeric(length(state)) + 1
	names(comp) <- state
	## cluster data to obtain initial estimates
	if(is(data,"list")) data <- c(data,recursive=TRUE)
	cl <- .get.init(data,nstate=c(min(data),max(data)))
	
	mean.pos <- cl$mean[1]
	mean.neg <- cl$mean[2]
	var.pos <- cl$var[1]
	var.neg <- cl$var[2]
	weight.pos <- 1
	weight.neg <- 1
	emission.comp <- list()
	emission <- list()

	emission.comp[[state[1]]] <- cbind(weight.pos,mean.pos,var.pos)
	emission.comp[[state[2]]] <- cbind(weight.neg,mean.neg,var.neg)
	for(i in 1:length(state)){
		if(em.type == "tDist"){
			emission[[state[i]]] <- new("tDist",mean=emission.comp[[state[i]]][,2],
									var=emission.comp[[state[i]]][,3],df=df[(i%%length(df))+1])
		} else{
			stop(paste("Requested emission distribution of type",em.type,"is not supported."))	
		}
	}

	## set initial transition probabilities according to average fragment size
	## we don't know how many enriched regions to expect, but the number 
	## of sufficiently extreme observations should be a good starting point
	pos.index <- cl$cluster == 1
	neg.index <- !pos.index
	freq.pos <- sum(pos.index)/length(data)
	freq.pos <- freq.pos*(probe.region/frag.size)
	
	if(pos.state == 1){
		prob.aa <- min(1-probe.region/frag.size,max.prob)
		prob.bb <- min(1-freq.pos,max.prob)
	} else{
		prob.aa <- min(1-freq.pos,max.prob)
		prob.bb <- min(1-probe.region/frag.size,max.prob)
	}
	
	neg.state <- 2
	if(pos.state == 2) neg.state <- 1
	transition <- list()
	transition[[state[1]]] <- new("discDist",alpha=state,prob=c(prob.aa,1-prob.aa))
	transition[[state[2]]] <- new("discDist",alpha=state,prob=c(1-prob.bb,prob.bb))
	
	## choose initial distribution as stationary distribution of the MC with above transition matrix
	a <- transition[[state[1]]][2]
	b <- transition[[state[2]]][1]
	pi <- new("discDist",alpha=state,prob=c(b/(a+b),a/(a+b)))
	
	## create HMM with initial parameters
	hmm.init <- new("contHMM",transition=transition,emission=emission,init=pi)
	hmm.init
}

## a more convenient way to create an HMM
getHMM <- function(params,snames){
	trans <- list()
	trans[[snames[1]]] <- new("discDist",alpha=snames,prob=c(1-params$a[1],params$a[1]))
	trans[[snames[2]]] <- new("discDist",alpha=snames,prob=c(params$a[2],1-params$a[2]))

	emiss <- list()
	emiss[[snames[1]]] <- new("tDist",mean=params$mu[1],var=params$sigma[1],df=params$nu[1])
	emiss[[snames[2]]] <- new("tDist",mean=params$mu[2],var=params$sigma[2],df=params$nu[2])

	init <- new("discDist",alpha=snames,prob=c(params$a[2]/(params$a[2]+params$a[1]),
		params$a[1]/(params$a[2]+params$a[1])))

	new("contHMM",transition=trans,emission=emiss,init=init)
}

## calculate 'shrinkage t' statistic
shrinkt.st <- function(X,L,h0.mean=0,...){
	## ensure that there are at least two samples per group
	tbl <- table(L)
	if(min(tbl) == 1){
		stop("Group ", names(tbl)[which.min(tbl)], " contains only one sample. At least two samples are required per group.")
	}
	
	if(max(L) == 1){	
		shrink.var <- var.shrink(t(X))
		means <- apply(X,1,mean)
		t.stat <- mapply(function(m,v,h0,n) (m - h0)/sqrt(v/n),means,shrink.var,
			MoreArgs=list(h0.mean,length(L),...))
	} else if(max(L) == 2){
		t.stat <- shrinkt.stat(t(X),L,...)
	} else stop(paste("Illegal number of groups! Expected 1 or 2, found", max(L)))
	
	t.stat	
}

Any scripts or data that you put into this service are public.

tileHMM documentation built on May 30, 2017, 3:41 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tileHMM
Hidden Markov Models for ChIP-on-Chip Analysis

R/utils.R
In tileHMM: Hidden Markov Models for ChIP-on-Chip Analysis

Defines functions logSum viterbiEM posterior reg2gff gff2index region.length region.position remove.short generate.data get.init hmm.setup getHMM shrinkt.st

Documented in generate.data getHMM gff2index hmm.setup logSum posterior reg2gff region.length region.position remove.short shrinkt.st viterbiEM

Try the tileHMM package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tileHMM Hidden Markov Models for ChIP-on-Chip Analysis

R/utils.R In tileHMM: Hidden Markov Models for ChIP-on-Chip Analysis

Defines functions logSum viterbiEM posterior reg2gff gff2index region.length region.position remove.short generate.data get.init hmm.setup getHMM shrinkt.st

Documented in generate.data getHMM gff2index hmm.setup logSum posterior reg2gff region.length region.position remove.short shrinkt.st viterbiEM

Try the tileHMM package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tileHMM
Hidden Markov Models for ChIP-on-Chip Analysis

R/utils.R
In tileHMM: Hidden Markov Models for ChIP-on-Chip Analysis