R/misc.R

Defines functions avgFunc tmvtfFit nbinomCLLDD nbinomFit poisFit gammaFit splitFarNeighbour maxGapminRun biomvRmgmr preClustGrp mat2list simUvSegData simSegData

Documented in biomvRmgmr maxGapminRun simSegData splitFarNeighbour

##################################################
#			segmentation functions
##################################################
simSegData<-function(nseg=10, J=3, soj, emis, seed=1234, toPlot=FALSE){
	# focus on the soj settings,
	# keep the difference between emission of different states small

	# pool of segments for each states to sample from
	#check J to see if it conforms with soj and emis
	if(!is.null(soj)){
		if(! soj$type %in% c('gamma', 'pois', 'nbinom')){
			stop("soj type not supported")
		}
		paraLen<-sapply(soj, length)
		if(! all(paraLen[names(paraLen)!='type']==J)){
			stop("incorrect length for the soj parameter")
		}
	}
	if(!is.null(emis)){
		if(! emis$type %in% c('norm', 't', 'pois','nbinom')){
			stop("emis type not supported")
		}
		paraLen<-sapply(emis, length)
		if(!all(paraLen[names(paraLen)!='type']==J)){
			stop("incorrect length for the emis parameter")
		}
	}	

	# generate pool of segments, with soj related length, and states related signal
	segLen<-sapply(1:J, function(j) simUvSegData(nseg, j, soj, seed))
	segExp<-sapply(1:J, function(j) sapply(segLen[,j], function(l) simUvSegData(l, j, emis)))
	
	sel<-sample(1:J, size=nseg, replace=T)
	states<-runValue(Rle(sel))
	nsel<-length(states)
	
	L<-segLen[(states-1)*nseg+1:nsel]
	E<-unlist(segExp[(states-1)*nseg+1:nsel])
	
	res<-list(L=L,E=E,S=states)
	if(toPlot){
		filename<-paste('Simulation.', gsub(':', '-', gsub(' ', '_', date())) ,'.pdf', sep='')
		pdf(filename)
		ts.plot(E)
		dev.off()
		res<-c(res, pdf=filename)
	}
	return(res)
}


simUvSegData<-function(n, j, param, seed=NULL){
	if(!is.null(seed))	set.seed(seed)
	x<-switch(param$type,
		norm = rnorm(n, mean=param$mean[j], sd=param$sd[j]),
		t = rt(n, ncp=param$ncp[j], df=param$df[j]),
		gamma = rgamma(n, shape=param$shape[j], scale=param$scale[j]),
		pois = rpois(n, lambda=param$lambda[j]),
		nbinom = rnbinom(n, mu=param$mu[j], size=param$size[j]))
}


mat2list<-function(x, nr=NULL){
	if(!is.null(nr)) x<-matrix(x, nrow=nr)
	L = vector(mode="list", length=nrow(x))
	for(i in 1:nrow(x)) {
		L[[i]] = cbind(estimate=x[i, 0:(i-1)]) 
	}
	return(L)
}


preClustGrp<-function(x, grp=NULL, cluster.m=NULL, deepSplit=4, minClusterSize=4){
	clustmethods<-c('ward','single','complete','average','mcquitty','median','centroid')
	nc<-ncol(x)
	if (!is.null(grp)) {
		if(length(grp)!=nc) 
			stop('length of grp differs from number of column in x !!!')
		if(!is.null(cluster.m)) 
			warning('clustering would not be performed, using the grp input instead !!!')	
		grp<-as.character(grp)
	}	else if(!is.null(cluster.m) && nc >= 4){
		# no grp and has cluster.m
		if(length(find.package('dynamicTreeCut', quiet=T))==0) {
			warning("'dynamicTreeCut' is not found, clustering result ignored, treated as one group!!!")
			grp<-rep('1', nc)
		} else{
			cluster.m<-match.arg(cluster.m, clustmethods)
			DM <- dist(t(x))
			HC <- hclust(DM, method=cluster.m)
			require(dynamicTreeCut)
			grp<-cutreeDynamic(dendro = HC, distM = as.matrix(DM), deepSplit = deepSplit, pamRespectsDendro = FALSE, minClusterSize = minClusterSize, verbose=0) 
		}
	} else {
		grp<-rep('1', nc)
	}
	return(grp)
}

## main function 
biomvRmgmr<-function(x, xPos=NULL, xRange=NULL, usePos='start', cutoff=NULL, q=0.9, high=TRUE, minrun=5, maxgap=2, splitLen=Inf, poolGrp=FALSE, grp=NULL, cluster.m=NULL, avg.m='median', trim=0, na.rm=TRUE){
	
	if (!is.numeric(x) &&  !is.matrix(x) && class(x)!='GRanges') 
        stop("'x' must be a numeric vector or matrix or a GRanges object.")
    if(class(x)=='GRanges') {
    	xid<-names(values(x))
    	xRange<-x
    	mcols(xRange)<-NULL
    	x<-as.matrix(values(x))
		if( any(sapply(unique(seqnames(xRange)), function(s) length(unique(strand(xRange[seqnames(xRange)==s]))))!=1))
    		stop('For some sequence, there are data appear on both strands !')
    } else if(length(dim(x))==2){
		xid<-colnames(x)
		x<-as.matrix(x)
	} else {
		message('No dim attributes, coercing x to a matrix with 1 column.')
		x <- matrix(as.numeric(x), ncol=1)
	}
	nr<-nrow(x) 
	nc<-ncol(x)
	if(is.null(xid)){
		xid<-paste('S', seq_len(nc), sep='')
		colnames(x)<-xid
	}
	
	## some checking on xpos and xrange, xrange exist then xpos derived from xrange,
	if(!is.null(xRange) && (class(xRange)=='GRanges' || class(xRange)=='IRanges') && !is.null(usePos) && length(xRange)==nr && usePos %in% c('start', 'end', 'mid')){
		if(usePos=='start'){
			xPos<-start(xRange)
		} else if(usePos=='end'){
			xPos<-end(xRange)
		} else {
			xPos<-(start(xRange)+end(xRange))/2
		}
	} else {
		# no valid xRange, set it to null
		message('no valid xRange and usePos found, check if you have specified xRange / usePos.')
		xRange<- NULL
	} 
	if (is.null(xPos) || !is.numeric(xPos) || length(xPos)!=nr){
		message("No valid positional information found. Re-check if you have specified any xPos / xRange.")
		xPos<-NULL
	}
 
    # check grp setting, cluster if needed, otherwise treat as one group	
   	if(!is.null(grp)) grp<-as.character(grp)
	grp<-preClustGrp(x, grp=grp, cluster.m=cluster.m)
	
	## build xRange if not a GRanges for the returning object
	if(is.null(xRange) || class(xRange) != 'GRanges'){
		if(!is.null(xRange) && class(xRange) == 'IRanges'){
			xRange<-GRanges(seqnames='sampleseq', xRange)	
		} else 	if(!is.null(xPos)){
			xRange<-GRanges(seqnames='sampleseq', IRanges(start=xPos, width=1))	
		} else {
			xRange<-GRanges(seqnames='sampleseq', IRanges(start=seq_len(nr), width=1))	
		}
	}
	# get seqnames status	
	seqs<-unique(as.character(seqnames(xRange)))
	## initialize the output vectors
	res<-GRanges()
	seqlevels(res)<-seqlevels(xRange)
	# we have more than one seq to batch
	for(s in seq_along(seqs)){
		message(sprintf("Processing sequence %s\n", seqs[s]))
		r<-which(as.character(seqnames(xRange)) == seqs[s])
		
		for(g in unique(grp)){
			gi<-grp==g
			message(sprintf("Building segmentation model for group %s ...", g))
			# fixme, worth thinking, if columns within one group should be pooled.
			if(poolGrp){
				Ilist<-maxGapminRun(x=apply(as.matrix(x[r, gi]), 1, median, na.rm=na.rm), xRange=ranges(xRange)[r], cutoff=cutoff, q=q, high=high, minrun=minrun, maxgap=maxgap, splitLen=splitLen, na.rm=na.rm)
				if(length(Ilist$IS)>0){
					tores<-GRanges(seqnames=as.character(seqs[s]), 
								IRanges(start=rep(start(xRange)[r][Ilist$IS], sum(gi)), end=rep(end(xRange)[r][Ilist$IE], sum(gi))), 
								strand=strand(xRange)[r][Ilist$IS],
								SAMPLE=rep(xid[gi], each=length(Ilist$IS)), 
								STATE=rep('HI', length(Ilist$IS)*sum(gi)), 
								AVG=sapply(which(gi), function(c) sapply(seq_along(Ilist$IS), function(t) avgFunc(x[Ilist$IS[t]:Ilist$IE[t],c], avg.m=avg.m, trim=trim, na.rm=na.rm)))
					)
					seqlevels(tores)<-seqlevels(xRange)
					res<-c(res, tores)					
				}
			} else {
				for(c in which(gi)){
				Ilist<-maxGapminRun(x=x[r,c], xRange=ranges(xRange)[r], cutoff=cutoff, q=q, high=high, minrun=minrun, maxgap=maxgap, splitLen=splitLen, na.rm=na.rm)
				if(length(Ilist$IS)>0){
					tores<-GRanges(seqnames=as.character(seqs[s]), 
						IRanges(start=start(xRange)[r][Ilist$IS], end=end(xRange)[r][Ilist$IE]), 
						strand=strand(xRange)[r][Ilist$IS],
						SAMPLE=rep(xid[c], length(Ilist$IS)), 
						STATE=rep(ifelse(high, 'HIGH', 'LOW'), length(Ilist$IS)), 
						AVG=as.numeric(sapply(seq_along(Ilist$IS),  function(t) avgFunc(x[Ilist$IS[t]:Ilist$IE[t],c], avg.m=avg.m, trim=trim, na.rm=na.rm)))
					)
					seqlevels(tores)<-seqlevels(xRange)
					res<-c(res, tores)	
				}
			}
			
			} # end c for
			message(sprintf("Building segmentation model for group %s complete.", g))
		} # end for g
		message(sprintf("Processing sequence %s complete.", seqs[s]))
	} # end for s

	values(xRange)<-DataFrame(x,  row.names = NULL)
	new("biomvRCNS",  
		x = xRange, res = res,
		param=list(maxgap=maxgap, minrun=minrun, q=q, cutoff=cutoff, splitLen=splitLen, grp=grp, cluster.m=cluster.m, poolGrp=poolGrp, avg.m=avg.m, trim=trim, na.rm=na.rm)
	)
}


##################################################
#  maxgap minrun algo, utilizing RLE, mostly for transcript detection...
##################################################
maxGapminRun<-function(x, xPos=NULL, xRange=NULL, cutoff=NULL, q=0.9, high=TRUE, minrun=5, maxgap=2, splitLen=Inf, na.rm=TRUE){
	# optional position vector, if not present, use index as postion
	# output values, index of active intervals, plus func parameters 
	
	if(is.null(cutoff)){
		cutoff<- as.numeric(quantile(x, q, na.rm=na.rm))
	} else if (cutoff >= max(x, na.rm=na.rm) || cutoff<=min(x, na.rm=na.rm)) {
		stop("cutoff level is not well within range!")
	}
	if(high){
		xl<- x>=cutoff
	} else {
		xl<- x<=cutoff
	}
	
	if(all(xl) | all(!xl)) {
		warnings("nothing to be done, all above/below cutoff !")
		intStart<-intEnd<-integer(0)
		return(list(IS=intStart, IE=intEnd, CUTOFF=cutoff, MG=maxgap, MR=minrun, SPL=splitLen))		
	}
	n<-length(x)
	
	if(!is.null(xPos) && is.null(xRange)){
		if(!is.numeric(xPos)) stop("xPos is not numeric!")
		if(n!=length(xPos)) stop("the length of xPos doesn't match the length of x!")
		if(all((xPos[-1]-xPos[-n])>maxgap) | (xPos[n]-xPos[1])<minrun)
			warnings("maxgap and minrun values are not appropriate !!")
	} else if (is.null(xPos) && !is.null(xRange)){
		if(class(xRange) !='IRanges') stop("xRange is not a IRange object!")
		if(n!=length(xRange)) stop("the length of xRange doesn't match the length of x!")
		if(all(start(xRange)[-1]-end(xRange)[-n] > maxgap) | (end(xRange)[n]- start(xRange)[1]) < minrun)
			warnings("maxgap and minrun values are not appropriate !!")
	}
	
	
	# this version of the maxgap minrun utilize Rle
	# every 2 consecutive runs will have different sign, a merit
	xlrle<-Rle(xl)
	i<-min(which(runValue(xlrle)))
	while(i < max(which(runValue(xlrle)))){

		rv<-runValue(xlrle)
		rl<-runLength(xlrle)	
		erl<-cumsum(rl) #idx of the end of each run
		#srl<-erl-rl+1 # idx of the start of each run, not quite used...
		nr<-length(runValue(xlrle))
		if(i == nr){
			break
		} else if(	(is.null(xPos) && is.null(xRange) && rl[i+1]<=maxgap) ||
						(!is.null(xPos) && is.null(xRange) && xPos[erl[i+1]]-xPos[erl[i]]<=maxgap) ||
						(is.null(xPos) && !is.null(xRange) && start(xRange)[erl[i+1]]-end(xRange)[erl[i]]<=maxgap) && (i+1)<nr ){
			# the next F interval is shorter than maxgap, and there is one T interval still hanging 
			# the gap distance is calculated as the distance between the end of the last True in run i to the last FALSE in the run i+1, half open
			if(	(is.null(xPos) && is.null(xRange) && sum(rl[i:min(i+2, nr)])<=splitLen) ||
				(!is.null(xPos) && is.null(xRange) && xPos[erl[min(i+2, nr)]]-xPos[erl[i]] <=splitLen) ||
				(is.null(xPos) && !is.null(xRange) && end(xRange)[erl[min(i+2, nr)]]-start(xRange)[erl[i]]<=splitLen) ){ 
				#the accumulative length of this interval including this F interval is shorter than the splitLen, merge
				runValue(xlrle)[(i+1)]<-TRUE
			} else {
				#move on and ignore this F interval, no merging
				i<-i+2
			}
		} else {
			i<-i+2
		}
	}
	
	rv<-runValue(xlrle)
	rl<-runLength(xlrle)
	erl<-cumsum(rl) 
	srl<-erl-rl+1 
	nr<-length(runValue(xlrle))
	if(!is.null(xPos)){
		ri<-which(rv & xPos[erl]-xPos[srl]>= minrun)
	} else if (!is.null(xRange)){
		ri<-which(rv & end(xRange)[erl]-start(xRange)[srl]>= minrun)
	} else {
		ri<-which(rv & rl>=minrun)
	}
	if(length(ri)>0){
		intStart<-sapply(ri, function(z) sum(rl[seq_len(z-1)])+1)
		intEnd<-sapply(ri, function(z) sum(rl[seq_len(z)]))
		
		## solve neighbouring probes which are both true, but too far away
		tmp<-splitFarNeighbour(intStart=intStart, intEnd=intEnd, xPos=xPos, xRange=xRange, maxgap=maxgap, minrun=minrun)
		intStart<-tmp$IS
		intEnd<-tmp$IE
	} else {
		intStart<-intEnd<-integer(0)
	}
	return(list(IS=intStart, IE=intEnd, CUTOFF=cutoff, MG=maxgap, MR=minrun, SPL=splitLen))		
}

##################################################
# helper function to split far away neighbouring items
##################################################
splitFarNeighbour<-function(intStart=NULL, intEnd=NULL, xPos=NULL, xRange=NULL, maxgap=Inf, minrun=1){
	if(!is.null(xRange) && class(xRange)=='GRanges' || class(xRange)== 'IRanges') xPos<-NULL
	if(!is.null(xPos) || !is.null(xRange) && !is.null(intStart) && !is.null(intEnd) && length(intEnd)==length(intStart) && !is.infinite(maxgap)){
		if(!is.null(xPos)){
			n<-length(xPos)
			gapStart<-which((xPos[-1]-xPos[-n])>maxgap)
		} else {
			n<-length(xRange)
			gapStart<-which((start(xRange)[-1]-end(xRange)[-n])>maxgap)	
		}
		gapEnd<-gapStart+1
		gapir<-IRanges(start=gapStart, end=gapEnd)
		gapir<-reduce(gapir)
		intN<-length(intStart)
		todel<-numeric(0)
		toadd<-numeric(0)
		for(i in 1:intN){
			splitgap<-which(intStart[i]<=gapStart & intEnd[i]>=gapEnd)
			ngap<-length(splitgap)
			if(ngap>0){
				todel<-c(todel, i)
				for(g in 1:ngap){
					if(ngap>1){
						#there are more than one
						if(g==1){
							# the 1st gap in this interval
							if(	(!is.null(xPos) && xPos[gapStart[splitgap[g]]]-xPos[intStart[i]]>=minrun) ||
								(!is.null(xRange) && end(xRange)[gapStart[splitgap[g]]]-start(xRange)[intStart[i]]>=minrun) ){
								#split first half	
								toadd<-c(toadd, intStart[i], gapStart[splitgap[g]])
							}
						}
						if (g==ngap){
							# the last gap in this interval
							if(	(!is.null(xPos) && xPos[intEnd[i]]-xPos[gapEnd[splitgap[g]]]>=minrun) ||
								(!is.null(xRange) && end(xRange)[intEnd[i]]-start(xRange)[gapEnd[splitgap[g]]]>=minrun) ){
								#split second half
								toadd<-c(toadd, gapEnd[splitgap[g]], intEnd[i])
							}
						} else{
							if(	(g+1)<ngap &&	(!is.null(xPos) && xPos[gapStart[splitgap[g+1]]]-xPos[gapEnd[splitgap[g]]]>=minrun) ||
								(!is.null(xRange) && end(xRange)[gapStart[splitgap[g+1]]]-start(xRange)[gapEnd[splitgap[g]]]>=minrun) ){
								#split second half
								toadd<-c(toadd, gapEnd[splitgap[g]], gapStart[splitgap[g+1]])
							}
						}
					} else {
						# there is only one gap within this interval
						if(	(!is.null(xPos) && xPos[gapStart[g]]-xPos[intStart[i]]>=minrun) ||
							(!is.null(xRange) && end(xRange)[gapStart[g]]-start(xRange)[intStart[i]]>=minrun) ){
							#split first half	
							toadd<-c(toadd, intStart[i], gapStart[splitgap[g]])
						}
						if( (!is.null(xPos) && xPos[intEnd[i]]-xPos[gapEnd[splitgap[g]]]>=minrun) ||
							(!is.null(xRange) && end(xRange)[intEnd[i]]-start(xRange)[gapEnd[splitgap[g]]]>=minrun) ){
							#split second half
							toadd<-c(toadd, gapEnd[splitgap[g]], intEnd[i])
						}
					}							
				}
			}
		}
		if(length(todel)>0){
			intStart<-intStart[-todel]
			intEnd<-intEnd[-todel]
		}
		if(length(toadd)>0){
			intStart<-c(intStart, toadd[seq(1, length(toadd), 2)])
			intEnd<-c(intEnd, toadd[seq(2, length(toadd), 2)])
		}	
	}
	return(list(IS=intStart, IE=intEnd))
}

gammaFit<-function(x, wt=NULL){
	if(is.null(wt)) wt <- rep(1,length(x))
	if(length(x) != length(wt)) stop("length of x and wt differ!")
	
	tmp <- cov.wt(data.frame(x),wt=wt)
    shape <- as.numeric(tmp$center/sqrt(tmp$cov))^2
    scale<-as.numeric(tmp$center)/shape
#	par<-optim(c(shape,scale),function(par) sum(dgamma(x,shape=par[1],scale=par[2], log=TRUE)*wt, na.rm=TRUE)  ,lower=.Machine$double.eps, method='L-BFGS-B', control=list(fnscale=-1,maxit=1000))$par
#	return(c(shape=par[1], scale=par[2]))
	return(c(shape=shape, scale=scale))
}

poisFit <- function(x, wt=NULL, maxshift=0) {  	
	if(maxshift>min(x)) stop("maxshift can't be greater than the minimum of x !")
	if(is.null(wt)) wt <- rep(1/length(x),length(x))
	if(length(x) != length(wt)) stop("length of x and wt differ!")
	
	shift<-which.max(sapply(0:maxshift, function(s) sum(dpois(x = x-s, lambda=sum((x-s) * wt, na.rm=T) ,log=TRUE) * wt, na.rm=T)))-1
	lambda <- sum((x-shift) * wt, na.rm=T) 
    return(c(lambda=lambda, shift=shift))
}

nbinomFit <- function(x, wt=NULL, maxshift=0) {  	
	if(maxshift>min(x)) stop("maxshift can't be greater than the minimum of x !")
	if(is.null(wt)) wt <- rep(1,length(x))
	if(length(x) != length(wt)) stop("length of x and wt differ!")
	
	shift <- which.max(sapply(0:maxshift,  function(s) nbinomCLLDD(x, wt, s)$value))-1
    res<-c(nbinomCLLDD(x, wt, shift)$par, shift)
    names(res)<-c('size', 'mu', 'shift')
    return(res)
}


nbinomCLLDD<-function(x, wt=NULL, s=0){
	if(is.null(wt)) wt <- rep(1,length(x))
	if(length(x) != length(wt)) stop("length of x and wt differ!")
	tmp <- cov.wt(matrix(as.vector(x-s)),wt=wt)
	m <- as.numeric(tmp$center)
	v <- as.numeric(tmp$cov)
	size <- if (v > m) m^2/(v - m) else 100
	value<-sum(dnbinom(x = as.vector(x-s), size=size, mu=m ,log=TRUE) * wt, na.rm=T)
	return(list(value=value, par=c(size, m)))
#	optim(c(size,m),function(par) sum(dnbinom(x-s,size=par[1],mu=par[2],log=TRUE)*wt, na.rm=TRUE) , lower=.Machine$double.eps, method='L-BFGS-B', control=list(fnscale=-1,maxit=1000))
}

tmvtfFit<-function(x, wt=NULL){
	#Aeschliman, C., Park, J., & Kak, A. (2010). A novel parameter estimation algorithm for the multivariate t-distribution and its application to computer vision. Computer Vision-ECCV 2010, 594-607.
	if(!is.matrix(x))	x<-matrix(x)
	n<-nrow(x)
	p<-ncol(x)
	if(is.null(wt)) wt <- rep(1,n)
	if(n != length(wt)) stop("rows of x and length of wt differ!")
	tmp <- cov.wt(x,wt=wt)
	c<-unname(tmp$center)
	z<-apply(x, 1, function(r) log(sum((r - c)^2)))
	zbar<-mean(z)
	b<-sum((z-zbar)^2)/n - trigamma(p/2)
	vhat<-floor((1+sqrt(1+4*b))/b)
	return(list(mu=c, df=ifelse(vhat>1, vhat, 1), var=unname(tmp$cov)))
}

avgFunc<-function(x, avg.m='median', trim=0, na.rm=TRUE){
	switch(avg.m,
		mean=mean(x, trim=trim, na.rm=na.rm),
		median=median(x, na.rm=na.rm),
		stop("invalid 'avg.m' specified!")
	)
}

Try the biomvRCNS package in your browser

Any scripts or data that you put into this service are public.

biomvRCNS documentation built on Nov. 8, 2020, 6:49 p.m.