R/biomvRseg.R
In biomvRCNS: Copy Number study and Segmentation for multivariate biological data

Documented in biomvRseg regionSegAlphaNB regionSegCost

## main function for normized numeric intensity vectors
biomvRseg<-function(x, maxk=NULL, maxbp=NULL, maxseg=NULL, xPos=NULL, xRange=NULL, usePos='start', family='norm', penalty='BIC', twoStep=TRUE, segDisp=FALSE, useMC=FALSE, useSum=TRUE, comVar=TRUE, maxgap=Inf, tol=1e-06, grp=NULL, cluster.m=NULL, avg.m='median', trim=0, na.rm=TRUE){
	
	# input to the main function is a matrix object, x, features on the same strand and same chr
	# optional grouping factor, length of which should be the same as the column of x
	# input checking and preparation
	familes<-c('norm' , 'pois' , 'nbinom')
	family<-match.arg(family, familes)
	
	if (!is.numeric(x) &&  !is.matrix(x) && class(x)!='GRanges') 
        stop("'x' must be a numeric vector or matrix or a GRanges object.")
    if(class(x)=='GRanges') {
    	xid<-names(values(x))
    	xRange<-x
    	mcols(xRange)<-NULL
    	x<-as.matrix(values(x))
    	if( any(sapply(unique(seqnames(xRange)), function(s) length(unique(strand(xRange[seqnames(xRange)==s]))))!=1))
    		stop('For some sequence, there are data appear on both strands !')
    } else if(length(dim(x))==2){
		xid<-colnames(x)
		x<-as.matrix(x)
	} else {
		message('No dim attributes, coercing x to a matrix with 1 column.')
		x <- matrix(as.numeric(x), ncol=1)
	}
	nr<-nrow(x) 
	nc<-ncol(x)
	if(is.null(xid)){
		xid<-paste('S', seq_len(nc), sep='')
		colnames(x)<-xid
	}
	
	## some checking on xpos and xrange, xrange exist then xpos drived from xrange,
	if(!is.null(xRange) && (class(xRange)=='GRanges' || class(xRange)=='IRanges') && !is.null(usePos) && length(xRange)==nr && usePos %in% c('start', 'end', 'mid')){
		if(usePos=='start'){
			xPos<-start(xRange)
		} else if(usePos=='end'){
			xPos<-end(xRange)
		} else {
			xPos<-(start(xRange)+end(xRange))/2
		}
	} else {
		# no valid xRange, set it to null
		message('no valid xRange and usePos found, check if you have specified xRange / usePos.')
		xRange<- NULL
	} 
	if (is.null(xPos) || !is.numeric(xPos) || length(xPos)!=nr){
		message("No valid positional information found. Re-check if you have specified any xPos / xRange.")
		xPos<-NULL
	}
	if (!is.null(maxbp) && (!is.numeric(maxbp) || (length(maxbp) != 1) || (maxbp <= 1) ||  ( !is.null(xPos) && maxbp > max(xPos,na.rm=na.rm)-min(xPos, na.rm=na.rm)))) 
	 	 stop(sprintf("'maxbp' must be a single integer between 2 and the maximum length of the region if xPos is avaliable."))	 
	
	# checking on maxseg and maxk
	if (is.null(maxseg) || !is.numeric(maxseg) || (length(maxseg) != 1) || (maxseg <= 1) ||  (maxseg > nr)) 
		 stop(sprintf("'maxseg' must be a single integer between 2 and the number of rows of 'x': %d.", nr))
	maxseg<-as.integer(maxseg) # this is the number for initial candidate region
	if (is.null(maxbp) && (is.null(maxk) || !is.numeric(maxk) || (length(maxk) != 1) || (maxk <= 1) ||  (maxk > nr))) 
	 	 stop(sprintf("'maxk' must be a single integer between 2 and the number of rows of 'x': %d, if maxbp is not available.", nr))
	maxk<-as.integer(maxk) # this is the maximal windows size for the initial segmentation
	# fixme, this check may not be necessary, since nr might be a sum of multiple seq
#	if(maxk*maxseg<nr) 
#		stop(sprintf("the product of 'maxseg' and 'maxk' is smaller than the number of rows of 'x': %d.", nr))

    # penalty
    penaltymethods<-c('none','AIC','AICc','BIC','SIC','HQIC', 'mBIC')
    penalty<-match.arg(penalty, penaltymethods)
    if(penalty=='SIC') penalty='BIC' 
    
    # check grp setting, cluster if needed, otherwise treat as one group	
   	if(!is.null(grp)) grp<-as.character(grp)
	grp<-preClustGrp(x, grp=grp, cluster.m=cluster.m)
	
	
	## build xRange if not a GRanges for the returning object
	if(is.null(xRange) || class(xRange) != 'GRanges'){
		if(!is.null(xRange) && class(xRange) == 'IRanges'){
			xRange<-GRanges(seqnames='sampleseq', xRange)	
		} else 	if(!is.null(xPos)){
			xRange<-GRanges(seqnames='sampleseq', IRanges(start=xPos, width=1))	
		} else {
			xRange<-GRanges(seqnames='sampleseq', IRanges(start=seq_len(nr), width=1))	
		}
	}
	# get seqnames status	
	seqs<-unique(as.character(seqnames(xRange)))
	
	## initialize the output vectors
	segStart <- vector(mode="list", length=nc)
	segMean <- vector(mode="list", length=nc)
	res<-GRanges()
	seqlevels(res)<-seqlevels(xRange)
	
	tmaxk<-maxk
	# we have more than one seq to batch
	
	for(s in seq_along(seqs)){
		message(sprintf("Processing sequence %s\n ...", seqs[s]))
		r<-which(as.character(seqnames(xRange)) == seqs[s])
		
		## update local maxk, if maxbp and xPos/xRange are given / or check if maxk is too large for current seq
		maxk<-tmaxk
		if(!is.null(maxbp) && !is.null(xPos) ){
			maxbpidx<-sapply(r, function(i) max(which(xPos[i]+maxbp >= xPos[r])))
			# find the maxk idx
			smaxk<-max(maxbpidx - seq_along(r))+1
			maxk<-smaxk
		} else if(maxk>length(r)){
			maxk<-length(r)
		}	
		
		for(g in unique(grp)){
			message(sprintf("Step 1 building segmentation model for group %s ...", g))
			gi<-grp==g
			d<-sum(gi)
			if(family=='nbinom'){
				if (segDisp){
					alpha<-regionSegAlphaNB(x[r,gi], maxk=maxk, useMC=useMC, tol=tol)
				} else {
					dispersion<-estimateSegCommonDisp(x[r,gi])
					alpha<-matrix(dispersion, maxk, sum(r))
				}
				C<-regionSegCost(x[r,gi], maxk=maxk, family=family, alpha=alpha, useSum=useSum, useMC=useMC)
			} else {
				C<-regionSegCost(x[r,gi], maxk=maxk, family=family, useSum=useSum, comVar=comVar)
			}
			Res<- .C("univaRseg", as.double(C), as.integer(maxseg), as.integer(maxk), as.integer(length(r)), cost=double(maxseg*length(r)), pos=integer((maxseg-1)*length(r)), minC=double(maxseg), segS=integer(maxseg*maxseg), PACKAGE = "biomvRCNS")
		
			if( family== 'norm'){
				# norm
				#logL<- -nr/2*(1+log(2*pi)+log(Res$minC/nr)) # in tilingArray is a mixture of common and change variance
				if(comVar){
					logL<- -length(r)/2*(d+d*log(2*pi)+log(Res$minC/length(r))) # the likelihood ratio cost, p104
				} else {
					logL<- -(length(r)+length(r)*d*log(2*pi)+Res$minC)/2 # the likelihood ratio cost, p134
				}
			} else {
				# pois and nb
				logL<- 0-Res$minC
			}
			if(family=='pois'){
				nP<-seq_len(maxseg)*2-1
			} else if ( (family=='nbinom' && !segDisp) || (family=='norm' && comVar && useSum)){
				nP<-seq_len(maxseg)*2
			} else if ( (family=='nbinom' && segDisp) || (family=='norm' && !comVar && useSum)) {
				nP<-seq_len(maxseg)*3-1
			} else if ( family=='norm' && comVar && !useSum) {
				nP<-(seq_len(maxseg)+1)*(d+1)-2
			} else if ( family=='norm' && !comVar && !useSum) {
				nP<-seq_len(maxseg)*d*2+seq_len(maxseg)-1
			}
			# mbic 2nd term,
			mbic2ndt<-sapply(mat2list(Res$segS, maxseg), function(p) sum(log(p-c(1, p[-length(p)]))))

			
			# the optimal number of final segments for series within this group
			rN<-switch(penalty,
				none = which.max(logL),
				AIC = which.min(sapply(seq_len(maxseg), function(x) -2*logL[x]+2*nP[x])),
				AICc = which.min(sapply(seq_len(maxseg), function(x) -2*logL[x]+2*nP[x]*(nP[x]+1)/(length(r)*d-nP[x]-1))),
				BIC = which.min(sapply(seq_len(maxseg), function(x) -2*logL[x]+nP[x]*log(length(r)*d))),
				mBIC = which.min(sapply(seq_len(maxseg), function(x) -2*logL[x] +mbic2ndt[x]/2- log(length(r)*d)/2 + nP[x]*log(length(r)*d))),
				HQIC = which.min(sapply(seq_len(maxseg), function(x) -2*logL[x]+2*nP[x]*log(log(length(r)*d)))),
				stop('Invalid value argument for penalty'))	
			message(sprintf("Step 1 building segmentation model for group %s complete.", g))
		
			#rN could be supplied by the user, thus only keeping the initial segment candidate
			for(c in which(gi)){
				## here add checking whether a 2nd step is needed
				if(d==1 || !twoStep || rN==maxseg){
					## no need to do 2nd step
					rIdx<-c(mat2list(Res$segS, maxseg)[[rN]])
					j<-r[c(1,rIdx)] # col, segStart
					i<-r[c(rIdx-1, length(r))] # segEnd
					k<-i-j+1 # row for C and alpha matrix
				
					segStart[[c]]<-r[rIdx]
					segMean[[c]]<-sapply(seq_len(rN), function(z) mean(x[i[z]:j[z],c]))
										
					Ilist<-splitFarNeighbour(intStart=j, intEnd=i, xRange=ranges(xRange), maxgap=maxgap)
					tores<-GRanges(seqnames=as.character(seqs[s]), 
						IRanges(start=rep(start(xRange)[Ilist$IS], 1), end=rep(end(xRange)[Ilist$IE], 1)), 
						strand=strand(xRange)[Ilist$IS], 
						SAMPLE=rep(xid[c], each=length(Ilist$IS)), 
						AVG=as.numeric(sapply(1:length(Ilist$IS),  function(t) apply(as.matrix(x[Ilist$IS[t]:Ilist$IE[t],c]), 2, avgFunc, avg.m=avg.m, trim=trim, na.rm=na.rm)))
					)
					mcols(tores)<-DataFrame(values(tores), STATE=sapply(1:length(tores), function(i) ifelse(values(tores)[i, 'AVG']>avgFunc(x[r,c], avg.m=avg.m, trim=trim, na.rm=na.rm), 'HIGH', 'LOW')), row.names = NULL)
					seqlevels(tores)<-seqlevels(xRange)
					res<-c(res, tores)
					message(sprintf("No need to run step 2 merging, processing complete for column %s from group %s.", c, g))
					
				} else {
					message(sprintf("Step 2 merging for column %s from group %s\n", c, g))
					# to run a 2nd step merging
					# get all candidates coordinates
					rRegs<-mat2list(Res$segS, maxseg)[[maxseg]]	
					# generate a regional cost for each series
					if(family=='nbinom'){
						if(segDisp){
							ralpha<-regionSegAlphaNB(x[r,c], segs=rRegs, useMC=useMC, tol=tol)
						} else {
							ralpha=matrix(dispersion,maxseg, maxseg)
						}
						rC<-regionSegCost(x[r,c], segs=rRegs, family=family, alpha=ralpha, useSum=useSum, useMC=useMC) 
					} else  {
						rC<-regionSegCost(x[r,c], segs=rRegs, family=family, useSum=T, comVar=comVar)
					}		
					# use this regional cost to do dp merging
	#				rRes<-.Call("univaRseg", rC , as.integer(rN),  PACKAGE = "biomvRCNS") # .call version
					rRes<- .C("univaRseg", as.double(rC), as.integer(rN), as.integer(maxseg), as.integer(maxseg), cost=double(rN*maxseg), pos=integer((rN-1)*maxseg),minC=double(rN*maxseg), segS=integer(rN*rN), PACKAGE = "biomvRCNS") # .C version

					rIdx<-mat2list(rRes$segS, rN)[[rN]]
					j<-r[c(1,rRegs[rIdx-1])] # col, segStart, original index, including 1
					i<-r[c(rRegs[rIdx-1]-1, length(r))] # segEnd, original index
				
					# the result here need to be put into the final returning value
					segStart[[c]]<-r[rRegs[rIdx-1]]			
					segMean[[c]]<-sapply(seq_len(rN), function(z) mean(x[i[z]:j[z],c]))	
					
					Ilist<-splitFarNeighbour(intStart=j, intEnd=i, xRange=ranges(xRange), maxgap=maxgap)
					tores<-GRanges(seqnames=as.character(seqs[s]), 
						IRanges(start=rep(start(xRange)[Ilist$IS], 1), end=rep(end(xRange)[Ilist$IE], 1)),
						strand=strand(xRange)[Ilist$IS], 
						SAMPLE=rep(xid[c], each=length(Ilist$IS)), 
						AVG=as.numeric(sapply(1:length(Ilist$IS),  function(t) apply(as.matrix(x[Ilist$IS[t]:Ilist$IE[t],c]), 2, avgFunc, avg.m=avg.m, trim=trim, na.rm=na.rm)))
					)
					mcols(tores)<-DataFrame(values(tores), STATE=sapply(1:length(tores), function(i) ifelse(values(tores)[i, 'AVG']>avgFunc(x[r,c], avg.m=avg.m, trim=trim, na.rm=na.rm), 'HIGH', 'LOW')), row.names = NULL)
					seqlevels(tores)<-seqlevels(xRange)
					res<-c(res, tores)
					message(sprintf("Step 2 merging for column %s from group %s complete.", c, g))
				} # end 2nd step if
			} # end c for
			message(sprintf("Building segmentation model for group %s complete.", g))
		} # end for g
		message(sprintf("Processing sequence %s complete.", seqs[s]))
	} # end for s

#	new("biomvRseg",
#    x = x,
#    segStart = segStart,
#    segMean = segMean,
#    group=grp,
#	family=family)

	values(xRange)<-DataFrame(x,  row.names = NULL)
	new("biomvRCNS",  
		x = xRange, res = res,
		param=list(maxk=tmaxk, maxseg=maxseg, maxbp=maxbp, family=family, penalty=penalty, grp=grp, cluster.m=cluster.m, twoStep=twoStep, segDisp=segDisp, useMC=useMC, useSum=useSum, comVar=comVar, na.rm=na.rm, avg.m=avg.m, trim=trim, tol=tol)
	)

}






##################################################
# segmentation utility functions
##################################################

regionSegCost<-function(x, maxk=NULL, segs=NULL, family=NULL, alpha=NULL, useSum=TRUE, useMC=FALSE, comVar=TRUE){
	# the same cost function for both initial DP segmentation and 2nd DP merging
	# segs, a vector of the starting index of each candidate region, except for the first position 1
	# check input x, convert to vector if necessary
	if (is.null(family) || (family != 'norm' && family != 'pois' && family != 'nbinom')) 
        stop("'family' must be specified, currently only 'norm' ,'pois' and 'nbinom' are supported !")
  
	if (!is.numeric(x) || !(is.vector(x) || is.matrix(x))) 
        stop("'x' must be a numeric vector or matrix.")
    if (is.vector(x)) {
        d <- 1
        r <- x
        if( family== 'norm'){
        	 q<- x*x
        }
        x<-matrix(x, ncol=d)
    } else {
    	d <- ncol(x)
        r <- rowSums(x)
        if( family== 'norm'){
       	 q <-rowSums(x*x)
        }
    }
    n<-length(r)
    naVal<-.Machine$double.xmax
    
    ## add a checking for useSum in norm models
    if(d==1 && !useSum){
    	useSum<-TRUE
    	message("For univariate normal data, 'useSum' reset to TRUE, though result should be identical.")
    }
    
    ## check input parameter maxk and segs
     if (is.null(segs)) {
     	if (is.null(maxk)){
     		# both not specified, offer a warning
     		warning(sprintf("Both 'maxk' and 'segs' are not specified, a full size %d x %d cost matrix would be generated.", n, n))
			maxk<-n
     	} else {
     		# cost for the fisrt DP seg step
			 if (!is.numeric(maxk) || (length(maxk) != 1) || (maxk <= 1) ||  (maxk > n)) 
			    stop(sprintf("'maxk' must be a single integer between 2 and the number of rows of 'x': %d.", n))
     	}
		segs<-seq_len(n-1)+1
		N<-n
     } else if ( is.numeric(segs) && min(segs)>1 && max(segs)<=n  ) {
     	## if no segs specified, treat every position individually, no grouping, defult for the 1st step
    	N<-length(segs)+1
     	if (!is.null(maxk)){
     		# both specified, offer a warning
     		warning(sprintf("Both 'maxk' and 'segs' are specified, maxk would be overridden with length(segs)+1=%d", N))
     	}
     	maxk<-N 
     } else {
     	stop(sprintf("'segs' must be a vector of integers between 2 and the number of rows of 'x': %d.", n))
     }
     
    ## check nb and alpha,  alpha has to be consistent with C
    if( family == 'nbinom' && (!is.matrix(alpha) || nrow(alpha) != maxk && ncol(alpha)!=N))
		stop(sprintf("'alpha' must be a matrix of %d x %d .", maxk, N))
     
    # all end position of each candidate region
	e<-c(segs-1, n)
    
    # sum(x) wrt segs, for tilingArray and likelihood ratio cost 1
	cr<-cumsum(r)
	crs<-rep(cr[e[1]],N)
	crs[2:N]<-cr[e[2:N]]-cr[e[1:(N-1)]]
	crss<-cumsum(crs)
	
	# generate the cost wrt segs and maxk
	C <- matrix(as.numeric(naVal), nrow = maxk, ncol = N)
	sk<-cumsum(c(segs-1,n)-c(1,segs)+1)
	k <- 1:maxk 
	
    if( family== 'norm'){
    	# sum(x^2) wrt segs for norm only
		cq<-cumsum(q)
		cqs<-rep(cq[e[1]],N)
		cqs[2:N]<-cq[e[2:N]]-cq[e[1:(N-1)]]
		cqss<-cumsum(cqs)
		
		## there is a difference between grand mean and mean vector, currently, we are using grand mean
		## a complete solution would be a sum vector within a sum matrix, and then sum its segments up
		if(useSum){
			if(comVar){
				C[, 1] <- cqss[k] - crss[k] * crss[k]/(sk[k]*d) # old tilingArray solution, assuming common variance and use grand sum
			} else {
				C[, 1] <- log((cqss[k] - crss[k] * crss[k]/(sk[k]*d))/sk[k])*sk[k] # likelihood ratio cost, assuming different variance and use grand sum
			}
		} else {
			## sum(x) wrt segs, for the new likelihood ration cost
			cx<-sapply(1:d, function(j) cumsum(x[,j]))
			cxs<-matrix(, N, d)
			cxs[1, ]<- cx[e[1], ]
			cxs[2:N, ]<- sapply(1:d, function(j) cx[e[2:N],j]- cx[e[1:(N-1)],j])
			cxss<-sapply(1:d, function(j) cumsum(cxs[,j]))
		
			if(comVar){
				C[, 1] <- cqss[k] - sapply(k, function(i) sum(cxss[i,]^2))/sk[k]	# likelihood ratio cost, column wise cumsum and assume common variance
			} else {
				C[, 1] <- log((cqss[k] - sapply(k, function(i) sum(cxss[i,]^2))/sk[k])/sk[k])*sk[k]	# new likelihood ratio cost, column wise cumsum and different variance
			}
		}
		for (k in 1:(ifelse(N==maxk,maxk-1,maxk)) ) {
		    i <- 1:(N - k)
		    cqssk <- cqss[i + k] - cqss[i]
		    skk<-sk[i+k]-sk[i]
	    	if(useSum){
	    		crssk <- crss[i + k] - crss[i]
	    		if(comVar){
	    			C[k, i+1] <- cqssk - crssk * crssk/(skk*d) # old tilingArray solution, use grand sum, common variace, and grand sum
	    		} else {
	    			C[k, i+1] <-  log((cqssk -  crssk * crssk/(skk*d))/skk)*skk #  likelihood ratio cost, assuming different variance and use grand sum
	    		}
	    	} else {
	    		cxssk<-matrix(t(sapply(i, function(z)  cxss[z + k,] - cxss[z,])), ncol=d, nrow=N-k)
	    		if(comVar){
	    			C[k, i+1] <- cqssk - sapply(i, function(z) sum(cxssk[z,]^2))/skk # likelihood ratio cost, column wise cumsum, common variance
	    		} else {
	    			C[k, i+1] <- log((cqssk - sapply(i, function(z) sum(cxssk[z,]^2))/skk)/skk)*skk # likelihood ratio cost, column wise cumsum, variance change
	    		}
	    	}
		}
		if(N==n) C[1,]<-naVal #  give NA/Inf to the cost of single point in the first step, to avoid a positive epsilon or NaN, thus exclude consecutive changes
		
    } else if (family== 'pois') { # for poisson
    	C[, 1] <- crss[k]*log(crss[k]/(sk[k]*d))
		for (k in 1:(ifelse(N==maxk,maxk-1,maxk)) ) {
		    i <- 1:(N - k)
		    crssk <- crss[i + k] - crss[i]
		    skk<-sk[i+k]-sk[i]
		    C[k, i+1] <- crssk*log(crssk/(skk*d))
		}
		#C[1,]<-NA ## experimental, to avoid consecutive changes, doesnot work well, and inf cause 
		C<- -C
		
    }  else { # for negative bionomial
    	if(all(alpha==0)){
    		C[, 1] <- sapply(k, function(z) NBlogL1stTerm(c(x[seq_len(e[z]),]), alpha[z,1])) + crss[k]*log(crss[k]/(sk[k]*d)) - log((1+alpha[,1]*crss[k]/(sk[k]*d))^(crss[k]+(sk[k]*d)/alpha[,1]))
    	} else if(useMC) {
    		 C[, 1] <- unlist(mclapply(sapply(k, function(y) list(y)), function(z) NBlogL1stTerm(c(x[seq_len(e[z]),]), alpha[z,1]))) + crss[k]*log(crss[k]/(sk[k]*d)) - (crss[k]+(sk[k]*d)/alpha[,1])*log(1+alpha[,1]*crss[k]/(sk[k]*d))
    	} else {
    		C[, 1] <- sapply(k, function(z) NBlogL1stTerm(c(x[seq_len(e[z]),]), alpha[z,1])) + crss[k]*log(crss[k]/(sk[k]*d)) - (crss[k]+(sk[k]*d)/alpha[,1])*log(1+alpha[,1]*crss[k]/(sk[k]*d))
    	}  	
    	if(!useSum){
    		C[, 1] <- C[, 1]/(sk[k]*d)	
    	}				
		for (k in 1:(ifelse(N==maxk,maxk-1,maxk)) ) {
			i <- 1:(N - k)
			crssk <- crss[i + k] - crss[i]
			skk<-sk[i+k]-sk[i]
			if(all(alpha==0)){
				C[k, i+1] <- sapply(i, function(z) NBlogL1stTerm(c(x[segs[z+k-1]:e[z+k],]), alpha[k, z+1])) + crssk*log(crssk/(skk*d)) - log((1+alpha[k, i+1]*crssk/(skk*d))^(crssk+(skk*d)/alpha[k, i+1]))
			} else if(useMC) {
				C[k, i+1] <- unlist(mclapply(sapply(i, function(y) list(y)), function(z) NBlogL1stTerm(c(x[segs[z+k-1]:e[z+k],]), alpha[k, z+1]))) + crssk*log(crssk/(skk*d)) - (crssk+(skk*d)/alpha[k, i+1])*log(1+alpha[k, i+1]*crssk/(skk*d))
			}	else {
				C[k, i+1] <- sapply(i, function(z) NBlogL1stTerm(c(x[segs[z+k-1]:e[z+k],]), alpha[k, z+1]))+ crssk*log(crssk/(skk*d)) - (crssk+(skk*d)/alpha[k, i+1])*log(1+alpha[k, i+1]*crssk/(skk*d))
			}
			if(!useSum){
				C[k, i+1] <- C[k, i+1]/(skk*d)		
			} 				
		}
		C<- -C
    }
	return(C)
}


NBlogL1stTerm<-function(x, alpha){
	# here alpha is a single value, estimated from the same segment of the x matrix
	# log first and sum latter
	if(alpha==0){
		0
	} else {
		sum(sapply(x, function(y) if(y==0) 1 else sum(sapply(seq_len(y)-1, function(z) log(z*alpha+1)))))
	}
}


regionSegAlphaNB<-function(x, maxk=NULL, segs=NULL, useMC=FALSE, tol=1e-06){
	# generate alpha matrix for NB model, in both steps of segmentation cost prep.
	# segs, a vector of the starting index of each candidate region, except for the first position 1
	# check input x, convert to vector if necessary
	if (!is.numeric(x) || !(is.vector(x) || is.matrix(x))) 
        stop("'x' must be a numeric vector or matrix.")
    if (is.vector(x)) {
        d <- 1
        r <- x
        x<-matrix(x, ncol=d)
    } else {
    	d <- ncol(x)
        r <- rowSums(x)
    }
    n<-length(r)
       
   if(useMC & length(find.package('parallel', quiet=T))==0) {
		warning("'parallel' is not found, use normal 'apply' function!!!")
		useMC<-FALSE
	} else if (useMC){
		require(parallel)
	}
        
    ## check input parameter maxk and segs
     if (is.null(segs)) {
     	if (is.null(maxk)){
     		# both not specified, offer a warning
     		warning("Both 'maxk' and 'segs' are not specified, a full size n x n cost matrix would be generated.")
			maxk<-n
     	} else {
     		# cost for the fisrt DP seg step
			 if (!is.numeric(maxk) || (length(maxk) != 1) || (maxk <= 1) ||  (maxk > n)) 
			    stop(sprintf("'maxk' must be a single integer between 2 and the number of rows of 'x': %d.", n))
     	}
		segs<-seq_len(n-1)+1
		N<-n
     } else if ( is.numeric(segs) && min(segs)>1 && max(segs)<=n  ) {
     	## if no segs specified, treat every position individually, no grouping, defult for the 1st step
    	N<-length(segs)+1
     	if (!is.null(maxk)){
     		# both specified, offer a warning
     		warning("Both 'maxk' and 'segs' are specified, maxk would be overrided with lenth(segs)+1")
     	}
     	maxk<-N 
     } else {
     	stop(sprintf("'segs' must be a vector of integers between 2 and the number of rows of 'x': %d.", n))
     }
     # all end position of each candidate region
	e<-c(segs-1, n)
	
	# generate alpha wrt segs and maxk
	alpha <- matrix(as.numeric(NA), nrow = maxk, ncol = N)
	k <- 1:maxk 

	if(useMC){
		alpha[, 1] <-unlist(mclapply(sapply(k, function(y) list(y)), function(z) estimateSegCommonDisp(x[seq_len(e[z[1]]),])))	
	} else {
		alpha[, 1] <-sapply(k, function(z) estimateSegCommonDisp(x[seq_len(e[z]),]))		
	}	
	for (k in 1:(ifelse(N==maxk,maxk-1,maxk)) ) {
		## need a provide a vector for alpha, this is for each segment length k,  estimated from the all segments start from i to j	
	    i <- 1:(N - k)  
		if(useMC){
			alpha[k, i+1] <-unlist(mclapply(sapply(i, function(y) list(y)), function(z) estimateSegCommonDisp(x[segs[z[1]+k-1]:e[z[1]+k],])))
		} else {
			alpha[k, i+1]<-sapply(i, function(z) estimateSegCommonDisp(x[segs[z+k-1]:e[z+k], ]))
		}
	}
	return(alpha)
}



estimateSegCommonDisp<-function(xSeg,tol=1e-06){
	# function imported from edgeR
	# xSeg is a matrix object or a vector
#		disp <- 0
#		for(i in 1:2) {
#			delta <- optimize(commonCondLogLikDerDelta, interval=c(1e-4,100/(100+1)), tol=tol, maximum=TRUE, y=list(xSeg), der=0)
#			delta <- delta$maximum
#			disp <- delta/(1-delta)
#		}
#		disp
		x<-as.numeric(xSeg)
		wt<-rep(1, length(x))
		nbinomCLLDD(x, wt, 0)$par[1]
}
Any scripts or data that you put into this service are public.
biomvRCNS documentation built on Nov. 8, 2020, 6:49 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
biomvRCNS
Copy Number study and Segmentation for multivariate biological data

R/biomvRseg.R
In biomvRCNS: Copy Number study and Segmentation for multivariate biological data

Defines functions estimateSegCommonDisp regionSegAlphaNB NBlogL1stTerm regionSegCost biomvRseg

Documented in biomvRseg regionSegAlphaNB regionSegCost

Try the biomvRCNS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

biomvRCNS Copy Number study and Segmentation for multivariate biological data

R/biomvRseg.R In biomvRCNS: Copy Number study and Segmentation for multivariate biological data

Defines functions estimateSegCommonDisp regionSegAlphaNB NBlogL1stTerm regionSegCost biomvRseg

Documented in biomvRseg regionSegAlphaNB regionSegCost

Try the biomvRCNS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

biomvRCNS
Copy Number study and Segmentation for multivariate biological data

R/biomvRseg.R
In biomvRCNS: Copy Number study and Segmentation for multivariate biological data