R/msscan_tools.R

Defines functions `dchi` `compute.var` `matrix.max` `computeBeta` `computeMoments` `vu` `computeTiltDirect` `ComputeZ.fromS.R.partial` `ComputeZ.fromS.R` `computeZ.onechange` `computeZ.onechange.sample` `computeZ.squarewave.sample` `fcompute.max.Z` `fscan.max` `getCutoffMultisampleWeightedChisq` `pmarg.sumweightedchisq` `pvalueMultisampleWeightedChisq` `mscbs.classify` `fmscbs`

`dchi` <- function(y,N)
{
    fy <- (1-N/2)*log(2) + (N-1)*log(y) - y^2/2 - lgamma(N/2)
    exp(fy)
}


`compute.var` <- function(y)
{
    T <- nrow(y)
    T <- ifelse(T %% 2==0, T, T-1)
    y.diff <- y[seq(1,T-1,by=2),] - y[seq(2,T,by=2),]
    y.var  <- apply(y.diff, 2, mad)^2/2
    return( y.var )
}


`matrix.max` <- function(Z)
{
    ## If Z has NA values treat as -Inf
    na.inds = which(is.na(Z),arr.ind=TRUE)
    Z[na.inds] = -Inf

    row.max.col = max.col(Z, ties.method=c("random", "first", "last"))
    max.row = which.max(Z[cbind(1:nrow(Z),row.max.col)])
    c(max.row, row.max.col[max.row])
}


`computeBeta` <- function(ALPHA)
{
    w<-function(u){
         exp(u^2/2)/(ALPHA+exp(u^2/2))
    }
    integrand<-function(u){
        u^2*w(u)^2*(2*u^4*w(u)*(1-w(u))+5*u^2*w(u)-3*u^2-2)*dchi(u,1)
    }
        
    numerator=integrate(integrand,lower=0,upper=10)
    
    g<-function(u){
         u^2*exp(u^2/2)/(ALPHA+exp(u^2/2))
    }
    g.moments=computeMoments(g,0)
    numerator$value/(2*g.moments$psidotdot)
}


`computeMoments` <- function(g, theta)
{

    INTLIM.THRESH = 0.001
    psidottop.int <- function(u){ g(u)*exp(theta*g(u))*exp(-(u)^2/2) }    
    for( INTLIM in 10:50 ){
        temp=psidottop.int(INTLIM)
        if (is.na(temp)){
            INTLIM=INTLIM-1
            break
        }
        if (temp<INTLIM.THRESH) break
    }
    psidottop = integrate(psidottop.int,lower=-INTLIM,upper=INTLIM)
    psidottop = psidottop$value/sqrt(2*pi)

    psidotbot.int =  function(u){ exp(theta*g(u))*exp(-(u)^2/2)}  
    for( INTLIM in 10:50 ){
        temp=psidotbot.int(INTLIM)
        if( is.na(temp)){
            INTLIM=INTLIM-1
            break
        }
        if( temp<INTLIM.THRESH ) break
    }
    psidotbot = integrate(psidotbot.int, lower=-INTLIM, upper=INTLIM)
    psidotbot = psidotbot$value/sqrt(2*pi)
    psidot = psidottop/psidotbot
    
    EgUsq.int<-function(u){ g(u)^2*exp(theta*g(u))*exp(-(u)^2/2) }    
    for( INTLIM in 10:50 ){
        temp=EgUsq.int(INTLIM)
        if( is.na(temp)){
            INTLIM=INTLIM-1
            break
        }
        if( temp<INTLIM.THRESH) break
    }
    EgUsq =  integrate(EgUsq.int,lower=-INTLIM,upper=INTLIM)
    EgUsq = EgUsq$value/sqrt(2*pi)

    psidotdot = (EgUsq*psidotbot - psidottop^2)/(psidotbot^2);
    psi = psidotbot

    list(psi=psi,psidot=psidot,psidotdot=psidotdot)
}


`vu` <- function(x, maxn=1000, do.approx=(abs(x)<0.2))
{
    x=as.matrix(x)
    vux = matrix(0,nrow=nrow(x),ncol=ncol(x))

    if(is.logical(do.approx)){
        if(sum(do.approx) > 0)   vux[do.approx] = exp(-x[do.approx]*0.583)
    } else if(length(do.approx) > 0){
        vux[do.approx] = exp(-x[do.approx]*0.583)
    }
  

    if (sum(do.approx)<length(x)){
        notdo.approx.ix = which(!do.approx)
        n = matrix(c(1:maxn), nrow=1, ncol=maxn)
        summands = pnorm( -0.5*matrix(x[notdo.approx.ix], nrow=length(notdo.approx.ix), ncol=1) %*% sqrt(n) )/
                   matrix(rep(n,length(notdo.approx.ix)), ncol=length(n), byrow=TRUE)
        expterm = -2*apply(summands,1,"sum")
        vux[notdo.approx.ix] = (2/x[notdo.approx.ix]^2)*exp(expterm)
    }

    vux
}


`computeTiltDirect` <- function(b,g,THRESH,theta0) 
{
    theta = theta0 # if start theta at 0, then dh(theta)=0 at the first step for g(u)=u^2.
    prevtheta = Inf
    prevprevtheta = Inf
    thetarec = theta
    
    while( abs(theta-prevtheta)>THRESH && abs(theta-prevprevtheta)>THRESH ){
        
        g.moments <- computeMoments(g,theta)
        htheta = g.moments$psidot-b
        dhtheta = g.moments$psidotdot

        ## cat("theta=", theta," htheta=",htheta,".\n",sep="")
        thetarec = c(thetarec, theta)
        prevprevtheta = prevtheta
        prevtheta = theta
        theta = prevtheta - htheta/dhtheta
    }

    theta=prevtheta
    psi = g.moments$psi

    list(theta=theta,psi=psi,psidot=g.moments$psidot,psidotdot=g.moments$psidotdot)
}


`ComputeZ.fromS.R.partial` <- function(this.S, this.SST, this.imap, start.inds, end.inds, ALPHA, MIN.SNPs)
{
    T <- nrow(this.S)
    N <- ncol(this.S)
    dfnum <- 1

    log.alpha <- log(ALPHA)
    g <- function(u){
        i2 <- (u < 10 + log.alpha)
        r1 <- u
        r1[i2] <- r1[i2] * exp(u[i2]/2)/(ALPHA+exp(u[i2]/2))
        return( r1 )
    }
    
    Z <- matrix(nrow=length(start.inds), ncol=length(end.inds), data=0)

    totalsnps <- this.imap[T,] - this.imap[1,]
    dfden     <- totalsnps-2
    for (i in 1:length(start.inds)) {
        for (j in 1:length(end.inds)) {
            st <- start.inds[i]
            ed <- end.inds[j]
            if (st > 0 && st < ed && ed<T) {
                nsnps <- this.imap[ed,] - this.imap[st,]
                diff1 <- this.S[ed,] - this.S[st,]
                SSb <- nsnps*(diff1/nsnps - this.S[T,]/totalsnps)^2
                SSb <- SSb + (totalsnps-nsnps)*((this.S[T,]-diff1)/(totalsnps-nsnps)-this.S[T,]/totalsnps)^2
                SSw <- this.SST - SSb
                U <- (SSb/dfnum)/(SSw/dfden)
                set.to.zero <- which(nsnps<MIN.SNPs | ((totalsnps-nsnps)<MIN.SNPs))
                U[set.to.zero] <- 0
                Z[i,j] <- sum(g(U))
            }
        }
    }

    return(Z)
}


`ComputeZ.fromS.R` <- function(this.S, this.SST, this.imap, win, ALPHA, MIN.SNPs)
{
    T <- nrow(this.S)
    N <- ncol(this.S)
    dfnum <- 1

    win = min(win, T-1)
    if (win==0) { 
        return(NULL)
    }

    log.alpha <- log(ALPHA)
    g <- function(u) {
        i2 <- (u < 10 + log.alpha)
        r1 <- u
        r1[i2] <- r1[i2] * exp(u[i2]/2)/(ALPHA+exp(u[i2]/2))
        return( r1 )
    }
    
    U <- matrix(nrow=T, ncol=win, data=0)
    Z <- U
    for (i in 1:N) {
        totalsnps <- this.imap[T,i] - this.imap[1,i]
        dfden     <- totalsnps - 2
        for (k in 1:win) {
            nsnps <- this.imap[(k+1):T,i] - this.imap[1:(T-k),i]
            diff1 <- this.S[(k+1):T,i] - this.S[1:(T-k),i]
                
            SSb <- nsnps*(diff1/nsnps - this.S[T,i]/totalsnps)^2
            SSb <- SSb + (totalsnps - nsnps)*((this.S[T,i] - diff1)/(totalsnps - nsnps) - this.S[T,i]/totalsnps)^2
            SSw <- this.SST[i] - SSb
            U[1:(T-k),k] <- (SSb/dfnum)/(SSw/dfden)
                
            set.to.zero <- which(nsnps<MIN.SNPs | ((totalsnps-nsnps)<MIN.SNPs))
            U[set.to.zero,k] <- 0
        }
        Z <- Z + g(U)
    }
    return(Z)
}


`computeZ.onechange` <- function(y, t, y.var)
{
    T    <- ncol(y)
    St   <- apply(y[,1:t, drop=FALSE], 1, sum)
    ST   <- apply(y, 1, sum)
    Zsq  <- (St - (t/T)*ST)^2/(y.var*t*(1-t/T))
    sumZ <- sum(Zsq)
    pval <- pchisq(sumZ, nrow(y), lower.tail=FALSE)
  
    list(sumZ=sumZ, pval=pval)
}


`computeZ.onechange.sample` <- function(y, t, y.var)
{
    T    <- ncol(y)
    St   <- apply(y[,1:t, drop=FALSE], 1, sum)
    ST   <- apply(y,1,sum)
    Zsq  <- (St - (t/T)*ST)^2/(y.var*t*(1-t/T))
    pval <- pchisq(Zsq, 1, lower.tail=FALSE)

    list(Zsq=Zsq, pval=pval)
}


`computeZ.squarewave.sample` <- function(this.y, seg, y.var)
{
    T  <- ncol(this.y)
    Ss <- apply(as.matrix(this.y[,1:seg[1]], nrow=nrow(this.y), ncol=seg[1]), 1, sum)
    St <- apply(this.y[,1:seg[2]], 1, sum)
    ST <- apply(this.y,1,sum)
    k  <- seg[2]-seg[1]
    Zsq <- (St-Ss - (k/T)*ST)^2/(y.var*k*(1-k/T))
    pval <- pchisq(Zsq, 1, lower.tail=FALSE)

    list(Zsq=Zsq, pval=pval)
}


`fcompute.max.Z` <- function(this.y, win, y.var, ALPHA, MIN.SNPs, SINGLECHANGE.THRESH=0.0001)
{
    N      <- ncol(this.y)
    this.T <- nrow(this.y)

    if (this.T < 2*MIN.SNPs) {
        bestZ  <- NA
        bestchpt <- c(NA, NA)
    } else {
        this.S   <- apply(this.y,2,cumsum)
        this.SST <- apply(this.y,2,var)*(this.T-1)
        
        ## Find (t1, t2] that maximizes Z using filtered scan
        temp <- fscan.max(this.S, this.SST, this.imap=NULL, MIN.SNPs=MIN.SNPs, ALPHA=ALPHA)
        bestchpt <- temp$seg
        bestZ    <- temp$maxZ
        
        ## Test the left and right change-point individually
        if (bestZ > 0) {
            if (bestchpt[1]==0) { ## change 1 => 0
                pval.L <- 1
            } else {
                pval.L <- computeZ.onechange(t(this.y[1:bestchpt[2],]), bestchpt[1], y.var)$pval
            }
            
            if (bestchpt[2]==this.T) {
                pval.R <- 1
            } else {
                pval.R <- computeZ.onechange(t(this.y[(bestchpt[1]+1):this.T,]), bestchpt[2]-bestchpt[1], y.var)$pval
            }
            
            if ( (pval.L > SINGLECHANGE.THRESH || bestchpt[1] < MIN.SNPs) &&
                 (pval.R <= SINGLECHANGE.THRESH && (this.T - bestchpt[2]) >= MIN.SNPs) ) {
                bestchpt <- c(bestchpt[2], NA)
            } else {
                if ( (pval.R > SINGLECHANGE.THRESH || (this.T - bestchpt[2]) < MIN.SNPs) &&
                     (pval.L <= SINGLECHANGE.THRESH && bestchpt[1] >= MIN.SNPs) ) {
                    bestchpt <- c(bestchpt[1], NA)
                } else {
                    if ( (pval.L > SINGLECHANGE.THRESH || bestchpt[1] < MIN.SNPs) &&
                         (pval.R > SINGLECHANGE.THRESH || (this.T - bestchpt[2]) < MIN.SNPs) ) {
                        bestchpt <- c(NA, NA)
                        bestZ <- NA
                    }
                }
            }     
        
        } else {
            bestchpt <- c(NA, NA)
            bestZ <- NA
        }
    }
    
    list(bestchpt=bestchpt, bestZ=bestZ)
}


`fscan.max` <- function(this.S, this.SST, y.var=NULL, this.imap=NULL, 
                        f=NULL, MIN.SNPs=2, ALPHA=0, beta=NULL, verbose=FALSE)
{
    T <- nrow(this.S)   ## Number of SNPs
    N <- ncol(this.S)   ## Number of samples
    
    if (T < 2*MIN.SNPs) {
        maxZ <- NA
        seg  <- c(NA, NA)
    } else {

        if (is.null(f)) {
            f.power <- seq(-floor(log10(T))+1, 0, 1)
            f <- 10^f.power
        }
      
        if (T<1000) f <- 1  ## even if a value for f is passed in, do not allow filtering for short sequences
      
        f <- sort(f)
        if (f[length(f)] != 1) f <- c(f,1)
        R <- length(f)
        L <- ceiling(f[2:R]/f[1:(R-1)])
        L <- c(ceiling(T*f[1]), L)
      
        chpts   <- matrix(ncol=2, nrow=0)
        chpts.Z <- matrix(nrow=1, ncol=0)
    
        if (is.null(this.imap)) {
            this.imap <- matrix(1:T, nrow=T, ncol=N)
        }
    
        g2 <- function(u) { u^2*exp(u^2/2)/(ALPHA+exp(u^2/2)) }
        g2.moments <- computeMoments(g2,0)
        if(is.null(beta)) beta <- rep(1,N)
      
        for (r in 1:R) {
            stepsize <- floor(1/f[r])
        
            t <- seq(0, T, stepsize)         ## t is the filtered anchor set
            if (t[length(t)]<T) t <- c(t,T)  ## always include the last datapoint in the set
    
            if (min(t)==0) {
            	f.S    <- rbind(c(0,0), this.S[t,])
            	f.imap <- rbind(c(0,0), this.imap[t,])
            } else {
            	f.S    <- this.S[t,]
            	f.imap <- this.imap[t,]
            }
                
            ## Refine previously found change-points using the denser anchor set
            if (nrow(chpts)>0) {
                for (i in 1:nrow(chpts)) {
                    ind.L <- chpts[i,1] %/% stepsize
                    ind.R <- chpts[i,2] %/% stepsize
            
                    check.win  <- f[r]/f[r-1]
                    start.inds <- (ind.L - check.win):(ind.L + check.win) 
                    end.inds   <- (ind.R - check.win):(ind.R + check.win)

                    Z.part <- ComputeZ.fromS.R.partial(f.S, this.SST, f.imap, start.inds, end.inds, ALPHA, MIN.SNPs)
                    Z.part <- (Z.part - N*g2.moments$psidot)/sqrt(g2.moments$psidotdot*N)          
    
                    maxind      <- matrix.max(Z.part)
                    improved.cp <- c(t[start.inds[maxind[1]]], t[end.inds[maxind[2]]])
                    improved.Z  <- Z.part[maxind[1], maxind[2]]
                    if (verbose) {
                        cat("fscan.max: Changepoints (", chpts[i,1], ", ",chpts[i,2], 
                            ") refined to (", improved.cp[1], ", ",improved.cp[2], 
                            "). Z-score improved from ", chpts.Z[i], " to ", improved.Z,"\n", sep="")
                    }
                    chpts[i,]  <- improved.cp
                    chpts.Z[i] <- improved.Z
                } ## for i loop
            } ## if (nrow(chpts)>0)
        
            ## Do scan with min window size MIN.SNPS and max window size L[r]
            Z <- ComputeZ.fromS.R(f.S, this.SST, f.imap, L[r], ALPHA, MIN.SNPs)
            Z <- (Z - N*g2.moments$psidot)/sqrt(g2.moments$psidotdot*N)      
    
            maxind  <- matrix.max(Z)
            newchpt <- c(t[maxind[1]], t[maxind[1] + maxind[2]])
            newZ    <- Z[maxind[1], maxind[2]]
            if (verbose) {
                cat("fscan.max: Change-point found in round ", r, ":\n")
                print(c(newchpt, newZ), digits=1)
            }
            
            if (nrow(chpts)==0) {
                chpts <- rbind(chpts, newchpt)
                if (length(chpts) == 2) chpts <- matrix(nrow=1, ncol=2, data=chpts)      
                chpts.Z <- c(chpts.Z, newZ)
            }
            if (nrow(chpts)>=1) {
            	tmp <- unique(rbind(chpts, newchpt))
            	if (nrow(tmp) == nrow(chpts) + 1) {
            		chpts <- rbind(chpts, newchpt)
            		chpts.Z <- c(chpts.Z, newZ)
            	}
            	## otherwise the newchpt have been detected in previous round
            }
            
        } ## for r loop
    
        ## Take the maximum over the rounds, return chpt and Z score
        max.ind <- which.max(chpts.Z)
        maxZ    <- chpts.Z[max.ind]
        seg     <- chpts[max.ind,]
    } ## if T<=MIN.SNPS
    
    list(maxZ = maxZ, seg = seg)
}


`getCutoffMultisampleWeightedChisq` <- function(pval, m, delta, win, N, alpha)
{
    
    cat("Computing threshold for weighted chi-square...\n")
    THRES <- 0.1*pval
    currb <- 1
    prevsmallerb <- currb
    prevlargerb  <- 200
    currpval <- 1
    
    while( abs(currpval-pval)>THRES ) {        
        if( currpval > pval){
            ## need to increase b.
            prevsmallerb <- currb
            currb <- currb + (prevlargerb-currb)/2
        } else {
            ## need to decrease b.
            prevlargerb <- currb
            currb <- currb - (currb-prevsmallerb)/2
        }

        currpval <- pvalueMultisampleWeightedChisq(currb, m, delta, win, alpha, N)
    }    
    
    return( currb )
}


`pmarg.sumweightedchisq` <- function(b, ALPHA, N)
{
    g <- function(u) {
         u^2*exp(u^2/2)/(ALPHA + exp(u^2/2))
    }
    g.moments = computeMoments(g,0)
    gnormed <- function(u) {
    	(g(u)-g.moments$psidot)/sqrt(g.moments$psidotdot)
    }
    theta0 = sqrt(g.moments$psidotdot)/2   ## need theta/sqrt(psidotdot) < 1/2 for psi(theta)<infty
    b0 = b/sqrt(N)
    THRESH = 0.0001
    marg = computeTiltDirect(b0, gnormed, THRESH, theta0)
    thetabN = marg$theta*sqrt(N)
    pmarg = exp(-thetabN*b + N*log(marg$psi))/sqrt(2*pi*marg$psidotdot)
    pmarg
}


`pvalueMultisampleWeightedChisq` <- function(b, m, delta, win, ALPHA, N)
{
    delta1 = min(1, win/m)
    beta = computeBeta(ALPHA)

    integrand<-function(u){
        vu(sqrt(2)*b*sqrt(beta)/(sqrt(m)*sqrt(u*(1-u))))^2/(u^2*(1-u))
    }
    
    integral <- integrate(integrand, lower=delta, upper=delta1)
    pmarg    <- pmarg.sumweightedchisq(b, ALPHA, N)

    pval <- b^3*beta^2*pmarg*integral$value
    pval
}


`mscbs.classify` <- function(this.y, seg, y.var, CHISQ.PVAL.THRESH=0.001, MIN.REQ.ABSDIFF=NA, MIN.SUFF.ABSDIFF=NA)
{    
    N <- ncol(this.y)
    T <- nrow(this.y)

    if (is.na(MIN.REQ.ABSDIFF)) {
        MIN.REQ.ABSDIFF <- 0.5*sqrt(y.var)
    }
    if (is.na(MIN.SUFF.ABSDIFF)) {
        MIN.SUFF.ABSDIFF <- 5*sqrt(y.var)
    }
    
    ## segment: (seg[1],seg[2]]    
    if (is.na(seg[1]) && is.na(seg[2])) {
        ## Invalid change-point
        sampleswithsegment <- rep(0,N)
        this.yhat <- matrix(0, nrow=nrow(this.y), ncol=ncol(this.y))
    } else {
        if (is.na(seg[2])) {
            ## Single change-point
            sample.pval <- computeZ.onechange.sample(t(this.y), seg[1], y.var)$pval
            pass1 <- sample.pval<CHISQ.PVAL.THRESH
            pass2 <- abs(apply(as.matrix(this.y[1:seg[1],]),2,mean) - apply(as.matrix(this.y[(seg[1]+1):T,]),2,mean)) > MIN.SUFF.ABSDIFF
            cut1  <- abs(apply(as.matrix(this.y[1:seg[1],]),2,mean) - apply(as.matrix(this.y[(seg[1]+1):T,]),2,mean)) < MIN.REQ.ABSDIFF
            sampleswithsegment <- (pass1 | pass2) & (!cut1)
            this.yhat <- matrix(0,nrow=nrow(this.y),ncol=ncol(this.y))
            this.yhat[1:seg[1], sampleswithsegment] <- 
                matrix(rep(apply(as.matrix(this.y[1:seg[1],sampleswithsegment],nrow=seg[1]),2,mean), seg[1]),
                       nrow=seg[1], byrow=TRUE)
            this.yhat[(seg[1]+1):T,sampleswithsegment] <- 
                matrix(rep(apply(as.matrix(this.y[(seg[1]+1):T,sampleswithsegment],nrow=T-seg[1]),2,mean), T-seg[1]), 
                       nrow=T-seg[1], byrow=TRUE)        

        } else {
            ## Two change-points, square wave change, see who has change in middle
            sample.pval <- computeZ.squarewave.sample(t(this.y), seg, y.var)$pval
            pass1 <- sample.pval < CHISQ.PVAL.THRESH
            pass2 <- abs( apply(as.matrix(this.y[(seg[1] + 1):seg[2],]),2,mean) - 
                          apply(as.matrix(this.y[c(1:seg[1],(seg[2]+1):T),]),2,mean) ) > MIN.SUFF.ABSDIFF
            cut1  <- abs( apply(as.matrix(this.y[(seg[1] + 1):seg[2],]),2,mean) - 
                        apply(as.matrix(this.y[c(1:seg[1],(seg[2]+1):T),]),2,mean) ) < MIN.REQ.ABSDIFF
            sampleswithsegment <- (pass1 | pass2) & (!cut1)
            this.yhat <- matrix(0, nrow=nrow(this.y), ncol=ncol(this.y))
            this.yhat[(seg[1]+1):seg[2], sampleswithsegment] <- 
                matrix(rep(apply(as.matrix(this.y[(seg[1] + 1):seg[2], sampleswithsegment], nrow=seg[2]-seg[1]),2,mean), seg[2]-seg[1]), 
                       nrow=seg[2]-seg[1], byrow=TRUE)
            this.yhat[c(1:seg[1], (seg[2] + 1):T),sampleswithsegment] <- 
                matrix(rep(apply(as.matrix(this.y[c(1:seg[1], (seg[2] + 1):T), sampleswithsegment], nrow=T-(seg[2]-seg[1])),2,mean),T-(seg[2]-seg[1])),
                       nrow=T-(seg[2]-seg[1]),byrow=TRUE)
        }
    }
    
    list(carriers=sampleswithsegment, yhat=this.yhat)
}


`fmscbs` <- function(y, 
                     win, 
                     f = c(0.01, 0.1, 1), 
                     MIN.SNPs = 3, 
                     ALPHA = 0, 
                     GLOBAL.PVAL.CUTOFF = 0.0001, 
                     MAX.CHPTS = NA, 
                     WCHISQ.CUTOFF = NA)
{

## -------------------------------------------------------------
##     win:  maximum length of CNV (can be set to nrow(y))
##     MIN.SNPs: minimum length of CNV (can be set to 1).
##     GLOBAL.PVAL.CUTOFF:  Stops segmenting if pval falls below this.
##     MAX.CHPTS:  Stops segmenting after a maximum number of 
##                 change-points is reached.
##     plots:  If true, draws heatmap after every split.  
##             If your data set size not too large, this is
##             useful for tracking progress.  But plotting takes a long time
##             and the program runs a lot faster when plots=FALSE.
##
##   If your data is not properly normalized, or if you want a really sparse
##   summary (i.e. ignore everything except for the most significant changes),
##   it may be a good idea to use MAX.CHPTS to limit the number of recursions.
##
## --------------------------------------------------------------

    N <- ncol(y)
    T <- nrow(y)
    DELTA <- MIN.SNPs/T
  
    if(is.na(MAX.CHPTS)) MAX.CHPTS <- floor(T/MIN.SNPs)
  
    if(is.na(WCHISQ.CUTOFF)) {
        WCHISQ.CUTOFF <- getCutoffMultisampleWeightedChisq(pval = GLOBAL.PVAL.CUTOFF, 
                                                           m = T, 
                                                           delta = DELTA, 
                                                           win = win, 
                                                           N = N, 
                                                           alpha = ALPHA)
        cat("MSCBS: weighted chisquare cutoff = ", WCHISQ.CUTOFF, "\n")
    }

    ## initialize
    y.var <- compute.var(y)
    yhat  <- matrix(rep(apply(y,2,mean),T), nrow=T, byrow=TRUE)
    y.r   <- y - yhat
  
    chpts <- c(0, T)
    bestZ <- fcompute.max.Z(y.r, win, y.var, ALPHA, MIN.SNPs)
    best.subchpt <- matrix(bestZ$bestchpt, ncol=1, nrow=2)
    best.Z       <- bestZ$bestZ

    splitnum  <- 0
    chpt.hist <- vector("list", MAX.CHPTS)
        
    while (TRUE) {
        max.Z      <- max(best.Z, na.rm=TRUE)
        max.region <- which.max(best.Z)
    
        if (max.Z < WCHISQ.CUTOFF) {
            cat("Maximum Z-score is ", max.Z, 
                ", which does not exceed cutoff of ", WCHISQ.CUTOFF, 
                ". Segmentation finished.\n", sep="")
            break
        }

        if (length(chpts) > MAX.CHPTS + 2) {
            cat("Maximum number of change-points reached. Segmentation finished.\n")
            break
        }

        if (length(max.region)==0) {
            cat("Optimal region has no valid change-points. Segmentation finished.\n")      
            break
        }

        if (is.na(best.subchpt[1, max.region])) {
            cat("Optimal region has no valid change-points. Segmentation finished.\n")      
            break
        }
        
        splitnum <- splitnum + 1
        newchpt  <- c(best.subchpt[1, max.region], best.subchpt[2, max.region])

        ## Classify samples and update yhat
        y.r.classify <- mscbs.classify(y.r[(chpts[max.region]+1):chpts[(max.region+1)],], 
                                       newchpt - chpts[max.region], y.var, CHISQ.PVAL.THRESH=0.001)
        y.r.hat <- matrix(0, nrow=T, ncol=N)
        y.r.hat[(chpts[max.region]+1):chpts[(max.region+1)],] <- y.r.classify$yhat  ## add +1 to chpts[max.region]
        yhat <- yhat + y.r.hat
        y.r <- y.r - y.r.hat    

        if (!is.na(newchpt[2])) { ## The added change consistes of two change-points
            cat("Split ", splitnum, ": ", newchpt[1], ", ",newchpt[2], ", Z-score = ", max.Z, ".\n", sep="")
            y.r.L <- y.r[(chpts[max.region]+1):newchpt[1],]
            y.r.M <- y.r[(newchpt[1]+1):newchpt[2],]
            y.r.R <- y.r[(newchpt[2]+1):chpts[max.region+1],]

            bestZ.L <- fcompute.max.Z(y.r.L, win, y.var, ALPHA, MIN.SNPs)
            bestZ.M <- fcompute.max.Z(y.r.M, win, y.var, ALPHA, MIN.SNPs)
            bestZ.R <- fcompute.max.Z(y.r.R, win, y.var, ALPHA, MIN.SNPs)
            best.Z.new <- c(bestZ.L$bestZ, bestZ.M$bestZ, bestZ.R$bestZ)
            best.subchpt.new <- cbind(bestZ.L$bestchpt + chpts[max.region], 
                                      bestZ.M$bestchpt + newchpt[1], 
                                      bestZ.R$bestchpt + newchpt[2])
        } else { ## The added change-point is a singleton
            newchpt <- newchpt[1]
            cat("Split ", splitnum, ": ", newchpt, ", Z-score = ", max.Z, ".\n", sep="")
            y.r.L <- y.r[(chpts[max.region] + 1):newchpt,]
            y.r.R <- y.r[(newchpt + 1):chpts[max.region+1],]
      
            bestZ.L <- fcompute.max.Z(y.r.L, win,y.var, ALPHA, MIN.SNPs)
            bestZ.R <- fcompute.max.Z(y.r.R, win,y.var, ALPHA, MIN.SNPs)
            best.Z.new <- c(bestZ.L$bestZ, bestZ.R$bestZ)
            best.subchpt.new <- cbind(bestZ.L$bestchpt + chpts[max.region], 
                                      bestZ.R$bestchpt + newchpt)
        }
    
        if (max.region > 1) {
            leftpart   <- best.subchpt[,1:(max.region-1)]
            leftpart.Z <- best.Z[1:(max.region-1)]
        } else {
            leftpart   <- matrix(0, ncol=0, nrow=2)
            leftpart.Z <- matrix(0, ncol=0, nrow=0)
        }
    
        if (max.region+1 <= ncol(best.subchpt)) {
            rightpart   <- best.subchpt[,(max.region+1):ncol(best.subchpt)]
            rightpart.Z <- best.Z[(max.region+1):length(best.Z)]
        } else {
            rightpart   <- matrix(0, ncol=0, nrow=2)
            rightpart.Z <- matrix(0, ncol=0, nrow=0)
        }
        
        chpt.hist[[splitnum]] <- list(chpts = chpts, max.region = max.region,
                                      newchpt = newchpt, max.Z = max.Z, 
                                      carriers = y.r.classify$carriers)
        best.Z <- c(leftpart.Z, best.Z.new, rightpart.Z)    
        best.subchpt <- cbind(leftpart, best.subchpt.new, rightpart)
        chpts <- c(chpts[1:max.region], newchpt, chpts[(max.region+1):length(chpts)])
        
    } ## while loop

    chpt.hist <- chpt.hist[1:splitnum]
    if (length(chpts)>2) { 
        for (i in 2:length(chpts)) {
            yhat[(chpts[i-1]+1):chpts[i],] <- matrix(nrow = chpts[i]-chpts[i-1], ncol=ncol(yhat), 
                                                     data = apply(as.matrix(y[(chpts[i-1]+1):chpts[i],]),2,mean), byrow=TRUE)    
        }    
    } else {
        chpts <- c(0, T)
    }

    feature.matrix  <- yhat[chpts[-1], ]
    feature.regions <- data.frame(start = chpts[-length(chpts)] + 1, end = chpts[-1], row.names=NULL)

    list(chpt.hist=chpt.hist, chpts=chpts, yhat=yhat, 
         feature.matrix=feature.matrix, feature.regions=feature.regions)
}

Try the saasCNV package in your browser

Any scripts or data that you put into this service are public.

saasCNV documentation built on May 1, 2019, 7:49 p.m.