R/pruneByDP.R

pruneByDP <- structure(function(# Exact segmentation of a multivariate signal using dynamic programming.
### Exact segmentation of a multivariate signal using dynamic programming.
                                Y,
### A \code{n*p} signal to be segmented
                                candCP=1:(nrow(Y)-1),
### A vector of candidate change point positions (defaults to 1:(n-1))
                                K=length(candCP),
### The maximum number of change points to find
                                allowNA=TRUE,
### A boolean value specifying whether missing values should be allowed or not.
                                verbose=FALSE
### A \code{logical} value: should extra information be output ? Defaults to \code{FALSE}.
                                ){
    ##details<<This function retrieves the maximum likelihood solution
    ##of the gaussian homoscedastic change model into segments, for
    ##\eqn{K \in {1 \dots length(candCP)}}. The dynamic programming algorithm used
    ##is quadratic in time. For signals containing more than 1000
    ##points, we recommend using a first pass segmentation (see
    ##\code{\link{segmentByRBS}}) to find a smaller number of
    ##candidates, and to run \code{pruneByDP} on these candidates only,
    ##as initially suggested by Gey and Lebarbier (2008). These two
    ##steps can be performed using \code{\link{jointSeg}} for generic
    ##multivariate signals, and using \code{\link{PSSeg}} for copy
    ##number signals from SNP array data.

    ##details<<if \code{allowNA}, the calulation of the cost of removing
    ##candidate breakpoints between i and j for i<j tolerates missing
    ##values. Method \code{!allowNA} is maintained in order to check
    ##consistency with the original dynamic programming in the absence
    ##of NA:s.
    
    ##references<<Bellman, R. (1961). On the approximation of curves by
    ##line segments using dynamic programming. Communications of the
    ##ACM, 4(6), 284.

    ##references<<Gey, S., & Lebarbier, E. (2008). Using CART to Detect
    ##Multiple Change Points in the Mean for Large
    ##Sample. http://hal.archives-ouvertes.fr/hal-00327146/
    
    ##note<<This implementation is derived from the MATLAB code
    ##by Vert and Bleakley: \url{http://cbio.ensmp.fr/GFLseg}.
    
    ##seealso<<\code{\link{jointSeg}}, \code{\link{PSSeg}}

    ## Argument 'Y'
    if (is.null(dim(Y)) || is.data.frame(Y)) {
        if (verbose) {
            print("Coercing 'Y' to a matrix")
        }
        Y <- as.matrix(Y)
    } else if (!is.matrix(Y)){
        stop("Argument 'Y' should be a matrix, vector or data.frame")
    }
    n <- nrow(Y)
    p <- ncol(Y)
    if (K*length(candCP)^2>1e9) {
        cat("Please note that 'pruneByDP' is intended to be run on a not too large set of *candidate* change points.  Running it on too many candidates can be long, as the algorithm is quadratic in the number of candidates\n")
    }
    ## Argument 'candCP'
    if (length(candCP)) {
        rg <- range(candCP);
        stopifnot(rg[1]>=1 && rg[2]<=n)
    }

    ## Compute boundaries of the smallest intervals considered
    b <- as.integer(sort(c(0, candCP, n)))
    k <- length(candCP) + 1 # number of such intervals
    
    ## Compute the k*k matrix J such that J[i,j] for i<=j is the RSE
    ## when intervals i to j are merged
    J <- matrix(numeric(k*k), ncol=k)
    if (allowNA) {
        for (pp in 1:p) {
            Jpp <- getUnivJ(Y[, pp], candCP)
            Jpp[is.nan(Jpp)] <- 0  ## happens when all values are NA between two successive candidates
            ## then the corresponding dimension contributes 0 (ie no cost/gain of merging)
            J <- J + Jpp
        }
    } else {
        s <- rbind(rep(0,p), apply(Y, 2, cumsum))
        v <- c(0, cumsum(rowSums(Y^2)))
        for (ii in 1:k) {
            Istart <- b[ii] +1
            for(jj in ii:k){
                Iend <- b[jj+1]
                J[ii,jj] <- v[Iend+1] - v[Istart] - sum((s[Iend+1,]-s[Istart,])^2)/(Iend-Istart+1)
            }
        } ## for (ii ...
    } ## if (!allowNA)
    
    ## Dynamic programming
    V <- matrix(numeric((K+1)*k), ncol = k)
    ## V[i,j] is the best RSE for segmenting intervals 1 to j
    ## with at most i-1 change points
    bkp <-   matrix(integer(K*k), ncol = k)
    ## With no change points, V[i,j] is juste the precomputed RSE
    ## for intervals 1 to j
    V[1, ] <- J[1, ]
    KK <- seq(length=K)
    ## Then we apply the recursive formula
    for (ki in KK){
        for (jj in (ki+seq(length=k-ki))){
            obj <- V[ki,ki:(jj-1)] + J[(ki+1):jj, jj]
            val <- min(obj)
            ind <- which.min(obj)
            V[ki+1, jj] <- val
            bkp[ki, jj] <- ind + ki-1L
        }
    }
    
    ## Optimal segmentation
    res.bkp <- list()
    for (ki in KK){
        res.bkp[[ki]] <- integer(ki)
        res.bkp[[ki]][ki] <- bkp[ki, k]
        if (ki!=1) {
            for (ii in (ki-seq(length=ki-1))) {
                res.bkp[[ki]][ii] <- bkp[ii, res.bkp[[ki]][ii+1]]
            }
        }
    }
    ## Convert back the index of the interval to the last position before the bkp
    rightlimit <- b[2:length(b)]
    for(ki in KK) {
        res.bkp[[ki]] <- rightlimit[res.bkp[[ki]]]
    }
    ## RSE as a function of number of change-points
    res.rse <- V[, k]
    ## Optimal number of change points
    ##  options(warn=-1) ## warnings can be useful

    ##value<< A list with elements:
    list(bkpList=res.bkp, ##<< A list of vectors of change point positions for the best model with k change points, for k=1, 2, ... K
         rse=res.rse, ##<< A vector of K+1 residual squared errors
         V=V) ##<< V[i,j] is the best RSE for segmenting intervals 1 to j
}, ex=function(){
    p <- 2
    trueK <- 10
    sim <- randomProfile(1e4, trueK, 1, p)
    Y <- sim$profile
    K <- 2*trueK
    res <- segmentByRBS(Y, K)
    resP <- pruneByDP(Y, res$bkp)

    ##   Note that all of the above can be dmethod=="other"one directly using 'jointSeg'
    resJ <- jointSeg(sim$profile, method="RBS", K=K)
    stopifnot(identical(resP$bkpList, resJ$dpBkp))

    ## check consistency when no NA
    resP2 <- pruneByDP(Y, res$bkp, allowNA=FALSE)
    max(abs(resP$rse-resP2$rse))

    plotSeg(Y, list(resP$bkp[[trueK]], sim$bkp), col=1)
})


############################################################################
## HISTORY:
## 2013-04-10
## o Added argument 'K'.
## 2013-01-09
## o Replace all jumps by bkp
## 2012-12-31
## o Added boolean argument 'allowNA' to handle missing values (NA:s).
## 2012-12-30
## o Added example.
## o Added safeguards for extreme case (e.g. candCP==NULL).
## 2012-12-27
## o Renamed argument candidatechangepoints to candCP.
## o Renamed to pruneByDP.
## o Some code and doc cleanups.
## 2012-11-27
## o Moved model selection part to 'modelSelection'.
## 2012-09-13
## o Some code cleanups.
## 2012-08-21
## o Created.
############################################################################

Try the jointseg package in your browser

Any scripts or data that you put into this service are public.

jointseg documentation built on May 2, 2019, 5:20 p.m.