R/binMissingValues.R

#' binMissingValues
#' 
#' Perform binning in order to remove missing values
#' 
#' Some segmentation methods (in particular, GFLars) do not natively handle the 
#' situation when some observations have missing values in one or more 
#' dimensions. In order to avoid dropping the corresponding observations 
#' entirely, \code{binMissingValues} bins the signal values of the last complete
#' observation before a (range of) observations with missing entries using the 
#' \code{\link[matrixStats]{binMeans}} function.
#' 
#' In the specific case when the first row has NA values, the first non-missing 
#' entry is replicated in order to make smoothing possible.  This choice is 
#' arbitrary but some arbitrary choice is needed in that case.
#' 
#' @param Y A numeric matrix
#' @param verbose A \code{logical} value: should extra information be output ? 
#'   Defaults to \code{FALSE}.
#' @note Currently this function is only used by \code{\link{doGFLars}} in order
#'   to make it possible to run GFLars segmentation on SNP array data where most
#'   markers (on the order of 2/3 to 5/6) have missing values, because of 
#'   uninformative or missing allelic ratio signals.
#'   
#'   The \code{binMissingValues} function may be used for other segmentation 
#'   methods suffering from the same limitation.  However, we emphasize that 
#'   handling missing values natively in the segmentation method would be a 
#'   better solution.
#' @importFrom matrixStats binMeans
#' @author Morgane Pierre-Jean and Pierre Neuvial
#' @references Bleakley, K., & Vert, J. P. (2011). The group fused lasso for 
#'   multiple change-point detection. arXiv preprint arXiv:1106.4199.
#'   
#'   Vert, J. P., & Bleakley, K. (2010). Fast detection of multiple 
#'   change-points shared by many signals using group LARS. Advances in Neural 
#'   Information Processing Systems, 23, 2343-2351.
#'   
#' @note Currently this function is only used by \code{\link{doGFLars}} in order
#'   to make it possible to run GFLars segmentation on SNP array data where most
#'   markers (on the order of 2/3 to 5/6) have missing values, because of 
#'   uninformative or missing allelic ratio signals. The \code{binMissingValues}
#'   function may be used for other segmentation methods suffering from the same
#'   limitation.  However, we emphasize that handling missing values natively in
#'   the segmentation method would be a better solution.
#' @examples
#' 
#' sim <- randomProfile(10, 1, 0.1, 3)
#' Y <- sim$profile
#' Y[c(4, 8), 2] <- NA
#' Y[c(7, 8), 3] <- NA
#' 
#' res <- binMissingValues(Y)
#' 
#' Y <- sim$profile
#' Y[1:5, 2] <- NA
#' Yb <- binMissingValues(Y)
#' 
#' Y <- sim$profile
#' Y[3:5, 2] <- NA
#' Yb <- binMissingValues(Y)
#' 
#' @export binMissingValues
binMissingValues <- function(Y, verbose=FALSE) {
    n <- nrow(Y)
    idxs <- 1:n
    nbNA <- rowSums(is.na(Y))
    wNA <- which(nbNA>0)

    if (wNA[1]==1) {
        if (verbose) {
            print("First row has NA values")
        }
        idx <- min(which(nbNA==0))
        y <- colMeans(Y[1:(idx-1),, drop=FALSE], na.rm=TRUE)
        col <- which(is.nan(y))
        y[col] <- Y[idx, col]
        Y[1:(idx-1), ] <-  matrix(y, nrow=idx-1, ncol=ncol(Y), byrow=TRUE)
        Y[2:(idx-1), col] <- NA
    }

    idxsOK <- union(1, idxs[-wNA]) ## indices of non-missing entries (+ '1')
    xOut <- union(idxsOK, n+1)
    
    Ys <- apply(Y, 2, matrixStats::binMeans, x=idxs, bx=xOut)
    attr(Ys, "idxs") <- idxsOK
    Ys  ## A matrix with the same columns as the input argument \code{Y}
    ## and as many rows as non-missing entries in \code{Y}.
}

############################################################################
## HISTORY:
## 2014-05-15
## o Created. 
############################################################################
mpierrejean/jointSeg-1 documentation built on May 23, 2019, 6:29 a.m.