R/CNclusterNcenter.R

Defines functions CNclusterNcenter

Documented in CNclusterNcenter

#' @title TODO
#' 
#' @description TODO
#' 
#' @param segrat a \code{list} containing information for one specific profile:
#' \itemize{
#' \item{\code{seg}}{ a \code{data.frame} with 3 columns:} \itemize{
#'     \item{\code{StartProbe}}{ a \code{numeric} that tabulates the (integer) 
#'     start position of each segment in internal units such as probe numbers.}
#'     \item{\code{EndProbe}}{ a \code{numeric} that tabulates the (integer) 
#'     end position of each segment in internal units such as probe numbers.}
#'     \item{\code{chrom}}{ a \code{numeric} representing the chromosome.}}
#' \item{\code{rat}}{ a \code{numeric} \code{vector} of elements that are 
#'     functions of copy number, most often log ratios of copy number to 
#'     the expected standard value, such as 2 in diploid genomes for a specific
#'     profile. }
#' \item{\code{stream}}{ a \code{character} \code{string} representing the 
#'     profile ID.}
#' \item{\code{sub}}{ a \code{numeric} representing the position of the current
#'     profile ID in a \code{vector} of profiles.}
#' }
#' 
#' @param blsize a single \code{integer} specifying the bootstrap 
#' sampling rate of segment medians to generate input for model-based 
#' clustering. The number of times a segment is sampled is then given by the 
#' (integer) division of the segment length in internal units by \code{blsize}.
#' 
#' @param minJoin a single \code{numeric} value between \code{0} and \code{1} 
#' specifying the degree of overlap above which two clusters will be joined 
#' into one.
#' 
#' @param nTrial a single \code{integer} specifying the number of times 
#' a model-based clustering is attempted for each profile in order to 
#' achieve the highest Bayesian information criterion (BIC).
#' 
#' @param bestBIC a single \code{numeric} value for initializing BIC 
#' maximization. A large negative value is recommended. 
#' 
#' @param modelNames a \code{vector} of \code{character} strings specifying 
#' the names of models to be used in model-based clustering (see package 
#' \code{mclust} for further details).
#' 
#' @param cweight a single \code{numeric} value between \code{0} and \code{1} 
#' specifying the minimal share of the central cluster in each profile.
#' 
#' @param bstimes a single \code{double} value specifying the number of 
#' time the median of each segment is sampled in order to predict the cluster 
#' assignment for the segment.
#' 
#' @param chromRange a \code{integer} vector enumerating chromosomes from 
#' which segments are to be used for initial model-based clustering.
#' 
#' @return a \code{matrix} with 9 columns:
#' \itemize{
#' \item{\code{untitled}}{ a \code{numeric}, the median value of the copy 
#'     number elements forming each segment}
#' \item{\code{untitled}}{ TODO}
#' \item{\code{mediandev}}{ a \code{numeric}, the median function of copy 
#' number relative to its central value for each segment}
#' \item{\code{segerr}}{ a \code{numeric}, the error estimate for the 
#' function of copy number for each segment}
#' \item{\code{centerz}}{ a \code{numeric} between \code{0} and \code{1}, the 
#' probability that the segment is in the central cluster}
#' \item{\code{cpb}}{ TODO}
#' \item{\code{maxz}}{ TODO}
#' \item{\code{maxzmean}}{ TODO}
#' \item{\code{maxzsigma}}{ TODO}
#' }
#'
#' @examples
#'
#' # TODO
#' 
#' ## Create a list that contain information about the segments and bins 
#' ## related to one 
#' 
#' 
#' 
#' @author Alexander Krasnitz, Guoli Sun
#' @importFrom stats median var
#' @importFrom mclust Mclust mclustBIC
#' @keywords internal
CNclusterNcenter <- function(segrat, blsize, minJoin, nTrial, bestBIC,
    modelNames, cweight, bstimes, chromRange) {


    startcol <- "StartProbe"
    endcol   <- "EndProbe"
    chromcol <- "chrom"
    medcol   <- "segmedian"
    madcol   <- "segmad"

    ## Add to the current data.frame a column with the median and a column
    ## with the median absolute deviation for each group of bins forming 
    ## a specific segment
    segrat$seg <- cbind(segrat$seg,
                        t(apply(segrat$seg[,c(startcol, endcol, chromcol),
                        drop=FALSE], 1, smedmad, v=segrat$rat)))
    
    ## Assigning the column names of the updated data.frame
    dimnames(segrat$seg)[[2]] <- c(startcol, endcol, chromcol, medcol, madcol)
    
    ## Only retain the segments for the selected chromosomes
    seguse <- segrat$seg[segrat$seg[, chromcol] %in% chromRange,, drop=FALSE]
    
    ## Identify bins that are associated to the retained segments
    ## Only those bins will be kept for the analysis
    aux <- rep(0, length(segrat$rat))
    aux[seguse[, startcol]] <- 1  ## start position == 1
    aux[seguse[, endcol]]   <- (-1) ## end position == -1
    aux <- cumsum(aux) ## bins to retained are tagged 1 (except end position)
    aux[seguse[, endcol]]   <- 1 ## end position == 1
    
    ratuse <- segrat$rat[aux == 1]
    
    ## Run trials and keep best cluster from all trial (best BIC value)
    for(j in seq_len(nTrial)) {
        aaa <- segsample(seguse, ratuse, blocksize=blsize)
        if (all(unique(aaa[,3]) == 0)) { 
            aaa[,3] <- 1e-10 
        }
        emfit <- Mclust(aaa[,3], maxG=15, modelNames=modelNames)
        if (emfit$bic >= bestBIC) {
            bestaaa <- aaa
            bestem  <- emfit
            bestBIC <- emfit$bic
        }
    }

    ## Join clusters with minimum overlap
    ## The returned object is no more a Mclust object
    newem <- consolidate(bestem, minJoin)
    
    ## Join clusters until the main cluster contain the minimum required 
    ## ratio of data
    newem <- get.center(newem, cweight)
    
    ## Get the median of the central cluster. The central cluster is
    ## the one with the mean closer to zero
    if (length(bestem$parameters$mean) == 1) { 
        profcenter <- median(bestaaa[,3]) 
    } else { 
        profcenter <- weighted.median(bestaaa[,3], newem$z[, newem$center]) 
    }

    ## Calculate the median deviation to the central cluster
    mediandev <- segrat$seg[, medcol] - profcenter
    
    ## Bin sampling
    segs <- segsample(segrat$seg, segrat$rat, times=bstimes)

    if (all(unique(aaa[,3]) == 1e-10)) { 
        segs[segs[,3] == 0, 3] <- 1e-10 
    }

    segzall <- getz(segs[,3], bestem, newem$groups, times=bstimes)
    centerz <- segzall[, newem$center]
    maxz <- segzall[nrow(segzall) * (max.col(segzall) - 1) + 
                            (seq_len(nrow(segzall)))]
    
    maxzcol <- max.col(segzall)
    maxzmean <- newem$mu[maxzcol] - newem$mu[newem$center]
    maxzsigma <- sqrt(newem$sigmasq[maxzcol])
    cpb <- centerprob(segs[,3], bestem, newem$groups, times=bstimes, 
                newem$center)
    w <- t(matrix(nrow=bstimes, data=segs[,3]))
    segerr <- sqrt(apply(w, 1, var, na.rm=TRUE))
    
    return(cbind(segrat$seg[, medcol], segrat$seg[,madcol], mediandev, segerr, 
                    centerz, cpb, maxz, maxzmean, maxzsigma))
}
KrasnitzLab/CNprep documentation built on May 28, 2022, 8:32 p.m.