R/collapseBins.R

Defines functions collapseBins

Documented in collapseBins

#' Collapse consecutive bins
#'
#' The function will collapse consecutive bins which have, for example, the same combinatorial state.
#'
#' The following tables illustrate the principle of the collapsing:
#'
#' Input data:
#' \tabular{rrrrrr}{
#' seqnames \tab start \tab end \tab column2collapseBy \tab moreColumns \tab columns2sumUp \cr
#' chr1     \tab     0 \tab 199 \tab                 2 \tab        1 10 \tab           1 3 \cr
#' chr1     \tab   200 \tab 399 \tab                 2 \tab        2 11 \tab           0 3 \cr
#' chr1     \tab   400 \tab 599 \tab                 2 \tab        3 12 \tab           1 3 \cr
#' chr1     \tab   600 \tab 799 \tab                 1 \tab        4 13 \tab           0 3 \cr
#' chr1     \tab   800 \tab 999 \tab                 1 \tab        5 14 \tab           1 3 \cr
#' }
#' Output data:
#' \tabular{rrrrrr}{
#' seqnames \tab start \tab end \tab column2collapseBy \tab moreColumns \tab columns2sumUp \cr
#' chr1     \tab     0 \tab 599 \tab                 2 \tab        1 10 \tab           2 9 \cr
#' chr1     \tab   600 \tab 999 \tab                 1 \tab        4 13 \tab           1 6 \cr
#' }
#' 
#' @param data A data.frame containing the genomic coordinates in the first three columns.
#' @param column2collapseBy The number of the column which will be used to collapse all other inputs. If a set of consecutive bins has the same value in this column, they will be aggregated into one bin with adjusted genomic coordinates.
#' @param columns2sumUp Column numbers that will be summed during the aggregation process.
#' @param columns2average Column numbers that will be averaged during the aggregation process.
#' @param columns2getMax Column numbers where the maximum will be chosen during the aggregation process.
#' @param columns2drop Column numbers that will be dropped after the aggregation process.
#' @return A data.frame.
#' @author Aaron Taudt
#' @export
#' @examples
#'## Load example data
#'## Get an example multiHMM
#'file <- system.file("data","multivariate_mode-combinatorial_condition-SHR.RData",
#'                     package="chromstaR")
#'model <- get(load(file))
#'df <- as.data.frame(model$bins)
#'shortdf <- collapseBins(df, column2collapseBy='state', columns2sumUp='width', columns2average=6:9)
#'
collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, columns2average=NULL, columns2getMax=NULL, columns2drop=NULL) {

    ## Name to index
    if (is.character(column2collapseBy)) {
        column2collapseBy <- which(column2collapseBy == names(data))
    }
    if (is.character(columns2sumUp)) {
        columns2sumUp <- unlist(lapply(columns2sumUp, function(x) { which(x == names(data)) }))
    }
    if (is.character(columns2average)) {
        columns2average <- unlist(lapply(columns2average, function(x) { which(x == names(data)) }))
    }
    if (is.character(columns2getMax)) {
        columns2getMax <- unlist(lapply(columns2getMax, function(x) { which(x == names(data)) }))
    }
    if (is.character(columns2drop)) {
        columns2drop <- unlist(lapply(columns2drop, function(x) { which(x == names(data)) }))
    }
    ## Indices
    ind_coords <- 1:3
    ind_morecols <- setdiff(1:ncol(data), c(ind_coords, columns2sumUp, columns2average, columns2getMax, columns2drop))
    ind_sumcols <- columns2sumUp
    ind_meancols <- columns2average
    ind_maxcols <- columns2getMax

    ## Make the comparison vector
    ptm <- startTimedMessage('Making comparison vector ...')
    if (is.null(column2collapseBy)) {
        c <- data$start
        cShift1 <- rep(NA,length(c))
        cShift1[2:length(cShift1)] <- data$end[-length(c)] + 1
    } else {
        if (is(data[,column2collapseBy], "factor")) {
            c <- as.integer(data[,column2collapseBy])
        } else {
            c <- data[,column2collapseBy]
        }
        cShift1 <- rep(NA,length(c))
        cShift1[-1] <- c[-length(c)]
    }
    compare_custom <- c != cShift1
    ## Make the comparison vector to separate chromosomes
    c <- as.integer(data[,1])
    cShift1 <- rep(NA,length(c))
    cShift1[-1] <- c[-length(c)]
    compare_chrom <- c != cShift1
    ## Combine the vectors
    compare <- compare_custom | compare_chrom
    compare[1] <- TRUE
    numcollapsedbins <- length(which(compare==TRUE))
    numbins <- nrow(data)
    stopTimedMessage(ptm)
    if (any(is.na(compare))) {
        stop("NAs in vector 'compare'")
    }

    ## Select the collapsed rows
    ptm <- startTimedMessage('Selecting rows ...')
    collapsed.bins <- list()
    collapsed.bins[[names(data)[1]]] <- data[which(compare),1] #which to remove NAs which shouldn't be there in the first place
    collapsed.bins[[names(data)[2]]] <- data[which(compare),2]
    collapsed.bins[[names(data)[3]]] <- data[c((which(compare)-1)[-1],numbins), 3]
    if (length(ind_morecols)==1) {
        collapsed.bins[[names(data)[ind_morecols]]] <- data[which(compare), ind_morecols]
    } else if (length(ind_morecols)>1) {
        lcb <- length(collapsed.bins)
        lmc <- length(ind_morecols)
        collapsed.bins[(lcb+1):(lcb+lmc)] <- data[which(compare), ind_morecols]
        names(collapsed.bins)[(lcb+1):(lcb+lmc)] <- names(data)[ind_morecols]
    }
    stopTimedMessage(ptm)

    ## Sum up columns
    xfuns <- list(sum, mean, max)
    xstrings <- list('sum', 'mean', 'max')
    columns2xs <- list(columns2sumUp, columns2average, columns2getMax)
    inds_xcols <- list(ind_sumcols, ind_meancols, ind_maxcols)
    for (ix in 1:length(xfuns)) {
        xfun <- xfuns[[ix]]
        xstring <- xstrings[[ix]]
        columns2x <- columns2xs[[ix]]
        ind_xcols <- inds_xcols[[ix]]
        if (!is.null(columns2x)) {
            ptm <- startTimedMessage('Calculating ',xstring,' ...')
            xcols <- as.matrix(data[,columns2x])
            collapsed.xcols <- matrix(NA, nrow=numcollapsedbins, ncol=length(columns2x))
            icount <- 1
            i1_lasttrue <- 1
            for (i1 in 1:length(compare)) {
                if (compare[i1]==TRUE) {
                    if (length(columns2x)==1) {
                        collapsed.xcols[icount-1] <- xfun(xcols[i1_lasttrue:(i1-1),])
                    } else if (length(columns2x) > 1) {
                        if (i1_lasttrue==i1-1 | i1==1) {
                            collapsed.xcols[icount-1,] <- as.numeric(xcols[i1_lasttrue,])
                        } else {
                            collapsed.xcols[icount-1,] <- apply(xcols[i1_lasttrue:(i1-1),],2,xfun)
                        }
                    }
                    icount <- icount+1
                    i1_lasttrue <- i1
                }
            }
            i1 = i1+1
            if (length(columns2x)==1) {
                collapsed.xcols[icount-1] <- xfun(xcols[i1_lasttrue:(i1-1),])
            } else if (length(columns2x) > 1) {
                if (i1_lasttrue==i1-1 | i1==1) {
                    collapsed.xcols[icount-1,] <- as.numeric(xcols[i1_lasttrue,])
                } else {
                    collapsed.xcols[icount-1,] <- apply(xcols[i1_lasttrue:(i1-1),],2,xfun)
                }
            }
            if (length(ind_xcols) > 0) {
                lcb <- length(collapsed.bins)
                lsc <- length(ind_xcols)
                collapsed.bins[(lcb+1):(lcb+lsc)] <- as.data.frame(collapsed.xcols)
                names(collapsed.bins)[(lcb+1):(lcb+lsc)] <- paste(xstring, names(data)[ind_xcols], sep='.')
            }
            stopTimedMessage(ptm)
        }
    }

    return(as.data.frame(collapsed.bins))

}
ataudt/chromstaR documentation built on Dec. 26, 2021, 12:07 a.m.