# R/collapseBins.R In AneuFinder: Analysis of Copy Number Variation in Single-Cell-Sequencing Data

#### Documented in collapseBins

```#' Collapse consecutive bins
#'
#' The function will collapse consecutive bins which have, for example, the same combinatorial state.
#'
#' The following tables illustrate the principle of the collapsing:
#'
#' Input data:
#' \tabular{rrrrrr}{
#' seqnames \tab start \tab end \tab column2collapseBy \tab moreColumns \tab columns2sumUp \cr
#' chr1     \tab     0 \tab 199 \tab                 2 \tab        1 10 \tab           1 3 \cr
#' chr1     \tab   200 \tab 399 \tab                 2 \tab        2 11 \tab           0 3 \cr
#' chr1     \tab   400 \tab 599 \tab                 2 \tab        3 12 \tab           1 3 \cr
#' chr1     \tab   600 \tab 799 \tab                 1 \tab        4 13 \tab           0 3 \cr
#' chr1     \tab   800 \tab 999 \tab                 1 \tab        5 14 \tab           1 3 \cr
#' }
#' Output data:
#' \tabular{rrrrrr}{
#' seqnames \tab start \tab end \tab column2collapseBy \tab moreColumns \tab columns2sumUp \cr
#' chr1     \tab     0 \tab 599 \tab                 2 \tab        1 10 \tab           2 9 \cr
#' chr1     \tab   600 \tab 999 \tab                 1 \tab        4 13 \tab           1 6 \cr
#' }
#'
#' @param data A data.frame containing the genomic coordinates in the first three columns.
#' @param column2collapseBy The number of the column which will be used to collapse all other inputs. If a set of consecutive bins has the same value in this column, they will be aggregated into one bin with adjusted genomic coordinates. If \code{NULL} directly adjacent bins will be collapsed.
#' @param columns2sumUp Column numbers that will be summed during the aggregation process.
#' @param columns2average Column numbers that will be averaged during the aggregation process.
#' @param columns2getMax Column numbers where the maximum will be chosen during the aggregation process.
#' @param columns2drop Column numbers that will be dropped after the aggregation process.
#' @return A data.frame.
#' @author Aaron Taudt
#' @export
#' @examples
#'## Get an example BED file with single-cell-sequencing reads
#'bedfile <- system.file("extdata", "KK150311_VI_07.bam.bed.gz", package="AneuFinderData")
#'## Bin the BAM file into bin size 1Mp
#'                   chromosomes=c(1:19,'X','Y'))
#'## Collapse the bins by chromosome and get average, summed and maximum read count
#'df <- as.data.frame(binned[[1]])
#'# Remove one bin for illustration purposes
#'df <- df[-3,]
#'collapseBins(df, column2collapseBy='seqnames', columns2sumUp=c('width','counts'),
#'                        columns2average='counts', columns2getMax='counts',
#'                        columns2drop=c('mcounts','pcounts'))
#'collapseBins(df, column2collapseBy=NULL, columns2sumUp=c('width','counts'),
#'                        columns2average='counts', columns2getMax='counts',
#'                        columns2drop=c('mcounts','pcounts'))
#'
collapseBins = function(data, column2collapseBy=NULL, columns2sumUp=NULL, columns2average=NULL, columns2getMax=NULL, columns2drop=NULL) {

## Name to index
if (is.character(column2collapseBy)) {
column2collapseBy <- which(column2collapseBy == names(data))
}
if (is.character(columns2sumUp)) {
columns2sumUp <- unlist(lapply(columns2sumUp, function(x) { which(x == names(data)) }))
}
if (is.character(columns2average)) {
columns2average <- unlist(lapply(columns2average, function(x) { which(x == names(data)) }))
}
if (is.character(columns2getMax)) {
columns2getMax <- unlist(lapply(columns2getMax, function(x) { which(x == names(data)) }))
}
if (is.character(columns2drop)) {
columns2drop <- unlist(lapply(columns2drop, function(x) { which(x == names(data)) }))
}
## Indices
ind_coords <- 1:3
ind_morecols <- setdiff(1:ncol(data), c(ind_coords, columns2sumUp, columns2average, columns2getMax, columns2drop))
ind_sumcols <- columns2sumUp
ind_meancols <- columns2average
ind_maxcols <- columns2getMax

## Make the comparison vector
ptm <- startTimedMessage('Making comparison vector ...')
if (is.null(column2collapseBy)) {
c <- data\$start
cShift1 <- rep(NA,length(c))
cShift1[2:length(cShift1)] <- data\$end[-length(c)] + 1
} else {
if (is(data[,column2collapseBy], "factor")) {
c <- as.integer(data[,column2collapseBy])
} else {
c <- data[,column2collapseBy]
}
cShift1 <- rep(NA,length(c))
cShift1[-1] <- c[-length(c)]
}
compare_custom <- c != cShift1
## Make the comparison vector to separate chromosomes
c <- as.integer(data[,1])
cShift1 <- rep(NA,length(c))
cShift1[-1] <- c[-length(c)]
compare_chrom <- c != cShift1
## Combine the vectors
compare <- compare_custom | compare_chrom
compare[1] <- TRUE
numcollapsedbins <- length(which(compare==TRUE))
numbins <- nrow(data)
stopTimedMessage(ptm)
if (any(is.na(compare))) {
stop("NAs in vector 'compare'")
}

## Select the collapsed rows
ptm <- startTimedMessage('Selecting rows ...')
collapsed.bins <- list()
collapsed.bins[[names(data)[1]]] <- data[which(compare),1] #which to remove NAs which shouldn't be there in the first place
collapsed.bins[[names(data)[2]]] <- data[which(compare),2]
collapsed.bins[[names(data)[3]]] <- data[c((which(compare)-1)[-1],numbins), 3]
if (length(ind_morecols)==1) {
collapsed.bins[[names(data)[ind_morecols]]] <- data[which(compare), ind_morecols]
} else if (length(ind_morecols)>1) {
lcb <- length(collapsed.bins)
lmc <- length(ind_morecols)
collapsed.bins[(lcb+1):(lcb+lmc)] <- data[which(compare), ind_morecols]
names(collapsed.bins)[(lcb+1):(lcb+lmc)] <- names(data)[ind_morecols]
}
stopTimedMessage(ptm)

## Sum up columns
xfuns <- list(sum, mean, max)
xstrings <- list('sum', 'mean', 'max')
columns2xs <- list(columns2sumUp, columns2average, columns2getMax)
inds_xcols <- list(ind_sumcols, ind_meancols, ind_maxcols)
for (ix in 1:length(xfuns)) {
xfun <- xfuns[[ix]]
xstring <- xstrings[[ix]]
columns2x <- columns2xs[[ix]]
ind_xcols <- inds_xcols[[ix]]
if (!is.null(columns2x)) {
ptm <- startTimedMessage('Calculating ',xstring,' ...')
xcols <- as.matrix(data[,columns2x])
collapsed.xcols <- matrix(NA, nrow=numcollapsedbins, ncol=length(columns2x))
icount <- 1
i1_lasttrue <- 1
for (i1 in 1:length(compare)) {
if (compare[i1]==TRUE) {
if (length(columns2x)==1) {
collapsed.xcols[icount-1] <- xfun(xcols[i1_lasttrue:(i1-1),])
} else if (length(columns2x) > 1) {
if (i1_lasttrue==i1-1 | i1==1) {
collapsed.xcols[icount-1,] <- as.numeric(xcols[i1_lasttrue,])
} else {
collapsed.xcols[icount-1,] <- apply(xcols[i1_lasttrue:(i1-1),],2,xfun)
}
}
icount <- icount+1
i1_lasttrue <- i1
}
}
i1 = i1+1
if (length(columns2x)==1) {
collapsed.xcols[icount-1] <- xfun(xcols[i1_lasttrue:(i1-1),])
} else if (length(columns2x) > 1) {
if (i1_lasttrue==i1-1 | i1==1) {
collapsed.xcols[icount-1,] <- as.numeric(xcols[i1_lasttrue,])
} else {
collapsed.xcols[icount-1,] <- apply(xcols[i1_lasttrue:(i1-1),],2,xfun)
}
}
if (length(ind_xcols) > 0) {
lcb <- length(collapsed.bins)
lsc <- length(ind_xcols)
collapsed.bins[(lcb+1):(lcb+lsc)] <- as.data.frame(collapsed.xcols)
names(collapsed.bins)[(lcb+1):(lcb+lsc)] <- paste(xstring, names(data)[ind_xcols], sep='.')
}
stopTimedMessage(ptm)
}
}

return(as.data.frame(collapsed.bins))

}
```

## Try the AneuFinder package in your browser

Any scripts or data that you put into this service are public.

AneuFinder documentation built on May 2, 2018, 2:49 a.m.